HW9: Organizing Code With Structured Programming
April 18, 2024
User-designed Functions
In order to generate random data, and run linear regression and ANOVA on the dataset several functions were designed as shown below:
- sim_norm(): a function that creates a simulated (from a normal
distribution) AGER gene expression dataset for different groups.
- Input Argument: ‘list’ (a list containing the mean, variance and size of each categories) and ‘groups’ (an atomic vector of all the predictor groups)
- Output: ‘sim_df’ (a dataframe containing a unique ID variable, Group and Resp_var (simulated from normal distribution))
- Reg_Stat(): Performs the linear regression analysis on a dataset
- Input Arguments: ‘df’ (a dataframe containing the variable the linear regression will be ran on)
- Output: ‘a’ (an atomic vector containing the coefficient ‘b’ (Response = a + bPredictor), the p-value and conclusion (significant (p < 0.05) or insignificant))
- Reg_Plot(): Plot the dataset
- Input Arguments: ‘df’ (a dataframe containing the variable to be plotted)
- Output: ‘plot’ (a scatterplot with a trendline showing the correlation between the response and predictor variables))
- ANOVA_Stat(): a function that runs the ANOVA analysis on a dataset
- Input: ‘df’ (a dataframe containing the variable the ANOVA will be ran on)
- Outputs: ‘a’ (an atomic vector containing the p-value and conclusion (significant (p < 0.05) or insignificant))
- ANOVA_Plot(): Plot the dataset
- Input Arguments: ‘df’ (a dataframe containing the variable to be plotted)
- Output: ‘plot’ (a boxplot)
Analysis
Two randomly generated datasets were simulated using the ‘sim_norm()’ function and the the linear regression analysis was ran on the first dataset using the ‘Reg_Stat()’ and ‘Reg_Plot()’ functions and ANOVA was ran on the second dataset using the ‘ANOVA_Stat()’ and ‘ANOVA_Plot()’ functions. The ooutputs from these analyses are shown in Tables 1 and 2 and Fig 1.
To automate running these tests several times, a for loop was used to run the ANOVA analysis 10 times and the output shown in Table 3.
# Relevant libraries ------------------------------------------------------
library(tidyverse) # To use ggplot2 and dplyr packages
library(kableExtra) # To make fancy tables
library(patchwork)
# FUNCTIONS ------------------------------------------------------
##############################################################################################
# FUNCTION: sim_rnorm()
# Description: a function that creates a simulated gene expression dataset for different groups.
# Input Arguments: 'list' (a list containing the mean, variance and size of each categories) and 'groups' (an atomic vector of all the predictor groups)
# Output: 'sim_df' (a dataframe containing a unique ID variable, Group and Resp_var (simulated from normal distribution))
#___________________________________________________________________________________________
<- function(groups = c(), list = input){
sim_rnorm
#Times = c("24h", "+4h", "+24h", "5d")
= rnorm(n = list[[1]][1], mean = list[[1]][2], sd = sqrt(list[[1]][3]))
g1 = rnorm(n = list[[2]][1], mean = list[[2]][2], sd = sqrt(list[[2]][3]))
g2 = rnorm(n = list[[3]][1], mean = list[[3]][2], sd = sqrt(list[[3]][3]))
g3 = rnorm(n = list[[4]][1], mean = list[[4]][2], sd = sqrt(list[[4]][3]))
g4
<- c(g1, g2, g3, g4)
Resp_var<- 1:length(Resp_var)
ID <- c (rep(groups[1], length(g1)), rep(groups[2], length(g2)), rep(groups[3], length(g3)), rep(groups[4], length(g4))) # Assigning Time categories to the rnorm() generated data
Group
<- data.frame(ID, Group, Resp_var) # Create a dataframe from the simulated Group and Resp_var data.
sim_df
return(sim_df)
}
##############################################################################################
##############################################################################################
# FUNCTION: Reg_Stat()
# Description: Performs linear regression on a dataset
# Input Arguments: 'df' (a dataframe containing the variable the linear regression will be ran on)
# Output: p-value, coefficient and conclusion (significant / not-significant)
#___________________________________________________________________________________________
<- function(df = DF){
Reg_Stat
<- lm(Resp_var ~ Group, data = df)
z <-summary(z)$coefficients[2,1] # pulls out the coefficient of the relationship btw the response and predictor variables
coeff<- summary(z)$coefficients[2,4] # pulls out the p-value of the relationship btw the response and predictor variables
pVal
<-c(coeff, pVal, ifelse(pVal < 0.05, "Significant", "Non-significant"))
a names(a) <- c("Coefficient", "P-Value", "Conclusion")
return(a)
}
##############################################################################################
##############################################################################################
# FUNCTION: Reg_Plot()
# Description: Plot the datset
# Input Arguments: 'df' (a dataframe containing the variable the ANOVA will be ran on)
# Output:'plot'
#___________________________________________________________________________________________
<- function(df = DF){
Reg_Plot
### Plot of the data
<- ggplot(df) +
plot aes(x = Group, y = Resp_var) +
geom_point() +
stat_smooth(method=lm,se=0.95) + # default se=0.95
labs(title = "Simulated AGER Response vs Time", x = "Time (hrs)", y = "Simulated AGER Expression")
return(plot)
}################################################################################################
##############################################################################################
# FUNCTION: ANOVA_Stat()
# Description: Runs the ANOVA on a dataset
# Input Arguments: 'df' (a dataframe containing the variables the ANOVA will be ran on)
# Output: p-value and conclusion (significant/not significant)
#___________________________________________________________________________________________
<- function(df = DF){
ANOVA_Stat
<- aov(Resp_var ~ Group, data = df)
z <- unlist(summary(z))[9] # pulls out the p-value of the relationship btw the response and predictor variables
pVal
<-c(pVal, ifelse(pVal < 0.05, "Significant", "Non-significant"))
a names(a) <- c("P-Value","Conclusion")
return(a)
}
##############################################################################################
##############################################################################################
# FUNCTION: ANOVA_Plot()
# Description: Plot the datset
# Input Arguments: 'df' (a dataframe containing the variable the ANOVA will be ran on)
# Output:'plot'
#___________________________________________________________________________________________
<- function(df = DF){
ANOVA_Plot
### Plot of the data (boxplot)
<- ggplot(df) +
plot aes(x = Group, y = Resp_var, fill = Group) +
geom_boxplot() +
labs(title = "Simulated AGER Response vs Stiffness",x = "Stiffness (kPA)", y = "Simulated AGER Expression")
return(plot)
}################################################################################################
# Global Variables ------------------------------------------------------
= 84; mean_24h = 0.8439886; var_24h = 0.0604894;
n_24h = 86; mean_48h = 2.2456403; var_48h = 3.0218083;
n_48h = 84; mean_72h = 2.7526489; var_72h = 7.8850666;
n_72h = 82; mean_144h = 2.4733090; var_144h = 9.2351024;
n_144h
# Create a list with all input data
<- list(sim24h = c(n_24h, mean_24h,var_24h),
input1 sim48h = c(n_48h, mean_48h,var_48h),
sim72h = c(n_72h, mean_72h,var_72h),
sim144h = c(n_144h, mean_144h,var_144h))
= 96; mean_2kpa = 1.190448; var_2kpa = 0.650671;
n_2kpa = 96; mean_16kpa = 1.886097; var_16kpa = 2.287650;
n_16kpa = 95; mean_64kpa = 2.730308; var_64kpa = 10.417837;
n_64kpa = 49; mean_plastic = 2.892691; var_plastic = 8.975734;
n_plastic
<- list(sim2kpa = c(n_2kpa, mean_2kpa, var_2kpa),
input2 sim16kpa = c(n_16kpa, mean_16kpa, var_16kpa),
sim64kpa = c(n_64kpa, mean_64kpa, var_64kpa),
simplastic = c(n_plastic, mean_plastic, var_plastic))
# Simulating Random Dataset and Running LR and ANOVA ------------------------------------------------------
<- sim_rnorm(groups = c(24, 48, 72, 144), list = input1) # create a randomly generated Time and AGER dataset
df1 <- Reg_Stat(df1) # computes the linear regression analysis of AGER(response) and Time (predictor)
LR <- Reg_Plot(df1) # plot the AGER and Time variables in the dataframe
p1 kable(data.frame(t(LR)), caption = "Table 1 - Sample Linear Regression Analysis Output")
Coefficient | P.Value | Conclusion |
---|---|---|
0.0127452755442848 | 4.87035159561295e-06 | Significant |
<- sim_rnorm(groups = c("2kpa", "16kpa", "64kpa", "Plastic"), list = input2)
df2 <- ANOVA_Stat(df2)
AV <- ANOVA_Plot(df2)
p2 kable(data.frame(t(AV)), caption = "Table 2 - Sample ANOVA Output")
P.Value | Conclusion |
---|---|
3.73175692738276e-08 | Significant |
/p2 + plot_annotation(tag_levels = "A") # Return both plots 1 and 2 on the same figure p1
# Using a for loop to automate the ANOVA analysis --------------------------------------------------------------------
<- NULL
AV_Stats
for (i in seq_len(10)){
<- sim_rnorm(groups = c("2kpa", "16kpa", "64kpa", "Plastic"), list = input2)
df2
<- data.frame(t(i))
Test <- data.frame(t(ANOVA_Stat(df2)))
AnovaOutput
<- bind_rows(AV_Stats, cbind(Test, AnovaOutput))
AV_Stats
}
names(AV_Stats) <- c("Test No.", "P-Value", "Conclusion")
kable(AV_Stats, caption = "*Table 3 - P-Values and Conclusions from Running ANOVA Ten Times on Randomly Generated Data*", align = "c")
Test No. | P-Value | Conclusion |
---|---|---|
1 | 4.50746673009505e-10 | Significant |
2 | 0.000879276854316227 | Significant |
3 | 1.24489616406898e-07 | Significant |
4 | 4.63994484898574e-07 | Significant |
5 | 5.23012754567744e-05 | Significant |
6 | 7.42684006051392e-11 | Significant |
7 | 1.20313076467997e-10 | Significant |
8 | 0.00065235808088938 | Significant |
9 | 5.09822158669315e-05 | Significant |
10 | 0.000232057689730662 | Significant |