# Māhuahua Session # Statistics with R workshop # 6 July 2017 # A "pound" symbol (aka "hashtag") preceeding some text turns the text green. This is called a comment. It will not run ##### You can put as as many "pound" symbols as you like. Commenting allows you to annotate your scripts. **VERY IMPORTANT** # A scripting or script language is a programming language that supports scripts: # programs written for a special run-time environment that automate the execution # of tasks that could alternatively be executed one-by-one by a human operator. # Scripting languages are often interpreted (rather than compiled). (via Wikipedia) # Hello World! print("hello world") # basic concepts # math 1 + 1 # assignment x <- 5 #built in functions log(10) pi sin(pi/2) floor(5.7) ceiling(5.7) round(5.7) round(5.4) # creating a vector y <- c(10,1,12,13,2,4) y 2 * y x <- 2 * y x # vector functions max(x) min(x) mean(x) median(x) range(x) sum(x) sd(x) sort(x) ################################################ # setwd() # set working directory # setwd(mydirectory) # change to mydirectory # setwd("c:/docs/mydir") # note / instead of \ in windows # setwd("/usr/rob/mydir") # on linux # On your computer, navigate to the folder where you downloaded the data # Identify the pathway of the data and type/paste it into the function setwd(). # Make sure the pathway is in double quotes # set working diretory # setwd("/Users/jonathankoch/Google Drive/PIPES/PIPES2017/Statistics101/MahuahuaStatsSession/DataAnalysis") #mac # list files # list.files() # read data # df <- read.csv("MouseData_v1.csv", header = T) #reads a .csv file # You can setwd to access data. However, for the purpose of this tutorial, we will # use the fread() function to download data from the web! #call library data.table library(data.table) # if you get an error, you may need to download it with the following command install.packages("data.table") library(ggplot2) # download the MouseData_v1.csv from my webpage df <- fread('https://jonathanbkoch.weebly.com/uploads/1/6/1/2/16126472/mousedata_v1.csv') # print out header columns and read the header columns into R names(df) attach(df) # how many rows of data do I have? nrow(df) # how many columns of data do I have? ncol(df) # let's take a look at our data in the "Environment Window" # Discuss the dataset # A PIPES student travelled to Nevada to study the effect of climate and plant # abundance (seeds) on mouse health. The researcher collected weight, sex, elevation, # host plant abundance, and temperature. # In the study ################################################################################################### # what kind of data do I have? # unique unique(df$Sex) # Did we collect both male and female data? df$CollectionEvent unique(df$CollectionEvent) sort(unique(df$CollectionEvent)) # how many unique values do I have? length(unique(df$CollectionEvent)) # Use the functions: "unique", "sort", and "length", # to examine other columns in the dataset, and answer # the following questions: # 1) How many seasons did I collect data in? What are they? # 2) How many unique Sites was data collected from? # 3) How many unique Age Classes are there? What are they? ################################################################################################# # visualize your response variable (dependent variable) hist(df$Weight_g) ?hist() hist(df$Weight_g, main = "Histogram of Mouse Weight Measrements (g)") # Examine histograms and check for normality of the other variables in your data (df) # e.g., Elevation and Temperature # 2 minutes #################### T.Test ############################ # Is there a difference in female vs. male weights? # Is data normally distributed? hist(df$Weight_g2) # T-test # Null Hypothesis: # Alternative Hypothesis: t.test(df$Weight_g2~df$Sex) #parametric wilcox.test(df$Weight_g2~df$Sex) #non parametric; notice the differences # What does this result look like? boxplot(df$Weight_g2~df$Sex) # This boxplot is accurately labeled! We need x-axis, y-axis, and a title boxplot(df$Weight_g2~df$Sex, xlab = "Sex", ylab = "Weight (g)", main = "Weight (g) differences in Male and Female Mice") # Use the t.test and boxplot knowledge that you gained to see if there # is a difference in mouse weights (regardless of sex) with respect to # seasonal differences (Spring vs. Summer)? # Null Hypothesis: # Alternative Hypothesis: # What is the t-value? # How many df? # What is the p-value? # What are the means (averages) of each group (i.e., Spring vs. Summer)? # Make sure to label your boxplot. boxplot(df$Weight_g~df$Season, xlab = "Season", ylab = "Weight (g)", main = "Weight (g) differences between Spring and Summer") ################# ANOVA ################################ # Sort f.df <- df[which(df$Sex == 'F'), ] #extract data that are only female m.df <- #extract data thar are only male nrow(f.df) hist(f.df$Weight_g2) shapiro.test(f.df$Weight_g2) a1 <- aov(f.df$Weight_g2~f.df$AgeClass) summary(aov(f.df$Weight_g2~f.df$AgeClass)) # Plotting the data library(plyr) # Run the functions length, mean, and sd on the value of "Weight_g2" for each group, # broken down by AgeClass of females cdata <- ddply(f.df, c("AgeClass"), summarise, N = length(Weight_g2), mean = mean(Weight_g2), sd = sd(Weight_g2), se = sd / sqrt(N) ) cdata # make a bargraph (appropriate for ANOVA and other parametric tests) p<- ggplot(cdata, aes(x=AgeClass, y=mean)) + geom_bar(stat="identity", color="black", position=position_dodge()) + geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2, position=position_dodge(.9)) + xlab("Age Class") + ylab("Weight (g)") print(p) posthoc <- TukeyHSD(x=a1, 'f.df$AgeClass', conf.level=0.95) posthoc summary(a1) ############## Correlation ########################## hist(df$Temperature_C) # Not Normal (use kendall rank correlation) shapiro.test(df$Temperature_C) ?cor.test #examine optioins cor.test(df$Elevation_m, df$Temperature_C, method = "kendall") plot(df$Elevation_m, df$Temperature_C) #plot the data plot(df$Elevation_m, df$Temperature_C, xlab = "Elevation (m)", ylab = "Temperature (C)", main = "Relationship between Temperature (C) and Elevation (m)") ######### Linear Regression ############################# # Examine the effects of Elevation, Season, AgeClass, HostPlant Abundance on Weight of Female Mouse hist(f.df$Weight_g2) null <-lm(Weight_g2~1, data = f.df) # null model m1 <- lm(Weight_g2~Elevation_m, data = f.df) # alternative model m1 summary(m1) anova(null, m1, test = "Chisq") #is the alternative model better than the null model? #Deviance = 76.061, P < 2.2e-16 # Scatterplot of Elevation vs Weight in Female mice plot(f.df$Elevation_m, f.df$Weight_g2) #scatterplot; make a scatterplot with appropriate labels and title # Histogram of residuals hist(residuals(m1)) #do shapiro.wilk test of residuals # Residuals Vs. Explanatory Variable (X-variable) to exame linearity and homoscedasticity plot(f.df$Elevation_m, residuals(m1)) cor.test(f.df$Elevation_m, residuals(m1)) resid_model <- lm(residuals(m1)~f.df$Elevation_m) abline(resid_model) ##### Chi-square ################################################### # Mouse abundance across three land types and in the presence abscence of disease sr <- matrix(c(78,28,10,92,42,46),ncol=3,byrow=TRUE) colnames(sr) <- c("Wild","City","Farm") rownames(sr) <- c("Disease","No-Disease") sr <- as.table(sr) sr chisq.test(sr) barplot(sr, legend.text = TRUE, beside = TRUE, args.legend = list(x ='topright', bty='n', inset=c(-0.10,0)), xlab = "", ylab = "", main = "Data") # label barplot with appropirate labels ######## END OF EXCERCISE ###################################################################### # EXTRA CODE wilcox.test() #wilcox test; non-parametric paired t-test and 2-sample t-test kruskal.test() cor.test() #parametric = Pearson's Correlation; Non-parametric = Spearman's and Kendall Tau # random numbers x <- runif(20, min=6.0, max=10.0) pf <-cbind(x) pf