## import and summary data<- read.table(file="C:/r_data/therbook/SimpleRegression.txt", header=TRUE) data names(data) attach(data) par(cex.lab=1.3, cex.axis=1.3, font.axis=5, font.lab=6, cex=1.3, pch=16) ##plot the first dataset plot(x2, y2, xlab="X", ylab="Y", main="Regression model", pch=16, cex=1, cex.lab=1, cex.axis=1) ## Estimate the simple regression line mod<-lm(y2~x2) abline(mod) ## Add segments segments(x2,fitted(mod),x2,y2,lty=2) ## Add some text to the graph text(0.3,45,labels="Is the relation linear?",cex=2) #Model checking #1. Linearity res<-residuals(mod) yfit<-fitted(mod) plot(yfit, res, ylab="Residuals", xlab="Fitted", main="Residuals vs fitted") abline(lm(res~yfit)) #2. Normality library(car) qq.plot(res, col='black') qqnorm(scale(res)) abline(0,1) shapiro.test(res) #3. Omoscedasticity plot(yfit, abs(res), ylab="Residuals", xlab="Fitted", main="Residuals in absolute value vs fitted") g<-lm(abs(res)~ yfit) abline(g) summary(g) #4. Outlier library(outliers) help(dixon.test) library(car) outlier.test(mod) ##test to spot outliers help(outlier.test) plot(mod) ############################################################### ## Example 2 ## Estimate the simple regression line plot(x1, y1, xlab="X", ylab="Y", main="Regression model", pch=16,cex=1.2) mod1<-lm(y1~x1) abline(mod1) #1. Linearity res<-residuals(mod1) yfit<-fitted(mod1) plot(yfit, res, ylab="Residuals", xlab="Fitted", main="Residuals vs fitted") abline(lm(res~yfit)) #2. Normality qqnorm(scale(res)) abline(0,1) shapiro.test(res) #3. Omoscedasticity plot(yfit, abs(res), ylab="Residuals", xlab="Fitted", main="Residuals in absolute value vs fitted") g<-lm(abs(res)~ yfit) summary(g) ###Built-in function to check the assumptions of the regression plot(mod1, which=1) plot(mod1, which=2) plot(mod1, which=3) plot(mod1, which=4) ## if the Cook's distance is >1 be careful #4. Outlier library(car) outlier.test(mod) ##If assumptions are not fulfilled, we have two possibilities: #(a) transformation or #(b) polynomial regression ################################################################################ #a Transformation ## Box-cox transformation library(MASS) boxcox(mod1, plotit=TRUE) ## grafico ##Log-transformation x1_log<-log(x1) plot(x1_log,y1) mod3<-lm(y1~x1_log) summary(mod3) abline(mod3) mod3<-lm(y1~x1_log+I(x1_log^2)) summary(mod3) #1. Linearity res<-residuals(mod3) yfit<-fitted(mod3) plot(yfit, res, ylab="Residuals", xlab="Fitted", main="Residuals vs fitted") abline(lm(res~yfit)) plot(x1_log, res, ylab="Residuals", xlab="X", main="Residuals vs predictor") abline(lm(res~x1_log)) #2. Normality qqnorm(scale(res)) abline(0,1) shapiro.test(res) #3. Omoscedasticity plot(yfit, abs(res), ylab="Residuals", xlab="Fitted", main="Residuals in absolute value vs fitted") g<-lm(abs(res)~ yfit) summary(g) #4. Outlier library(car) outlier.test(mod) ################################################################################ ############### b polynomial regression ####################################à# plot(x1,y1, xlab="X", ylab="Y", main="Quadratic regression", pch=16, cex=1.2, cex.lab=1.5, cex.axis=1.5, cex.main=1.5) modx<-lm(y1~x1+I(x1^2)+I(x1^3)) summary(modx) ##can we simplify the model? ## remove non-significant term modx<-lm(y1~x1+I(x1^2)) ## polynomial regression summary(modx) x<-0:60 x y<-predict(modx, list(x1=x)) y lines(x,y) plot(modx) ################################################################################ ################################################################################ rm(list=ls(all=TRUE)) ##remove all the objects ################################################################################ ##Multiple regression models ###We want to test the effect of age and 2 hormones' concentration [Metabolite] ## import and summary blood.data<- read.table(file="C:/r_data/therbook/ozone.data.txt", header=TRUE) attach(blood.data) names(blood.data)<-c("Age","HormoneI","HormoneII","Metabolite") attach(blood.data) #1. Explore the linearity and the degree of correlation between predictors pairs(blood.data, panel=panel.smooth) ##get an idea on the relation you've got cor(blood.data) ##correlation ##Start with a complex model ##Fit a complicate model - from complex to simple model1<-lm(Metabolite~Age*HormoneI*HormoneII+I(Age^2)+I(HormoneI^2)+I(HormoneII^2)) summary(model1) ##Now check the parameters. Look at the highest interaction: is it significant? ## DELETE JUST ONE TERM A TIME!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ##Simplification (NB starts with the highest interactions!!!!!!!!!!!!!) model2<-update(model1,~. - Age:HormoneI:HormoneII) summary(model2) #Look at the highest interaction: is it significant? model3<-update(model2,~. - Age:HormoneI) summary(model3) #Look at the highest interaction: is it significant? model4<-update(model3,~. - HormoneI:HormoneII) summary(model4) #Look at the highest interaction: is it significant? model5<-update(model4,~. - Age:HormoneII) summary(model5) #Look at the quadratic term: is it significant? model6<-update(model5,~. - I(Age^2)) coef(model6) summary(model6) #Now we have our MAM (Minimum adequate model) ###Check assumptions plot(model6) ###outlyer test library(car) outlier.test(model6) ##good or bad??? ##try some transformation ################################################################################ ##Log-transformation of Y model7<-lm(log(Metabolite)~Age+HormoneI+HormoneII+I(HormoneI^2)+I(HormoneII^2)) summary(model7) model8<-update(model7,~. - I(HormoneI^2)) summary(model8) plot(model8) plot(model8, which=4) ################################################################################ ######################## Variation partitioning ############################## #Separate the effect relative effect of Age and the concetration of two hormones #I and II) on [Metabolite] ##Work with the MAM model and then separate the relative effects #The question is: How much variation can age explain accounting for the #effect of hormones? #Load vegan package library(vegan) #Have a look at the manual to work out your script help(varpart) varpart(log(Metabolite), ~Age, ~ HormoneI+HormoneII+I(HormoneII^2)) showvarparts(2, labels=c("Age","shared","Hormones","Unexplained"))