########################################################################## # import andy data library(readxl) andy <- read_excel("andy.xlsx") summary(andy) #multiple regr model sales~price+advert mod1 <- lm(sales~price+advert, data=andy) print(mod1) #only coefficients and intercepts summary(mod1) #all the table # residuals and SSE andy$res<-(mod1$residuals) head(andy) #visualize the first six units in the dataset SSE<-sum(andy$res^2) SSE # variance of the error term df=mod1$df.residual df var_e=SSE/df var_e se_e=sqrt(var_e) se_e # confidence interval for coefficients and intercept summary(mod1) confint(mod1) # Hypothesis Testing in Multiple Regression # H0:B2=0,HA:B2!=0 - coefficient b2 - two tails test alpha <- 0.05 df <- mod1$df.residual # degree of freedom of multiple regr model (n-k-1) qt <- qt(1-alpha/2, df) qt<-c(-qt, qt) qt b2 <- coef(mod1)[["price"]] # save coefficient price seb2 <-coef(summary(mod1))[2,"Std. Error"] # save st.error price t <- b2/seb2 # t-statistic t # coefficient b3 alpha <- 0.05 df <- mod1$df.residual qt <- qt(1-alpha/2, df) #critical values qt<-c(-qt, qt) qt b3 <- coef(mod1)[[3]] seb3 <- sqrt(vcov(mod1)[3,3]) t <- b3/seb3 # Calculated t=2.726, and qt=1.993Since t>qt -> we reject the null hypothesis #we conclude that there is a statistically significant relationship between advert and sales ###################################################################################### ################# homoskedasticity and normality of error term plot(andy$price, andy$res) plot(andy$advert, andy$res) hist(andy$res) ebar <- mean(andy$res) sde <- sd(andy$res) #hist for the residuals hist(andy$res, col="grey", freq=FALSE, main="", ylab="density", xlab="ehat") #normal curve with mean and sd curve(dnorm(x, ebar, sde), col=2, add=TRUE, ylab="density", xlab="ehat") library(tseries) jarque.bera.test(andy$res) #(in package 'tseries') # Ho: residuals are normally distributed # h1: residuals are not normally distributed # of X-squared>chi2-2gdl -> reject normality assumption alpha=0.05 qchisq(1-alpha,2) ##################################### F-Test (joint hypotesis test) ###################### # estimate full model (with two independent variables and save SSE) mod1 <- lm(sales~price+advert, data=andy) andy$res<-(mod1$residuals) head(andy) #visualize the first six units in the dataset SSEu<-sum(andy$res^2) SSEu # estimate restricted model (with no independent variable and save SSE) mod2 <- lm(sales~ +0 , data=andy) summary(mod2) andy$res<-(mod2$residuals) head(andy) #visualize the first six units in the dataset SSEr<-sum(andy$res^2) SSEr F=((SSEr-SSEu)/2)/(SSEu/72) # Goodness-of-Fit in Multiple Regression mod1 <- lm(sales~price+advert, data=andy) smod1 <- summary(mod1) adjRsq <- smod1$adj.r.squared #adjusted R2 adjRsq AIC(mod1) BIC(mod1) ####### Indicator variables # load utonwn data library(readr) data <- read_excel("utown.xlsx") # declare variables as factors summary(utown) utown$utown <- as.factor(utown$utown) utown$pool <- as.factor(utown$pool) utown$fplace <- as.factor(utown$fplace) mod4 <- lm(price~sqft+utown+age+pool+fplace, data=utown) summary(mod4) summary(mod4)$adj.r.squared # coeff coef(mod4) ################################################################################ ################################################################################ # Omitted Variable Bias edu_inc <- read_excel("edu_inc.xlsx") summary(edu_inc) mod1 <- lm(faminc~he+we, data=edu_inc) mod2 <- lm(faminc~he, data=edu_inc) #we omit woma earning (we) summary(mod1) summary(mod2) #the coeff display diff vakues from mod1 # Irrelevant Variables mod3 <- lm(faminc~he+we+kl6, data=edu_inc) mod4 <- lm(faminc~he+we+kl6+xtra_x5+xtra_x6, data=edu_inc) library(stats) smod1 <- summary(mod1) Rsq <- smod1$r.squared AdjRsq <- smod1$adj.r.squared aic <- AIC(mod1) bic <- BIC(mod1) c(Rsq, AdjRsq, aic, bic) # Collinearity test library(readxl) cars <- read_excel("cars.xlsx") mod1 <- lm(mpg~cyl, data=cars) mod2 <- lm(mpg~cyl+eng+wgt, data=cars) library(car) tab <- vif(mod2) # variance inflation factor tab # VIF for each regressor. The rule of thumb is that a regressor produces collinearity if its VIF is greater than 10 ##################################################################################################### # heteroskedasticity library(lmtest) # and bptest(). library(broom) food <- read_excel("food.xlsx") mod1 <- lm(food_exp~income, data=food) plot(food$income,food$food_exp, type="p", xlab="income", ylab="food expenditure") abline(mod1) res <- residuals(mod1) yhat <- fitted(mod1) plot(food$income,res, xlab="income", ylab="residuals") # y vs residuals plot(yhat,res, xlab="fitted values", ylab="residuals") #yhat vs residuals #Heteroskedasticity Tests # The Breusch-Pagan heteroskedasticiy test summary(mod1) library(lmtest) bptest(mod1)