##########################################################################
# import andy data
library(readxl)
andy <- read_excel("andy.xlsx")

summary(andy)

#multiple regr model sales~price+advert
mod1 <- lm(sales~price+advert, data=andy)
print(mod1) #only coefficients and intercepts 
summary(mod1) #all the table

# residuals and SSE 
andy$res<-(mod1$residuals)
head(andy) #visualize the first six units in the dataset
SSE<-sum(andy$res^2)
SSE

# variance of the error term 
df=mod1$df.residual
df
var_e=SSE/df
var_e
se_e=sqrt(var_e)
se_e
# confidence interval for coefficients and intercept
summary(mod1)
confint(mod1)


# Hypothesis Testing in Multiple Regression
# H0:B2=0,HA:B2!=0 - coefficient b2 - two tails test 
alpha <- 0.05
df <- mod1$df.residual # degree of freedom of multiple regr model (n-k-1)
qt <- qt(1-alpha/2, df)
qt<-c(-qt, qt)
qt
b2 <- coef(mod1)[["price"]]  # save coefficient price
seb2 <-coef(summary(mod1))[2,"Std. Error"]  # save st.error price
t <- b2/seb2 # t-statistic
t


# coefficient b3 
alpha <- 0.05
df <- mod1$df.residual
qt <- qt(1-alpha/2, df) #critical values
qt<-c(-qt, qt)
qt

b3 <- coef(mod1)[[3]]
seb3 <- sqrt(vcov(mod1)[3,3])
t <- b3/seb3

# Calculated  t=2.726, and  qt=1.993Since  t>qt -> we reject the null hypothesis 
#we conclude that there is a statistically significant relationship between  advert and  sales


######################################################################################
################# homoskedasticity and normality of error term 

plot(andy$price, andy$res)
plot(andy$advert, andy$res)

hist(andy$res)

ebar <- mean(andy$res)
sde <- sd(andy$res)
#hist for the residuals 
hist(andy$res, col="grey", freq=FALSE, main="",
     ylab="density", xlab="ehat")
#normal curve with mean and sd
curve(dnorm(x, ebar, sde), col=2, add=TRUE,
      ylab="density", xlab="ehat")

library(tseries)

jarque.bera.test(andy$res) #(in package 'tseries')

# Ho: residuals are normally distributed
# h1: residuals are not normally distributed


# of X-squared>chi2-2gdl -> reject normality assumption
alpha=0.05
qchisq(1-alpha,2)


##################################### F-Test (joint hypotesis test) ######################

# estimate full model (with two independent variables and save SSE)
mod1 <- lm(sales~price+advert, data=andy)
andy$res<-(mod1$residuals)
head(andy) #visualize the first six units in the dataset
SSEu<-sum(andy$res^2)
SSEu

# estimate restricted model (with no independent variable and save SSE)
mod2 <- lm(sales~ +0 , data=andy)
summary(mod2)
andy$res<-(mod2$residuals)
head(andy) #visualize the first six units in the dataset
SSEr<-sum(andy$res^2)
SSEr

F=((SSEr-SSEu)/2)/(SSEu/72)

# Goodness-of-Fit in Multiple Regression
mod1 <- lm(sales~price+advert, data=andy)
smod1 <- summary(mod1)
adjRsq <- smod1$adj.r.squared #adjusted R2
adjRsq
AIC(mod1)
BIC(mod1)


####### Indicator variables 
# load utonwn data
library(readr)
data <- read_excel("utown.xlsx")
# declare variables as factors 
summary(utown)
utown$utown <- as.factor(utown$utown)
utown$pool <- as.factor(utown$pool)
utown$fplace <- as.factor(utown$fplace)

mod4 <- lm(price~sqft+utown+age+pool+fplace, data=utown)
summary(mod4)

summary(mod4)$adj.r.squared

# coeff
coef(mod4)


################################################################################
################################################################################
# Omitted Variable Bias
edu_inc <- read_excel("edu_inc.xlsx")

summary(edu_inc)
mod1 <- lm(faminc~he+we, data=edu_inc)
mod2 <- lm(faminc~he, data=edu_inc) #we omit woma earning (we)
summary(mod1)
summary(mod2) #the coeff display diff vakues from mod1

#  Irrelevant Variables
mod3 <- lm(faminc~he+we+kl6, data=edu_inc)
mod4 <- lm(faminc~he+we+kl6+xtra_x5+xtra_x6, data=edu_inc)


library(stats)
smod1 <- summary(mod1)
Rsq <- smod1$r.squared
AdjRsq <- smod1$adj.r.squared
aic <- AIC(mod1)
bic <- BIC(mod1)
c(Rsq, AdjRsq, aic, bic)


# Collinearity test 
library(readxl)
cars <- read_excel("cars.xlsx")
mod1 <- lm(mpg~cyl, data=cars)
mod2 <- lm(mpg~cyl+eng+wgt, data=cars)

library(car)
tab <- vif(mod2) # variance inflation factor
tab
# VIF for each regressor. The rule of thumb is that a regressor produces collinearity if its VIF is greater than 10

#####################################################################################################
# heteroskedasticity 
library(lmtest) # and bptest().
library(broom)

food <- read_excel("food.xlsx")

mod1 <- lm(food_exp~income, data=food)
plot(food$income,food$food_exp, type="p",
     xlab="income", ylab="food expenditure")
abline(mod1)

res <- residuals(mod1)
yhat <- fitted(mod1)
plot(food$income,res, xlab="income", ylab="residuals") # y vs residuals 
plot(yhat,res, xlab="fitted values", ylab="residuals") #yhat vs residuals 

#Heteroskedasticity Tests
# The Breusch-Pagan heteroskedasticiy test 
summary(mod1)
library(lmtest)
bptest(mod1)