# load data food 
library(readxl)
food <- read_excel("~/Dropbox/DIDATTICA/Econometria a.a.2023_2024/Dati/food.xlsx")
lm <- lm(food_exp~income, data=food)

#output table from regression model 
print(lm)
summary(lm)

#confidence interval: confint command in R -> estimation of ic (default 95%)
confint(lm)

#########  Goodness-of-Fit   #########
#The total variation of  y about its sample mean,  SST
#can be decomposed in variation about the regression line,  SSE
#and variation of the regression line about the mean of  y , SSR
# coefficient of determination,  R2 , is defined as the proportion of the variance in  y
#that is explained by the regression,  SSR , in the total variation in  y, SST

#rho pearson correlation coefficient 
cor(food$food_exp, food$income)
(cor(food$food_exp, food$income))^2 #squared rho give us a measure of goodness of fit in SIMPLE linear regr model 


# SER of regression (squared sum of yi-yhat)

SSE<-sum(lm$residuals^2) #SSE 
n<-nrow(food) 
var_e<-SSE/(n-2) # variance of the error term 
se_e<- sqrt(var_e) # st.error of the error term 

#St. errr with a command on R 
sigma_m1<- summary(lm)$sigma # s.error of error term or regression
sigma_m1

# we can see R2 and SER from the output of the lin regress -> commnd: summary(lm)
summary(lm)

#  for computing by hands R2, you need the total sum (SST), sum of squared errors (SSE) or the sum of squares due to regression (SSR) 
# to see these quantities, use the anova function: 


SSR=190627 
SSE=304505.2
SST=SSR+SSE

R2=1-(SSE/SST)
R2=(SSR/SST)

# R2 - goodness of fit
summary(lm)$r.squared # for simple linear regr model 

summary(lm)$adj.r.squared #for multiple regr model 

#################################################################################
# Residuals and Diagnostics
# Plotting the residuals and calculating certain test statistics help deciding whether assumptions 
#such as homoskedasticity, serial correlation, and normality of the errors are not violated. 
#In R, the residuals are stored in the vector residuals of the regression output.

ehat <- lm$residuals
plot(food$income, ehat, xlab="income", ylab="residuals")

#One can notice that the spread of the residuals seems to be higher at higher incomes, 
#which may indicate that the homoskedasticity assumption is violated.

###############################################################################
# normality assumption
#Another assumption that we would like to test is the normality of the residuals, 
#which assures reliable hypothesis testing and confidence intervals even in small samples. 
#This assumption can be assessed by inspecting a histogram of the residuals, as well as performing a Jarque-Bera test, 
#for which the null hypothesis is ???Series is normally distributed???. 
#Thus, a small  p-value rejects the null hypothesis, which means the series fails the normality test. 
#The Jarque-Bera test requires installing and loading the package tseries
install.packages(tseries)
library(tseries)
lm <- lm(food_exp~income, data=food)
ehat <- resid(lm)
ebar <- mean(ehat)
sde <- sd(ehat)
#hist for the residuals 
hist(ehat, col="grey", freq=FALSE, main="",
     ylab="density", xlab="ehat")
#normal curve with mean and sd
curve(dnorm(x, ebar, sde), col=2, add=TRUE,
      ylab="density", xlab="ehat")

# jarque.bera.test
jarque.bera.test(ehat) #(in package 'tseries')

# Ho: residuals are normally distributed
# h1: residuals are not normally distributed


# of X-squared>chi2-2gdl -> reject normality assumption
alpha=0.05
qchisq(1-alpha,2)

# While the histogram may not strongly support one conclusion or another about the normlity of ehat, 
# the Jarque-Bera test is unambiguous: there is no evidence against the normality hypothesis.