# load data food library(readxl) food <- read_excel("~/Dropbox/DIDATTICA/Econometria a.a.2023_2024/Dati/food.xlsx") lm <- lm(food_exp~income, data=food) #output table from regression model print(lm) summary(lm) #confidence interval: confint command in R -> estimation of ic (default 95%) confint(lm) ######### Goodness-of-Fit ######### #The total variation of y about its sample mean, SST #can be decomposed in variation about the regression line, SSE #and variation of the regression line about the mean of y , SSR # coefficient of determination, R2 , is defined as the proportion of the variance in y #that is explained by the regression, SSR , in the total variation in y, SST #rho pearson correlation coefficient cor(food$food_exp, food$income) (cor(food$food_exp, food$income))^2 #squared rho give us a measure of goodness of fit in SIMPLE linear regr model # SER of regression (squared sum of yi-yhat) SSE<-sum(lm$residuals^2) #SSE n<-nrow(food) var_e<-SSE/(n-2) # variance of the error term se_e<- sqrt(var_e) # st.error of the error term #St. errr with a command on R sigma_m1<- summary(lm)$sigma # s.error of error term or regression sigma_m1 # we can see R2 and SER from the output of the lin regress -> commnd: summary(lm) summary(lm) # for computing by hands R2, you need the total sum (SST), sum of squared errors (SSE) or the sum of squares due to regression (SSR) # to see these quantities, use the anova function: SSR=190627 SSE=304505.2 SST=SSR+SSE R2=1-(SSE/SST) R2=(SSR/SST) # R2 - goodness of fit summary(lm)$r.squared # for simple linear regr model summary(lm)$adj.r.squared #for multiple regr model ################################################################################# # Residuals and Diagnostics # Plotting the residuals and calculating certain test statistics help deciding whether assumptions #such as homoskedasticity, serial correlation, and normality of the errors are not violated. #In R, the residuals are stored in the vector residuals of the regression output. ehat <- lm$residuals plot(food$income, ehat, xlab="income", ylab="residuals") #One can notice that the spread of the residuals seems to be higher at higher incomes, #which may indicate that the homoskedasticity assumption is violated. ############################################################################### # normality assumption #Another assumption that we would like to test is the normality of the residuals, #which assures reliable hypothesis testing and confidence intervals even in small samples. #This assumption can be assessed by inspecting a histogram of the residuals, as well as performing a Jarque-Bera test, #for which the null hypothesis is ???Series is normally distributed???. #Thus, a small p-value rejects the null hypothesis, which means the series fails the normality test. #The Jarque-Bera test requires installing and loading the package tseries install.packages(tseries) library(tseries) lm <- lm(food_exp~income, data=food) ehat <- resid(lm) ebar <- mean(ehat) sde <- sd(ehat) #hist for the residuals hist(ehat, col="grey", freq=FALSE, main="", ylab="density", xlab="ehat") #normal curve with mean and sd curve(dnorm(x, ebar, sde), col=2, add=TRUE, ylab="density", xlab="ehat") # jarque.bera.test jarque.bera.test(ehat) #(in package 'tseries') # Ho: residuals are normally distributed # h1: residuals are not normally distributed # of X-squared>chi2-2gdl -> reject normality assumption alpha=0.05 qchisq(1-alpha,2) # While the histogram may not strongly support one conclusion or another about the normlity of ehat, # the Jarque-Bera test is unambiguous: there is no evidence against the normality hypothesis.