##################################################################### ###### Simple regression model ######### ##################################################################### #set your Working directory #setwd("~/..../dataset") # load data on food expenditure #food<- read.csv("food.csv",sep=",",dec=".", header=TRUE) library(readxl) food <- read_excel("food.xlsx") str(food) summary(food) boxplot(food$food_exp) boxplot(food$income) hist(food$income) # graphical representation of two variable income and food exp plot(x=food$income, y=food$food_exp, main="Food_exp ~ Income") cor(food$income,food$food_exp ) #rho of pearson # fit the simple linear regression model (function lm) linearMod <- lm(food_exp ~ income, data=food) # call for coefficients: #several ways in R print(linearMod) #only coefficient and intercept beta_food<-coef(linearMod) # save coeff. beta_food linearMod$coefficients # recall coeff. # interpret the coeff b1 and b2 names(linearMod) # to see all the elements stored in the lm command names summary(linearMod) # see all the ouput computed by R # plot: scatter plot + regress line plot(x=food$income, y=food$food_exp, main="Food_exp ~ Income") abline(a = 83.42, b = 10.21) abline(linearMod) # alternative way # mean value of the residuals mean(linearMod$residuals) # Assumption SR2:The expected value of the random error e is ZERO # elasticity - epsilon (slope*(x/y)) # mean x and y variables ym<-mean(food$food_exp) xm<-mean(food$income) e<-10.21*(xm/ym) e #elasticity=0.70 # We estimate that a 1% increase in weekly household income will lead, on average, #to a 0.71% increase in weekly household expenditure on food, #when x and y take their sample mean values, (x; y) = (19:60; 283:57). # Since the estimated income elasticity is less than one, we would classify #food as a ??????necessity?????? rather than a ??????luxury,?????? which is consistent with what we would expect for an average household. #prediction (Yhat) # estimates of expected food expenditure when income=20 Yhat<-83.42+10.21*(20) expected<- predict(linearMod, newdata = data.frame(income = 20),interval = "confidence") print(expected) ######################################################################################################################### ######################################################################################################################### # assessing the least square - least squares residuals # degree of freedom of linear model df<-linearMod$df.residual #n-2 b1 b2 df #SSE - squared sum of errors SSE<- sum(linearMod$residuals^2) VAR<-SSE/(df) #estimated variance of the error (variance of the regression) VAR # standard error of the regression SER<- sqrt(VAR) #measure of goodness of fit SER ################################################################################ # NON LINEAR RELATIONAHIP # Quadratic and log-linear models #load data br br<- read.csv("br.csv",sep=",",dec=".", header=TRUE) summary(br$PRICE) summary(br$SQFT) boxplot(br$PRICE) hist(br$PRICE) boxplot(br$SQFT) hist(br$SQFT) plot(br$SQFT, br$PRICE) cor(br$SQFT, br$PRICE) linearMod1 <- lm(PRICE ~ SQFT, data=br) # build linear regression model on full data print(linearMod1) #save coefficinet beta<-coef(linearMod1) #squared model - add new variable in the dataset br$sqft2<-br$SQFT^2 summary(br$sqft2) plot(br$sqft2, br$PRICE) linearMod2 <-lm(PRICE~I(SQFT^2), data=br) print(linearMod2) b1 <- coef(linearMod2)[[1]] b2 <- coef(linearMod2)[[2]] #Y=55776.56 + 0.0154x # slope: 2*b2*X # eg: For a 2000-square-foot house: 2*0.0154*2000 2*0.0154*8000 # = 61.6 (price increase for a 2000 sqft house) # Elasticity - estimate that a 1% increase in house size will increase price by 1.05% # epsilon=slope*(x/y) # eg. elasticity sqft=2000 y=55776.56 + 0.0154*2000^2 x=2000 e=61.6*(x/y) # 1.049 # elasticity for more than 1 x value..... sqftx=c(2000, 4000, 6000) #given values for sqft pricex=b1+b2*sqftx^2 #prices corresponding to given sqft DpriceDsqft <- 2*b2*sqftx # marginal effect of sqft on price elasticity=DpriceDsqft*sqftx/pricex b1; b2; DpriceDsqft; elasticity #prints results # log- linear model #exponential model - -log-lin model hist(br$PRICE, col='grey') # asimmetric distrib - long righ tail ... use log(price) hist(log(br$PRICE), col='grey') # run log-lin mod (y is in logaritm) linearMod3 <-lm(log(PRICE)~SQFT, data=br) print(linearMod3) b1 <- coef(linearMod3)[[1]] b2 <- coef(linearMod3)[[2]] b2*100 #semi-elasticity # for a one-square-foot increase in size, we estimate a price increase of 0.04% # a 100-square-foot increase will increase price by approximately 4%. #save b1 and b2 in vectors b1 <- coef(linearMod3)[[1]] b2 <- coef(linearMod3)[[2]] # predicted price #for a house with 2000 SQFT -> antilogaritm -> exp function Yhat<-exp(b1+b2*2000) Yhat #the predicted price (Yhat) is 115975.47 # slope log-lin model: b2*y # for a house price equal to 100000 slope=b2*100000 # 41,12 slope=b2*500000 # 205,63 #elasticity (b2*x) b2*2000 # elasticity: For a house with 2000-square-feet, the estimated elasticity is 0.823: # A 1% increase in house size with 2000 SQFT is estimated to increase selling price by 0.823% b2*4000 # elasticity: For a house with 4000-square-feet, the estimated elasticity is 1,64: # A 1% increase in house size with 4000 SQFT is estimated to increase selling price by 1.64% # semielasticity (b2*100) b2*100 # for a one-square-foot increase in size, we estimate a price increase of 0.04% ########################################################## ########## Linear regression model ########## ########## Indicator variables ########## ######################################################### #indicator variables utown<- read.csv("utown.csv",sep=",",dec=".", header=TRUE) summary(utown) #utown dicotomicos variable #freq. distr table(utown$utown) #absolute freq. prop.table(table(utown$utown)) #relative freq. #price istogram hist(utown$price) #price istogram by utown hist(utown$price[utown$utown==0], breaks=10) # utown=0 hist(utown$price[utown$utown==1], breaks=10) # utown=1 #plot price - utown plot(x=utown$utown, y=utown$price) abline(mod1) #linear model mod1 <- lm(price ~ utown, data=utown) print(mod1) #example with pool (1=yes; 0=no) mod2 <- lm(price ~ pool, data=utown) print(mod2)