## Stockton exercise - linear and non linear regression models data <- read_excel("stockton4.xlsx") data<-stockton4 summary(data) #a)- graphical representation sprice - livarea plot(data$livarea, data$sprice, main="livarea ~ sprice") #b) linear regression sprice=b1+b2livarea+e mod1<-lm(sprice ~ livarea,data=data) summary(mod1) coeff<-coef(mod1) coeff # interpretation: 9181.7 selling prie increase approx. 9182dollars for eachh additional 100square foot in living area. # intercept: houses with 0 square feet cost -30069 dollars plot(data$livarea, data$sprice, main="livarea ~ sprice") abline(mod1) #c) quadratic model sprice=b1+b2livarea^2 data$livarea2<-data$livarea^2 mod2<-lm(sprice ~ livarea2,data=data) summary(mod2) coef(mod2) # marginal effect of an additional 100square feet for a home with 1500 sq.feet of liv. space # m.effect= 2(b2)*livarea b2=212.611 livarea=15 m.effect=2*(b2)*livarea #adding 100 sq.feet of living space to a house of 1500 sq.feet is estimated to #increase iits expected price by approx. 6378dollars #d) graph repr with mod1 and mod2 plot(data$livarea, data$sprice, main="livarea ~ sprice") abline(mod1) plot(data$livarea, data$sprice, main="livarea ~ sprice") points(data$livarea, predict(mod1), type="l", col="red") points(data$livarea, predict(mod2), type="l", col="green") #quadratic model (green) appear to fit better #d) obtain SSE SSE1<-sum(mod1$residuals^2) #sse of linear reg. model SSE1 SSE2<-sum(mod2$residuals^2) #sse of quadratic reg. model SSE2 # quadratic model appears to fit better since sse is lower #e) regression with quadratic livarea - only large size table(data$lgelot) #n=95 attach(data) newdata <- data[ which(lgelot==1),] #n=95 obs. detach(data) mod3<-lm(sprice ~ livarea2,data=newdata) summary(mod3) mod3$coefficients # (Intercept) livarea2 # 113279.3592 193.8298 b1b <- coef(mod3)[[1]] b2b <- coef(mod3)[[2]] attach(data) newdata2 <- data[ which(lgelot==0),] #n= obs. detach(data) mod4<-lm(sprice ~ livarea2,data=newdata2) summary(mod4) mod4$coefficients # (Intercept) livarea2 # 62172.4058 186.8586 b1s <- coef(mod4)[[1]] #intercept b2s <- coef(mod4)[[2]] #b2 # livarea can be interpreted as a marginal effect of an extra 100sq.feet of living area (2*b2*livarea) 2*b2b 2*b2s #marginal effect of living area on price is 387.66$*livarea for houses on large lots #marginal effect of living area on price is 373.71$*livarea for houses on small lots # mean price for small and big lot #tapply(data$sprice,data$lgelot, mean ) #xm_big<- mean(newdata$sprice) #xm_small<- mean(newdata2$sprice) # f) plot price and age plot(data$age, data$lnprice, main="age ~ sprice") mod5<-lm(sprice ~ age,data=data) mod5$coefficients #selling price is 627$ less for each additional year of age # intercept suggest that house with 0 age cost 137.403$. #loglin model data$lnprice<-log(data$sprice) mod6<-lm(lnprice ~ age,data=data) mod6$coefficients # each year reduces the price by 0.48% #for intercetp: age=0 -> the estimated price for new home is 126.121$ exp(11.745) #linear plot(data$age, data$sprice, main="age ~ sprice") points(data$age, predict(mod5), type="l", col="red") #loglinear plot(data$age, data$lnprice, main="age ~ sprice") points(data$age, predict(mod6), type="l", col="green") #based on the plot and visual estimated reg.line, loglin mod show less problem of predicition SSE5<-sum(mod5$residuals^2) #linear SSE5 SSE6<-sum(mod6$residuals^2) #log-lin SSE6 # g) reg price=b1+lgelotb2+e # model with indicator variable (lgelot 1=yes, 0=no) table(data$lgelot) mod7<-lm(sprice ~ lgelot ,data=data) mod7$coefficients b1= 115220.0 b2=133797.3 # expected selling price for house with large lot (lgelot=1) is b1+b2*(1) yhat_lage=b1+b2*1 #expected selling price for house with not large lot (lgelot==) is b1+b2*(0) yhat_notlarge=b1+b2*(0) yhat_lage- yhat_notlarge # this value is equal to b2 #(the price increase when house is located in large lot compared to small lot)