## Stockton exercise  - linear and non linear regression models 

data <- read_excel("stockton4.xlsx")
data<-stockton4
summary(data)
#a)- graphical representation sprice - livarea 
plot(data$livarea, data$sprice, main="livarea ~ sprice")

#b) linear regression sprice=b1+b2livarea+e
mod1<-lm(sprice ~ livarea,data=data)
summary(mod1)
coeff<-coef(mod1)
coeff
# interpretation: 9181.7 selling prie increase approx. 9182dollars for eachh additional 100square foot in living area.
# intercept: houses with 0 square feet cost -30069 dollars

plot(data$livarea, data$sprice, main="livarea ~ sprice")
abline(mod1)

#c) quadratic model sprice=b1+b2livarea^2
data$livarea2<-data$livarea^2
mod2<-lm(sprice ~ livarea2,data=data)
summary(mod2)
coef(mod2)
# marginal effect of an additional 100square feet for a home with 1500  sq.feet of liv. space
# m.effect= 2(b2)*livarea
b2=212.611
livarea=15
m.effect=2*(b2)*livarea
#adding 100 sq.feet of living space to a house of 1500 sq.feet is estimated to 
#increase iits expected price by approx. 6378dollars

#d) graph repr with mod1 and mod2
plot(data$livarea, data$sprice, main="livarea ~ sprice")
abline(mod1)

plot(data$livarea, data$sprice, main="livarea ~ sprice")
points(data$livarea, predict(mod1), type="l", col="red")
points(data$livarea, predict(mod2), type="l", col="green")

#quadratic model (green) appear to fit better 

#d) obtain SSE
SSE1<-sum(mod1$residuals^2) #sse of linear reg. model 
SSE1

SSE2<-sum(mod2$residuals^2) #sse of quadratic reg. model 
SSE2
# quadratic model appears to fit better since sse is lower 

#e) regression with quadratic livarea - only large size
table(data$lgelot) #n=95

attach(data)
newdata <- data[ which(lgelot==1),] #n=95 obs. 
detach(data)

mod3<-lm(sprice ~ livarea2,data=newdata)
summary(mod3)

mod3$coefficients
# (Intercept)    livarea2 
# 113279.3592    193.8298 

b1b <- coef(mod3)[[1]]
b2b <- coef(mod3)[[2]]

attach(data)
newdata2 <- data[ which(lgelot==0),] #n= obs. 
detach(data)

mod4<-lm(sprice ~ livarea2,data=newdata2)
summary(mod4)

mod4$coefficients
# (Intercept)    livarea2 
# 62172.4058    186.8586 

b1s <- coef(mod4)[[1]] #intercept
b2s <- coef(mod4)[[2]] #b2

# livarea can be interpreted as a marginal effect of an extra 100sq.feet of living area (2*b2*livarea)

2*b2b 
2*b2s

#marginal effect of living area on price is 387.66$*livarea for houses on large lots
#marginal effect of living area on price is 373.71$*livarea for houses on small lots

# mean price for small and big lot 
#tapply(data$sprice,data$lgelot, mean )
#xm_big<- mean(newdata$sprice)
#xm_small<- mean(newdata2$sprice)

# f) plot price and age

plot(data$age, data$lnprice, main="age ~ sprice")
mod5<-lm(sprice ~ age,data=data)
mod5$coefficients
#selling price is 627$ less for each additional year of age
# intercept suggest that house with 0 age cost 137.403$. 

#loglin model 
data$lnprice<-log(data$sprice)
mod6<-lm(lnprice ~ age,data=data)
mod6$coefficients

# each year reduces the price by 0.48% 
#for intercetp: age=0 -> the estimated price for new home is 126.121$
exp(11.745) 

#linear
plot(data$age, data$sprice, main="age ~ sprice")
points(data$age, predict(mod5), type="l", col="red")
#loglinear
plot(data$age, data$lnprice, main="age ~ sprice")
points(data$age, predict(mod6), type="l", col="green")

#based on the plot and visual estimated reg.line, loglin mod show less problem of predicition
SSE5<-sum(mod5$residuals^2) #linear 
SSE5

SSE6<-sum(mod6$residuals^2) #log-lin 
SSE6

# g) reg price=b1+lgelotb2+e
# model with indicator variable (lgelot 1=yes, 0=no)
table(data$lgelot)

mod7<-lm(sprice ~ lgelot ,data=data)
mod7$coefficients
b1= 115220.0 
b2=133797.3 
# expected selling price for house with large lot (lgelot=1) is b1+b2*(1)
yhat_lage=b1+b2*1
#expected selling price for house with not large lot (lgelot==) is b1+b2*(0)
yhat_notlarge=b1+b2*(0)

yhat_lage- yhat_notlarge # this value is equal to b2 
#(the price increase when house is located in large lot compared to small lot)