## Modello di Poisson a.a. 2024--2025
##Distribuzioni teoriche di Poisson

x.poi<-rpois(n=200,lambda=2.5)
hist(x.poi, xlab="y", ylab="Prob(Y=y)", main = expression (paste("Distribuzione di Poisson, ", theta, "=2.5")))


par(mfrow=c(2,2))
lambda <- 1:4
x <- 0:10
for (i in lambda) {
x1 <- dpois(x, lambda[i])
barplot(x1, names.arg=c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), xlab="y", ylab=expression(P(Y=y)), ylim=c(0,.4), col="red")
title(main = substitute(theta == i,list(i=i)))
}


##Stima di un modello di Poisson

require(arm)
##Numero di giorni di assenza a scuola
##in funzione dei voti in math, lingue e sesso
##We have attendance data on 316 high school juniors from two urban high schools. The response variable of interest is days absent, daysabs. The variables math and langarts give the standardized test scores for math and language arts respectively. The variable male is a binary indicator of student gender.
abs<-read.table("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/poissonreg.dat", sep=",")
names(abs)<-c("id", "school", "male", "math", "langarts", "daysatt", "daysabs")
abs[1:4,]
##Analisi descrittive
attach(abs)

par(mfrow=c(2,2))
hist(daysabs, breaks=20, main="distribuzione giorni di assenza a scuola", xlab="giorni di assenza", ylab="frequenza")
plot(math, daysabs, cex.lab=1.2, pch=19, col=ifelse(male==1, "blue", "red"), xlab="Voto in matematica", ylab="giorni di assenza")
legend(70, 40, c("Ragazzi", "Ragazze"), 
	col=c("blue", "red"), pch=19, cex=0.8)

plot(langarts,daysabs,  cex.lab=1.2, pch=19, col=ifelse(male==1, "blue", "red"), xlab="voto in letteratura", ylab="giorni di assenza")
legend(70, 40, c("Ragazzi", "Ragazze"), 
	col=c("blue", "red"), pch=19, cex=0.8)

plot(jitter(male, 0.5), daysabs,xaxt="n", xlab="genere", ylab="giorni di assenza a scuola")
axis(1, c(0, 1), labels = c("Ragazze", "Ragazzi"))



##Analisi di correlazione
cor(abs[,-1])

##stima del modello 
fit.1<-glm(daysabs ~male + rescale(math) + rescale(langarts), family="poisson") 
display(fit.1,4) ## coeff interpretatibili come tax di crescita
summary(fit.1)


fit.2<-glm(daysabs ~male + rescale(math) + rescale(langarts), offset=log(daysatt), family="poisson")
display(fit.2,4)
summary(fit.2)

fit.2$linear.predictors[1:10]
fit.2$fitted[1:10]

##valori stimati
exp(fit.2$linear.predictors[1:10])
## o anche
yhat <- predict (fit.2, type="response")
round(yhat[1:10])



##Overdispersione
hist(daysabs, freq=F, ylim=c(0, 0.3), main="Distribuzione giorni di assenza")
##il miglior adattamento con theta==2
points (sort(daysabs), dpois(sort(daysabs), 2), col="blue", cex=0.5)
lines (sort(daysabs), dpois(sort(daysabs), 2), col="red", cex=0.5)


##Valori previsti 
yhat <- predict (fit.2, type="response")

##Plot dei residui
##Residui dei Pearson
ri<-residuals(fit.2, type="pearson")

par(mfrow=c(2,2))
##grafico dei residui grezzi
plot(fit.2$fitted.values, residuals(fit.2, type="pearson"), xlab = "Valori Stimati", ylab = "Residui Grezzi", cex = 0.6, ylim=c(-10, 50))
abline(h = 0)  

##ovvero
plot(fit.2$fitted.values, abs$daysabs-exp(fit.2$linear.predictors), xlab = "Valori Stimati", ylab = "Residui Grezzi", cex = 0.6, ylim=c(-10, 50))
abline(h = 0)  


##residui di Pearson STANDARDIZZATI
## zi=(yi-theta_i)/sqrt(theta_i)

z<-(abs$daysabs-exp(fit.2$linear.predictors))/sqrt(exp(fit.2$linear.predictors))
plot(yhat, z, xlab = "Valori Stimati", ylab = "Residui standardizzati", cex = 0.6, ylim=c(-10, 10))
abline(h = 0) 
abline(h = -1, col="red") 
abline(h = 1, col="red") 


##H0: the variance is equal to the expectation
##Pearson scale overdispersion = Pearson Chi SQ / DF
##gradi di liberta' 312
n=316
k=4
qchisq(0.05, (n-k), lower.tail = FALSE)
stat.test<-sum(z^2)

##Rifiuto H0 perche' sum(z^2)>>>qchisq(0.05, (n-k), lower.tail = FALSE)
##p-value
cat ("p-value of overdispersion test is ", pchisq (sum(z^2), n-k), "\n")


## Stima del fattore di sovradispersione
cat ("overdispersion ratio is ", sum(z^2)/(n-k), "\n") 


##Alternativamente
##overdispersion = Deviance Chi SQ / DF
summary(fit.2)

res.deviance<-fit.2$deviance/fit.2$df.residual
df.residual<-fit.2$df.residual
pchisq(res.deviance, df.residual, lower.tail = FALSE)

##Sovradispersione in R 
fit.2a<-glm(daysabs ~male + rescale(math) + rescale(langarts), offset=log(daysatt), family="quasipoisson")
display(fit.2a,4)
summary(fit.2a)


##Incremento gli errori standard in base al coefficiente di sovradispersione
##Valuto lo standard error di male del modello di base
summary(fit.2)$coeff[2,2]
##Incremento lo standard error della radice del fattore di sovradispersione
summary(fit.2)$coeff[2,2]*sqrt(sum(z^2)/(n-k))

##confronto con modello con family quasipoisson
summary(fit.2a)$coeff[2,2]

##grafico dei valori osservati e stimati
plot(abs$math, abs$daysabs, pch="o", col="blue", main="Valori osservati e previsti", 
     xlab="Voto in matematica", ylab="Giorni di assenze")
points(predict (fit.2, type="response"), pch="p", col="red")
legend(80,40,c("obs","pred"), pch=c("o","p"), col=c("blue","red"))
 

##stima dei valori 
abs$phat <- predict(fit.2a, type = "response")
exp(fit.2a$linear.predictors[1:10])

require(ggplot2)
## create the plot
ggplot(abs, aes(x = math, y = yhat, colour = factor(male))) + 
  geom_point(aes(y = daysabs), alpha = 0.5, 
             position = position_jitter(h = 0.2)) + geom_line(size = 1)+
              labs(x = "Punteggio in matematica", y = "Numero atteso di giorni di assenze")



###Questi dati provengono da un articolo di McCullagh and Nelder (1989) relativo ad incidenti sulle navi
##The file has 34 rows corresponding to the observed combinations of type of ship, year of construction and period of operation. 
## Each row has information on five variables as follows:

ship<- read.table("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/ships.dat", sep="")
head(ship)
## ship type, coded 1-5 for A, B, C, D and E,
## year of construction (1=1960-64, 2=1965-70, 3=1970-74, 4=1975-79),
## period of operation (1=1960-74, 2=1975-79)
## months of service, ranging from 63 to 20,370, and
## damage incidents, ranging from 0 to 53.

par(mfrow = c(1, 2))
plot(damage ~ months, data = ship)
plot(log(damage + 0.5) ~ log(months), data = ship)

par(mfrow = c(1, 3))
plot(damage ~ as.factor(type), data = ship)
plot(damage ~ months, data = ship, pch = as.character(type))
plot(damage/months ~ as.factor(type), data = ship)

## Modello senza exposure
M0 <- glm(damage ~ type + construction + operation, data = ship, family = poisson)
display(M0)

##Introduco l'exposure (mesi di servizio--in log come da analisi descrittiva)
M1 <- glm(damage ~ type + construction + operation + offset(log(months)), data = ship, family = poisson)
display(M1)

##Overdispersion
summary(M1)

##Calcolo dei residui
yhat <- predict (M1, type="response")
z <- (ship$damage-yhat)/sqrt(yhat)
df.residual<-M1$df.residual
cat ("overdispersion ratio is ", sum(z^2)/df.residual, "\n") 

M2 <- glm(damage ~ type + construction + operation + offset(log(months)), data = ship, family = quasipoisson)
display(M2)


################
##ZIP model
#################
install.packages("pscl")
library(pscl)

##Stessi predittori per il modello di conteggio e per il modello Zero-inflated
fit.zip <- zeroinfl(daysabs ~male + rescale(math), offset=log(daysatt))
summary(fit.zip)

##Escludo il sesso dal modello Zero-inflated
fit.zip.a <- zeroinfl(daysabs ~male + rescale(math) | rescale(math) , offset=log(daysatt))
summary(fit.zip.a)


##ESEMPIO DELLA PESCA
##We have data on 250 groups that went to a park.  Each group was questioned about how many fish they caught (count), how many children were in the group (child), how many people were in the group (persons), and whether or not they brought a camper to the park (camper).   
##In addition to predicting the number of fish caught, there is interest in predicting the existence of excess zeros, i.e., the probability that a group caught zero fish. We will use the variables child, persons, and camper in our model.

zinb<-read.csv("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/zinb.csv")[,-1]

zinb <- within(zinb, {
  nofish <- factor(nofish)
  livebait <- factor(livebait)
  camper <- factor(camper)
})

summary(zinb)

hist(zinb$count, breaks=10)
hist(zinb$count, breaks=100)


require(ggplot2)
## histogram with x axis in log10 scale senza gli zero
ggplot(zinb, aes(count, fill = camper)) +
  geom_histogram() +
  scale_x_log10() +
  facet_grid(camper ~ ., margins=TRUE, scales="free_y")+
  labs(x = "Numero di pesci pescati", y = "Frequenza")



##Stima del modello
fit.zip1 <- zeroinfl(count ~ child + camper, data=zinb) 
summary(fit.zip1)

##Considero predittori diversi per i due modelli (per il modello con massa in zero solo il numero di persone presenti)
fit.zip2 <- zeroinfl(count ~ child + camper | persons, data = zinb)
summary(fit.zip2)

##Dal momento che abbiamo essenzialmente predittori categorici possiamo calcolare
##i valori attesi della variabile y (numero di pesci pescati) in funzione di tutte le possibili combinazioni dei nostri predittori (calcolata con la funzione expand.grid)
##quindi facciamo il grafico

newdata1 <- expand.grid(0:3, factor(0:1), 1:4) ##tutte le possibili combinazioni
colnames(newdata1) <- c("child", "camper", "persons")
##Valori previsti
newdata1$phat <- predict(fit.zip2, newdata1)

ggplot(newdata1, aes(x = child, y = phat, colour = factor(persons))) +
  geom_point() +
  geom_line() +
  facet_wrap(~camper) +
  labs(x = "Numero di bambini", y = "Numero di pesci pescati--valori stimati")

##ESERCIZIO
## dati su "Extramarital Affairs Data"
load("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/affairs.RData")
head(data)
##Description
desc
##variable                                                                        label
##        id                                                                   identifier
##      male                                                                   =1 if male
##       age                                                                     in years
##   yrsmarr                                                                years married
##      kids                                                              =1 if have kids
##     relig        5 = very relig., 4 = somewhat, 3 = slightly, 2 = not at all, 1 = anti
##      educ                                                              years schooling
##     occup                                       occupation, reverse Hollingshead scale
##  ratemarr 5 = vry hap marr, 4 = hap than avg, 3 = avg, 2 = smewht unhap, 1 = vry unhap
## naffairs                                           number of affairs within last year
##   affair                                                =1 if had at least one affair
##   vryhap                                                                ratemarr == 5
##   hapavg                                                                ratemarr == 4
##  avgmarr                                                                ratemarr == 3
##    unhap                                                                ratemarr == 2
##   vryrel                                                                   relig == 5
##   smerel                                                                   relig == 4
## slghtrel                                                                   relig == 3
##   notrel                                                                   relig == 2

dati.affairs<-data
dati.affairs[1:5,]
hist(dati.affairs$naffairs)


fit.1 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age), data = dati.affairs, family = poisson)
summary(fit.1)
display(fit.1)


##Overdispersion
gdl<-fit.1$df.residual

##Calcolo dei residui
yhat <- predict (fit.1, type="response")
z <- (dati.affairs$naffairs-yhat)/sqrt(yhat)
n<-dim(dati.affairs)[1]
k<-length(coef(fit.1))

cat ("overdispersion ratio is ", sum(z^2)/(n-k), "\n") 

fit.2 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age), data = dati.affairs, family = quasipoisson)
summary(fit.2)
display(fit.2)


fit.3 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age) +  offset(log(yrsmarr)), data = dati.affairs, family = quasipoisson)
summary(fit.3)
display(fit.3)



fit.4<-zeroinfl(naffairs ~ male + rescale(relig) + rescale(age) | rescale(age) + kids + offset(log(yrsmarr)), data=dati.affairs)
summary(fit.4)


##Esercizio cesarei
##Births by caesarean section are said to be more frequent in private (fee paying) hospitals as ##compared to non-fee paying public hospitals. Data about total annual births and the number of ##caesarean sections carried out were obtained from the records of 4 private hospitals and 16 public ##hospitals. These are tabulated below: 
##0 = Private 1 = Public 
##Hospital Hospitals 

dati <- read.csv("cesarea.csv", header=T, sep=";")
dati$Public<-ifelse(dati$Hospital.type==1, 1, 0)
plot(dati$Births, dati$Caesareans, pch=16)

fit.1<-glm (Caesareans~ Public, family="poisson", data=dati)
display(fit.1)
fit.1a<-glm (Caesareans~ Public, family="poisson", offset=log(Births), data=dati)
display(fit.1a)


##Overdispersion
summary(fit.1a)
gdl<-fit.1a$df.residual

##Calcolo dei residui
yhat <- predict (fit.1a, type="response")
z <- (dati$Caesareans-yhat)/sqrt(yhat)
n<-dim(dati)[1]
##k=2

cat ("overdispersion ratio is ", sum(z^2)/gdl, "\n") 


fit.1b<-glm (Caesareans~ Public, family="quasipoisson", offset=log(Births), data=dati)
display(fit.1b)