## Modello di Poisson a.a. 2024--2025 ##Distribuzioni teoriche di Poisson x.poi<-rpois(n=200,lambda=2.5) hist(x.poi, xlab="y", ylab="Prob(Y=y)", main = expression (paste("Distribuzione di Poisson, ", theta, "=2.5"))) par(mfrow=c(2,2)) lambda <- 1:4 x <- 0:10 for (i in lambda) { x1 <- dpois(x, lambda[i]) barplot(x1, names.arg=c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), xlab="y", ylab=expression(P(Y=y)), ylim=c(0,.4), col="red") title(main = substitute(theta == i,list(i=i))) } ##Stima di un modello di Poisson require(arm) ##Numero di giorni di assenza a scuola ##in funzione dei voti in math, lingue e sesso ##We have attendance data on 316 high school juniors from two urban high schools. The response variable of interest is days absent, daysabs. The variables math and langarts give the standardized test scores for math and language arts respectively. The variable male is a binary indicator of student gender. abs<-read.table("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/poissonreg.dat", sep=",") names(abs)<-c("id", "school", "male", "math", "langarts", "daysatt", "daysabs") abs[1:4,] ##Analisi descrittive attach(abs) par(mfrow=c(2,2)) hist(daysabs, breaks=20, main="distribuzione giorni di assenza a scuola", xlab="giorni di assenza", ylab="frequenza") plot(math, daysabs, cex.lab=1.2, pch=19, col=ifelse(male==1, "blue", "red"), xlab="Voto in matematica", ylab="giorni di assenza") legend(70, 40, c("Ragazzi", "Ragazze"), col=c("blue", "red"), pch=19, cex=0.8) plot(langarts,daysabs, cex.lab=1.2, pch=19, col=ifelse(male==1, "blue", "red"), xlab="voto in letteratura", ylab="giorni di assenza") legend(70, 40, c("Ragazzi", "Ragazze"), col=c("blue", "red"), pch=19, cex=0.8) plot(jitter(male, 0.5), daysabs,xaxt="n", xlab="genere", ylab="giorni di assenza a scuola") axis(1, c(0, 1), labels = c("Ragazze", "Ragazzi")) ##Analisi di correlazione cor(abs[,-1]) ##stima del modello fit.1<-glm(daysabs ~male + rescale(math) + rescale(langarts), family="poisson") display(fit.1,4) ## coeff interpretatibili come tax di crescita summary(fit.1) fit.2<-glm(daysabs ~male + rescale(math) + rescale(langarts), offset=log(daysatt), family="poisson") display(fit.2,4) summary(fit.2) fit.2$linear.predictors[1:10] fit.2$fitted[1:10] ##valori stimati exp(fit.2$linear.predictors[1:10]) ## o anche yhat <- predict (fit.2, type="response") round(yhat[1:10]) ##Overdispersione hist(daysabs, freq=F, ylim=c(0, 0.3), main="Distribuzione giorni di assenza") ##il miglior adattamento con theta==2 points (sort(daysabs), dpois(sort(daysabs), 2), col="blue", cex=0.5) lines (sort(daysabs), dpois(sort(daysabs), 2), col="red", cex=0.5) ##Valori previsti yhat <- predict (fit.2, type="response") ##Plot dei residui ##Residui dei Pearson ri<-residuals(fit.2, type="pearson") par(mfrow=c(2,2)) ##grafico dei residui grezzi plot(fit.2$fitted.values, residuals(fit.2, type="pearson"), xlab = "Valori Stimati", ylab = "Residui Grezzi", cex = 0.6, ylim=c(-10, 50)) abline(h = 0) ##ovvero plot(fit.2$fitted.values, abs$daysabs-exp(fit.2$linear.predictors), xlab = "Valori Stimati", ylab = "Residui Grezzi", cex = 0.6, ylim=c(-10, 50)) abline(h = 0) ##residui di Pearson STANDARDIZZATI ## zi=(yi-theta_i)/sqrt(theta_i) z<-(abs$daysabs-exp(fit.2$linear.predictors))/sqrt(exp(fit.2$linear.predictors)) plot(yhat, z, xlab = "Valori Stimati", ylab = "Residui standardizzati", cex = 0.6, ylim=c(-10, 10)) abline(h = 0) abline(h = -1, col="red") abline(h = 1, col="red") ##H0: the variance is equal to the expectation ##Pearson scale overdispersion = Pearson Chi SQ / DF ##gradi di liberta' 312 n=316 k=4 qchisq(0.05, (n-k), lower.tail = FALSE) stat.test<-sum(z^2) ##Rifiuto H0 perche' sum(z^2)>>>qchisq(0.05, (n-k), lower.tail = FALSE) ##p-value cat ("p-value of overdispersion test is ", pchisq (sum(z^2), n-k), "\n") ## Stima del fattore di sovradispersione cat ("overdispersion ratio is ", sum(z^2)/(n-k), "\n") ##Alternativamente ##overdispersion = Deviance Chi SQ / DF summary(fit.2) res.deviance<-fit.2$deviance/fit.2$df.residual df.residual<-fit.2$df.residual pchisq(res.deviance, df.residual, lower.tail = FALSE) ##Sovradispersione in R fit.2a<-glm(daysabs ~male + rescale(math) + rescale(langarts), offset=log(daysatt), family="quasipoisson") display(fit.2a,4) summary(fit.2a) ##Incremento gli errori standard in base al coefficiente di sovradispersione ##Valuto lo standard error di male del modello di base summary(fit.2)$coeff[2,2] ##Incremento lo standard error della radice del fattore di sovradispersione summary(fit.2)$coeff[2,2]*sqrt(sum(z^2)/(n-k)) ##confronto con modello con family quasipoisson summary(fit.2a)$coeff[2,2] ##grafico dei valori osservati e stimati plot(abs$math, abs$daysabs, pch="o", col="blue", main="Valori osservati e previsti", xlab="Voto in matematica", ylab="Giorni di assenze") points(predict (fit.2, type="response"), pch="p", col="red") legend(80,40,c("obs","pred"), pch=c("o","p"), col=c("blue","red")) ##stima dei valori abs$phat <- predict(fit.2a, type = "response") exp(fit.2a$linear.predictors[1:10]) require(ggplot2) ## create the plot ggplot(abs, aes(x = math, y = yhat, colour = factor(male))) + geom_point(aes(y = daysabs), alpha = 0.5, position = position_jitter(h = 0.2)) + geom_line(size = 1)+ labs(x = "Punteggio in matematica", y = "Numero atteso di giorni di assenze") ###Questi dati provengono da un articolo di McCullagh and Nelder (1989) relativo ad incidenti sulle navi ##The file has 34 rows corresponding to the observed combinations of type of ship, year of construction and period of operation. ## Each row has information on five variables as follows: ship<- read.table("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/ships.dat", sep="") head(ship) ## ship type, coded 1-5 for A, B, C, D and E, ## year of construction (1=1960-64, 2=1965-70, 3=1970-74, 4=1975-79), ## period of operation (1=1960-74, 2=1975-79) ## months of service, ranging from 63 to 20,370, and ## damage incidents, ranging from 0 to 53. par(mfrow = c(1, 2)) plot(damage ~ months, data = ship) plot(log(damage + 0.5) ~ log(months), data = ship) par(mfrow = c(1, 3)) plot(damage ~ as.factor(type), data = ship) plot(damage ~ months, data = ship, pch = as.character(type)) plot(damage/months ~ as.factor(type), data = ship) ## Modello senza exposure M0 <- glm(damage ~ type + construction + operation, data = ship, family = poisson) display(M0) ##Introduco l'exposure (mesi di servizio--in log come da analisi descrittiva) M1 <- glm(damage ~ type + construction + operation + offset(log(months)), data = ship, family = poisson) display(M1) ##Overdispersion summary(M1) ##Calcolo dei residui yhat <- predict (M1, type="response") z <- (ship$damage-yhat)/sqrt(yhat) df.residual<-M1$df.residual cat ("overdispersion ratio is ", sum(z^2)/df.residual, "\n") M2 <- glm(damage ~ type + construction + operation + offset(log(months)), data = ship, family = quasipoisson) display(M2) ################ ##ZIP model ################# install.packages("pscl") library(pscl) ##Stessi predittori per il modello di conteggio e per il modello Zero-inflated fit.zip <- zeroinfl(daysabs ~male + rescale(math), offset=log(daysatt)) summary(fit.zip) ##Escludo il sesso dal modello Zero-inflated fit.zip.a <- zeroinfl(daysabs ~male + rescale(math) | rescale(math) , offset=log(daysatt)) summary(fit.zip.a) ##ESEMPIO DELLA PESCA ##We have data on 250 groups that went to a park. Each group was questioned about how many fish they caught (count), how many children were in the group (child), how many people were in the group (persons), and whether or not they brought a camper to the park (camper). ##In addition to predicting the number of fish caught, there is interest in predicting the existence of excess zeros, i.e., the probability that a group caught zero fish. We will use the variables child, persons, and camper in our model. zinb<-read.csv("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/zinb.csv")[,-1] zinb <- within(zinb, { nofish <- factor(nofish) livebait <- factor(livebait) camper <- factor(camper) }) summary(zinb) hist(zinb$count, breaks=10) hist(zinb$count, breaks=100) require(ggplot2) ## histogram with x axis in log10 scale senza gli zero ggplot(zinb, aes(count, fill = camper)) + geom_histogram() + scale_x_log10() + facet_grid(camper ~ ., margins=TRUE, scales="free_y")+ labs(x = "Numero di pesci pescati", y = "Frequenza") ##Stima del modello fit.zip1 <- zeroinfl(count ~ child + camper, data=zinb) summary(fit.zip1) ##Considero predittori diversi per i due modelli (per il modello con massa in zero solo il numero di persone presenti) fit.zip2 <- zeroinfl(count ~ child + camper | persons, data = zinb) summary(fit.zip2) ##Dal momento che abbiamo essenzialmente predittori categorici possiamo calcolare ##i valori attesi della variabile y (numero di pesci pescati) in funzione di tutte le possibili combinazioni dei nostri predittori (calcolata con la funzione expand.grid) ##quindi facciamo il grafico newdata1 <- expand.grid(0:3, factor(0:1), 1:4) ##tutte le possibili combinazioni colnames(newdata1) <- c("child", "camper", "persons") ##Valori previsti newdata1$phat <- predict(fit.zip2, newdata1) ggplot(newdata1, aes(x = child, y = phat, colour = factor(persons))) + geom_point() + geom_line() + facet_wrap(~camper) + labs(x = "Numero di bambini", y = "Numero di pesci pescati--valori stimati") ##ESERCIZIO ## dati su "Extramarital Affairs Data" load("/Users/mariagraziapittau/Library/CloudStorage/Dropbox-Universita'DegliStudiDiRomaLaSapienza/grazia.pittau@uniroma1.it/Lezioni/Statistica_Aziendale/R/Poisson/affairs.RData") head(data) ##Description desc ##variable label ## id identifier ## male =1 if male ## age in years ## yrsmarr years married ## kids =1 if have kids ## relig 5 = very relig., 4 = somewhat, 3 = slightly, 2 = not at all, 1 = anti ## educ years schooling ## occup occupation, reverse Hollingshead scale ## ratemarr 5 = vry hap marr, 4 = hap than avg, 3 = avg, 2 = smewht unhap, 1 = vry unhap ## naffairs number of affairs within last year ## affair =1 if had at least one affair ## vryhap ratemarr == 5 ## hapavg ratemarr == 4 ## avgmarr ratemarr == 3 ## unhap ratemarr == 2 ## vryrel relig == 5 ## smerel relig == 4 ## slghtrel relig == 3 ## notrel relig == 2 dati.affairs<-data dati.affairs[1:5,] hist(dati.affairs$naffairs) fit.1 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age), data = dati.affairs, family = poisson) summary(fit.1) display(fit.1) ##Overdispersion gdl<-fit.1$df.residual ##Calcolo dei residui yhat <- predict (fit.1, type="response") z <- (dati.affairs$naffairs-yhat)/sqrt(yhat) n<-dim(dati.affairs)[1] k<-length(coef(fit.1)) cat ("overdispersion ratio is ", sum(z^2)/(n-k), "\n") fit.2 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age), data = dati.affairs, family = quasipoisson) summary(fit.2) display(fit.2) fit.3 <- glm(naffairs ~ male + kids + rescale(relig) + rescale(age) + offset(log(yrsmarr)), data = dati.affairs, family = quasipoisson) summary(fit.3) display(fit.3) fit.4<-zeroinfl(naffairs ~ male + rescale(relig) + rescale(age) | rescale(age) + kids + offset(log(yrsmarr)), data=dati.affairs) summary(fit.4) ##Esercizio cesarei ##Births by caesarean section are said to be more frequent in private (fee paying) hospitals as ##compared to non-fee paying public hospitals. Data about total annual births and the number of ##caesarean sections carried out were obtained from the records of 4 private hospitals and 16 public ##hospitals. These are tabulated below: ##0 = Private 1 = Public ##Hospital Hospitals dati <- read.csv("cesarea.csv", header=T, sep=";") dati$Public<-ifelse(dati$Hospital.type==1, 1, 0) plot(dati$Births, dati$Caesareans, pch=16) fit.1<-glm (Caesareans~ Public, family="poisson", data=dati) display(fit.1) fit.1a<-glm (Caesareans~ Public, family="poisson", offset=log(Births), data=dati) display(fit.1a) ##Overdispersion summary(fit.1a) gdl<-fit.1a$df.residual ##Calcolo dei residui yhat <- predict (fit.1a, type="response") z <- (dati$Caesareans-yhat)/sqrt(yhat) n<-dim(dati)[1] ##k=2 cat ("overdispersion ratio is ", sum(z^2)/gdl, "\n") fit.1b<-glm (Caesareans~ Public, family="quasipoisson", offset=log(Births), data=dati) display(fit.1b)