####################################################################### ##### Statistics Course - Descriptive statistics with dataset ###### ######### Dr. Ilaria Benedetti ######### ####################################################################### #homework assignment: Ex_caschool_desc_stat.pdf setwd() data <- read.csv("caschool.csv") # 1. observe the data structure, str(data) dim(data) head(data) # 2. focusing on the test score variable (var.name= testscr), #calculate the main position indicators (the mean, quantiles, min, max, mode), range, interquartile difference, asymmety indicators; summary(data) mean(data$testscr) summary(data$testscr) # all the statistics togheter quantile(data$testscr, 0.75 ) # q3 quantile(data$testscr, 0.50 ) # q2 (median) quantile(data$testscr, 0.25 ) # q1 min(data$testscr) max(data$testscr) # mode -> we define a function in R Mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } Mode(data$testscr) # range range<- max(data$testscr) - min(data$testscr) range # interquartile difference diff(quantile(data$testscr, probs = c(0.25, 0.75))) # symmetry - asimmetry of a distribution install.packages("labstatR") library(labstatR) skew(data$testscr) # mean or other statistics by group: # 3. Compute the mean of the test score variable by Grades; #this is factor indicating grade span of district (var.name=gr_span) # compute the mean of the variable "testscr" by Grades data$gr_span <-as.factor(data$gr_span ) tapply(data$testscr, data$gr_span , mean) # 4. Compute the variance, the standard deviation and coefficient of variation of the variable test score. # # compute variance of the variable age var(data$testscr) # sampling variance by default in R --> sampling variance (n-1 observations) sqrt(var(data$testscr)) # sqrt st.deviation # variance of the population library(labstatR) sigma2(data$testscr) #variance in the pop #coefficient of variation # CV of variable testscr mean<-mean(data$testscr) var<-var(data$testscr) sd<-sqrt(var) cv<-(sd/mean)*100 cv # Cv with packages install.packages("ineq") library(ineq) var.coeff(data$testscr, square = FALSE) # not expressed in 100% # 5. compute variance and standard deviation of the variable testscore by gr_span tapply(data$testscr, data$gr_span, var) # sampling variance by default in R tapply(data$testscr, data$gr_span, sd) ####### alternative command ###### install.packages("dplyr") library(dplyr) group_by(data, gr_span) %>% summarise( count = n(), mean = mean(testscr, na.rm = TRUE), sd = sd(testscr, na.rm = TRUE) ) # 6.Represent the variable test score graphically using a box-plot # boxplot - test_score boxplot(data$testscr) #7. outliers -test_score table(boxplot(data$testscr)$out) # for detailed outiers outlier <- boxplot(data$testscr)$out outlier.number <- length(boxplot(data$testscr)$out) outlier.number # only 1 outlier #8 histogram - variable income avginc hist(data$avginc)