####################################################################### ####################################################################### # open dataset marketingcampaign , rename the dataset as "mc" mc <- read_excel("~/Dropbox/DIDATTICA/CORSO SAPIENZA/SBDM_2024_2025/DATA/marketing_campaign.xlsx") # in green is reported my path # compute the mean of a quantitative variables # food_expenditure mean mean(mc$food_expenditure) # income mean mean(mc$Income) #results: NA! there are missing values in this variable table(is.na(mc$Income)) # TRUE=NA, FALSE=no NA sum(is.na(mc$Income)) # number of NA mean(mc$Income, na.rm = TRUE) #remome NA observations # quantiles of distribution quantile(mc$food_expenditure, 0.75 ) # q3 quantile(mc$food_expenditure, 0.50 ) # q2 (median) quantile(mc$food_expenditure, 0.25 ) # q1 # minimum and maximum min(mc$food_expenditure) max(mc$food_expenditure) # summary of the mean, q1, q2, q3, min, max summary(mc$food_expenditure) # mode for qualitative variable table(mc$Education) # modality with the highest freq. # boxplot boxplot(mc$Income) # flat graphical repr -> presence of outliers (points upper part of the graph) # outliers identification table(boxplot(mc$Income)$out) # for detailed values of outiers (n. of outliers) outlier <- boxplot(mc$Income)$out outlier.number <- length(boxplot(mc$Income)$out) outlier.number # 8 outlier boxplot(mc$Income, outline=FALSE) # ylab not clear -> change the scale of measurement mc$income2=mc$Income/1000 #add a new variable with new income in 1000 boxplot(mc$income2, outline=FALSE) # box plot by groups - boxplot(x ~ y) x=quantitative var y=qualit var # income box-plot by marital status boxplot(mc$income2 ~ mc$Marital_Status, outline=FALSE, xlab="marital status", ylab="income distrib") # What is it YOLO????? is an outlier..... we should remove this modality mc$Marital_Status<-as.factor(mc$Marital_Status) levels(mc$Marital_Status) table(mc$Marital_Status) # 2 units with YOLO and 2 units with Absurd--> should be removed # Step 1: remove the modalities from the dataset mc_filtered <- mc[!(mc$Marital_Status %in% c("YOLO", "Absurd")), ] # Step 2: Drop the unused factor levels in Marital_Status mc_filtered$Marital_Status <- droplevels(mc_filtered$Marital_Status) # Verify that the unused levels have been removed levels(mc_filtered$Marital_Status) # boxplot.... boxplot(income2 ~ Marital_Status, data = mc_filtered, main = "Box Plot of Income by Marital Status", xlab = "Marital Status", ylab = "Income", col = "lightblue") #### position indicators by group # what is it the group with the highest average income? income mean by marital status (tapply function) # tapply(quantitative_var, qualitative_var, function) tapply(mc_filtered$income2, mc_filtered$Marital_Status, mean, na.rm=TRUE) ############################################################################## ########################## DEALING WITH OUTLIERS ############################# ############################################################################## # import dataset organic_expenditure and recall as oe oe <- read_excel("Desktop/SBDM DATASET/organic_exp.xlsx") View(oe) # summry of organic exp summary(oe$organic_food_expenditure) boxplot(oe$organic_food_expenditure) # box plot of food expenditure by education level boxplot(organic_food_expenditure ~ Education, data = oe, main = "Box Plot of Income by Educational level", xlab = "Educational level", ylab = "organic food exp", col = "lightblue") # What is it modality YOLO???? IT is a OUTLIER! It should be removed from the dataset.... # remove the units related to this modality oe$Education<-as.factor(oe$Education) levels(oe$Education) oe_filtered <- oe[!(oe$Education %in% c("YOLO")), ] # command != means different from oe_filtered$Education <- droplevels(oe_filtered$Education) # this command drop the levels associated to the modality excluded # new box plot with subdataset oe_filtered boxplot(organic_food_expenditure ~ Education, data = oe_filtered, main = "Box Plot of Income by Educational level", xlab = "Educational level", ylab = "organic food exp", col = "lightblue") #################################################################################### ######################### VARIABILITY INDICATORS ################################### #################################################################################### # range range<- max(mc$Income, na.rm = TRUE) - min(mc$Income, na.rm = TRUE) range # interquartile difference diff(quantile(mc$income2, probs = c(0.25, 0.75), na.rm = TRUE)) # variance of quantitative var var(mc$food_expenditure, na.rm=TRUE) sqrt(var(mc$food_expenditure, na.rm=TRUE)) # ex. to do! compute variance of food_exp by marital status # by default R compute the sampling variance # example with 5 observations v<-c(5,15,20,25,35) mean(v) var(v) # compute the variance of this vector by hands... -> the result will be different from R! # WHY? #variance by default in R --> sampling variance (n-1 observations) #coefficient of variation # CV of variable food_exp mean<-mean(mc$food_expenditure) var<-var(mc$food_expenditure) sd<-sqrt(var) cv<-(sd/mean) cv # usually CV are reported in % (multiplied by 100) # Cv with packages install.packages("ineq") library(ineq) var.coeff(mc$food_expenditure, square = FALSE) # not expressed in 100%