#######################################################################
#######################################################################


# open dataset marketingcampaign , rename the dataset as "mc"

mc <- read_excel("~/Dropbox/DIDATTICA/CORSO SAPIENZA/SBDM_2024_2025/DATA/marketing_campaign.xlsx") # in green is reported my path
# compute the mean of a quantitative variables 
# food_expenditure mean 
mean(mc$food_expenditure)

# income mean 
mean(mc$Income) #results: NA! there are missing values in this variable
table(is.na(mc$Income)) # TRUE=NA, FALSE=no NA 
sum(is.na(mc$Income))  # number of NA

mean(mc$Income, na.rm = TRUE) #remome NA observations 

# quantiles of distribution
quantile(mc$food_expenditure, 0.75 ) # q3
quantile(mc$food_expenditure, 0.50 ) # q2 (median)
quantile(mc$food_expenditure, 0.25 ) # q1 

# minimum and maximum 
min(mc$food_expenditure)
max(mc$food_expenditure)

# summary of the mean, q1, q2, q3, min, max 
summary(mc$food_expenditure)

# mode for qualitative variable 
table(mc$Education) # modality with the highest freq. 


# boxplot 
boxplot(mc$Income) # flat graphical repr -> presence of outliers (points upper part of the graph)

# outliers identification
table(boxplot(mc$Income)$out) # for detailed values of outiers (n. of outliers)
outlier <- boxplot(mc$Income)$out
outlier.number <- length(boxplot(mc$Income)$out)
outlier.number # 8 outlier

boxplot(mc$Income, outline=FALSE) # ylab not clear -> change the scale of measurement

mc$income2=mc$Income/1000 #add a new variable with new income in 1000 
boxplot(mc$income2, outline=FALSE) 

# box plot by groups  - boxplot(x ~ y) x=quantitative var y=qualit var 
# income box-plot by marital status 

boxplot(mc$income2 ~ mc$Marital_Status, outline=FALSE, xlab="marital status", ylab="income distrib")

# What is it YOLO????? is an outlier..... we should remove this modality 
mc$Marital_Status<-as.factor(mc$Marital_Status)
levels(mc$Marital_Status)

table(mc$Marital_Status) # 2 units with YOLO and 2 units with Absurd--> should be removed 

# Step 1: remove the modalities from the dataset 
mc_filtered <- mc[!(mc$Marital_Status %in% c("YOLO", "Absurd")), ]

# Step 2: Drop the unused factor levels in Marital_Status
mc_filtered$Marital_Status <- droplevels(mc_filtered$Marital_Status)

# Verify that the unused levels have been removed
levels(mc_filtered$Marital_Status)

# boxplot....  
boxplot(income2 ~ Marital_Status, data = mc_filtered, 
        main = "Box Plot of Income by Marital Status",
        xlab = "Marital Status", ylab = "Income", col = "lightblue")

#### position indicators by group 
# what is it the group with the highest average income? income mean by marital status (tapply function)
# tapply(quantitative_var, qualitative_var, function)
tapply(mc_filtered$income2, mc_filtered$Marital_Status, mean, na.rm=TRUE)

##############################################################################
########################## DEALING WITH OUTLIERS #############################
##############################################################################

# import dataset organic_expenditure and recall as oe

oe <- read_excel("Desktop/SBDM DATASET/organic_exp.xlsx")
View(oe)

# summry of organic exp
summary(oe$organic_food_expenditure)
boxplot(oe$organic_food_expenditure)

# box plot of food expenditure by education level 
boxplot(organic_food_expenditure ~ Education, data = oe, 
        main = "Box Plot of Income by Educational level",
        xlab = "Educational level", ylab = "organic food exp", col = "lightblue")

# What is it modality YOLO???? IT is a OUTLIER! It should be removed from the dataset....

# remove the units related to this modality 
oe$Education<-as.factor(oe$Education)
levels(oe$Education)
oe_filtered <- oe[!(oe$Education %in% c("YOLO")), ] # command != means different from 

oe_filtered$Education <- droplevels(oe_filtered$Education) # this command drop the levels associated to the modality excluded 

# new box plot with subdataset oe_filtered

boxplot(organic_food_expenditure ~ Education, data = oe_filtered, 
        main = "Box Plot of Income by Educational level",
        xlab = "Educational level", ylab = "organic food exp", col = "lightblue")


####################################################################################
######################### VARIABILITY INDICATORS ###################################
####################################################################################

# range
range<- max(mc$Income, na.rm = TRUE) - min(mc$Income, na.rm = TRUE)
range

# interquartile difference
diff(quantile(mc$income2, probs = c(0.25, 0.75), na.rm = TRUE))


# variance of quantitative var
var(mc$food_expenditure, na.rm=TRUE)
sqrt(var(mc$food_expenditure, na.rm=TRUE))

# ex. to do! compute variance of food_exp by marital status 


# by default R compute the sampling variance 
# example with 5 observations 
v<-c(5,15,20,25,35)
mean(v)
var(v)
# compute the variance of this vector by hands... -> the result will be different from R!
# WHY? 

#variance by default in R --> sampling variance (n-1 observations)


#coefficient of variation
# CV of variable food_exp
mean<-mean(mc$food_expenditure)
var<-var(mc$food_expenditure)
sd<-sqrt(var)

cv<-(sd/mean)
cv # usually CV are reported in % (multiplied by 100)

# Cv with packages 
install.packages("ineq")
library(ineq)
var.coeff(mc$food_expenditure, square = FALSE) # not expressed in 100%