########################################################################
# SBDM course 
########################################################################

getwd() 
setwd()
# 
# ---------------------------------------------------------------------
# freq. distr. with R 

#open dataset edu.xlxs
# set your w.d 
# rename the dataset edu as "mydata" 
library(readxl)
mydata <- read_excel("edu.xlsx")

class(mydata) #dataframe 
str(mydata)  # display the variables and the nature of the variables 


# modalities of the variable "edu_level" (qualitative variable)

mydata$edu_level<-as.factor(mydata$edu_level) # define this variable as factor 
levels(mydata$edu_level) # 2 levels 

# 1- distr. freq. della var "edu_level"

#absolute freq (nj)
nj<-table(mydata$edu_level) #table --> abs. freq. distrib.
nj

#relative freq (fj)
fj<-prop.table(nj)
fj


# cumulative relative freq. distrib (Fj - upper case F!)
Fj<-cumsum(fj)
Fj

# perc. rel freq. (pj)
pj<-fj*100


# combine all the freq in a table 

# Step 5: Combine all into a data frame
freq_table <- data.frame(
  Category = names(nj),
  Absolute_Freq = as.vector(nj),
  Relative_Freq = as.vector(fj),
  Percent_Freq = as.vector(pj)
)

# View the table
print(freq_table)


# BAR - PLOT -------------------------------------------------------------------------
# var. qualitative (ex. educational level, degree of product satisfaction... )


nj<-table(mydata$edu_level) #table --> abs. freq. distrib.


# bar plot 
# vertical bars 
barplot(nj,
        main="Simple Bar Plot",
        xlab="educational level", ylab="Frequency")


# orizzontal bars --> specify  horiz=TRUE

barplot(nj,
        main="Horizontal Bar Plot",
        xlab="Frequency", ylab="educational level",
        horiz=TRUE)


# to do: create a table with absolute, relative and cum. rel. freq. distributions of the variable "week_hours"


# HISTOGRAM (for quantitative variables, eg. income, expenditure.... )

# load income dataset 
library(readxl)  # open data in excel format
mydata <- read_excel("income.xls")

# Histogram --------------------------------------------------------------
is.numeric(mydata$income) # only for quantitative variables 

hist(mydata$income, xlab="income", main="Income distribution", freq=F)

#change the unit of measurement (values of income /1000) (1 means 1000 euros)

tousand=mydata$income/1000

#
hist(tousand, xlab="Net disposable income (in tousand)",  main="Income distribution", freq=F)

#increase the  breaks 
hist(tousand,xlab="Net disposable income (in tousand)",  main="Income distribution", freq=F ,breaks=30)

# cut the long right tail --> from  0 to 150)
hist(tousand,xlab=" income (in tousand)",  main="Income distribution", freq=F ,
     breaks=60, xlim=c(0,150)) 


# piechart for qualitative variables 

counts <- table(mydata$fixedterm_contract)
counts
pie(counts, main="Pie chart ")


##############################################################################


# Exercise to complete -->  Open the marketing_campaign dataset

# Identify the frequency distribution (fj) of the variables "Education" and "Complaint"
# Graphically represent the variable "MntMeatProducts" using a histogram

# Graphically represent the variable "Education" using a bar chart
# Graphically represent the variable"Complaint" using a bar chart