######################################################################## # SBDM course ######################################################################## getwd() setwd() # # --------------------------------------------------------------------- # freq. distr. with R #open dataset edu.xlxs # set your w.d # rename the dataset edu as "mydata" library(readxl) mydata <- read_excel("edu.xlsx") class(mydata) #dataframe str(mydata) # display the variables and the nature of the variables # modalities of the variable "edu_level" (qualitative variable) mydata$edu_level<-as.factor(mydata$edu_level) # define this variable as factor levels(mydata$edu_level) # 2 levels # 1- distr. freq. della var "edu_level" #absolute freq (nj) nj<-table(mydata$edu_level) #table --> abs. freq. distrib. nj #relative freq (fj) fj<-prop.table(nj) fj # cumulative relative freq. distrib (Fj - upper case F!) Fj<-cumsum(fj) Fj # perc. rel freq. (pj) pj<-fj*100 # combine all the freq in a table # Step 5: Combine all into a data frame freq_table <- data.frame( Category = names(nj), Absolute_Freq = as.vector(nj), Relative_Freq = as.vector(fj), Percent_Freq = as.vector(pj) ) # View the table print(freq_table) # BAR - PLOT ------------------------------------------------------------------------- # var. qualitative (ex. educational level, degree of product satisfaction... ) nj<-table(mydata$edu_level) #table --> abs. freq. distrib. # bar plot # vertical bars barplot(nj, main="Simple Bar Plot", xlab="educational level", ylab="Frequency") # orizzontal bars --> specify horiz=TRUE barplot(nj, main="Horizontal Bar Plot", xlab="Frequency", ylab="educational level", horiz=TRUE) # to do: create a table with absolute, relative and cum. rel. freq. distributions of the variable "week_hours" # HISTOGRAM (for quantitative variables, eg. income, expenditure.... ) # load income dataset library(readxl) # open data in excel format mydata <- read_excel("income.xls") # Histogram -------------------------------------------------------------- is.numeric(mydata$income) # only for quantitative variables hist(mydata$income, xlab="income", main="Income distribution", freq=F) #change the unit of measurement (values of income /1000) (1 means 1000 euros) tousand=mydata$income/1000 # hist(tousand, xlab="Net disposable income (in tousand)", main="Income distribution", freq=F) #increase the breaks hist(tousand,xlab="Net disposable income (in tousand)", main="Income distribution", freq=F ,breaks=30) # cut the long right tail --> from 0 to 150) hist(tousand,xlab=" income (in tousand)", main="Income distribution", freq=F , breaks=60, xlim=c(0,150)) # piechart for qualitative variables counts <- table(mydata$fixedterm_contract) counts pie(counts, main="Pie chart ") ############################################################################## # Exercise to complete --> Open the marketing_campaign dataset # Identify the frequency distribution (fj) of the variables "Education" and "Complaint" # Graphically represent the variable "MntMeatProducts" using a histogram # Graphically represent the variable "Education" using a bar chart # Graphically represent the variable"Complaint" using a bar chart