is.numeric(data$salary) class(data$salary) # mean of salary mean(data$salary) mean(data$salary, na.rm=T) # st. dev of salary sd(data$salary, na.rm=T) hist(data$salary) boxplot(data$salary, outline = F) # detect outliers q1<-quantile(data$salary, 0.25 , na.rm=T) q3<-quantile(data$salary, 0.75, na.rm=T) summary(data$salary) iqr<-q3-q1 lambda <- 1.5 lower_bound <- q1 -lambda * iqr upper_bound <- q3 +lambda * iqr # function to remove outliers x<-data$salary salary_no_out<- x[!x %in% boxplot.stats(x)$out] # distr. of salary witout outliers hist(salary_no_out) # Estimate density dens <- density(salary_no_out, na.rm = T) # Plot density plot(dens) qqnorm(salary_no_out) qqline(salary_no_out, col="red", lwd=2) # salary mean (no outliers) mean(salary_no_out, na.rm = T) # st.dev of salary (without outliers) sd(salary_no_out, na.rm = T) # mean=1693 sd=369 x<- seq(500, 3000, by=1) pdf_vals<- dnorm(x,mean, sd ) pdf_df <- data.frame(x, pdf_vals) plot(pdf_df$pdf_vals) # P(X>1800) greater<- pnorm(1800, mean, sd) 1- greater # P(1500