# conf. interval computation using expected survey dataset # Load dataset data <- readxl::read_excel("dataset_clean.xlsx") # Inspect structure str(data) # treat the dataset as the total population pop <- data$salary mean(pop, na.rm = TRUE) sd(pop, na.rm = TRUE) length(pop) # draw a random sample of 20 obs. set.seed(123) # for reproducibility sample_20 <- sample(pop, size = 20) # compute sampling mean and sampling s.d. x_bar <- mean(sample_20, na.rm = T) s <- sd(sample_20, na.rm = T) n <- length(sample_20) # conf. int alpha <- 0.05 t_crit <- qt(1 - alpha/2, df = n - 1) se <- s / sqrt(n) lower <- x_bar - t_crit * se upper <- x_bar + t_crit * se c(lower, upper) library(ggplot2) ggplot() + geom_histogram(aes(x = pop), bins = 30, fill = "grey80", color = "grey30") + geom_vline(xintercept = mean(pop, na.rm = TRUE), color = "red", linetype = "dashed", size = 1) + geom_errorbarh(aes(y = 0, xmin = lower, xmax = upper), height = 1, color = "blue", size = 1.2) + labs(x = "Salary", y = "Density", title = "95% Confidence Interval for the Mean Salary (sample of 20)") ########################################### set.seed(123) n <- 20 # sample size B <- 200 # number of samples alpha <- 0.05 means <- numeric(B) lower <- numeric(B) upper <- numeric(B) ############ Simulate repeated sampling ############################ for (i in 1:B) { samp <- sample(pop, size = n, replace = TRUE) xbar <- mean(samp) s <- sd(samp) se <- s / sqrt(n) t_crit <- qt(1 - alpha/2, df = n - 1) means[i] <- xbar lower[i] <- xbar - t_crit * se upper[i] <- xbar + t_crit * se } # save in a dataframe the simulated samples with mean, and conf. int df <- data.frame( samp_id = 1:B, xbar = means, ci_lo = lower, ci_hi = upper ) pop_mean <- mean(pop, na.rm = TRUE) # this is the population mean # graphically visualize the estimated conf. int + true mean in the population ggplot(df[1:40, ], aes(y = factor(samp_id), x = xbar)) + geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.25, color = "steelblue") + geom_point(size = 1.6) + geom_vline(xintercept = pop_mean, linetype = "dashed", color = "red", linewidth = 1) + labs( title = "95% CI (orizzontali) per 40 campioni", y = "Sample ID", x = "Sample mean of salary" ) + theme_minimal()