# --- R Demo A: Sampling distribution of the mean --- set.seed(123) # 1) Create a synthetic population (e.g., salaries in ???) N <- 100000 pop_mean <- 1900 pop_sd <- 200 population <- rnorm(N, mean = pop_mean, sd = pop_sd) # 2) Draw many samples and compute sample means B <- 3000 # number of samples n <- 30 # sample size per sample samp_means <- replicate(B, mean(sample(population, n))) samp_means hist(samp_means) # 3) Compare empirical sampling distribution to theory mean_samp_means <- mean(samp_means) sd_samp_means <- sd(samp_means) # empirical SE se_theory <- pop_sd / sqrt(n) # theoretical SE (?? known) cat(sprintf("Mean of sample means: %.2f (true ?? = %.0f)\n", mean_samp_means, pop_mean)) # 4) Plot library(ggplot2) df <- data.frame(samp_means = samp_means) # plot the sampling distribution of the means with B replication hist(samp_means, freq = FALSE, # density breaks = 40, # num of bins col = "grey80", border = "grey40", # colori main = sprintf("Sampling distribution of the mean (n = %d, B = %d)", n, B), xlab = "Sample mean", ylab = "Density") # normal curve curve(dnorm(x, mean = pop_mean, sd = se_theory), add = TRUE, lwd = 2, col = "blue") # Aggiungi un sottotitolo con SE empirico e teorico mtext(sprintf("Empirical SE ??? %.1f; Theoretical SE = ??/???n = %.1f", sd_samp_means, se_theory), side = 3, line = 0.5, cex = 0.8) ############# # --- R Demo B: One-sample t CI for the mean --- set.seed(42) # Pretend this is your sample of expected salaries (n small ??? use t) n <- 25 x <- rnorm(n, mean = 1900, sd = 220) xbar <- mean(x) s <- sd(x) # st. deviation in the sample alpha <- 0.05 tcrit <- qt(1 - alpha/2, df = n - 1) se <- s / sqrt(n) L <- xbar - tcrit * se U <- xbar + tcrit * se round(c(mean = xbar, L = L, U = U), 2) # Nice plot of the CI on a number line library(ggplot2) df_ci <- data.frame(xbar = xbar, L = L, U = U) p_ci <- ggplot(df_ci) + geom_segment(aes(x = L, xend = U, y = 0, yend = 0), linewidth = 3) + geom_point(aes(x = xbar, y = 0), size = 3) + geom_vline(xintercept = 1900, linetype = 2) + labs(title = "95% t-confidence interval for ?? (?? unknown)", subtitle = sprintf("x?? = %.1f, s = %.1f, n = %d", xbar, s, n), x = "Euros", y = "") + theme_minimal(base_size = 13) + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) print(p_ci) ############################################################################ set.seed(123) # true parameters of the pop (to be used for ssimulate the data) mu_true <- 1900 sigma_true <- 220 # sample and n.of replication n <- 25 # dimension of the sample B <- 200 # n. of sample(s) conf <- 0.95 # confidence level alpha <- 1 - conf # Simulate B samples, compute sampling mean x??, s, t-IC xbar <- s <- L <- U <- rep(NA_real_, B) for (b in 1:B) { x <- rnorm(n, mean = mu_true, sd = sigma_true) xbar[b] <- mean(x) s[b] <- sd(x) tcrit <- qt(1 - alpha/2, df = n - 1) se <- s[b] / sqrt(n) L[b] <- xbar[b] - tcrit * se U[b] <- xbar[b] + tcrit * se } # Coverage (TRUE if Confidence interval include the true parameter) cover <- (L <= mu_true) & (U >= mu_true) cover coverage_rate <- mean(cover) # ord <- order(xbar) # sort the mean (only for aestethic purposes) xbar <- xbar[ord]; L <- L[ord]; U <- U[ord]; cover <- cover[ord] # ---- figure R: "caterpillar plot" of conf. int ---- par(mar = c(4, 8, 3, 2)) plot(x = xbar, y = seq_len(B), xlim = range(c(L, U, mu_true)), pch = 16, col = ifelse(cover, "forestgreen", "firebrick"), yaxt = "n", ylab = "", xlab = "Mean & 95% CI", main = sprintf("t-CIs for ?? (n = %d, B = %d) ??? Coverage ??? %.1f%%", n, B, 100*coverage_rate)) # add vertical line on the true mean ( ?? ) abline(v = mu_true, lty = 2, lwd = 2) # segments for each conf. int segments(x0 = L, y0 = seq_len(B), x1 = U, y1 = seq_len(B), col = ifelse(cover, "forestgreen", "firebrick"), lwd = 2) mtext(sprintf("Green = covers ??; Red = misses ?? | True ?? = %.0f", mu_true), side = 3, line = 0.5, cex = 0.9) ############################################################################ ###################### --- R Demo C: CI for a proportion --- set.seed(7) # Suppose we surveyed n people; y said "Yes" (success) n <- 400 y <- rbinom(1, size = n, prob = 0.32) # simulate 32% true proportion phat <- y / n se <- sqrt(phat * (1 - phat) / n) alpha <- 0.05 z <- qnorm(1 - alpha/2) L <- phat - z * se U <- phat + z * se round(c(phat = phat, L = L, U = U), 3) # --- R Demo D: Normal curve with shaded central area --- library(ggplot2) mu <- 0; sigma <- 1 Lz <- -1.96; Uz <- 1.96 x <- seq(-4, 4, length.out = 1000) df <- data.frame(x = x, d = dnorm(x, mu, sigma)) df$shade <- with(df, x >= Lz & x <= Uz) p <- ggplot(df, aes(x, d)) + geom_line(linewidth = 1) + geom_area(data = subset(df, shade), aes(y = d), alpha = 0.3) + labs(title = "Standard Normal with Central 95% Shaded", x = "z", y = "Density") + theme_minimal(base_size = 13) print(p)