############################################## # Random Variables in R: Binomial & Normal # # Context: Marketing Communication Analytics # ############################################## # --- 0) Setup --------------------------------------- set.seed(123) # reproducibility library(ggplot2) # --- 1) BINOMIAL RANDOM VARIABLE -------------------- # Scenario: X = number of users (out of 10) clicking on a campaign link # Parameters: n = 10 trials, p = 0.3 probability of success (click) n <- 10 p <- 0.3 x <- 0:n # possible outcomes (0 to 10 clicks) # Probability Mass Function (PMF) pmf <- dbinom(x, size = n, prob = p) # Cumulative probabilities cdf <- pbinom(x, size = n, prob = p) # Combine results binom_df <- data.frame(x, pmf, cdf) # Display the first few rows head(binom_df) #> x pmf cdf #> 1 0 0.0282 0.0282 #> 2 1 0.1211 0.1493 #> 3 2 0.2335 0.3828 #> 4 3 0.2668 0.6496 #> 5 4 0.2001 0.8497 #> 6 5 0.1029 0.9526 # --- Plot: PMF of the Binomial variable -------------- ggplot(binom_df, aes(x = x, y = pmf)) + geom_col(fill = "steelblue") + labs( title = "Binomial Random Variable: Number of Clicks (n=10, p=0.3)", x = "x = Number of Clicks", y = "P(X = x)" ) + theme_minimal() # --- Compute example probabilities ------------------- # P(X = 3) p_eq_3 <- dbinom(3, n, p) # P(X <= 4) p_le_4 <- pbinom(4, n, p) # P(X > 6) p_gt_6 <- 1 - pbinom(6, n, p) p_eq_3; p_le_4; p_gt_6 #> [1] 0.2668 #> [1] 0.8497 #> [1] 0.0473 # Expected value and variance E_X <- n * p Var_X <- n * p * (1 - p) E_X; Var_X #> [1] 3 #> [1] 2.1 # --- 2) NORMAL RANDOM VARIABLE ----------------------- # Scenario: Y = time spent on landing page (seconds) # Parameters: mean = 60, sd = 10 mean_y <- 60 sd_y <- 10 # Generate random data for 200 users Y <- rnorm(200, mean = mean_y, sd = sd_y) # Quick summary summary(Y) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 34.2 53.3 59.8 60.1 66.8 87.5 # --- Plot: Normal density (theoretical + simulated) --- # Theoretical curve y_vals <- seq(30, 90, by = 0.5) pdf_vals <- dnorm(y_vals, mean = mean_y, sd = sd_y) pdf_df <- data.frame(y_vals, pdf_vals) # Density plot (empirical vs theoretical) ggplot() + geom_density(aes(x = Y), fill = "orange", alpha = 0.4, color = "darkred") + geom_line(data = pdf_df, aes(x = y_vals, y = pdf_vals * 100), # scaled for comparison color = "blue", linewidth = 1) + labs( title = "Normal Random Variable: Time on Page (mean=60, sd=10)", x = "Seconds", y = "Density / Frequency" ) + theme_minimal() # --- Example probabilities ---------------------------- # P(Y < 50) p_less_50 <- pnorm(50, mean_y, sd_y) # P(Y > 70) p_greater_70 <- 1 - pnorm(70, mean_y, sd_y) # P(50 < Y < 70) p_between <- pnorm(70, mean_y, sd_y) - pnorm(50, mean_y, sd_y) p_less_50; p_greater_70; p_between #> [1] 0.1587 #> [1] 0.1587 #> [1] 0.6826 # --- Visualization of probabilities under Normal curve ------------ ggplot(data.frame(y = y_vals, pdf = pdf_vals), aes(x = y, y = pdf)) + geom_line(color = "steelblue", size = 1) + geom_area(data = subset(pdf_df, y_vals >= 50 & y_vals <= 70), aes(x = y_vals, y = pdf_vals), fill = "skyblue", alpha = 0.5) + labs( title = "P(50 < Y < 70) under the Normal Distribution", subtitle = "Shaded area ??? 0.68 (Empirical Rule: 68% within ??1??)", x = "Seconds", y = "Density" ) + theme_minimal()