##############################################
# Random Variables in R: Binomial & Normal   #
# Context: Marketing Communication Analytics #
##############################################

# --- 0) Setup ---------------------------------------
set.seed(123)  # reproducibility
library(ggplot2)

# --- 1) BINOMIAL RANDOM VARIABLE --------------------
# Scenario: X = number of users (out of 10) clicking on a campaign link
# Parameters: n = 10 trials, p = 0.3 probability of success (click)

n <- 10
p <- 0.3
x <- 0:n  # possible outcomes (0 to 10 clicks)

# Probability Mass Function (PMF)
pmf <- dbinom(x, size = n, prob = p)

# Cumulative probabilities
cdf <- pbinom(x, size = n, prob = p)

# Combine results
binom_df <- data.frame(x, pmf, cdf)

# Display the first few rows
head(binom_df)
#>   x    pmf    cdf
#> 1 0 0.0282 0.0282
#> 2 1 0.1211 0.1493
#> 3 2 0.2335 0.3828
#> 4 3 0.2668 0.6496
#> 5 4 0.2001 0.8497
#> 6 5 0.1029 0.9526

# --- Plot: PMF of the Binomial variable --------------
ggplot(binom_df, aes(x = x, y = pmf)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Binomial Random Variable: Number of Clicks (n=10, p=0.3)",
    x = "x = Number of Clicks",
    y = "P(X = x)"
  ) +
  theme_minimal()

# --- Compute example probabilities -------------------
# P(X = 3)
p_eq_3 <- dbinom(3, n, p)
# P(X <= 4)
p_le_4 <- pbinom(4, n, p)
# P(X > 6)
p_gt_6 <- 1 - pbinom(6, n, p)

p_eq_3; p_le_4; p_gt_6
#> [1] 0.2668
#> [1] 0.8497
#> [1] 0.0473

# Expected value and variance
E_X <- n * p
Var_X <- n * p * (1 - p)
E_X; Var_X
#> [1] 3
#> [1] 2.1


# --- 2) NORMAL RANDOM VARIABLE -----------------------
# Scenario: Y = time spent on landing page (seconds)
# Parameters: mean = 60, sd = 10

mean_y <- 60
sd_y <- 10

# Generate random data for 200 users
Y <- rnorm(200, mean = mean_y, sd = sd_y)

# Quick summary
summary(Y)
#>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
#>   34.2     53.3      59.8     60.1     66.8     87.5

# --- Plot: Normal density (theoretical + simulated) ---
# Theoretical curve
y_vals <- seq(30, 90, by = 0.5)
pdf_vals <- dnorm(y_vals, mean = mean_y, sd = sd_y)
pdf_df <- data.frame(y_vals, pdf_vals)

# Density plot (empirical vs theoretical)
ggplot() +
  geom_density(aes(x = Y), fill = "orange", alpha = 0.4, color = "darkred") +
  geom_line(data = pdf_df, aes(x = y_vals, y = pdf_vals * 100),  # scaled for comparison
            color = "blue", linewidth = 1) +
  labs(
    title = "Normal Random Variable: Time on Page (mean=60, sd=10)",
    x = "Seconds",
    y = "Density / Frequency"
  ) +
  theme_minimal()

# --- Example probabilities ----------------------------
# P(Y < 50)
p_less_50 <- pnorm(50, mean_y, sd_y)
# P(Y > 70)
p_greater_70 <- 1 - pnorm(70, mean_y, sd_y)
# P(50 < Y < 70)
p_between <- pnorm(70, mean_y, sd_y) - pnorm(50, mean_y, sd_y)

p_less_50; p_greater_70; p_between
#> [1] 0.1587
#> [1] 0.1587
#> [1] 0.6826

# --- Visualization of probabilities under Normal curve ------------
ggplot(data.frame(y = y_vals, pdf = pdf_vals), aes(x = y, y = pdf)) +
  geom_line(color = "steelblue", size = 1) +
  geom_area(data = subset(pdf_df, y_vals >= 50 & y_vals <= 70),
            aes(x = y_vals, y = pdf_vals), fill = "skyblue", alpha = 0.5) +
  labs(
    title = "P(50 < Y < 70) under the Normal Distribution",
    subtitle = "Shaded area ??? 0.68 (Empirical Rule: 68% within ??1??)",
    x = "Seconds",
    y = "Density"
  ) +
  theme_minimal()