I would be incredibly grateful is someone is able to just send me a pdf of my rmd code.
#Part - a
# Check for missing values in specified columns
install.packages("dplyr")
library(dplyr)
library(rmarkdown)
library(readr)
library(ggplot2)
library(knitr)
data <- read_csv("dat_SalesCustomers.csv")
summary(data)
data <- read_csv("dat_SalesCustomers.csv")
colnames(data)
missingvalues_my <- sapply(data[, c("category", "price", "gender", "age", "payment_method")],function(x) sum(is.na(x)))
# Print the count of missing values for each column
cat("Missing values in each column:\n")
print(missingvalues_my)
cleaned_data <- data %>%
filter(!is.na(category),
!is.na(price),
!is.na(gender),
!is.na(age),
!is.na(payment_method))
# Count the remaining observations
remaining_observations <- nrow(cleaned_data)
# Print the number of remaining observations
cat("Number of observations left after removing missing values:", remaining_observations, "\n")
library(dplyr)
#part-b
# Create a dummy variable for cash payment
cleaned_data <- cleaned_data %>%
mutate(paid_in_cash = ifelse(payment_method == "cash", 1, 0))
table(cleaned_data$payment_method)
summary(cleaned_data$price)
nrow(cleaned_data)
head(cleaned_data)
# Create a dummy variable for males
cleaned_data <- cleaned_data %>%
mutate(paid_in_cash = ifelse(tolower(payment_method) == "cash", 1, 0))
cleaned_data <- cleaned_data %>%
mutate(male = ifelse(gender == "male", 1, 0))
#Fraction of transactions in cash
fraction_cash_transactions <- mean(cleaned_data$paid_in_cash)
# Fraction of overall sales (in TRY) carried out in cash
total_sales <- sum(cleaned_data$price, na.rm = TRUE)
cash_sales <- sum(cleaned_data$price[cleaned_data$paid_in_cash == 1], na.rm = TRUE)
fraction_cash_sales <- cash_sales / total_sales
# Print results
cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("Fraction of overall sales (in TRY) carried out in cash:", fraction_cash_sales, "\n")
cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("This represents the proportion of transactions in the dataset where cash payment was used.\n")
#subset for first 1000 observations.
subset_data <- cleaned_data[1:1000, ]
subset_data <- subset_data %>%
mutate(paid_in_cash = ifelse(payment_method == "cash", 1, 0))
cat("Generated dummy variables:\n",
"'paid_in_cash' (1 for cash payments, 0 otherwise) and 'male' (1 for male customers, 0 otherwise).\n")
fraction_cash_transactions <- mean(subset_data$paid_in_cash)
total_sales <- sum(subset_data$price, na.rm = TRUE)
cash_sales <- sum(subset_data$price[subset_data$paid_in_cash == 1], na.rm = TRUE)
fraction_cash_sales <- cash_sales / total_sales
cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("Fraction of overall sales (in TRY) carried out in cash:", fraction_cash_sales, "\n")
#Part c
library(dplyr)
subset_data <- subset_data %>%
mutate(category = tolower(trimws(category)))
subset_data <- subset_data %>%
mutate(
clothes_shoes = ifelse(category %in% c("clothing", "shoes"), 1, 0),
cosmetics = ifelse(category == "cosmetics", 1, 0),
food = ifelse(category %in% c("food & beverage"), 1, 0),
technology = ifelse(category == "technology", 1, 0),
other = ifelse(!category %in% c("clothing", "shoes", "cosmetics", "food & beverage", "technology"), 1, 0)
)
cat("Created dummy variables for categories:\n",
"'clothes_shoes' (clothing and shoes), 'cosmetics', 'food' (food & beverage), 'technology', and 'other' (all other categories).\n")
#transaction split
transaction_split <- colSums(subset_data[, c("clothes_shoes", "cosmetics", "food", "technology", "other")])
total_transactions <- sum(transaction_split)
transaction_proportions <- transaction_split / total_transactions
#Sales split
sales_split <- subset_data %>%
summarise(
clothes_shoes = sum(price[clothes_shoes == 1], na.rm = TRUE),
cosmetics = sum(price[cosmetics == 1], na.rm = TRUE),
food = sum(price[food == 1], na.rm = TRUE),
technology = sum(price[technology == 1], na.rm = TRUE),
other = sum(price[other == 1], na.rm = TRUE)
)
total_sales_split <- sum(sales_split)
sales_proportions <- sales_split / total_sales_split
#Display results
cat("Transaction split across categories:\n")
print(transaction_split)
print(transaction_proportions)
cat("\nSales split across categories (in TRY):\n")
print(sales_split)
print(sales_proportions)
#Theoretical Intuition.
cat("### Interpretations of Proportions:\n")
cat("\n**Transaction Proportions**:\n")
cat("- The majority of transactions (", round(transaction_proportions['clothes_shoes'] * 100, 2), "%) occurred in the 'clothes_shoes' category.\n")
cat("- ", round(transaction_proportions['technology'] * 100, 2), "% of transactions occurred in the 'technology' category.\n")
cat("- Other categories such as 'cosmetics', 'food', and 'other' accounted for much smaller proportions of transactions.\n\n")
#Part D
install.packages("maxLik")
library(maxLik)
library(dplyr)
### Estimating the Probit Model
# Subset data to ensure no missing values in relevant variables
probit_data <- subset_data %>%
select(paid_in_cash, price, age, clothes_shoes, cosmetics, food, technology) %>%
# Define the log-likelihood function for the probit model
log_likelihood <- function(beta, data) {
# Extract variables
y <- data$paid_in_cash
X <- as.matrix(cbind(1, data$price, data$male, data$age,
data$clothes_shoes, data$cosmetics, data$food, data$technology))
xb <- X %*% beta
# Compute log likelihood using pnorm for Φ(x)
ll <- y * pnorm(xb, log.p = TRUE) + (1 - y) * pnorm(-xb, log.p = TRUE)
return(sum(ll))
}
# Initial guesses for beta (can start with zeros)
initial_beta <- rep(0, 8)
# Run numerical optimization using maxLik
probit_model <- maxLik(logLik = log_likelihood, start = initial_beta, data = probit_data)
# Display results
summary(probit_model)
#Part E
#Age Doubling effect
categories <- list(
clothes_shoes = c(1, 500, 1, 30, 1, 0, 0, 0), # Clothes and Shoes
cosmetics = c(1, 500, 1, 30, 0, 1, 0, 0), # Cosmetics
food = c(1, 500, 1, 30, 0, 0, 1, 0), # Food
technology = c(1, 500, 1, 30, 0, 0, 0, 1), # Technology
other = c(1, 500, 1, 30, 0, 0, 0, 0) # Other
)
# Define covariate values for a 30-year-old male buying clothes/shoes for 500 TRY
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0) # Intercept, price, male, age, clothes_shoes, cosmetics, food, technology
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0) # Same covariates but age = 60
probit_model <- glm(
formula = paid_in_cash ~ price + male + age + category,
family = binomial(link = "probit"),
data = cleaned_data
)
# Extract coefficients from the estimated model
beta_hat <- coef(probit_model)
# Compute linear predictor values
xb_30 <- sum(x_30 * beta_hat) # For 30-year-old
xb_60 <- sum(x_60 * beta_hat) # For 60-year-old
# Compute expected probabilities using Φ(x)
p_30 <- pnorm(xb_30) # Expected probability for 30-year-old
p_60 <- pnorm(xb_60) # Expected probability for 60-year-old
# Effect of age doubling (γ₁(β̂))
gamma_1 <- p_60 - p_30
cat("Effect of age doubling on expected probability (γ₁):", gamma_1, "\n")
#Weighted effects
# Compute expected probabilities for 30-year-old and 60-year-old for each category
effects <- sapply(categories, function(x) {
# Linear predictor for 30-year-old
xb_30 <- sum(x * beta_hat)
# Linear predictor for 60-year-old
xb_60 <- sum(c(x[1:3], 60, x[5:8]) * beta_hat)
# Probabilities
p_30 <- pnorm(xb_30)
p_60 <- pnorm(xb_60)
# Return effect
return(p_60 - p_30)
})
# Get proportions of categories from earlier computations (replace with actual values)
proportions <- transaction_proportions # Make sure this variable is defined correctly
# Compute weighted average effect (γ₂)
gamma_2 <- sum(effects * proportions)
cat("Weighted effect without conditioning on categories (γ₂):", gamma_2, "\n")
#Part f
# Create an explanation dynamically
cat("
Yes, the estimator \\( \\hat{\\beta} \\) is consistent under the following conditions based on the simplified version of the **extremum estimation theory**:
---
### Key Conditions for Consistency:
1. **Correct Specification**:
- The probit model is assumed to be correctly specified, meaning the true data-generating process aligns with the functional form of the model (probit). This ensures that the true parameter \\( \\beta_0 \\) lies within the parameter space.
2. **Identification**:
- The function \\( Q_n(\\beta; Z_n) = -\\frac{1}{n} \\ell(\\beta; Z_n) \\) has a unique global minimum at the true parameter value \\( \\beta_0 \\).
- For probit, the log-likelihood function is:
\\[
\\ell(\\beta; Z_n) = \\sum_{i=1}^n \\left[ y_i \\log \\Phi(X_i \\beta) + (1 - y_i) \\log (1 - \\Phi(X_i \\beta)) \\right]
\\]
- \\( \\Phi(X_i \\beta) \\) is strictly increasing and continuous, ensuring identification.
3. **Continuity**:
- The objective function \\( Q_n(\\beta; Z_n) \\) is continuous in \\( \\beta \\).
- The log-likelihood components \\( \\log \\Phi(X_i \\beta) \\) and \\( \\log (1 - \\Phi(X_i \\beta)) \\) are continuous and differentiable, satisfying this condition.
4. **Compact Parameter Space**:
- The parameter space for \\( \\beta \\) is assumed to be compact, ensuring the existence of a minimum.
5. **Law of Large Numbers (LLN)**:
- As \\( n \\to \\infty \\), the sample objective function \\( Q_n(\\beta; Z_n) \\) converges uniformly to its population counterpart:
\\[
Q(\\beta) = \\mathbb{E} \\left[ - y \\log \\Phi(X \\beta) - (1 - y) \\log (1 - \\Phi(X \\beta)) \\right]
\\]
### Conclusion:
Under these assumptions, the Maximum Likelihood Estimator (MLE) for the probit model is consistent. This means:
1. The true parameter \\( \\beta_0 \\) minimizes the population objective function \\( Q(\\beta) \\).
2. The sample objective function \\( Q_n(\\beta; Z_n) \\) converges uniformly to \\( Q(\\beta) \\) as the sample size increases.
Thus, if the probit model is correctly specified, \\( \\hat{\\beta} \\) will converge in probability to the true parameter \\( \\beta_0 \\).
")
#Part - G
# Set up parameters
M <- 100 # Number of bootstrap samples
n <- nrow(probit_data) # Sample size
bootstrap_results <- data.frame(beta_age = numeric(M), gamma1 = numeric(M), gamma2 = numeric(M))
# Define covariate values for gamma1
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0) # Covariates for 30-year-old
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0) # Covariates for 60-year-old
# Define category proportions from Part (c)
proportions <- transaction_proportions
# Perform bootstrapping
set.seed(123) # For reproducibility
for (i in 1:M) {
# Resample data with replacement
boot_sample <- probit_data[sample(1:n, size = n, replace = TRUE), ]
# Refit the probit model
boot_model <- glm(
paid_in_cash ~ price + male + age + clothes_shoes + cosmetics + food + technology,
data = boot_sample,
family = binomial(link = "probit")
)
# Extract coefficient for age
beta_hat <- coef(boot_model)
bootstrap_results$beta_age[i] <- beta_hat["age"]
# Compute gamma1 (effect of age doubling)
xb_30 <- sum(x_30 * beta_hat)
xb_60 <- sum(x_60 * beta_hat)
p_30 <- pnorm(xb_30)
p_60 <- pnorm(xb_60)
bootstrap_results$gamma1[i] <- p_60 - p_30
# Compute gamma2 (weighted effect across categories)
categories <- list(
clothes_shoes = c(1, 500, 1, 30, 1, 0, 0, 0),
cosmetics = c(1, 500, 1, 30, 0, 1, 0, 0),
food = c(1, 500, 1, 30, 0, 0, 1, 0),
technology = c(1, 500, 1, 30, 0, 0, 0, 1),
other = c(1, 500, 1, 30, 0, 0, 0, 0)
)
effects <- sapply(categories, function(x) {
xb_30 <- sum(x * beta_hat)
xb_60 <- sum(c(x[1:3], 60, x[5:8]) * beta_hat)
p_30 <- pnorm(xb_30)
p_60 <- pnorm(xb_60)
return(p_60 - p_30)
})
bootstrap_results$gamma2[i] <- sum(effects * proportions)
}
# Plot the finite sample distributions
library(ggplot2)
# Plot for beta_age
ggplot(bootstrap_results, aes(x = beta_age)) +
geom_histogram(bins = 20, fill = "skyblue", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Coefficient on Age", x = expression(hat(beta)["age"]), y = "Frequency")
# Plot for gamma1
ggplot(bootstrap_results, aes(x = gamma1)) +
geom_histogram(bins = 20, fill = "green", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Gamma1", x = expression(gamma[1](hat(beta))), y = "Frequency")
# Plot for gamma2
ggplot(bootstrap_results, aes(x = gamma2)) +
geom_histogram(bins = 20, fill = "maroon", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Gamma2", x = expression(gamma[2](hat(beta))), y = "Frequency")
cat("### Summary of Bootstrap Results\n\n")
cat("Mean and Standard Deviation of Bootstrap Estimates:\n")
summary_table <- data.frame(
Statistic = c("Mean", "Std. Dev."),
Beta_Age = c(mean(bootstrap_results$beta_age), sd(bootstrap_results$beta_age)),
Gamma1 = c(mean(bootstrap_results$gamma1), sd(bootstrap_results$gamma1)),
Gamma2 = c(mean(bootstrap_results$gamma2), sd(bootstrap_results$gamma2))
)
print(summary_table)
cat("\n\n### Distribution of Coefficient for Age (\\(\\hat{\\beta}_{\\text{age}}\\))\n")
ggplot(bootstrap_results, aes(x = beta_age)) +
geom_histogram(bins = 20, fill = "skyblue", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Coefficient on Age", x = expression(hat(beta)["age"]), y = "Frequency")
cat("\n\n### Distribution of \\(\\gamma_1(\\hat{\\beta})\\)\n")
ggplot(bootstrap_results, aes(x = gamma1)) +
geom_histogram(bins = 20, fill = "green", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Gamma1", x = expression(gamma[1](hat(beta))), y = "Frequency")
cat("\n\n### Distribution of \\(\\gamma_2(\\hat{\\beta})\\)\n")
ggplot(bootstrap_results, aes(x = gamma2)) +
geom_histogram(bins = 20, fill = "maroon", alpha = 0.6) +
labs(title = "Bootstrap Distribution of Gamma2", x = expression(gamma[2](hat(beta))), y = "Frequency")
## Part H
# Load necessary library
library(ggplot2)
install.packages("dplyr")
library(dplyr)
n <- nrow(probit_data) # Number of observations
beta_hat <- coef(probit_model) # Estimated coefficients
x_matrix <- model.matrix(paid_in_cash ~ price + male + age + clothes_shoes + cosmetics + food + technology, data = probit_model)
H_matrix <- t(x_matrix) %*% x_matrix / n
if (det(H_matrix) == 0 || abs(det(H_matrix)) < 1e-10) {
cat("H_matrix is singular or nearly singular. Adding regularization...\n")
epsilon <- 1e-6
H_matrix <- H_matrix + diag(epsilon, ncol(H_matrix))
}
if (det(H_matrix) == 0 || abs(det(H_matrix)) < 1e-10) {
cat("H_matrix is singular or nearly singular. Adding regularization...\n")
epsilon <- 1e-6
H_matrix <- H_matrix + diag(epsilon, ncol(H_matrix))
}
asymptotic_var <- solve(H_matrix)
set.seed(123)
asymptotic_samples <- rnorm(1000, mean = beta_hat["age"], sd = sqrt(asymptotic_var["age", "age"]))
ggplot(data.frame(beta_age = asymptotic_samples), aes(x = beta_age)) +
geom_histogram(bins = 30, fill = "darkblue", alpha = 0.6) +
labs(
title = "Asymptotic Distribution of Coefficient on Age",
x = expression(hat(beta)["age"]),
y = "Frequency"
)
##Part - I
library(ggplot2)
install.packages("dplyr")
library(dplyr)
beta_hat <- coef(probit_model) # Estimated coefficients
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0) # Covariates for 30-year-old
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0) # Covariates for 60-year-old
compute_gradient <- function(beta, x_30, x_60) {
xb_30 <- sum(x_30 * beta) # Linear predictor for 30-year-old
xb_60 <- sum(x_60 * beta) # Linear predictor for 60-year-old
grad_30 <- dnorm(xb_30) * x_30 # Gradient for 30-year-old
grad_60 <- dnorm(xb_60) * x_60 # Gradient for 60-year-old
return(grad_60 - grad_30)
}
# Calculate the gradient
gradient_gamma1 <- compute_gradient(beta_hat, x_30, x_60)
gradient_gamma1 <- compute_gradient(beta_hat, x_30, x_60)
# Compute variance of gamma_1 using the Delta Method
var_gamma1 <- t(gradient_gamma1) %*% asymptotic_var %*% gradient_gamma1
sd_gamma1 <- sqrt(var_gamma1) # Standard deviation of gamma_1
# Simulate the asymptotic distribution of gamma_1
set.seed(123)
asymptotic_samples_gamma1 <- rnorm(1000, mean = pnorm(sum(x_60 * beta_hat)) - pnorm(sum(x_30 * beta_hat)), sd = sd_gamma1)
# Plot the asymptotic distribution
print(ggplot(data.frame(gamma1 = asymptotic_samples_gamma1), aes(x = gamma1)) +
geom_histogram(bins = 30, fill = "green", alpha = 0.6) +
labs(
title = "Asymptotic Distribution of Gamma1 (Delta Method)",
x = expression(gamma[1](hat(beta))),
y = "Frequency"
))
# Compare with bootstrap results
cat("### Comparing Asymptotic and Bootstrap Distributions for Gamma1\n")
cat("Mean of asymptotic distribution:", mean(asymptotic_samples_gamma1), "\n")
cat("Standard deviation of asymptotic distribution:", sd(asymptotic_samples_gamma1), "\n")
cat("Mean of bootstrap distribution:", mean(bootstrap_results$gamma1), "\n")
cat("Standard deviation of bootstrap distribution:", sd(bootstrap_results$gamma1), "\n")
library(dplyr)
#part J
# Define key values
gamma1_hat <- pnorm(sum(x_60 * beta_hat)) - pnorm(sum(x_30 * beta_hat)) # Point estimate
var_gamma1 <- as.numeric(t(gradient_gamma1) %*% asymptotic_var %*% gradient_gamma1) # Variance
sd_gamma1 <- sqrt(var_gamma1) # Standard deviation
# Perform the t-test
t_stat <- gamma1_hat / sd_gamma1 # Test statistic
critical_value <- qnorm(0.975) # Critical value for 95% confidence
p_value <- 2 * (1 - pnorm(abs(t_stat))) # Two-tailed p-value
# Construct 95% confidence interval
conf_int <- c(
gamma1_hat - critical_value * sd_gamma1,
gamma1_hat + critical_value * sd_gamma1
)
# Display results
cat("### Hypothesis Test Results\n")
cat("Test Statistic (t):", t_stat, "\n")
cat("Critical Value (95% level):", critical_value, "\n")
cat("P-Value:", p_value, "\n")
cat("\n### 95% Confidence Interval for \\(\\gamma_1(\\beta)\\):\n")
cat("[", conf_int[1], ",", conf_int[2], "]\n")
# Conclusion
if (p_value < 0.05) {
cat("\nWe reject the null hypothesis at the 5% significance level. There is evidence that \\(\\gamma_1(\\beta)\\) is significantly different from 0.\n")
} else {
cat("\nWe fail to reject the null hypothesis at the 5% significance level. There is insufficient evidence to conclude that \\(\\gamma_1(\\beta)\\) is different from 0.\n")
}