Can someone please help me in knitting my rmd code

SSD · December 7, 2024, 9:42pm

I would be incredibly grateful is someone is able to just send me a pdf of my rmd code.

#Part - a
# Check for missing values in specified columns 
install.packages("dplyr")
library(dplyr)
library(rmarkdown)
library(readr)
library(ggplot2)
library(knitr)

data <- read_csv("dat_SalesCustomers.csv")
summary(data)

data <- read_csv("dat_SalesCustomers.csv")
colnames(data)

missingvalues_my <- sapply(data[, c("category", "price", "gender", "age", "payment_method")],function(x) sum(is.na(x)))

# Print the count of missing values for each column
cat("Missing values in each column:\n")
print(missingvalues_my)

cleaned_data <- data %>%
  filter(!is.na(category), 
         !is.na(price), 
         !is.na(gender), 
         !is.na(age), 
         !is.na(payment_method))

# Count the remaining observations
remaining_observations <- nrow(cleaned_data)

# Print the number of remaining observations
cat("Number of observations left after removing missing values:", remaining_observations, "\n")


library(dplyr)

#part-b
# Create a dummy variable for cash payment
cleaned_data <- cleaned_data %>%
  mutate(paid_in_cash = ifelse(payment_method == "cash", 1, 0))
table(cleaned_data$payment_method)
summary(cleaned_data$price)
nrow(cleaned_data)
head(cleaned_data)

# Create a dummy variable for males
cleaned_data <- cleaned_data %>%
  mutate(paid_in_cash = ifelse(tolower(payment_method) == "cash", 1, 0))

cleaned_data <- cleaned_data %>%
  mutate(male = ifelse(gender == "male", 1, 0))
#Fraction of transactions in cash
fraction_cash_transactions <- mean(cleaned_data$paid_in_cash)
# Fraction of overall sales (in TRY) carried out in cash
total_sales <- sum(cleaned_data$price, na.rm = TRUE)
cash_sales <- sum(cleaned_data$price[cleaned_data$paid_in_cash == 1], na.rm = TRUE)
fraction_cash_sales <- cash_sales / total_sales
# Print results
cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("Fraction of overall sales (in TRY) carried out in cash:", fraction_cash_sales, "\n")

cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("This represents the proportion of transactions in the dataset where cash payment was used.\n")

#subset for first 1000 observations.
subset_data <- cleaned_data[1:1000, ]
subset_data <- subset_data %>%
  mutate(paid_in_cash = ifelse(payment_method == "cash", 1, 0))
cat("Generated dummy variables:\n",
    "'paid_in_cash' (1 for cash payments, 0 otherwise) and 'male' (1 for male customers, 0 otherwise).\n")
fraction_cash_transactions <- mean(subset_data$paid_in_cash)
total_sales <- sum(subset_data$price, na.rm = TRUE)
cash_sales <- sum(subset_data$price[subset_data$paid_in_cash == 1], na.rm = TRUE)
fraction_cash_sales <- cash_sales / total_sales
cat("Fraction of transactions carried out in cash:", fraction_cash_transactions, "\n")
cat("Fraction of overall sales (in TRY) carried out in cash:", fraction_cash_sales, "\n")

#Part c
library(dplyr)

subset_data <- subset_data %>%
  mutate(category = tolower(trimws(category)))
subset_data <- subset_data %>%
  mutate(
    clothes_shoes = ifelse(category %in% c("clothing", "shoes"), 1, 0),
    cosmetics = ifelse(category == "cosmetics", 1, 0),
    food = ifelse(category %in% c("food & beverage"), 1, 0),
    technology = ifelse(category == "technology", 1, 0),
    other = ifelse(!category %in% c("clothing", "shoes", "cosmetics", "food & beverage", "technology"), 1, 0)
  )
cat("Created dummy variables for categories:\n",
    "'clothes_shoes' (clothing and shoes), 'cosmetics', 'food' (food & beverage), 'technology', and 'other' (all other categories).\n")

#transaction split
transaction_split <- colSums(subset_data[, c("clothes_shoes", "cosmetics", "food", "technology", "other")])
total_transactions <- sum(transaction_split)

transaction_proportions <- transaction_split / total_transactions

#Sales split
sales_split <- subset_data %>%
  summarise(
    clothes_shoes = sum(price[clothes_shoes == 1], na.rm = TRUE),
    cosmetics = sum(price[cosmetics == 1], na.rm = TRUE),
    food = sum(price[food == 1], na.rm = TRUE),
    technology = sum(price[technology == 1], na.rm = TRUE),
    other = sum(price[other == 1], na.rm = TRUE)
  )
total_sales_split <- sum(sales_split)

sales_proportions <- sales_split / total_sales_split

#Display results
cat("Transaction split across categories:\n")
print(transaction_split)
print(transaction_proportions)

cat("\nSales split across categories (in TRY):\n")
print(sales_split)
print(sales_proportions) 

#Theoretical Intuition.
cat("### Interpretations of Proportions:\n")
cat("\n**Transaction Proportions**:\n")
cat("- The majority of transactions (", round(transaction_proportions['clothes_shoes'] * 100, 2), "%) occurred in the 'clothes_shoes' category.\n")
cat("- ", round(transaction_proportions['technology'] * 100, 2), "% of transactions occurred in the 'technology' category.\n")
cat("- Other categories such as 'cosmetics', 'food', and 'other' accounted for much smaller proportions of transactions.\n\n")

#Part D
install.packages("maxLik")
library(maxLik) 

library(dplyr)
### Estimating the Probit Model

# Subset data to ensure no missing values in relevant variables
probit_data <- subset_data %>%
  select(paid_in_cash, price, age, clothes_shoes, cosmetics, food, technology) %>%
  

# Define the log-likelihood function for the probit model
log_likelihood <- function(beta, data) {
  # Extract variables
  y <- data$paid_in_cash
  X <- as.matrix(cbind(1, data$price, data$male, data$age, 
                       data$clothes_shoes, data$cosmetics, data$food, data$technology))
  xb <- X %*% beta
  # Compute log likelihood using pnorm for Φ(x)
  ll <- y * pnorm(xb, log.p = TRUE) + (1 - y) * pnorm(-xb, log.p = TRUE)
  return(sum(ll))
}

# Initial guesses for beta (can start with zeros)
initial_beta <- rep(0, 8)

# Run numerical optimization using maxLik
probit_model <- maxLik(logLik = log_likelihood, start = initial_beta, data = probit_data)

# Display results
summary(probit_model)

#Part E
#Age Doubling effect
categories <- list(
  clothes_shoes = c(1, 500, 1, 30, 1, 0, 0, 0),  # Clothes and Shoes
  cosmetics = c(1, 500, 1, 30, 0, 1, 0, 0),      # Cosmetics
  food = c(1, 500, 1, 30, 0, 0, 1, 0),           # Food
  technology = c(1, 500, 1, 30, 0, 0, 0, 1),     # Technology
  other = c(1, 500, 1, 30, 0, 0, 0, 0)           # Other
)

# Define covariate values for a 30-year-old male buying clothes/shoes for 500 TRY
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0)  # Intercept, price, male, age, clothes_shoes, cosmetics, food, technology
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0)  # Same covariates but age = 60

probit_model <- glm(
  formula = paid_in_cash ~ price + male + age + category,
  family = binomial(link = "probit"),
  data = cleaned_data
)

# Extract coefficients from the estimated model
beta_hat <- coef(probit_model)

# Compute linear predictor values
xb_30 <- sum(x_30 * beta_hat)  # For 30-year-old
xb_60 <- sum(x_60 * beta_hat)  # For 60-year-old

# Compute expected probabilities using Φ(x)
p_30 <- pnorm(xb_30)  # Expected probability for 30-year-old
p_60 <- pnorm(xb_60)  # Expected probability for 60-year-old

# Effect of age doubling (γ₁(β̂))
gamma_1 <- p_60 - p_30
cat("Effect of age doubling on expected probability (γ₁):", gamma_1, "\n")

#Weighted effects
# Compute expected probabilities for 30-year-old and 60-year-old for each category
effects <- sapply(categories, function(x) {
  # Linear predictor for 30-year-old
  xb_30 <- sum(x * beta_hat)
  # Linear predictor for 60-year-old
  xb_60 <- sum(c(x[1:3], 60, x[5:8]) * beta_hat)
  # Probabilities
  p_30 <- pnorm(xb_30)
  p_60 <- pnorm(xb_60)
  # Return effect
  return(p_60 - p_30)
})

# Get proportions of categories from earlier computations (replace with actual values)
proportions <- transaction_proportions  # Make sure this variable is defined correctly

# Compute weighted average effect (γ₂)
gamma_2 <- sum(effects * proportions)
cat("Weighted effect without conditioning on categories (γ₂):", gamma_2, "\n")

#Part f
# Create an explanation dynamically

cat("
Yes, the estimator \\( \\hat{\\beta} \\) is consistent under the following conditions based on the simplified version of the **extremum estimation theory**:

---

### Key Conditions for Consistency:

1. **Correct Specification**:
   - The probit model is assumed to be correctly specified, meaning the true data-generating process aligns with the functional form of the model (probit). This ensures that the true parameter \\( \\beta_0 \\) lies within the parameter space.

2. **Identification**:
   - The function \\( Q_n(\\beta; Z_n) = -\\frac{1}{n} \\ell(\\beta; Z_n) \\) has a unique global minimum at the true parameter value \\( \\beta_0 \\).
   - For probit, the log-likelihood function is:
     \\[
     \\ell(\\beta; Z_n) = \\sum_{i=1}^n \\left[ y_i \\log \\Phi(X_i \\beta) + (1 - y_i) \\log (1 - \\Phi(X_i \\beta)) \\right]
     \\]
     - \\( \\Phi(X_i \\beta) \\) is strictly increasing and continuous, ensuring identification.

3. **Continuity**:
   - The objective function \\( Q_n(\\beta; Z_n) \\) is continuous in \\( \\beta \\).
   - The log-likelihood components \\( \\log \\Phi(X_i \\beta) \\) and \\( \\log (1 - \\Phi(X_i \\beta)) \\) are continuous and differentiable, satisfying this condition.

4. **Compact Parameter Space**:
   - The parameter space for \\( \\beta \\) is assumed to be compact, ensuring the existence of a minimum.

5. **Law of Large Numbers (LLN)**:
   - As \\( n \\to \\infty \\), the sample objective function \\( Q_n(\\beta; Z_n) \\) converges uniformly to its population counterpart:
     \\[
     Q(\\beta) = \\mathbb{E} \\left[ - y \\log \\Phi(X \\beta) - (1 - y) \\log (1 - \\Phi(X \\beta)) \\right]
     \\]

### Conclusion:

Under these assumptions, the Maximum Likelihood Estimator (MLE) for the probit model is consistent. This means:

1. The true parameter \\( \\beta_0 \\) minimizes the population objective function \\( Q(\\beta) \\).
2. The sample objective function \\( Q_n(\\beta; Z_n) \\) converges uniformly to \\( Q(\\beta) \\) as the sample size increases.

Thus, if the probit model is correctly specified, \\( \\hat{\\beta} \\) will converge in probability to the true parameter \\( \\beta_0 \\).
")

#Part - G
# Set up parameters
M <- 100  # Number of bootstrap samples
n <- nrow(probit_data)  # Sample size
bootstrap_results <- data.frame(beta_age = numeric(M), gamma1 = numeric(M), gamma2 = numeric(M))

# Define covariate values for gamma1
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0)  # Covariates for 30-year-old
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0)  # Covariates for 60-year-old

# Define category proportions from Part (c)
proportions <- transaction_proportions

# Perform bootstrapping
set.seed(123)  # For reproducibility
for (i in 1:M) {
  # Resample data with replacement
  boot_sample <- probit_data[sample(1:n, size = n, replace = TRUE), ]
  
  # Refit the probit model
  boot_model <- glm(
    paid_in_cash ~ price + male + age + clothes_shoes + cosmetics + food + technology,
    data = boot_sample,
    family = binomial(link = "probit")
  )
  
  # Extract coefficient for age
  beta_hat <- coef(boot_model)
  bootstrap_results$beta_age[i] <- beta_hat["age"]
  
  # Compute gamma1 (effect of age doubling)
  xb_30 <- sum(x_30 * beta_hat)
  xb_60 <- sum(x_60 * beta_hat)
  p_30 <- pnorm(xb_30)
  p_60 <- pnorm(xb_60)
  bootstrap_results$gamma1[i] <- p_60 - p_30
  
  # Compute gamma2 (weighted effect across categories)
  categories <- list(
    clothes_shoes = c(1, 500, 1, 30, 1, 0, 0, 0),
    cosmetics = c(1, 500, 1, 30, 0, 1, 0, 0),
    food = c(1, 500, 1, 30, 0, 0, 1, 0),
    technology = c(1, 500, 1, 30, 0, 0, 0, 1),
    other = c(1, 500, 1, 30, 0, 0, 0, 0)
  )
  
  effects <- sapply(categories, function(x) {
    xb_30 <- sum(x * beta_hat)
    xb_60 <- sum(c(x[1:3], 60, x[5:8]) * beta_hat)
    p_30 <- pnorm(xb_30)
    p_60 <- pnorm(xb_60)
    return(p_60 - p_30)
  })
  
  bootstrap_results$gamma2[i] <- sum(effects * proportions)
}

# Plot the finite sample distributions
library(ggplot2)

# Plot for beta_age
ggplot(bootstrap_results, aes(x = beta_age)) +
  geom_histogram(bins = 20, fill = "skyblue", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Coefficient on Age", x = expression(hat(beta)["age"]), y = "Frequency")

# Plot for gamma1
ggplot(bootstrap_results, aes(x = gamma1)) +
  geom_histogram(bins = 20, fill = "green", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Gamma1", x = expression(gamma[1](hat(beta))), y = "Frequency")

# Plot for gamma2
ggplot(bootstrap_results, aes(x = gamma2)) +
  geom_histogram(bins = 20, fill = "maroon", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Gamma2", x = expression(gamma[2](hat(beta))), y = "Frequency")

cat("### Summary of Bootstrap Results\n\n")
cat("Mean and Standard Deviation of Bootstrap Estimates:\n")
summary_table <- data.frame(
  Statistic = c("Mean", "Std. Dev."),
  Beta_Age = c(mean(bootstrap_results$beta_age), sd(bootstrap_results$beta_age)),
  Gamma1 = c(mean(bootstrap_results$gamma1), sd(bootstrap_results$gamma1)),
  Gamma2 = c(mean(bootstrap_results$gamma2), sd(bootstrap_results$gamma2))
)
print(summary_table)

cat("\n\n### Distribution of Coefficient for Age (\\(\\hat{\\beta}_{\\text{age}}\\))\n")
ggplot(bootstrap_results, aes(x = beta_age)) +
  geom_histogram(bins = 20, fill = "skyblue", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Coefficient on Age", x = expression(hat(beta)["age"]), y = "Frequency")

cat("\n\n### Distribution of \\(\\gamma_1(\\hat{\\beta})\\)\n")
ggplot(bootstrap_results, aes(x = gamma1)) +
  geom_histogram(bins = 20, fill = "green", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Gamma1", x = expression(gamma[1](hat(beta))), y = "Frequency")

cat("\n\n### Distribution of \\(\\gamma_2(\\hat{\\beta})\\)\n")
ggplot(bootstrap_results, aes(x = gamma2)) +
  geom_histogram(bins = 20, fill = "maroon", alpha = 0.6) +
  labs(title = "Bootstrap Distribution of Gamma2", x = expression(gamma[2](hat(beta))), y = "Frequency")

## Part H
# Load necessary library
library(ggplot2)
install.packages("dplyr")
library(dplyr)

n <- nrow(probit_data)  # Number of observations
beta_hat <- coef(probit_model)  # Estimated coefficients
x_matrix <- model.matrix(paid_in_cash ~ price + male + age + clothes_shoes + cosmetics + food + technology, data = probit_model)
H_matrix <- t(x_matrix) %*% x_matrix / n

if (det(H_matrix) == 0 || abs(det(H_matrix)) < 1e-10) {
  cat("H_matrix is singular or nearly singular. Adding regularization...\n")
  epsilon <- 1e-6
  H_matrix <- H_matrix + diag(epsilon, ncol(H_matrix))
}

if (det(H_matrix) == 0 || abs(det(H_matrix)) < 1e-10) {
  cat("H_matrix is singular or nearly singular. Adding regularization...\n")
  epsilon <- 1e-6
  H_matrix <- H_matrix + diag(epsilon, ncol(H_matrix))
}


asymptotic_var <- solve(H_matrix)
set.seed(123)
asymptotic_samples <- rnorm(1000, mean = beta_hat["age"], sd = sqrt(asymptotic_var["age", "age"]))
ggplot(data.frame(beta_age = asymptotic_samples), aes(x = beta_age)) +
  geom_histogram(bins = 30, fill = "darkblue", alpha = 0.6) +
  labs(
    title = "Asymptotic Distribution of Coefficient on Age",
    x = expression(hat(beta)["age"]),
    y = "Frequency"
  )

##Part - I
library(ggplot2) 
install.packages("dplyr")
library(dplyr)

beta_hat <- coef(probit_model)  # Estimated coefficients
x_30 <- c(1, 500, 1, 30, 1, 0, 0, 0)  # Covariates for 30-year-old
x_60 <- c(1, 500, 1, 60, 1, 0, 0, 0)  # Covariates for 60-year-old

compute_gradient <- function(beta, x_30, x_60) {
  xb_30 <- sum(x_30 * beta)  # Linear predictor for 30-year-old
  xb_60 <- sum(x_60 * beta)  # Linear predictor for 60-year-old
  grad_30 <- dnorm(xb_30) * x_30  # Gradient for 30-year-old
  grad_60 <- dnorm(xb_60) * x_60  # Gradient for 60-year-old
  
  return(grad_60 - grad_30)
}

# Calculate the gradient
gradient_gamma1 <- compute_gradient(beta_hat, x_30, x_60)

gradient_gamma1 <- compute_gradient(beta_hat, x_30, x_60)

# Compute variance of gamma_1 using the Delta Method
var_gamma1 <- t(gradient_gamma1) %*% asymptotic_var %*% gradient_gamma1
sd_gamma1 <- sqrt(var_gamma1)  # Standard deviation of gamma_1

# Simulate the asymptotic distribution of gamma_1
set.seed(123)
asymptotic_samples_gamma1 <- rnorm(1000, mean = pnorm(sum(x_60 * beta_hat)) - pnorm(sum(x_30 * beta_hat)), sd = sd_gamma1)

# Plot the asymptotic distribution
print(ggplot(data.frame(gamma1 = asymptotic_samples_gamma1), aes(x = gamma1)) +
  geom_histogram(bins = 30, fill = "green", alpha = 0.6) +
  labs(
    title = "Asymptotic Distribution of Gamma1 (Delta Method)",
    x = expression(gamma[1](hat(beta))),
    y = "Frequency"
  ))

# Compare with bootstrap results
cat("### Comparing Asymptotic and Bootstrap Distributions for Gamma1\n")
cat("Mean of asymptotic distribution:", mean(asymptotic_samples_gamma1), "\n")
cat("Standard deviation of asymptotic distribution:", sd(asymptotic_samples_gamma1), "\n")
cat("Mean of bootstrap distribution:", mean(bootstrap_results$gamma1), "\n")
cat("Standard deviation of bootstrap distribution:", sd(bootstrap_results$gamma1), "\n")

library(dplyr)
#part J
# Define key values
gamma1_hat <- pnorm(sum(x_60 * beta_hat)) - pnorm(sum(x_30 * beta_hat))  # Point estimate
var_gamma1 <- as.numeric(t(gradient_gamma1) %*% asymptotic_var %*% gradient_gamma1)  # Variance
sd_gamma1 <- sqrt(var_gamma1)  # Standard deviation

# Perform the t-test
t_stat <- gamma1_hat / sd_gamma1  # Test statistic
critical_value <- qnorm(0.975)  # Critical value for 95% confidence
p_value <- 2 * (1 - pnorm(abs(t_stat)))  # Two-tailed p-value

# Construct 95% confidence interval
conf_int <- c(
  gamma1_hat - critical_value * sd_gamma1,
  gamma1_hat + critical_value * sd_gamma1
)

# Display results
cat("### Hypothesis Test Results\n")
cat("Test Statistic (t):", t_stat, "\n")
cat("Critical Value (95% level):", critical_value, "\n")
cat("P-Value:", p_value, "\n")

cat("\n### 95% Confidence Interval for \\(\\gamma_1(\\beta)\\):\n")
cat("[", conf_int[1], ",", conf_int[2], "]\n")

# Conclusion
if (p_value < 0.05) {
  cat("\nWe reject the null hypothesis at the 5% significance level. There is evidence that \\(\\gamma_1(\\beta)\\) is significantly different from 0.\n")
} else {
  cat("\nWe fail to reject the null hypothesis at the 5% significance level. There is insufficient evidence to conclude that \\(\\gamma_1(\\beta)\\) is different from 0.\n")
}

system · March 7, 2025, 9:42pm

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.