Hi! I am running R code, it is not that complicated and my data frame is not very large. I am applying a dictionary that I downloaded from github to my text data. Every time I run a particular chunk of code the session stops and I get the error code that says “R made a fatal error” and had to abort the session. I have updated R and R studio. I have restarted my session. I have cleared my environment. Here is the code below.
---
title: "hcmst_c2_04v2_SADCATdesc01"
output: html_document
date: "2024-07-12"
---
Set up libraries
```{r}
# set up SADCAT library
#install.packages("devtools")
library(devtools)
#install_github("gandalfnicolas/SADCAT")
library(SADCAT)
#install.packages("quanteda")
library(quanteda)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("stringr")
library(stringr)
#install.packages("psych")
library(psych)
Full_Dictionaries = SADCAT::Dictionaries
Pre_Dictionaries = SADCAT::All.steps_Dictionaries
Full_Dictionaries = SADCAT::Dictionaries_FT
Pre_Dictionaries = SADCAT::All.steps_Dictionaries_FT
# Include only dictionary entries that are single words
Pre_Dictionaries$values0 = as.character(Pre_Dictionaries$values0)
Pre_Dictionaries = filter(Pre_Dictionaries, !str_count(Pre_Dictionaries$values0, ' ') > 0)
Pre_Dictionaries2 = Pre_Dictionaries %>%
transmute(values0 = char_wordstem(values0, language = quanteda_options("language_stemmer")),
Sociability_dic = ifelse(Sociability_dict == 1, values0, ""),
Morality_dic = ifelse(Morality_dict ==1, values0,""),
Ability_dic = ifelse(Ability_dict ==1, values0,""),
Assertiveness_dic = ifelse(Agency_dict ==1, values0,""),
Status_dic = ifelse(Status_dict ==1, values0,""),
Warmth_dic = ifelse(Warmth_dict ==1, values0,""),
Competence_dic = ifelse(Competence_dict ==1, values0,""),
Beliefs_dic = ifelse(Beliefs_dict ==1, values0,""),
Sociability_dic_hi = ifelse(Sociability_dict_hi == 1, values0, ""),
Morality_dic_hi = ifelse(Morality_dict_hi ==1, values0,""),
Ability_dic_hi = ifelse(Ability_dict_hi ==1, values0,""),
Assertiveness_dic_hi = ifelse(Agency_dict_hi ==1, values0,""),
Status_dic_hi = ifelse(Status_dict_hi ==1, values0,""),
Warmth_dic_hi = ifelse(Warmth_dict_hi ==1, values0,""),
Competence_dic_hi = ifelse(Competence_dict_hi ==1, values0,""),
Beliefs_dic_hi = ifelse(Beliefs_dict_hi ==1, values0,""),
Sociability_dic_lo = ifelse(Sociability_dict_lo == 1, values0, ""),
Morality_dic_lo = ifelse(Morality_dict_lo ==1, values0,""),
Ability_dic_lo = ifelse(Ability_dict_lo ==1, values0,""),
Assertiveness_dic_lo = ifelse(Agency_dict_lo ==1, values0,""),
Status_dic_lo = ifelse(Status_dict_lo ==1, values0,""),
Warmth_dic_lo = ifelse(Warmth_dict_lo ==1, values0,""),
Competence_dic_lo = ifelse(Competence_dict_lo ==1, values0,""),
Beliefs_dic_lo = ifelse(Beliefs_dict_lo ==1, values0,""),
Health_dic = ifelse(health_dict ==1, values0,""),
Family_dic = ifelse(relative_dict ==1, values0,""),
Emotion_dic = ifelse(feeling_dict ==1, values0,""),
Nationality_dic = ifelse(Geography_dict ==1, values0,""),
Occupation_dic = ifelse(work_dict ==1, values0,""),
Appearance_dic = ifelse(Appearance_dict ==1, values0,"")) %>%
dplyr::select(-values0)
Dicts_v2 =
lapply(1:length(Pre_Dictionaries2), function(x) Pre_Dictionaries2[[x]][Pre_Dictionaries2[[x]] != ""])
names(Dicts_v2) = names(Pre_Dictionaries2)
Dicts_v2 = lapply(Dicts_v2, function(x) x[!is.na(x)])
#create quanteda dictionaries
Dicts_v2 = quanteda::dictionary(Dicts_v2)
# Load in text data
hcmst_c2_04v2 <- readRDS("~/Desktop/qp/R/hcmst_c2_04v2.RDS")
#Preprocess target text
# this loads in the "q24" column from the hcmst_c2_04v2 data frame and removed any numbers, symbols and punctuation
# creates a new dataframe called q24
q24 <- quanteda::tokens(hcmst_c2_04v2$q24, remove_numbers = T, remove_punct = T, remove_symbols = T)
# stem words within q24 data
q24 <- tokens_wordstem(q24, language = quanteda_options("language_stemmer"))
# Take processed and stemmed text data and put it back into a string of text
string_stemmed <- vector("character", length(q24))
for(i in seq_along(q24)) {
string_stemmed[i] <- paste(q24[[i]], collapse = ' ')
}
#Match target text to dictionaries
# this will take the processed text and check which of the dictionaries are applicable
q24_dict_pre <- tokens_lookup(q24, dictionary = Dicts_v2, levels = 1)
#Transform to a document-feature dataframe
q24_dict_pre = convert(dfm(q24_dict_pre), to = "data.frame")
## need to make new column in the hcmstc2 dataframe
# Generate the sequence of doc_id values
doc_id_sequence <- paste0("text", seq_len(nrow(hcmst_c2_04v2)))
# Add the new column to the dataframe
hcmst_c2_04v2$doc_id <- doc_id_sequence
# Combine q24_dict_pre, ntoken(q24), ntype(q24), and string_stemmed into one dataframe
q24_dict <- cbind(q24_dict_pre,
ntoken(q24), # raw count
ntype(q24), # distinct count
string_stemmed)
# Merge the combined dataframe with hcmst_c2_04v2 based on the 'doc_id' column
final_dataframe <- merge(q24_dict, hcmst_c2_04v2, by = "doc_id")
I also tried using an alternate approach. To do that, I did not run the last chunk of code and instead ran this. Same issue.
# Load necessary libraries
library(quanteda)
library(dplyr)
library(pryr)
library(data.table)
# Convert to data.table for efficient operations
q24_dict_pre_dt <- as.data.table(q24_dict_pre)
hcmst_c2_04v2_dt <- as.data.table(hcmst_c2_04v2)
# Convert doc_id to character to ensure proper merging
hcmst_c2_04v2_dt[, doc_id := as.character(doc_id)]
q24_dict_pre_dt[, doc_id := as.character(doc_id)]
# Process in smaller chunks
chunk_size <- 500
num_chunks <- ceiling(nrow(q24_dict_pre_dt) / chunk_size)
# Directory to save chunks
chunk_dir <- tempdir()