I am trying my hand on categorising sentences into Product category using a RF model. The dataset (US consumer complaints; smaller datasets for reprex here) is in my instance provided separately for train and test set, precluding the use of rsample::initial_split()
.
Clumsily or not, I managed to successfully tokenize and manipulate the training data into a document-term matrix for fitting in tidymodels
. I repeated the steps on my test set before using predict()
. But because the content of the complaint sentences are different, the tokenized words and dtm columns are different, resulting in this error:
Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...
How can I go about it?
library(tidyverse)
library(tidymodels)
library(tidytext)
library(tm)
# Import data
trainsmall_tbl <- readr::read_csv("complaints_train.csv")
test_org <- readr::read_csv("complaints_test.csv")
# Take only Product and Consumer.complaint.narrative
trainsmall_tbl <- trainsmall_tbl %>%
select(Product, Consumer.complaint.narrative) %>%
mutate(complaint_id = row_number())
# Add numeric labels for Product
product_numlevels <- tibble(
product_label = c(1:4),
Product = c("Mortgage", "Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease") )
trainsmall_tbl <- trainsmall_tbl %>% left_join(product_numlevels)
#head(trainsmall_tbl)
# Add labels to product_label
trainsmall_tbl$product_label <- factor(trainsmall_tbl$product_label,
levels = c(1, 2, 3, 4),
labels = c("Mortgage",
"Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease"))
#### Create tidytext data frame -----------------------
## Tokenize and clean --------
train_tokens <- trainsmall_tbl %>%
unnest_tokens(output = word, input = Consumer.complaint.narrative) %>%
filter(!str_detect(word, "[[:punct:]]")) %>% # remove punc
filter(!str_detect(word, "^[0-9]*$")) %>% # remove numbers
filter(!str_detect(word, "xx")) %>% # remove contains "xx"
anti_join(tidytext::stop_words) %>% # remove stop words
mutate(word = SnowballC::wordStem(word)) %>% # stem the words
mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>% # remove whitespace
mutate_at(.vars = c("word"), .funs = funs(tolower)) # to lowercase
#head(train_tokens)
#### Create document-term matrix (dtm) ---------------
train_dtm <- train_tokens %>%
count(complaint_id, word) %>%
tidytext::cast_dtm(document = complaint_id, term = word, value = n)
## Remove sparse terms
train_dtm <- tm::removeSparseTerms(train_dtm, sparse = .99)
## Convert dtm to matrix, then df/tibble
train_mat <- as.matrix(train_dtm)
train_df <- as.data.frame(cbind(trainsmall_tbl$product_label, train_mat))
train_tbl <- as_tibble(train_df)
## Convert product labels to factor again -----
train_tbl$V1 <- as.factor(train_tbl$V1)
train_tbl$V1 <- factor(train_tbl$V1,
levels = c(1, 2, 3, 4),
labels = c("Mortgage",
"Student loan",
"Credit card or prepaid card",
"Vehicle loan or lease"))
# Convert `V1` (`product_label`) to `dv`
names(train_tbl)[1] <- "dv"
head(train_tbl)
# A tibble: 6 x 877
dv ago appli attach bought breach cfpb charg check claim complaint continu contract
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Mort~ 1 1 3 1 1 1 2 2 1 2 1 1
2 Cred~ 0 0 0 0 0 0 0 0 0 0 0 0
3 Stud~ 0 0 0 0 0 0 0 1 0 0 0 2
4 Stud~ 0 0 0 0 0 0 0 0 0 0 0 0
5 Mort~ 0 0 0 0 0 0 0 0 0 0 0 0
6 Mort~ 0 0 0 0 0 0 0 0 0 0 0 0
tidymodels
## Make a recipe
simple_rec <- train_tbl %>% recipes::recipe(dv ~ .)
## Specifying the model
# Specified hyperparameters
rf_model <- parsnip::rand_forest(mtry = 65, min_n = 30) %>%
parsnip::set_engine("randomForest") %>%
parsnip::set_mode("classification")
## Create workflow
rf_wflow <- workflows::workflow() %>%
workflows::add_recipe(simple_rec) %>%
workflows::add_model(rf_model)
## Fit data to model --------
rf_wflow_fit <- parsnip::fit(rf_wflow, data = train_tbl)
Prepare test set for predict()
?
# Take only Consumer.complaint.narrative (Product not available)
test_tbl <- test_org %>%
select(Consumer.complaint.narrative) %>%
mutate(complaint_id = row_number())
# Tokenizing and cleaning of tokens
test_tokens <- test_tbl %>%
unnest_tokens(output = word, input = Consumer.complaint.narrative) %>%
filter(!str_detect(word, "[[:punct:]]")) %>%
filter(!str_detect(word, "^[0-9]*$")) %>%
filter(!str_detect(word, "xx")) %>%
anti_join(tidytext::stop_words) %>%
mutate(word = SnowballC::wordStem(word)) %>%
mutate_at(.vars = c("word"), .funs = funs(str_squish)) %>%
mutate_at(.vars = c("word"), .funs = funs(tolower))
## Create document matrix
test_dtm <- test_tokens %>%
count(complaint_id, word) %>%
tidytext::cast_dtm(document = complaint_id, term = word, value = n)
## Remove sparse terms
test_dtm <- tm::removeSparseTerms(test_dtm, sparse = .99)
## Convert dtm to matrix, then to tibble
test_mat <- as.matrix(test_dtm)
test_tbl <- as_tibble(test_mat)
names(test_tbl)[1:5] # [1] "contact" "credit" "defer" "due" "ecmc"
Predict on test set?
predict(final_rf_wflow_fit, test_tbl)
Error: The following required columns are missing: 'bought', 'breach', 'cfpb', 'counti', 'default', ...