Hi,
I have a list of words that i wish to tokenize using the tokenizers
package
I want to then get the result as a vector of characters which i then convert to a single comma seperated string so i can bind the column to my original data frame
Below i have it working for a single item in my dataframe but am not sure how to get it to work for the other items in my dataframe
Can anyone help?
library(tokenizers)
library(stringi)
library(tidyverse)
library(glue)
#>
#> Attaching package: 'glue'
#> The following object is masked from 'package:dplyr':
#>
#> collapse
# Taken from https://stackoverflow.com/questions/42734547/generating-random-strings
random_letters <- sprintf("%s%s%s", stri_rand_strings(5, 5, '[A-Z]'),
stri_rand_strings(5, 4, '[0-9]'), stri_rand_strings(5, 1, '[A-Z]')) %>%
as_tibble()
# Get the tokenizing to work for one
one_item_example <- random_letters$value[1]
test_col <- tokenize_character_shingles(one_item_example, n=4, n_min=4) %>%
unlist()
result <- glue_collapse(glue("'{test_col}'"), sep = ',')
result
#> 'cnvk','nvkz','vkz9','kz97','z978','9784','784x'
# Tokenize - BROKEN
tokenize_character_shingles(random_letters$value, n=4, n_min=4) %>%
bind_rows()
#> Error in bind_rows_(x, .id): Argument 1 must have names
Created on 2019-01-11 by the reprex package (v0.2.1)