Here you are.
# we will develop a model similar to those used at the core of the optical character recognition (OCR) software
# often bundled with desktop document scanners or in smartphone applications.
install.packages('kernlab')
#>
#> The downloaded binary packages are in
#> /var/folders/0q/pl2zs3cj2_l6fl59mkfk1x3c0000gn/T//RtmpoEnmNl/downloaded_packages
library(kernlab)
library(e1071)
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
library(skimr)
# we'll use a dataset donated to the UCI Machine Learning Repository
# he dataset contains 20,000 examples of 26 English alphabet capital letters
# as printed using 20 different randomly reshaped and distorted black-and-white fonts.
letters <- read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/letterdata.csv')
#> Rows: 20000 Columns: 17
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): letter
#> dbl (16): xbox, ybox, width, height, onpix, xbar, ybar, x2bar, y2bar, xybar,...
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(letters)
#> spec_tbl_df [20,000 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ letter: chr [1:20000] "T" "I" "D" "N" ...
#> $ xbox : num [1:20000] 2 5 4 7 2 4 4 1 2 11 ...
#> $ ybox : num [1:20000] 8 12 11 11 1 11 2 1 2 15 ...
#> $ width : num [1:20000] 3 3 6 6 3 5 5 3 4 13 ...
#> $ height: num [1:20000] 5 7 8 6 1 8 4 2 4 9 ...
#> $ onpix : num [1:20000] 1 2 6 3 1 3 4 1 2 7 ...
#> $ xbar : num [1:20000] 8 10 10 5 8 8 8 8 10 13 ...
#> $ ybar : num [1:20000] 13 5 6 9 6 8 7 2 6 2 ...
#> $ x2bar : num [1:20000] 0 5 2 4 6 6 6 2 2 6 ...
#> $ y2bar : num [1:20000] 6 4 6 6 6 9 6 2 6 2 ...
#> $ xybar : num [1:20000] 6 13 10 4 6 5 7 8 12 12 ...
#> $ x2ybar: num [1:20000] 10 3 3 4 5 6 6 2 4 1 ...
#> $ xy2bar: num [1:20000] 8 9 7 10 9 6 6 8 8 9 ...
#> $ xedge : num [1:20000] 0 2 3 6 1 0 2 1 1 8 ...
#> $ xedgey: num [1:20000] 8 8 7 10 7 8 8 6 6 1 ...
#> $ yedge : num [1:20000] 0 4 3 2 5 9 7 2 1 1 ...
#> $ yedgex: num [1:20000] 8 10 9 8 10 7 10 7 7 8 ...
#> - attr(*, "spec")=
#> .. cols(
#> .. letter = col_character(),
#> .. xbox = col_double(),
#> .. ybox = col_double(),
#> .. width = col_double(),
#> .. height = col_double(),
#> .. onpix = col_double(),
#> .. xbar = col_double(),
#> .. ybar = col_double(),
#> .. x2bar = col_double(),
#> .. y2bar = col_double(),
#> .. xybar = col_double(),
#> .. x2ybar = col_double(),
#> .. xy2bar = col_double(),
#> .. xedge = col_double(),
#> .. xedgey = col_double(),
#> .. yedge = col_double(),
#> .. yedgex = col_double()
#> .. )
#> - attr(*, "problems")=<externalptr>
# Just to make sure there's consistancy in features' names:
clean_names(letters)
#> # A tibble: 20,000 × 17
#> letter xbox ybox width height onpix xbar ybar x2bar y2bar xybar x2ybar
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 T 2 8 3 5 1 8 13 0 6 6 10
#> 2 I 5 12 3 7 2 10 5 5 4 13 3
#> 3 D 4 11 6 8 6 10 6 2 6 10 3
#> 4 N 7 11 6 6 3 5 9 4 6 4 4
#> 5 G 2 1 3 1 1 8 6 6 6 6 5
#> 6 S 4 11 5 8 3 8 8 6 9 5 6
#> 7 B 4 2 5 4 4 8 7 6 6 7 6
#> 8 A 1 1 3 2 1 8 2 2 2 8 2
#> 9 J 2 2 4 4 2 10 6 2 6 12 4
#> 10 M 11 15 13 9 7 13 2 6 2 12 1
#> # … with 19,990 more rows, and 5 more variables: xy2bar <dbl>, xedge <dbl>,
#> # xedgey <dbl>, yedge <dbl>, yedgex <dbl>
view(letters)
# TO check for missingness:
letters_na <- letters %>%
filter(!complete.cases(.))
# TO confirm if there are any duplicates:
letters_dup <- letters %>%
filter(duplicated(letters)) # Oops!! There are over 1300 duplicates.
# Let's get rid of duplicates:
letters <- letters %>%
distinct()
# SVM learners require all features to be numeric, and moreover,
# that each feature is scaled to a fairly small interval. But we don't have to normalize or standardize the data manually
# bc the model the R package we'll use for fitting SVM model will perform the rescalling automattically.
# Given that there is no data preparation left to perform,
# we can move directly to the training and testing phases of the machine learning process.
samp <- sample(2, nrow(letters), replace = TRUE, prob = c(0.8, 0.2))
train <- letters[samp == 1,]
test <- letters[samp == 2, ]
# We'll use ksvm() from kernlab package to train the model>
# To provide a baseline measure of SVM performance, let's begin by training a simple linear SVM classifier
classifier <- ksvm(letter ~ ., data = train, kernel = 'vanilladot')
#> Setting default kernel parameters
#> Warning in .local(x, ...): NAs introduced by coercion
#> Error in .local(x, ...): No Support Vectors found. You may want to change your parameters
Created on 2022-06-07 by the reprex package (v2.0.1)