Hello,
I have been working on data which has a lot of binary variables and a few numerical variables. I have used mtcars from the built-in datasets to create a similar situation.
I need to find the correlation between all the variables in mt_cars at once without changing the data type of any variable.
All the binary variables are treated as factor and when I use the ggpairs() or pairs() function, it only gives me the correlation value between the numerical which, I understand is only obvious.
I am trying to find ways to generate the correlation between all these variables at once. Is there any way to do that?
#DATA#
data(mtcars)
View(mtcars)
mt_cars = mtcars[,-c(2,4,5,6,7,11)]
head(mt_cars)
#> mpg disp vs am gear
#> Mazda RX4 21.0 160 0 1 4
#> Mazda RX4 Wag 21.0 160 0 1 4
#> Datsun 710 22.8 108 1 1 4
#> Hornet 4 Drive 21.4 258 1 0 3
#> Hornet Sportabout 18.7 360 0 0 3
#> Valiant 18.1 225 1 0 3
#CONVERSION OF DATA TYPES#
mt_cars$vs = as.factor(mt_cars$vs)
mt_cars$am = as.factor(mt_cars$am)
mt_cars$gear = as.factor(mt_cars$gear)
str(mt_cars)
#> 'data.frame': 32 obs. of 5 variables:
#> $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
#> $ disp: num 160 160 108 258 360 ...
#> $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#> $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#> $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#RESHAPING THE DATA FOR QUERY#
gear.matrix = model.matrix(~gear - 1, mt_cars)
mt_cars = cbind(mt_cars, gear.matrix)
head(mt_cars)
#> mpg disp vs am gear gear3 gear4 gear5
#> Mazda RX4 21.0 160 0 1 4 0 1 0
#> Mazda RX4 Wag 21.0 160 0 1 4 0 1 0
#> Datsun 710 22.8 108 1 1 4 0 1 0
#> Hornet 4 Drive 21.4 258 1 0 3 1 0 0
#> Hornet Sportabout 18.7 360 0 0 3 1 0 0
#> Valiant 18.1 225 1 0 3 1 0 0
mt_cars = mt_cars[,c(-5)]
head(mt_cars)
#> mpg disp vs am gear3 gear4 gear5
#> Mazda RX4 21.0 160 0 1 0 1 0
#> Mazda RX4 Wag 21.0 160 0 1 0 1 0
#> Datsun 710 22.8 108 1 1 0 1 0
#> Hornet 4 Drive 21.4 258 1 0 1 0 0
#> Hornet Sportabout 18.7 360 0 0 1 0 0
#> Valiant 18.1 225 1 0 1 0 0
mt_cars$gear3 = as.factor(mt_cars$gear3)
mt_cars$gear4 = as.factor(mt_cars$gear4)
mt_cars$gear5 = as.factor(mt_cars$gear5)
str(mt_cars)
#> 'data.frame': 32 obs. of 7 variables:
#> $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
#> $ disp : num 160 160 108 258 360 ...
#> $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#> $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#> $ gear3: Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 1 1 ...
#> $ gear4: Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 2 2 2 ...
#> $ gear5: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#correlation using ggpairs() and pairs()
library(GGally)
#> Warning: package 'GGally' was built under R version 3.6.3
#> Loading required package: ggplot2
#> Registered S3 method overwritten by 'GGally':
#> method from
#> +.gg ggplot2
x = ggpairs(mt_cars)
x
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pairs(mt_cars)
Created on 2020-05-01 by the reprex package (v0.3.0)