In this data, 98 corresponds to "No response". I want to create a new variable that calculates the proportion of No response values as a percentage of the total number of questions. In the present data, I want a variable to appear that calculates the number of "No response" divided by number of question. The data in this reprex has only few variables. The entire dataset has more than 400 variables. It would be great if I can get a solution for this.
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
nithin<-tibble::tribble(
~enumerator, ~l1el4_identify_letter3, ~l1el4_identify_letter4, ~l1el4_identify_letter5, ~l1el4_total, ~l1el5_dummy, ~l1el5_match_pic1,
"PEN008", 0L, 98L, 1L, 0L, 0L, 0L,
"PEN001", 1L, 1L, 98L, 1L, 0L, 98L,
"PEN006", 0L, 1L, 98L, 1L, 1L, 0L,
"PEN006", 0L, 0L, 0L, 0L, 0L, 0L,
"PEN010", 1L, 0L, 0L, 1L, 98L, 0L,
"PEN010", 98L, 1L, 0L, 0L, 0L, 0L,
"PEN003", 1L, 0L, 0L, 98L, 0L, 0L,
"PEN003", 1L, 1L, 98L, 0L, 0L, 1L
)
Created on 2021-11-02 by the reprex package (v2.0.1)
It's unclear whether you're looking to do this row-wise or by columns.
d <- data.frame(
V1 =
c("PEN008", "PEN001", "PEN006", "PEN006", "PEN010", "PEN010", "PEN003", "PEN003"),
V2 =
c(0, 1, 0, 0, 1, 98, 1, 1),
V3 =
c(98, 1, 1, 0, 0, 1, 0, 1),
V4 =
c(1, 98, 98, 0, 0, 0, 0, 98),
V5 =
c(0, 1, 1, 0, 1, 0, 98, 0),
V6 =
c(0, 0, 1, 0, 98, 0, 0, 0),
V7 =
c(0, 98, 0, 0, 0, 0, 0, 1)
)
# rowwise: count by variables of values equal to 98 divided by number of rows
colSums(d[,2:7] == 98) / nrow(d)
#> V2 V3 V4 V5 V6 V7
#> 0.125 0.125 0.375 0.125 0.125 0.125
# column wise
rowSums(d[,2:7] == 98) / nrow(d)
#> [1] 0.125 0.250 0.125 0.000 0.125 0.125 0.125 0.125
It is row-wise I want to do. Like for each enumerator, I want to see the ratio of "No response" like the data frame here below which I made from Excel. I want to create it in R as our survey dataset is quite big.
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
nithin<-tibble::tribble(
~enumerator, ~l1el4_identify_letter3, ~l1el4_identify_letter4, ~l1el4_identify_letter5, ~l1el4_total, ~l1el5_dummy, ~l1el5_match_pic1, ~No.response.ratio,
"PEN008", 0L, 98L, 1L, 0L, 0L, 0L, 0.142857143,
"PEN001", 1L, 1L, 98L, 1L, 0L, 98L, 0.428571429,
"PEN006", 0L, 1L, 98L, 1L, 1L, 0L, 0.142857143,
"PEN006", 0L, 0L, 0L, 0L, 98L, 0L, 0.285714286,
"PEN010", 1L, 0L, 0L, 1L, 98L, 0L, 0.142857143,
"PEN010", 98L, 1L, 0L, 0L, 0L, 0L, 0.142857143,
"PEN003", 1L, 0L, 0L, 98L, 0L, 0L, 0.142857143,
"PEN003", 1L, 1L, 98L, 0L, 0L, 1L, 0.285714286
)
Created on 2021-11-02 by the reprex package (v2.0.1)
I flipped row/col wise in the comment lines. The quoted code gets the percentage of each observation (row) equal to 98 divided by the total number of rows (8). The provides the answer to the question, for each observation, what is the percentage the nominator of whichis the sum of the value of 98 in any of V_1\dots V_6 and denominator of which is the number of observations? If that is the correct question, use rowSums
.
If the correct question is, rather for each variable what is the percentage the numerator of which is the sum of all observations in each of V_1\dots V_6 that equal 98 and the denominator of which is the number of observations? then use colSums
.
Is there some way to put the variable names rather than numbers (like 2:7) as it will be difficult to get the numbers?
d <- data.frame(
V1 =
c("PEN008", "PEN001", "PEN006", "PEN006", "PEN010", "PEN010", "PEN003", "PEN003"),
V2 =
c(0, 1, 0, 0, 1, 98, 1, 1),
V3 =
c(98, 1, 1, 0, 0, 1, 0, 1),
V4 =
c(1, 98, 98, 0, 0, 0, 0, 98),
V5 =
c(0, 1, 1, 0, 1, 0, 98, 0),
V6 =
c(0, 0, 1, 0, 98, 0, 0, 0),
V7 =
c(0, 98, 0, 0, 0, 0, 0, 1)
)
pick_list <- c("V2","V3","V5")
colSums(d[,pick_list] == 98) / nrow(d)
#> V2 V3 V5
#> 0.125 0.125 0.125
I think this is what you're looking for:
library(tidyverse)
library(janitor)
#> Error in library(janitor): there is no package called 'janitor'
nithin<-tibble::tribble(
~enumerator, ~l1el4_identify_letter3, ~l1el4_identify_letter4, ~l1el4_identify_letter5, ~l1el4_total, ~l1el5_dummy, ~l1el5_match_pic1,
"PEN008", 0L, 98L, 1L, 0L, 0L, 0L,
"PEN001", 1L, 1L, 98L, 1L, 0L, 98L,
"PEN006", 0L, 1L, 98L, 1L, 1L, 0L,
"PEN006", 0L, 0L, 0L, 0L, 0L, 0L,
"PEN010", 1L, 0L, 0L, 1L, 98L, 0L,
"PEN010", 98L, 1L, 0L, 0L, 0L, 0L,
"PEN003", 1L, 0L, 0L, 98L, 0L, 0L,
"PEN003", 1L, 1L, 98L, 0L, 0L, 1L
)
nithin %>%
rowwise() %>%
mutate(
N98=sum(across(where(is.numeric), ~.x==98)),
NDenom=sum(across(where(is.numeric), ~!is.na(.x))),
No.response.ratio=N98/NDenom
) %>%
ungroup()
#> # A tibble: 8 x 10
#> enumerator l1el4_identify_le~ l1el4_identify_le~ l1el4_identify_l~ l1el4_total
#> <chr> <int> <int> <int> <int>
#> 1 PEN008 0 98 1 0
#> 2 PEN001 1 1 98 1
#> 3 PEN006 0 1 98 1
#> 4 PEN006 0 0 0 0
#> 5 PEN010 1 0 0 1
#> 6 PEN010 98 1 0 0
#> 7 PEN003 1 0 0 98
#> 8 PEN003 1 1 98 0
#> # ... with 5 more variables: l1el5_dummy <int>, l1el5_match_pic1 <int>,
#> # N98 <int>, NDenom <int>, No.response.ratio <dbl>
Created on 2021-11-02 by the reprex package (v2.0.1)
This code is good. But I came across a new issue now. Even where rows contain "98", the N98 variable does not show the exact number. In the reprex below, the last student has "98" values, but that is not shown in the N98 variable and the No.response.ratio is also showing NA.
library(tidyverse)
nithin<-tibble::tribble(
~Student, ~l1c1_identify_pic1, ~l1c1_identify_pic2, ~l1c1_identify_pic3, ~l1c1_identify_pic4, ~l1c1_identify_pic5, ~l1c1_identify_pic6, ~l1c1_classify_bird, ~l1c1_classify_animal, ~l1c1_identification, ~l1c1_classification, ~l1c1_total, ~l1c2_identify_col1, ~l1c2_identify_col2, ~l1c2_identify_col3, ~l1c2_sort_col1, ~l1c2_identify, ~l1c2_sort, ~l1c2_total, ~l1c3_identify_shape1, ~l1c3_identify_shape2, ~l1c3_identify_shape3, ~l1c3_sort_shape1, ~l1c3_identify, ~l1c3_sort, ~l1c3_total, ~l1c4_corr_card, ~l1c4_total, ~l1c5_card_order,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 14L, 1L, 0L, 1L, 1L, 1, 0.5, 1.5, 1L, 0L, 98L, 1L, 0.5, 0.5, 1, 0L, 0L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 14L, 98L, 98L, 1L, 1L, 0.5, 0.5, 1, 98L, 98L, 98L, 1L, 0, 0.5, 0.5, 0L, 0L, 1L,
3L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 10L, 1L, 11L, 0L, 1L, 1L, 1L, 1, 0.5, 1.5, 1L, 1L, 98L, 1L, 1, 0.5, 1.5, 2L, 2L, 0L,
4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 14L, 0L, 1L, 0L, 1L, 0.5, 0.5, 1, 1L, 1L, 0L, 1L, 1, 0.5, 1.5, 1L, 1L, 0L,
5L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 10L, 0L, 10L, 0L, 1L, 1L, 1L, 1, 0.5, 1.5, 0L, 0L, 0L, 1L, 0, 0.5, 0.5, 1L, 1L, 0L,
6L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 12L, 0L, 12L, 0L, 1L, 1L, 1L, 1, 0.5, 1.5, 1L, 0L, 0L, 1L, 0.5, 0.5, 1, 1L, 1L, 1L,
7L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 10L, 2L, 12L, 0L, 0L, 0L, 1L, 0, 0.5, 0.5, 0L, 0L, 0L, 1L, 0, 0.5, 0.5, 2L, 2L, 1L,
8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 14L, 1L, 1L, 1L, 1L, 1.5, 0.5, 2, 1L, 1L, 1L, 1L, 1.5, 0.5, 2, 2L, 2L, 1L,
NA, 98L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 4L, 2L, 6L, 0L, 0L, 0L, 1L, 0, 0.5, 0.5, 98L, 98L, 98L, 1L, 0, 0.5, 0.5, 0L, 0L, 0L
)
nithin<-nithin %>%
rowwise() %>%
mutate(
N98=sum(across(where(is.numeric), ~.x==98)),
NDenom=sum(across(where(is.numeric), ~!is.na(.x))),
No.response.ratio=N98/NDenom
) %>%
ungroup()
nithin
#> # A tibble: 9 x 32
#> Student l1c1_identify_pic1 l1c1_identify_pi~ l1c1_identify_p~ l1c1_identify_p~
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 1 1 1 1
#> 3 3 1 1 0 1
#> 4 4 1 1 1 1
#> 5 5 1 1 1 1
#> 6 6 1 1 1 1
#> 7 7 1 1 0 1
#> 8 8 1 1 1 1
#> 9 NA 98 1 0 1
#> # ... with 27 more variables: l1c1_identify_pic5 <int>,
#> # l1c1_identify_pic6 <int>, l1c1_classify_bird <int>,
#> # l1c1_classify_animal <int>, l1c1_identification <int>,
#> # l1c1_classification <int>, l1c1_total <int>, l1c2_identify_col1 <int>,
#> # l1c2_identify_col2 <int>, l1c2_identify_col3 <int>, l1c2_sort_col1 <int>,
#> # l1c2_identify <dbl>, l1c2_sort <dbl>, l1c2_total <dbl>,
#> # l1c3_identify_shape1 <int>, l1c3_identify_shape2 <int>, ...
Created on 2021-11-03 by the reprex package (v2.0.1)
The NA that you see for the last row is because you have NA values in the sum
function. You need to add na.rm = TRUE
in the function.
nithin<-nithin %>%
rowwise() %>%
mutate(
N98=sum(across(where(is.numeric), ~.x==98), na.rm = TRUE),
NDenom=sum(across(where(is.numeric), ~!is.na(.x))),
No.response.ratio=N98/NDenom
) %>%
ungroup()
nithin
Thanks a lot for the help.
This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.
If you have a query related to it or one of the replies, start a new topic and refer back with a link.