# how to subset a dataset properly (incorrect result or bad code?)

I can't figure out why the number of outliers is always 0 when the number of non outliers is not zero. It makes no difference in the size of the dataset df2 the end result is always the same??? Am I reading this wrong?

I am following an example:

``````# create a new dataframe that contains only those rows
# that have a z-score of below 3
new_data <- subset(data, data\$zscore < 3)
``````

My data sets:

structure(list(id = 1:6, id2 = 0:5, z = c(0L, 0L, 0L, 0L, 0L,
0L), x1 = c(7.4, 7.8, 7.8, 11.2, 7.4, 7.4), x2 = c(0.7, 0.88,
0.76, 0.28, 0.7, 0.66), x3 = c(0, 0, 0.04, 0.56, 0, 0), x4 = c(1.9,
2.6, 2.3, 1.9, 1.9, 1.8), x5 = c(0.076, 0.098, 0.092, 0.075,
0.076, 0.075), x6 = c(11, 25, 15, 17, 11, 13), x7 = c(34, 67,
54, 60, 34, 40), x8 = c(0.9978, 0.9968, 0.997, 0.998, 0.9978,
0.9978), x9 = c(3.51, 3.2, 3.26, 3.16, 3.51, 3.51), x10 = c(0.56,
0.68, 0.65, 0.58, 0.56, 0.56), x11 = c(9.4, 9.8, 9.8, 9.8, 9.4,
9.4), y = c(5L, 5L, 5L, 6L, 5L, 5L), y2 = c(0L, 0L, 0L, 1L, 0L,
0L), y3 = c(1L, 1L, 1L, 2L, 1L, 1L)), row.names = c(NA, 6L), class = "data.frame")

structure(list(z = c(-1.75005514316603, -1.75005514316603, -1.75005514316603,
-1.75005514316603, -1.75005514316603, -1.75005514316603), x1 = c(0.142462300205994,
0.451001010798382, 0.451001010798382, 3.07358005083368, 0.142462300205994,
0.142462300205994), x2 = c(2.18866446400268, 3.28198233904062,
2.55310375568199, -0.362410577752516, 2.18866446400268, 1.94570493621647
), x3 = c(-2.19266375510471, -2.19266375510471, -1.91740510037435,
1.6609574111204, -2.19266375510471, -2.19266375510471), x4 = c(-0.744720785192258,
-0.597594077620892, -0.660648380865763, -0.744720785192258, -0.744720785192258,
-0.765738886273882), x5 = c(0.569913952190335, 1.19788250632519,
1.0266183551975, 0.541369927002388, 0.569913952190335, 0.541369927002388
), x6 = c(-1.10005519223097, -0.311296125454904, -0.87469545886638,
-0.762015592184085, -1.10005519223097, -0.987375325548675), x7 = c(-1.44624721020492,
-0.862402248309921, -1.09240177875341, -0.986248149317952, -1.44624721020492,
-1.34009358076947), x8 = c(1.03491316497404, 0.701432322361402,
0.768128490883923, 1.10160933349656, 1.03491316497404, 1.03491316497404
), x9 = c(1.81294997139708, -0.11506417365602, 0.258099854418771,
-0.36384019237255, 1.81294997139708, 1.81294997139708), x10 = c(0.193081910246498,
0.999501691167798, 0.797896745937473, 0.327485207066714, 0.193081910246498,
0.193081910246498), x11 = c(-0.915393708652846, -0.58002349000728,
-0.58002349000728, -0.58002349000728, -0.915393708652846, -0.915393708652846
), y = c(-0.937157483579359, -0.937157483579359, -0.937157483579359,
0.207983041305932, -0.937157483579359, -0.937157483579359)), row.names = c(NA,
6L), class = "data.frame")

I have 11 variables which I am checking one at a time for outliers. I am repeating this type of statement.

``````df  <- read.csv("df.csv",header=TRUE)

data <- df2
names(data)

data\$x1z <- df3\$x1
data\$x2z <- df3\$x2
data\$x3z <- df3\$x3
data\$x4z <- df3\$x4
data\$x5z <- df3\$x5
data\$x6z <- df3\$x6
data\$x7z <- df3\$x7
data\$x8z <- df3\$x8
data\$x9z <- df3\$x9
data\$x10z <- df3\$x10
data\$x11z <- df3\$x11
names(data)
dim(data)

# keep only rows with no outliers
datakeep <- data
datakeep <- subset(datakeep,abs(datakeep\$x1z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x2z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x3z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x4z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x5z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x6z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x7z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x8z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x9z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x10z)<= 3)
datakeep <- subset(datakeep,abs(datakeep\$x11z)<= 3)

dim(datakeep) #6009 x 28 or 938 outliers
names(datakeep)
``````

# that have a z-score > 3

``````

data <- df2
names(data)
data\$x1z <- df3\$x1
data\$x2z <- df3\$x2
data\$x3z <- df3\$x3
data\$x4z <- df3\$x4
data\$x5z <- df3\$x5
data\$x6z <- df3\$x6
data\$x7z <- df3\$x7
data\$x8z <- df3\$x8
data\$x9z <- df3\$x9
data\$x10z <- df3\$x10
data\$x11z <- df3\$x11
names(data)
dim(data)

# keep only rows with outliers
datakeep2 <- data
datakeep2 <- subset(datakeep2,abs(datakeep2\$x1z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x2z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x3z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x4z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x5z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x6z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x7z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x8z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x9z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x10z)>3)
datakeep2 <- subset(datakeep2,abs(datakeep2\$x11z)>3)

dim(datakeep2) # 0 x 28
names(datakeep2)
``````

You are asking for rows where x1Z > 3 AND x2z > 3 AND x3z > 3 etc.
If you post the output of

``````dput(head(df2))
``````

and

``````dput(head(df3))
``````

someone can probably suggest a good solution to select the rows with outliers.

Does this help?

``````# Define outliers as is done in default boxplots
is_outlier_q <- function(x){
lower <- quantile(x, probs = c(0.25)) - 1.5 * IQR(x)
upper <- quantile(x, probs = c(0.75)) + 1.5 * IQR(x)
return( x < lower | upper < x )
}

# Define outliers using standard scores
is_outlier_z <- function(x, z_lim = 3){
z <- scale(x)[,1]
return( abs(z) > z_lim )
}

# Generate example data
my_data <- data.frame(
x = rnorm(1000),
y = rnorm(1000)
)

# Do subsets of example data
subset(my_data, is_outlier_q(x))
subset(my_data, is_outlier_q(x) | is_outlier_q(y))
subset(my_data, is_outlier_z(x))
subset(my_data, is_outlier_z(x) | is_outlier_z(y))
``````

...and be aware, that using the standard score approach may not give you what you think it does. E.g. removing all `abs(z) > 3` and then re-scaling the original data can produce new observations, where `abs(z) > 3`, because you now have a smaller variance in the data. I have seen people employing an iterative approach, where they repeat the procedure until all `abs(z) < z_lim`.

In any case, it is important to think about what is an outlier contextually and why do you want to remove them.

I like it. I am using set.seed(12345).
You say rescaling is a problem. Can you expand on that? Why would I want to rescale?
How would you recommend I find the nonoutliers? I just changed the return statements.