Hi all,
So I think this problem might be to do with how I'm using rvest
rather than str_extract()
but I'm not sure as this is the first time I've used either... I think maybe it's some sort of encoding problem but I have no idea.
So I'm pulling temperature values from wikipedia. The temperature column has both celsius and fahrenheit. I'm trying to extract the celsius component. I've written a regex expression that works as expected when applied using str_extract()
to data which is not pulled from wikipedia. When I apply str_extract()
to the data from wikipedia, negative values are lost.
Here's a reprex that I hope illustrates the problem:
library(rvest)
#> Loading required package: xml2
library(tidyverse)
#> Warning: package 'forcats' was built under R version 3.6.3
# url of temperatures
url <- "https://en.wikipedia.org/wiki/List_of_cities_by_average_temperature"
# Import and clean data
temps <-
url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div/table') %>%
map_dfr(html_table) %>%
janitor::clean_names() %>%
select(-ref, -year) %>%
pivot_longer(
-c(country, city),
values_to = "temp",
names_to = "month"
)
# Select some rows so I have a mix of +ve and -ve values in temp
test <-
temps %>%
filter(country == "Afghanistan") %>%
slice(1:4)
### Problem starts here ###
# Regex for pulling out temperature in C
temp_regex <- "^(|-)\\d{1,2}([.]\\d{1,2}|)"
# This doesn't work. Note the str_extract returns NA for rows 1 and 2
test %>%
mutate(temp_extract = str_extract(temp, temp_regex))
#> # A tibble: 4 x 5
#> country city month temp temp_extract
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghanistan Kabul jan -2.3(27.9) <NA>
#> 2 Afghanistan Kabul feb -0.7(30.7) <NA>
#> 3 Afghanistan Kabul mar 6.3(43.3) 6.3
#> 4 Afghanistan Kabul apr 12.8(55.0) 12.8
# This does work. Compare row 1 and 2 with above.
#dput(test)
structure(list(country = c("Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan"), city = c("Kabul", "Kabul", "Kabul", "Kabul"),
month = c("jan", "feb", "mar", "apr"), temp = c("-2.3(27.9)",
"-0.7(30.7)", "6.3(43.3)", "12.8(55.0)")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L)) %>%
mutate(temp_extract = str_extract(temp, temp_regex))
#> # A tibble: 4 x 5
#> country city month temp temp_extract
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Afghanistan Kabul jan -2.3(27.9) -2.3
#> 2 Afghanistan Kabul feb -0.7(30.7) -0.7
#> 3 Afghanistan Kabul mar 6.3(43.3) 6.3
#> 4 Afghanistan Kabul apr 12.8(55.0) 12.8
# Pull temperature values out from df
pulled_val <- test %>%
pull(temp)
# This fails in same way as above
str_extract(pulled_val, temp_regex)
#> [1] NA NA "6.3" "12.8"
# Copy/pasted pulled_val output
copy_pasted <- c("-2.3(27.9)", "-0.7(30.7)", "6.3(43.3)", "12.8(55.0)")
# This now works...
str_extract(copy_pasted, temp_regex)
#> [1] "-2.3" "-0.7" "6.3" "12.8"