Chapter 4 Clean Data

4.1 Replace a value with NA

Use dplyr::na_if() if you have a value coded in your data (e.g., 999) that you want to convert to NA

example_data=dplyr::tribble(~name, ~bday_month,
                            "Ryan", 10,
                            "Z", 3,
                            "Jen", 999, 
                            "Tristin", 999,
                            "Cassidy", 6)

example_data
## # A tibble: 5 x 2
##   name    bday_month
##   <chr>        <dbl>
## 1 Ryan            10
## 2 Z                3
## 3 Jen            999
## 4 Tristin        999
## 5 Cassidy          6
example_data$bday_month=na_if(example_data$bday_month, 999) #example doing one column at a time 

example_data
## # A tibble: 5 x 2
##   name    bday_month
##   <chr>        <dbl>
## 1 Ryan            10
## 2 Z                3
## 3 Jen             NA
## 4 Tristin         NA
## 5 Cassidy          6
example_data %>% # can also pass the data to mutate and do it the tidyverse way
  mutate(bday_month=na_if(bday_month, 999)) 
## # A tibble: 5 x 2
##   name    bday_month
##   <chr>        <dbl>
## 1 Ryan            10
## 2 Z                3
## 3 Jen             NA
## 4 Tristin         NA
## 5 Cassidy          6

4.2 Replace NA’s with a value

tidyr::replace_na() is very useful if you have some NA’s in your data and you want to fill them in with some value.

example_data=tibble::tribble(~name, ~fav_color, ~fav_food,
                             "Ryan", "green", "Mexican",
                             "Cassidy", "blue", NA,
                             "Z", NA, NA,
                             "Tristin", "purple", NA,
                             "Tarika", NA, NA,
                             "Jen", NA, "Italian")

example_data
## # A tibble: 6 x 3
##   name    fav_color fav_food
##   <chr>   <chr>     <chr>   
## 1 Ryan    green     Mexican 
## 2 Cassidy blue      <NA>    
## 3 Z       <NA>      <NA>    
## 4 Tristin purple    <NA>    
## 5 Tarika  <NA>      <NA>    
## 6 Jen     <NA>      Italian
# replace NA's in one col
tidyr::replace_na(example_data$fav_food, "MISSING")
## [1] "Mexican" "MISSING" "MISSING" "MISSING" "MISSING" "Italian"
# replace in multiple columns
example_data %>% mutate(across(c(fav_color, fav_food), replace_na, "MISSING"))
## # A tibble: 6 x 3
##   name    fav_color fav_food
##   <chr>   <chr>     <chr>   
## 1 Ryan    green     Mexican 
## 2 Cassidy blue      MISSING 
## 3 Z       MISSING   MISSING 
## 4 Tristin purple    MISSING 
## 5 Tarika  MISSING   MISSING 
## 6 Jen     MISSING   Italian

4.3 Identify columns or rows with Missing values

is.na() is the base R way to identify, in a TRUE/FALSE manner, whether or not there are missing values in a vector

y <- c(1,2,3,NA)

is.na(y) # returns a vector (F F F T)
## [1] FALSE FALSE FALSE  TRUE

4.4 Find the percentage of a variable that is missing

Sometimes necessary to check before conducting an analysis. This requires my package, legaldmlab

?legaldmlab::count_missing

mtcars %>% 
  select(hp:drat) %>% 
  legaldmlab::count_missing()
## # A tibble: 2 x 3
##   variable missing_count percent_missing
##   <chr>            <int> <chr>          
## 1 hp                   0 0.0%           
## 2 drat                 0 0.0%

4.5 Exclude Missing values from analysis

4.6 Dropping Missing values from the data set

Use tidyr::drop_na() to remove rows with missing values.

example_data=dplyr::tribble(~name, ~bday_month, ~car,
                            "Ryan", 10, "kia",
                            "Z", NA, "toyota",
                            "Jen", NA, NA,
                            "Tristin", 999, NA,
                            "Cassidy", 6, "honda")


knitr::kable(example_data)
name bday_month car
Ryan 10 kia
Z NA toyota
Jen NA NA
Tristin 999 NA
Cassidy 6 honda
example_data %>% drop_na() # with nothing specified, it drops ALL variables that have >=1 missing value
## # A tibble: 2 x 3
##   name    bday_month car  
##   <chr>        <dbl> <chr>
## 1 Ryan            10 kia  
## 2 Cassidy          6 honda
example_data %>% drop_na(car) # drops only rows with values missing in the specified column
## # A tibble: 3 x 3
##   name    bday_month car   
##   <chr>        <dbl> <chr> 
## 1 Ryan            10 kia   
## 2 Z               NA toyota
## 3 Cassidy          6 honda