suppressPackageStartupMessages({
suppressWarnings({
library(assertr)
library(assertive)
library(assertthat)
library(palmerpenguins)
library(janitor)
})
})
Data validation is performed using the assertr package, with extra checks provided by the assertive package. The assertthat package can also be used as an alternative to stopifnot(). All examples perform checks on the palmerpenguins dataset and return TRUE if validation check passes and stops with an error if the validation check fails
The file path to the validation checks markdown document exists
assert_that(file.exists("validation_checks.Rmd"),
msg = "Path to validation check markdown document is UNSUCCESSFUL")
#> [1] TRUE
The penguin dataset is not empty
The penguins dataset contains 344 rows
penguins %>%
verify(nrow(.) == 344,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
The penguins dataset contains eight named columns. This uses a functional sequence to store verify functions for each column name and number of columns.
check_penguins_column_names <- . %>%
chain_start() %>%
verify(has_all_names("species")) %>%
verify(has_all_names("island")) %>%
verify(has_all_names("bill_length_mm")) %>%
verify(has_all_names("bill_depth_mm")) %>%
verify(has_all_names("flipper_length_mm")) %>%
verify(has_all_names("body_mass_g")) %>%
verify(has_all_names("sex")) %>%
verify(has_all_names("year")) %>%
verify(ncol(.) == 8) %>%
chain_end(success_fun = success_logical, error_fun = error_stop)
penguins %>%
check_penguins_column_names
#> [1] TRUE
Penguins raw dataset date_egg column is a date type
penguins_raw %>%
clean_names() %>%
assert(assertive::is_date, date_egg,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
Combination of sample number and species name in penguins raw dataset are unique
penguins_raw %>%
clean_names() %>%
assert_rows(col_concat, is_uniq, sample_number, species,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
All penguins are assigned to a species and an island (they is no missing data in the species or island columns)
penguins %>%
assert(not_na, species, island,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
All penguins are either male or female (set parameters allow.na = TRUE to exclude NAs and inverse = TRUE to check if all penguins do not contain either male or female)
penguins %>%
assert(in_set("male", "female"), sex,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
All penguins weight are positive values
penguins %>%
dplyr::filter(!is.na(body_mass_g)) %>%
assert(assertive::assert_all_are_positive, body_mass_g,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
All penguins weigh between 2700 and 6300 grams, inclusively
penguins %>%
assert(within_bounds(2700, 6300), body_mass_g,
success_fun = success_logical, error_fun = error_stop)
#> [1] TRUE
Find the rows in the penguins dataset that are missing all the four bill, flipper and body mass measurements. Row numbers are displayed in the index column
penguins %>%
assert_rows(num_row_NAs, function(.x){.x != 4}, bill_length_mm:body_mass_g,
success_fun = success_continue, error_fun = just_warn)
#> Data frame row reduction 'num_row_NAs' violates predicate 'function(.x) {
#> .x != 4
#> }' 2 times
#> verb redux_fn predicate
#> 1 assert_rows num_row_NAs function(.x) {\n .x != 4\n}
#> 2 assert_rows num_row_NAs function(.x) {\n .x != 4\n}
#> column index value
#> 1 ~bill_length_mm:body_mass_g 4 4
#> 2 ~bill_length_mm:body_mass_g 272 4
#> Warning: assertr encountered errors
#> # A tibble: 344 x 8
#> species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#> <fct> <fct> <dbl> <dbl> <int> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750
#> 2 Adelie Torgersen 39.5 17.4 186 3800
#> 3 Adelie Torgersen 40.3 18 195 3250
#> 4 Adelie Torgersen NA NA NA NA
#> 5 Adelie Torgersen 36.7 19.3 193 3450
#> 6 Adelie Torgersen 39.3 20.6 190 3650
#> 7 Adelie Torgersen 38.9 17.8 181 3625
#> 8 Adelie Torgersen 39.2 19.6 195 4675
#> 9 Adelie Torgersen 34.1 18.1 193 3475
#> 10 Adelie Torgersen 42 20.2 190 4250
#> # ... with 334 more rows, and 2 more variables: sex <fct>, year <int>