Writing a Data Dictionary

We’ll do this with a smaller dataset than what we’re working with in this workshop.

install.packages("palmerpenguins")
library(palmerpenguins)
penguins_data <- palmerpenguins::penguins

First, we build each piece of metadata we’d like, inculding:

  • variable names
  • data types
  • data ranges
  • observation counts
varNames <- names(penguins_data) # we've seen this before
dataClass <- sapply(penguins_data, class) # we haven't seen sapply before -- see the note box below
observationsWithValues <- sapply(penguins_data, function(x) sum(!is.na(x)))
# this might need a bit more of a break down. Basically,
# for the range of numbers 1:n, where n is the number of columns in the dataset
# pass that number into the anynmous function, which then passes that number into
# each index subset. See alos the note box below on matrix subsetting.
dataRanges <- sapply(1:length(penguins_data), function(x) ifelse(
        class(penguins_data[[x]]) == "factor",
        paste0(levels(penguins_data[[x]]), collapse = ", "),
        paste0(range(penguins_data[[x]], na.rm = TRUE), collapse = " : ")))
# this is similar to above, see if you can figure out what's happening!
dataType <- sapply(penguins_data, function(x) dplyr::case_when(
  class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
  class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
  .default = typeof(x)
))

sapply is one member of the apply family of functions. These functions take a list, and apply a function to each element in that list. sapply returns a vector. In the above case, for example with dataClass, we pass the data set as the list, which is interpreted as each of its columns, and then runs class() on each of these columns.

We’ve been introduced to filter and select to subset data. Subsetting can also be done by index location with []. There is a slight difference between using [] and [[]]. The former preserves the original data structure, the latter drops those attributes. In this example, we don’t want the class of the tibble, but of the individual column, hence the need for [[]].

class(penguins_data[1]) # reports tbl_df, tbl, data.frame
## [1] "tbl_df"     "tbl"        "data.frame"
class(penguins_data[[1]]) # reports factor
## [1] "factor"

We can then wrap this in a call to data.frame() to stitch it together. This requires a few small tweaks to the syntax.

penguins_dd <- data.frame(
  varNames = names(penguins_data),
  dataClass = sapply(penguins_data, class),
  dataType = sapply(penguins_data, function(x) dplyr::case_when(
    class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
    class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
    .default = typeof(x)
  )),
  dataRanges = sapply(1:length(penguins_data), function(x) ifelse(
          class(penguins_data[[x]]) == "factor",
          paste0(levels(penguins_data[[x]]), collapse = ", "),
          paste0(range(penguins_data[[x]], na.rm = TRUE), collapse = " : "))),
  observationsWithValues = sapply(penguins_data, function(x) sum(!is.na(x)))
)
rownames(penguins_dd) <- NULL

Lastly, we can generalize, building a function that we can pass any data set into.

create_datadictionary <- function(dataset){
  df <- data.frame(
  varNames = names(dataset),
  dataClass = sapply(dataset, class),
  dataType = sapply(dataset, function(x) dplyr::case_when(
    class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
    class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
    .default = typeof(x)
  )),
  dataRanges = sapply(1:length(dataset), function(x) ifelse(
          class(dataset[[x]]) == "factor",
          paste0(levels(dataset[[x]]), collapse = ", "),
          paste0(range(dataset[[x]], na.rm = TRUE), collapse = " : "))),
  observationsWithValues = sapply(dataset, function(x) sum(!is.na(x)))
)
rownames(df) <- NULL
return(df)
}

Let’s test it out!

penguins_dd_2 <- create_datadictionary(penguins_data)
penguins_dd_2
varNames dataClass dataType dataRanges observationsWithValues
species factor nominal Adelie, Chinstrap, Gentoo 344
island factor nominal Biscoe, Dream, Torgersen 344
bill_length_mm numeric double 32.1 : 59.6 342
bill_depth_mm numeric double 13.1 : 21.5 342
flipper_length_mm integer integer 172 : 231 342
body_mass_g integer integer 2700 : 6300 342
sex factor nominal female, male 333
year integer integer 2007 : 2009 344
LS0tCnRpdGxlOiAiV3JpdGluZyBhIGRhdGEgZGljdGlvbmFyeSIKcGFnZXRpdGxlOiAiV3JpdGluZyBhIGRhdGEgZGljdGlvbmFyeSIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBjb2RlX2ZvbGRpbmc6IHNob3cgIyBhbGxvd3MgdG9nZ2xpbmcgb2Ygc2hvd2luZyBhbmQgaGlkaW5nIGNvZGUuIFJlbW92ZSBpZiBub3QgdXNpbmcgY29kZS4KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUgIyBhbGxvd3MgdGhlIHVzZXIgdG8gZG93bmxvYWQgdGhlIHNvdXJjZSAuUm1kIGZpbGUuIFJlbW92ZSBpZiBub3QgdXNpbmcgY29kZS4KICAgIGluY2x1ZGVzOgogICAgICBhZnRlcl9ib2R5OiBmb290ZXIuaHRtbCAjIGluY2x1ZGUgYSBjdXN0b20gZm9vdGVyLgogICAgdG9jOiB0cnVlCiAgICB0b2NfZGVwdGg6IDMKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQotLS0KCiMjIFdyaXRpbmcgYSBEYXRhIERpY3Rpb25hcnkKCldlJ2xsIGRvIHRoaXMgd2l0aCBhIHNtYWxsZXIgZGF0YXNldCB0aGFuIHdoYXQgd2UncmUgd29ya2luZyB3aXRoIGluIHRoaXMgd29ya3Nob3AuCgpgYGB7ciwgZXZhbCA9IEZBTFNFfQppbnN0YWxsLnBhY2thZ2VzKCJwYWxtZXJwZW5ndWlucyIpCmxpYnJhcnkocGFsbWVycGVuZ3VpbnMpCmBgYAoKYGBge3IsIGVjaG8gPSBGQUxTRX0KbGlicmFyeShwYWxtZXJwZW5ndWlucykKYGBgCgpgYGB7cn0KcGVuZ3VpbnNfZGF0YSA8LSBwYWxtZXJwZW5ndWluczo6cGVuZ3VpbnMKYGBgCgpGaXJzdCwgd2UgYnVpbGQgZWFjaCBwaWVjZSBvZiBtZXRhZGF0YSB3ZSdkIGxpa2UsIGluY3VsZGluZzoKCiogdmFyaWFibGUgbmFtZXMKKiBkYXRhIHR5cGVzCiogZGF0YSByYW5nZXMKKiBvYnNlcnZhdGlvbiBjb3VudHMKCmBgYHtyfQp2YXJOYW1lcyA8LSBuYW1lcyhwZW5ndWluc19kYXRhKSAjIHdlJ3ZlIHNlZW4gdGhpcyBiZWZvcmUKZGF0YUNsYXNzIDwtIHNhcHBseShwZW5ndWluc19kYXRhLCBjbGFzcykgIyB3ZSBoYXZlbid0IHNlZW4gc2FwcGx5IGJlZm9yZSAtLSBzZWUgdGhlIG5vdGUgYm94IGJlbG93Cm9ic2VydmF0aW9uc1dpdGhWYWx1ZXMgPC0gc2FwcGx5KHBlbmd1aW5zX2RhdGEsIGZ1bmN0aW9uKHgpIHN1bSghaXMubmEoeCkpKQojIHRoaXMgbWlnaHQgbmVlZCBhIGJpdCBtb3JlIG9mIGEgYnJlYWsgZG93bi4gQmFzaWNhbGx5LAojIGZvciB0aGUgcmFuZ2Ugb2YgbnVtYmVycyAxOm4sIHdoZXJlIG4gaXMgdGhlIG51bWJlciBvZiBjb2x1bW5zIGluIHRoZSBkYXRhc2V0CiMgcGFzcyB0aGF0IG51bWJlciBpbnRvIHRoZSBhbnlubW91cyBmdW5jdGlvbiwgd2hpY2ggdGhlbiBwYXNzZXMgdGhhdCBudW1iZXIgaW50bwojIGVhY2ggaW5kZXggc3Vic2V0LiBTZWUgYWxvcyB0aGUgbm90ZSBib3ggYmVsb3cgb24gbWF0cml4IHN1YnNldHRpbmcuCmRhdGFSYW5nZXMgPC0gc2FwcGx5KDE6bGVuZ3RoKHBlbmd1aW5zX2RhdGEpLCBmdW5jdGlvbih4KSBpZmVsc2UoCiAgICAgICAgY2xhc3MocGVuZ3VpbnNfZGF0YVtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICBwYXN0ZTAobGV2ZWxzKHBlbmd1aW5zX2RhdGFbW3hdXSksIGNvbGxhcHNlID0gIiwgIiksCiAgICAgICAgcGFzdGUwKHJhbmdlKHBlbmd1aW5zX2RhdGFbW3hdXSwgbmEucm0gPSBUUlVFKSwgY29sbGFwc2UgPSAiIDogIikpKQojIHRoaXMgaXMgc2ltaWxhciB0byBhYm92ZSwgc2VlIGlmIHlvdSBjYW4gZmlndXJlIG91dCB3aGF0J3MgaGFwcGVuaW5nIQpkYXRhVHlwZSA8LSBzYXBwbHkocGVuZ3VpbnNfZGF0YSwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICBjbGFzcyh4KSA9PSAiZmFjdG9yIiAmJiBpcy5vcmRlcmVkKHgpID09IFRSVUUgfiAib2RlcmVkIiwKICBjbGFzcyh4KSA9PSAiZmFjdG9yIiAmJiBpcy5vcmRlcmVkKHgpID09IEZBTFNFIH4gIm5vbWluYWwiLAogIC5kZWZhdWx0ID0gdHlwZW9mKHgpCikpCmBgYAoKOjo6bm90ZQpgc2FwcGx5YCBpcyBvbmUgbWVtYmVyIG9mIHRoZSBgYXBwbHlgIGZhbWlseSBvZiBmdW5jdGlvbnMuIFRoZXNlIGZ1bmN0aW9ucyB0YWtlIGEgbGlzdCwgYW5kIGFwcGx5IGEgZnVuY3Rpb24gdG8gZWFjaCBlbGVtZW50IGluIHRoYXQgbGlzdC4gYHNhcHBseWAgcmV0dXJucyBhIHZlY3Rvci4gSW4gdGhlIGFib3ZlIGNhc2UsIGZvciBleGFtcGxlIHdpdGggYGRhdGFDbGFzc2AsIHdlIHBhc3MgdGhlIGRhdGEgc2V0IGFzIHRoZSBsaXN0LCB3aGljaCBpcyBpbnRlcnByZXRlZCBhcyBlYWNoIG9mIGl0cyBjb2x1bW5zLCBhbmQgdGhlbiBydW5zIGBjbGFzcygpYCBvbiBlYWNoIG9mIHRoZXNlIGNvbHVtbnMuCjo6OgoKOjo6bm90ZQpXZSd2ZSBiZWVuIGludHJvZHVjZWQgdG8gYGZpbHRlcmAgYW5kIGBzZWxlY3RgIHRvIHN1YnNldCBkYXRhLiBTdWJzZXR0aW5nIGNhbiBhbHNvIGJlIGRvbmUgYnkgaW5kZXggbG9jYXRpb24gd2l0aCBgW11gLiBUaGVyZSBpcyBhIHNsaWdodCBkaWZmZXJlbmNlIGJldHdlZW4gdXNpbmcgYFtdYCBhbmQgYFtbXV1gLiBUaGUgZm9ybWVyIHByZXNlcnZlcyB0aGUgb3JpZ2luYWwgZGF0YSBzdHJ1Y3R1cmUsIHRoZSBsYXR0ZXIgZHJvcHMgdGhvc2UgYXR0cmlidXRlcy4gSW4gdGhpcyBleGFtcGxlLCB3ZSBkb24ndCB3YW50IHRoZSBjbGFzcyBvZiB0aGUgdGliYmxlLCBidXQgb2YgdGhlIGluZGl2aWR1YWwgY29sdW1uLCBoZW5jZSB0aGUgbmVlZCBmb3IgYFtbXV1gLgoKYGBge3J9CmNsYXNzKHBlbmd1aW5zX2RhdGFbMV0pICMgcmVwb3J0cyB0YmxfZGYsIHRibCwgZGF0YS5mcmFtZQpjbGFzcyhwZW5ndWluc19kYXRhW1sxXV0pICMgcmVwb3J0cyBmYWN0b3IKYGBgCjo6OgoKV2UgY2FuIHRoZW4gd3JhcCB0aGlzIGluIGEgY2FsbCB0byBgZGF0YS5mcmFtZSgpYCB0byBzdGl0Y2ggaXQgdG9nZXRoZXIuIFRoaXMgcmVxdWlyZXMgYSBmZXcgc21hbGwgdHdlYWtzIHRvIHRoZSBzeW50YXguCgpgYGB7cn0KcGVuZ3VpbnNfZGQgPC0gZGF0YS5mcmFtZSgKICB2YXJOYW1lcyA9IG5hbWVzKHBlbmd1aW5zX2RhdGEpLAogIGRhdGFDbGFzcyA9IHNhcHBseShwZW5ndWluc19kYXRhLCBjbGFzcyksCiAgZGF0YVR5cGUgPSBzYXBwbHkocGVuZ3VpbnNfZGF0YSwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICAgIGNsYXNzKHgpID09ICJmYWN0b3IiICYmIGlzLm9yZGVyZWQoeCkgPT0gVFJVRSB+ICJvZGVyZWQiLAogICAgY2xhc3MoeCkgPT0gImZhY3RvciIgJiYgaXMub3JkZXJlZCh4KSA9PSBGQUxTRSB+ICJub21pbmFsIiwKICAgIC5kZWZhdWx0ID0gdHlwZW9mKHgpCiAgKSksCiAgZGF0YVJhbmdlcyA9IHNhcHBseSgxOmxlbmd0aChwZW5ndWluc19kYXRhKSwgZnVuY3Rpb24oeCkgaWZlbHNlKAogICAgICAgICAgY2xhc3MocGVuZ3VpbnNfZGF0YVtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICAgIHBhc3RlMChsZXZlbHMocGVuZ3VpbnNfZGF0YVtbeF1dKSwgY29sbGFwc2UgPSAiLCAiKSwKICAgICAgICAgIHBhc3RlMChyYW5nZShwZW5ndWluc19kYXRhW1t4XV0sIG5hLnJtID0gVFJVRSksIGNvbGxhcHNlID0gIiA6ICIpKSksCiAgb2JzZXJ2YXRpb25zV2l0aFZhbHVlcyA9IHNhcHBseShwZW5ndWluc19kYXRhLCBmdW5jdGlvbih4KSBzdW0oIWlzLm5hKHgpKSkKKQpyb3duYW1lcyhwZW5ndWluc19kZCkgPC0gTlVMTApgYGAKCkxhc3RseSwgd2UgY2FuIGdlbmVyYWxpemUsIGJ1aWxkaW5nIGEgZnVuY3Rpb24gdGhhdCB3ZSBjYW4gcGFzcyBhbnkgZGF0YSBzZXQgaW50by4KCmBgYHtyfQpjcmVhdGVfZGF0YWRpY3Rpb25hcnkgPC0gZnVuY3Rpb24oZGF0YXNldCl7CiAgZGYgPC0gZGF0YS5mcmFtZSgKICB2YXJOYW1lcyA9IG5hbWVzKGRhdGFzZXQpLAogIGRhdGFDbGFzcyA9IHNhcHBseShkYXRhc2V0LCBjbGFzcyksCiAgZGF0YVR5cGUgPSBzYXBwbHkoZGF0YXNldCwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICAgIGNsYXNzKHgpID09ICJmYWN0b3IiICYmIGlzLm9yZGVyZWQoeCkgPT0gVFJVRSB+ICJvZGVyZWQiLAogICAgY2xhc3MoeCkgPT0gImZhY3RvciIgJiYgaXMub3JkZXJlZCh4KSA9PSBGQUxTRSB+ICJub21pbmFsIiwKICAgIC5kZWZhdWx0ID0gdHlwZW9mKHgpCiAgKSksCiAgZGF0YVJhbmdlcyA9IHNhcHBseSgxOmxlbmd0aChkYXRhc2V0KSwgZnVuY3Rpb24oeCkgaWZlbHNlKAogICAgICAgICAgY2xhc3MoZGF0YXNldFtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICAgIHBhc3RlMChsZXZlbHMoZGF0YXNldFtbeF1dKSwgY29sbGFwc2UgPSAiLCAiKSwKICAgICAgICAgIHBhc3RlMChyYW5nZShkYXRhc2V0W1t4XV0sIG5hLnJtID0gVFJVRSksIGNvbGxhcHNlID0gIiA6ICIpKSksCiAgb2JzZXJ2YXRpb25zV2l0aFZhbHVlcyA9IHNhcHBseShkYXRhc2V0LCBmdW5jdGlvbih4KSBzdW0oIWlzLm5hKHgpKSkKKQpyb3duYW1lcyhkZikgPC0gTlVMTApyZXR1cm4oZGYpCn0KYGBgCgpMZXQncyB0ZXN0IGl0IG91dCEKCmBgYHtyLCBldmFsID0gRkFMU0V9CnBlbmd1aW5zX2RkXzIgPC0gY3JlYXRlX2RhdGFkaWN0aW9uYXJ5KHBlbmd1aW5zX2RhdGEpCnBlbmd1aW5zX2RkXzIKYGBgCgpgYGB7ciwgZWNobyA9IEZBTFNFfQpwZW5ndWluc19kZF8yIDwtIGNyZWF0ZV9kYXRhZGljdGlvbmFyeShwZW5ndWluc19kYXRhKQpwZW5ndWluc19kZF8yIHw+CiAga2FibGVFeHRyYTo6a2FibGUoKQpgYGAKCgoKCgoKCg==