Writing a Data Dictionary
We’ll do this with a smaller dataset than what we’re working with in
this workshop.
install.packages("palmerpenguins")
library(palmerpenguins)
penguins_data <- palmerpenguins::penguins
First, we build each piece of metadata we’d like, inculding:
- variable names
- data types
- data ranges
- observation counts
varNames <- names(penguins_data) # we've seen this before
dataClass <- sapply(penguins_data, class) # we haven't seen sapply before -- see the note box below
observationsWithValues <- sapply(penguins_data, function(x) sum(!is.na(x)))
# this might need a bit more of a break down. Basically,
# for the range of numbers 1:n, where n is the number of columns in the dataset
# pass that number into the anynmous function, which then passes that number into
# each index subset. See alos the note box below on matrix subsetting.
dataRanges <- sapply(1:length(penguins_data), function(x) ifelse(
class(penguins_data[[x]]) == "factor",
paste0(levels(penguins_data[[x]]), collapse = ", "),
paste0(range(penguins_data[[x]], na.rm = TRUE), collapse = " : ")))
# this is similar to above, see if you can figure out what's happening!
dataType <- sapply(penguins_data, function(x) dplyr::case_when(
class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
.default = typeof(x)
))
sapply
is one member of the apply
family of
functions. These functions take a list, and apply a function to each
element in that list. sapply
returns a vector. In the above
case, for example with dataClass
, we pass the data set as
the list, which is interpreted as each of its columns, and then runs
class()
on each of these columns.
We’ve been introduced to filter
and select
to subset data. Subsetting can also be done by index location with
[]
. There is a slight difference between using
[]
and [[]]
. The former preserves the original
data structure, the latter drops those attributes. In this example, we
don’t want the class of the tibble, but of the individual column, hence
the need for [[]]
.
class(penguins_data[1]) # reports tbl_df, tbl, data.frame
## [1] "tbl_df" "tbl" "data.frame"
class(penguins_data[[1]]) # reports factor
## [1] "factor"
We can then wrap this in a call to data.frame()
to
stitch it together. This requires a few small tweaks to the syntax.
penguins_dd <- data.frame(
varNames = names(penguins_data),
dataClass = sapply(penguins_data, class),
dataType = sapply(penguins_data, function(x) dplyr::case_when(
class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
.default = typeof(x)
)),
dataRanges = sapply(1:length(penguins_data), function(x) ifelse(
class(penguins_data[[x]]) == "factor",
paste0(levels(penguins_data[[x]]), collapse = ", "),
paste0(range(penguins_data[[x]], na.rm = TRUE), collapse = " : "))),
observationsWithValues = sapply(penguins_data, function(x) sum(!is.na(x)))
)
rownames(penguins_dd) <- NULL
Lastly, we can generalize, building a function that we can pass any
data set into.
create_datadictionary <- function(dataset){
df <- data.frame(
varNames = names(dataset),
dataClass = sapply(dataset, class),
dataType = sapply(dataset, function(x) dplyr::case_when(
class(x) == "factor" && is.ordered(x) == TRUE ~ "odered",
class(x) == "factor" && is.ordered(x) == FALSE ~ "nominal",
.default = typeof(x)
)),
dataRanges = sapply(1:length(dataset), function(x) ifelse(
class(dataset[[x]]) == "factor",
paste0(levels(dataset[[x]]), collapse = ", "),
paste0(range(dataset[[x]], na.rm = TRUE), collapse = " : "))),
observationsWithValues = sapply(dataset, function(x) sum(!is.na(x)))
)
rownames(df) <- NULL
return(df)
}
Let’s test it out!
penguins_dd_2 <- create_datadictionary(penguins_data)
penguins_dd_2
species |
factor |
nominal |
Adelie, Chinstrap, Gentoo |
344 |
island |
factor |
nominal |
Biscoe, Dream, Torgersen |
344 |
bill_length_mm |
numeric |
double |
32.1 : 59.6 |
342 |
bill_depth_mm |
numeric |
double |
13.1 : 21.5 |
342 |
flipper_length_mm |
integer |
integer |
172 : 231 |
342 |
body_mass_g |
integer |
integer |
2700 : 6300 |
342 |
sex |
factor |
nominal |
female, male |
333 |
year |
integer |
integer |
2007 : 2009 |
344 |
LS0tCnRpdGxlOiAiV3JpdGluZyBhIGRhdGEgZGljdGlvbmFyeSIKcGFnZXRpdGxlOiAiV3JpdGluZyBhIGRhdGEgZGljdGlvbmFyeSIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBjb2RlX2ZvbGRpbmc6IHNob3cgIyBhbGxvd3MgdG9nZ2xpbmcgb2Ygc2hvd2luZyBhbmQgaGlkaW5nIGNvZGUuIFJlbW92ZSBpZiBub3QgdXNpbmcgY29kZS4KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUgIyBhbGxvd3MgdGhlIHVzZXIgdG8gZG93bmxvYWQgdGhlIHNvdXJjZSAuUm1kIGZpbGUuIFJlbW92ZSBpZiBub3QgdXNpbmcgY29kZS4KICAgIGluY2x1ZGVzOgogICAgICBhZnRlcl9ib2R5OiBmb290ZXIuaHRtbCAjIGluY2x1ZGUgYSBjdXN0b20gZm9vdGVyLgogICAgdG9jOiB0cnVlCiAgICB0b2NfZGVwdGg6IDMKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQotLS0KCiMjIFdyaXRpbmcgYSBEYXRhIERpY3Rpb25hcnkKCldlJ2xsIGRvIHRoaXMgd2l0aCBhIHNtYWxsZXIgZGF0YXNldCB0aGFuIHdoYXQgd2UncmUgd29ya2luZyB3aXRoIGluIHRoaXMgd29ya3Nob3AuCgpgYGB7ciwgZXZhbCA9IEZBTFNFfQppbnN0YWxsLnBhY2thZ2VzKCJwYWxtZXJwZW5ndWlucyIpCmxpYnJhcnkocGFsbWVycGVuZ3VpbnMpCmBgYAoKYGBge3IsIGVjaG8gPSBGQUxTRX0KbGlicmFyeShwYWxtZXJwZW5ndWlucykKYGBgCgpgYGB7cn0KcGVuZ3VpbnNfZGF0YSA8LSBwYWxtZXJwZW5ndWluczo6cGVuZ3VpbnMKYGBgCgpGaXJzdCwgd2UgYnVpbGQgZWFjaCBwaWVjZSBvZiBtZXRhZGF0YSB3ZSdkIGxpa2UsIGluY3VsZGluZzoKCiogdmFyaWFibGUgbmFtZXMKKiBkYXRhIHR5cGVzCiogZGF0YSByYW5nZXMKKiBvYnNlcnZhdGlvbiBjb3VudHMKCmBgYHtyfQp2YXJOYW1lcyA8LSBuYW1lcyhwZW5ndWluc19kYXRhKSAjIHdlJ3ZlIHNlZW4gdGhpcyBiZWZvcmUKZGF0YUNsYXNzIDwtIHNhcHBseShwZW5ndWluc19kYXRhLCBjbGFzcykgIyB3ZSBoYXZlbid0IHNlZW4gc2FwcGx5IGJlZm9yZSAtLSBzZWUgdGhlIG5vdGUgYm94IGJlbG93Cm9ic2VydmF0aW9uc1dpdGhWYWx1ZXMgPC0gc2FwcGx5KHBlbmd1aW5zX2RhdGEsIGZ1bmN0aW9uKHgpIHN1bSghaXMubmEoeCkpKQojIHRoaXMgbWlnaHQgbmVlZCBhIGJpdCBtb3JlIG9mIGEgYnJlYWsgZG93bi4gQmFzaWNhbGx5LAojIGZvciB0aGUgcmFuZ2Ugb2YgbnVtYmVycyAxOm4sIHdoZXJlIG4gaXMgdGhlIG51bWJlciBvZiBjb2x1bW5zIGluIHRoZSBkYXRhc2V0CiMgcGFzcyB0aGF0IG51bWJlciBpbnRvIHRoZSBhbnlubW91cyBmdW5jdGlvbiwgd2hpY2ggdGhlbiBwYXNzZXMgdGhhdCBudW1iZXIgaW50bwojIGVhY2ggaW5kZXggc3Vic2V0LiBTZWUgYWxvcyB0aGUgbm90ZSBib3ggYmVsb3cgb24gbWF0cml4IHN1YnNldHRpbmcuCmRhdGFSYW5nZXMgPC0gc2FwcGx5KDE6bGVuZ3RoKHBlbmd1aW5zX2RhdGEpLCBmdW5jdGlvbih4KSBpZmVsc2UoCiAgICAgICAgY2xhc3MocGVuZ3VpbnNfZGF0YVtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICBwYXN0ZTAobGV2ZWxzKHBlbmd1aW5zX2RhdGFbW3hdXSksIGNvbGxhcHNlID0gIiwgIiksCiAgICAgICAgcGFzdGUwKHJhbmdlKHBlbmd1aW5zX2RhdGFbW3hdXSwgbmEucm0gPSBUUlVFKSwgY29sbGFwc2UgPSAiIDogIikpKQojIHRoaXMgaXMgc2ltaWxhciB0byBhYm92ZSwgc2VlIGlmIHlvdSBjYW4gZmlndXJlIG91dCB3aGF0J3MgaGFwcGVuaW5nIQpkYXRhVHlwZSA8LSBzYXBwbHkocGVuZ3VpbnNfZGF0YSwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICBjbGFzcyh4KSA9PSAiZmFjdG9yIiAmJiBpcy5vcmRlcmVkKHgpID09IFRSVUUgfiAib2RlcmVkIiwKICBjbGFzcyh4KSA9PSAiZmFjdG9yIiAmJiBpcy5vcmRlcmVkKHgpID09IEZBTFNFIH4gIm5vbWluYWwiLAogIC5kZWZhdWx0ID0gdHlwZW9mKHgpCikpCmBgYAoKOjo6bm90ZQpgc2FwcGx5YCBpcyBvbmUgbWVtYmVyIG9mIHRoZSBgYXBwbHlgIGZhbWlseSBvZiBmdW5jdGlvbnMuIFRoZXNlIGZ1bmN0aW9ucyB0YWtlIGEgbGlzdCwgYW5kIGFwcGx5IGEgZnVuY3Rpb24gdG8gZWFjaCBlbGVtZW50IGluIHRoYXQgbGlzdC4gYHNhcHBseWAgcmV0dXJucyBhIHZlY3Rvci4gSW4gdGhlIGFib3ZlIGNhc2UsIGZvciBleGFtcGxlIHdpdGggYGRhdGFDbGFzc2AsIHdlIHBhc3MgdGhlIGRhdGEgc2V0IGFzIHRoZSBsaXN0LCB3aGljaCBpcyBpbnRlcnByZXRlZCBhcyBlYWNoIG9mIGl0cyBjb2x1bW5zLCBhbmQgdGhlbiBydW5zIGBjbGFzcygpYCBvbiBlYWNoIG9mIHRoZXNlIGNvbHVtbnMuCjo6OgoKOjo6bm90ZQpXZSd2ZSBiZWVuIGludHJvZHVjZWQgdG8gYGZpbHRlcmAgYW5kIGBzZWxlY3RgIHRvIHN1YnNldCBkYXRhLiBTdWJzZXR0aW5nIGNhbiBhbHNvIGJlIGRvbmUgYnkgaW5kZXggbG9jYXRpb24gd2l0aCBgW11gLiBUaGVyZSBpcyBhIHNsaWdodCBkaWZmZXJlbmNlIGJldHdlZW4gdXNpbmcgYFtdYCBhbmQgYFtbXV1gLiBUaGUgZm9ybWVyIHByZXNlcnZlcyB0aGUgb3JpZ2luYWwgZGF0YSBzdHJ1Y3R1cmUsIHRoZSBsYXR0ZXIgZHJvcHMgdGhvc2UgYXR0cmlidXRlcy4gSW4gdGhpcyBleGFtcGxlLCB3ZSBkb24ndCB3YW50IHRoZSBjbGFzcyBvZiB0aGUgdGliYmxlLCBidXQgb2YgdGhlIGluZGl2aWR1YWwgY29sdW1uLCBoZW5jZSB0aGUgbmVlZCBmb3IgYFtbXV1gLgoKYGBge3J9CmNsYXNzKHBlbmd1aW5zX2RhdGFbMV0pICMgcmVwb3J0cyB0YmxfZGYsIHRibCwgZGF0YS5mcmFtZQpjbGFzcyhwZW5ndWluc19kYXRhW1sxXV0pICMgcmVwb3J0cyBmYWN0b3IKYGBgCjo6OgoKV2UgY2FuIHRoZW4gd3JhcCB0aGlzIGluIGEgY2FsbCB0byBgZGF0YS5mcmFtZSgpYCB0byBzdGl0Y2ggaXQgdG9nZXRoZXIuIFRoaXMgcmVxdWlyZXMgYSBmZXcgc21hbGwgdHdlYWtzIHRvIHRoZSBzeW50YXguCgpgYGB7cn0KcGVuZ3VpbnNfZGQgPC0gZGF0YS5mcmFtZSgKICB2YXJOYW1lcyA9IG5hbWVzKHBlbmd1aW5zX2RhdGEpLAogIGRhdGFDbGFzcyA9IHNhcHBseShwZW5ndWluc19kYXRhLCBjbGFzcyksCiAgZGF0YVR5cGUgPSBzYXBwbHkocGVuZ3VpbnNfZGF0YSwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICAgIGNsYXNzKHgpID09ICJmYWN0b3IiICYmIGlzLm9yZGVyZWQoeCkgPT0gVFJVRSB+ICJvZGVyZWQiLAogICAgY2xhc3MoeCkgPT0gImZhY3RvciIgJiYgaXMub3JkZXJlZCh4KSA9PSBGQUxTRSB+ICJub21pbmFsIiwKICAgIC5kZWZhdWx0ID0gdHlwZW9mKHgpCiAgKSksCiAgZGF0YVJhbmdlcyA9IHNhcHBseSgxOmxlbmd0aChwZW5ndWluc19kYXRhKSwgZnVuY3Rpb24oeCkgaWZlbHNlKAogICAgICAgICAgY2xhc3MocGVuZ3VpbnNfZGF0YVtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICAgIHBhc3RlMChsZXZlbHMocGVuZ3VpbnNfZGF0YVtbeF1dKSwgY29sbGFwc2UgPSAiLCAiKSwKICAgICAgICAgIHBhc3RlMChyYW5nZShwZW5ndWluc19kYXRhW1t4XV0sIG5hLnJtID0gVFJVRSksIGNvbGxhcHNlID0gIiA6ICIpKSksCiAgb2JzZXJ2YXRpb25zV2l0aFZhbHVlcyA9IHNhcHBseShwZW5ndWluc19kYXRhLCBmdW5jdGlvbih4KSBzdW0oIWlzLm5hKHgpKSkKKQpyb3duYW1lcyhwZW5ndWluc19kZCkgPC0gTlVMTApgYGAKCkxhc3RseSwgd2UgY2FuIGdlbmVyYWxpemUsIGJ1aWxkaW5nIGEgZnVuY3Rpb24gdGhhdCB3ZSBjYW4gcGFzcyBhbnkgZGF0YSBzZXQgaW50by4KCmBgYHtyfQpjcmVhdGVfZGF0YWRpY3Rpb25hcnkgPC0gZnVuY3Rpb24oZGF0YXNldCl7CiAgZGYgPC0gZGF0YS5mcmFtZSgKICB2YXJOYW1lcyA9IG5hbWVzKGRhdGFzZXQpLAogIGRhdGFDbGFzcyA9IHNhcHBseShkYXRhc2V0LCBjbGFzcyksCiAgZGF0YVR5cGUgPSBzYXBwbHkoZGF0YXNldCwgZnVuY3Rpb24oeCkgZHBseXI6OmNhc2Vfd2hlbigKICAgIGNsYXNzKHgpID09ICJmYWN0b3IiICYmIGlzLm9yZGVyZWQoeCkgPT0gVFJVRSB+ICJvZGVyZWQiLAogICAgY2xhc3MoeCkgPT0gImZhY3RvciIgJiYgaXMub3JkZXJlZCh4KSA9PSBGQUxTRSB+ICJub21pbmFsIiwKICAgIC5kZWZhdWx0ID0gdHlwZW9mKHgpCiAgKSksCiAgZGF0YVJhbmdlcyA9IHNhcHBseSgxOmxlbmd0aChkYXRhc2V0KSwgZnVuY3Rpb24oeCkgaWZlbHNlKAogICAgICAgICAgY2xhc3MoZGF0YXNldFtbeF1dKSA9PSAiZmFjdG9yIiwKICAgICAgICAgIHBhc3RlMChsZXZlbHMoZGF0YXNldFtbeF1dKSwgY29sbGFwc2UgPSAiLCAiKSwKICAgICAgICAgIHBhc3RlMChyYW5nZShkYXRhc2V0W1t4XV0sIG5hLnJtID0gVFJVRSksIGNvbGxhcHNlID0gIiA6ICIpKSksCiAgb2JzZXJ2YXRpb25zV2l0aFZhbHVlcyA9IHNhcHBseShkYXRhc2V0LCBmdW5jdGlvbih4KSBzdW0oIWlzLm5hKHgpKSkKKQpyb3duYW1lcyhkZikgPC0gTlVMTApyZXR1cm4oZGYpCn0KYGBgCgpMZXQncyB0ZXN0IGl0IG91dCEKCmBgYHtyLCBldmFsID0gRkFMU0V9CnBlbmd1aW5zX2RkXzIgPC0gY3JlYXRlX2RhdGFkaWN0aW9uYXJ5KHBlbmd1aW5zX2RhdGEpCnBlbmd1aW5zX2RkXzIKYGBgCgpgYGB7ciwgZWNobyA9IEZBTFNFfQpwZW5ndWluc19kZF8yIDwtIGNyZWF0ZV9kYXRhZGljdGlvbmFyeShwZW5ndWluc19kYXRhKQpwZW5ndWluc19kZF8yIHw+CiAga2FibGVFeHRyYTo6a2FibGUoKQpgYGAKCgoKCgoKCg==