df <- read_csv(fs::dir_ls("./data", glob = "*.csv")) |>
clean_names()
# fix the mix of numeric and character in the # killed and injured columns
df <- df |>
mutate(
number_killed = case_when(
number_killed %in% c("Multiple", "Unknown") ~ NA_integer_,
is.na(number_killed) ~ NA_integer_,
TRUE ~ as.integer(number_killed)
),
number_injured = case_when(
number_injured %in% c("Multiple", "Unknown") ~ NA_integer_,
is.na(number_injured) ~ NA_integer_,
TRUE ~ as.integer(number_injured)
)
)
# Add missing case names
df <- df |>
mutate(
name_of_case = str_to_title(name_of_case),
name_of_case = if_else(
case_id == "08112021_JMW_WEBB3",
"State Of Michigan V. Justen Watkins",
name_of_case
),
name_of_case = if_else(
case_id == "08112021_TFD_WEBB2",
"State Of Michigan V. Thomas Denton",
name_of_case
),
name_of_case = if_else(
case_id == "08112021_TW_WEBB1",
"State Of Michigan V. Tristan Webb",
name_of_case
),
name_of_case = if_else(
name_of_case == "Data Not Available",
"Unknown",
name_of_case
)
)
# only US cases
df <- df |>
filter(location_country == "United States")
# fix case name typos
df <- df |>
mutate(
name_of_case = str_replace_all(name_of_case, "Ofmassachusetts", "Of Massachusetts"),
name_of_case = str_replace_all(name_of_case, "United Staters", "United States"),
name_of_case = str_replace_all(name_of_case, "United Staers", "United States"),
name_of_case = str_replace_all(name_of_case, "United Sates", "United States"),
name_of_case = str_replace_all(name_of_case, "United Staes", "United States"),
name_of_case = str_replace_all(name_of_case, "\\bUsa\\b", "United States"),
name_of_case = str_replace_all(name_of_case, regex("\\bUnited State\\b"), "United States"),
name_of_case = str_replace_all(name_of_case, regex("\\bUnites States\\b"), "United States"),
name_of_case = str_remove(name_of_case, regex("^(The )?People Of ")),
name_of_case = str_replace_all(name_of_case, regex("^The State"), "State"),
name_of_case = str_replace_all(name_of_case, regex("^Virginia\\b"), "Commonwealth Of Virginia"),
name_of_case = str_replace_all(name_of_case, regex("^New York\\b"), "State Of New York"),
name_of_case = str_replace_all(name_of_case, regex("^Ten(n)?essee\\b"), "State Of Tennessee"),
name_of_case = str_replace_all(name_of_case, regex("^West Virginia\\b"), "State Of West Virginia"),
name_of_case = str_replace_all(name_of_case, regex("^State V\\. Christopher\\b"), "State Of Nevada V. Christopher"),
name_of_case = str_replace_all(name_of_case, regex("^State V\\. Finley\\b"), "State Of Nevada V. Finley")
)
# move appearance number from full names to its own column and clean up
df <- df |>
mutate(
appearance_number = str_extract(full_legal_name, "\\s\\(\\d+\\)$"),
full_legal_name = str_remove(full_legal_name, "\\s\\(\\d+\\)$"),
appearance_number = str_remove_all(appearance_number, "[\\(\\)\\s]"),
appearance_number = as.integer(appearance_number),
full_legal_name = str_trim(full_legal_name),
appearance_number = str_trim(appearance_number)
)
# add finer-grained jurisdiction type
df <- df |>
mutate(
jurisdiction_level = map2_chr(jurisdiction, name_of_case, jurisdiction_extract)
)