Code
::opts_chunk$set(comment = ">")
knitr
library(tidyverse)
library(ggrepel)
library(ggthemes)
load(url("https://github.com/vsass/CSSS508/raw/main/Lectures/Lecture7/data/restaurants.Rdata"))
What is the shortest restaurant name? What is the longest restaurant name?
Read the documentation for separate_longer_delim()
. Use this function to figure out what 10 “words”1 are most frequently used for the restaurant names in this data set.
restaurants |>
distinct(Name) |>
separate_longer_delim(Name, delim = " ") |>
count(Name) |>
arrange(desc(n))
> # A tibble: 9,370 × 2
> Name n
> <chr> <int>
> 1 & 692
> 2 LLC 619
> 3 CAFE 409
> 4 RESTAURANT 380
> 5 SCHOOL 360
> 6 THE 353
> 7 COFFEE 315
> 8 @ 306
> 9 - 303
> 10 BAR 260
> # ℹ 9,360 more rows
Which zip code has the highest (meaning worst) mean Grade
?2
restaurants |>
mutate(avg_score = mean(Grade, na.rm = TRUE),
.by = c(Name)) |>
summarize(avg_score_zip = mean(avg_score, na.rm = TRUE),
.by = c(`Zip Code`)) |>
slice_max(avg_score_zip)
> # A tibble: 1 × 2
> `Zip Code` avg_score_zip
> <dbl> <dbl>
> 1 98115 2.02
For that worst-rated zip code, how has its average rating changed by year?3
zip_98115_year <- restaurants |>
filter(`Zip Code` == "98115") |>
separate_wider_delim(`Inspection Date`,
delim = "/",
names = c("month", "day", "year")) |>
mutate(avg_score = mean(Grade, na.rm = TRUE),
.by = c(Name)) |>
summarize(avg_score_zip = mean(avg_score, na.rm = TRUE),
.by = year) |>
arrange(desc(year)) |>
filter(!is.na(year))
zip_98115_year
> # A tibble: 18 × 2
> year avg_score_zip
> <chr> <dbl>
> 1 2023 2.30
> 2 2022 2.35
> 3 2021 2.23
> 4 2020 2.09
> 5 2019 2.08
> 6 2018 2.03
> 7 2017 2.12
> 8 2016 2.15
> 9 2015 2.10
> 10 2014 2.00
> 11 2013 2.01
> 12 2012 2.01
> 13 2011 2.00
> 14 2010 1.89
> 15 2009 1.97
> 16 2008 1.93
> 17 2007 1.91
> 18 2006 1.99
zip_98115_year2 <- restaurants |>
filter(`Zip Code` == "98115") |>
mutate(inspection_date = mdy(`Inspection Date`),
year = year(inspection_date)) |>
mutate(avg_score = mean(Grade, na.rm = TRUE),
.by = c(Name)) |>
summarize(avg_score_zip = mean(avg_score, na.rm = TRUE),
.by = c(`Zip Code`, year)) |>
arrange(desc(year)) |>
filter(!is.na(year))
zip_98115_year2
> # A tibble: 18 × 3
> `Zip Code` year avg_score_zip
> <dbl> <dbl> <dbl>
> 1 98115 2023 2.30
> 2 98115 2022 2.35
> 3 98115 2021 2.23
> 4 98115 2020 2.09
> 5 98115 2019 2.08
> 6 98115 2018 2.03
> 7 98115 2017 2.12
> 8 98115 2016 2.15
> 9 98115 2015 2.10
> 10 98115 2014 2.00
> 11 98115 2013 2.01
> 12 98115 2012 2.01
> 13 98115 2011 2.00
> 14 98115 2010 1.89
> 15 98115 2009 1.97
> 16 98115 2008 1.93
> 17 98115 2007 1.91
> 18 98115 2006 1.99
Separate the variable Phone
into three variables that only contain numerical digits: area_code
, first_3
, and last_4
.
I came up with 4 different approaches to this but there may be more!
separate_wider_delim()
restaurants |>
distinct(Phone) |>
separate_wider_delim(cols = Phone,
delim = " ",
names = c("area_code", "phone_number")) |>
separate_wider_delim(cols = phone_number,
delim = "-",
names = c("first_3", "last_4")) |>
mutate(area_code = str_remove(area_code, pattern = "\\("),
area_code = str_remove(area_code, pattern = "\\)"))
> # A tibble: 7,384 × 3
> area_code first_3 last_4
> <chr> <chr> <chr>
> 1 206 722 6400
> 2 206 491 4694
> 3 952 649 8629
> 4 206 369 2978
> 5 206 612 2660
> 6 758 753 4760
> 7 425 251 8803
> 8 425 486 5555
> 9 206 363 8950
> 10 425 545 5552
> # ℹ 7,374 more rows
separate_wider_delim()
(w/ regex)restaurants |>
distinct(Phone) |>
separate_wider_delim(cols = Phone,
delim = regex("[ |-]"),
names = c("area_code", "first_3", "last_4")) |>
mutate(area_code = str_remove(area_code, pattern = "\\("),
area_code = str_remove(area_code, pattern = "\\)"))
> # A tibble: 7,384 × 3
> area_code first_3 last_4
> <chr> <chr> <chr>
> 1 206 722 6400
> 2 206 491 4694
> 3 952 649 8629
> 4 206 369 2978
> 5 206 612 2660
> 6 758 753 4760
> 7 425 251 8803
> 8 425 486 5555
> 9 206 363 8950
> 10 425 545 5552
> # ℹ 7,374 more rows
str_extract()
restaurants |>
distinct(Phone) |>
mutate(area_code = str_extract(Phone, "[:digit:]{3}"),
first_3 = str_extract(Phone, "(?<= )[:digit:]{3}(?=-)"),
last_4 = str_extract(Phone, "(?<=-)[:graph:]{4,}$"))
> # A tibble: 7,384 × 4
> Phone area_code first_3 last_4
> <chr> <chr> <chr> <chr>
> 1 (206) 722-6400 206 722 6400
> 2 (206) 491-4694 206 491 4694
> 3 (952) 649-8629 952 649 8629
> 4 (206) 369-2978 206 369 2978
> 5 (206) 612-2660 206 612 2660
> 6 (758) 753-4760 758 753 4760
> 7 (425) 251-8803 425 251 8803
> 8 (425) 486-5555 425 486 5555
> 9 (206) 363-8950 206 363 8950
> 10 (425) 545-5552 425 545 5552
> # ℹ 7,374 more rows
separate_wider_regex()
restaurants |>
distinct(Phone) |>
separate_wider_regex(cols = Phone,
patterns = c("^\\(",
area_code = "\\d{3}",
"\\) ",
first_3 = "\\d{3}",
"-",
last_4 = ".{4,}$"))
separate_wider_regex
, any unnamed regex pattern is matched but not saved into a named variable. This regex matches and discards the first open parentheses.
str_extract
that captures three digits in a row. (i.e. \\d
is equivalent to [:digit:]
)
> # A tibble: 7,384 × 3
> area_code first_3 last_4
> <chr> <chr> <chr>
> 1 206 722 6400
> 2 206 491 4694
> 3 952 649 8629
> 4 206 369 2978
> 5 206 612 2660
> 6 758 753 4760
> 7 425 251 8803
> 8 425 486 5555
> 9 206 363 8950
> 10 425 545 5552
> # ℹ 7,374 more rows
Anything that is separated by spaces counts as a word for this problem.↩︎
Note: To accurately reflect the true average you’ll want to weight this by restaurant: i.e. get the average grade by restaurant first, then get the average of those averages by zip code.↩︎
There are a few ways to do this. Use the method that makes most sense to you!↩︎