# 請勿更動此 code chunk 程式碼
library(dplyr)
library(ggplot2)
# 上週作業使用到的 titanic data
titanic <- readr::read_delim("titanic.csv",
delim = ";",
escape_double = FALSE,
trim_ws = TRUE)
# 上週作業 `分組摘要` 的結果
died_summary <- titanic %>%
group_by(Sex, Pclass) %>%
summarise(percent_survived = mean(Survived == "Yes")) %>%
ungroup()
died_summary #> # A tibble: 6 x 3
#> Sex Pclass percent_survived
#> <chr> <dbl> <dbl>
#> 1 female 1 0.968
#> 2 female 2 0.921
#> 3 female 3 0.5
#> 4 male 1 0.369
#> 5 male 2 0.157
#> 6 male 3 0.135
此題延續上週作業關於鐵達尼號乘客死亡率的分組摘要。上方的程式碼即是上週分組摘要的答案,儲存於 died_summary。你的任務是使用 ggplot2 將 died_summary 繪製成此長條圖:
# Write your code here
ggplot(data = died_summary) +
geom_bar(mapping = aes(x = Pclass, y = percent_survived), stat = "identity") +
facet_wrap(vars(Sex))geom_bar() 或是 geom_col()facet_wrap()輸出結果應要與此圖相同
請自行尋找一份資料 (不得使用 titanic.csv 或內建資料),將其放在此次作業的 repo 並命名為 mydata.csv (副檔名請根據自己的資料而定, e.g., 若為 tab 分隔檔,請命名為 mydata.tsv)。你的任務是將這份資料讀入並使用 ggplot2 視覺化這份資料。
(10 分) 資料讀取與清理
將 mydata.csv 讀入並進行資料清理 (如果需要的話),以利接下來的資料視覺化
(30 分) 資料視覺化
請依這份資料的特性以及你想觀察的現象,對這份資料進行視覺化。依據你的喜好,你可以畫任意多張圖,但其中一張圖裡「必須」使用到 2 種或 2 種以上的 geom_*() 函數 (助教也只會依據這張圖評分)。這些 geom_*() 的使用需合理。例如,下方的例子雖然仍畫得出圖,但顯然是不合理的,這種情況將不予給分:
ggplot(iris) +
geom_bar(aes(x = Species)) +
geom_point(aes(Sepal.Length, Petal.Width))(10 分) Tweak the plot
請依據你的個人偏好「修改」於 2. 所繪製出來的圖。例如,你可以使用某個 coord_*() 將圖的 x、y 軸對調;使用其它的風格;或是修改與新增圖的座標軸名稱與標題等。
若覺得題目說明不夠清楚,可以參考此題的範例。
# Write your code here
# 請務必印出 data frame
library(readr)
library(tidyverse)#> -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
#> √ tibble 3.1.0 √ stringr 1.4.0
#> √ tidyr 1.1.3 √ forcats 0.5.1
#> √ purrr 0.3.4
#> -- Conflicts ------------------------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
genre_brief <- function(genre) {
genre <- sapply(genre, function(x) {
if (grepl("thriller", x) | grepl("Thriller", x))
return("Horror/Thriller")
else if (grepl("horror", x) | grepl("Horror", x))
return("Horror/Thriller")
else if (grepl("fiction", x) | grepl("Fiction", x))
return("Fiction")
else if (grepl("drama", x) | grepl("Drama", x))
return("Drama")
else if (grepl("sitcom", x) | grepl("Sitcom", x))
return("Sitcom")
else if (grepl("comedy", x) | grepl("Comedy", x))
return("Comedy")
else return("Other")
})
}
mydata <- read_csv("mydata.csv") %>%
mutate(Genre = genre_brief(Genre)) %>%
select(!Premiere & !Finale) %>%
rename(Avr_ep_in_sn = "avr. Ep/S")#>
#> -- Column specification --------------------------------------------------------
#> cols(
#> Title = col_character(),
#> Genre = col_character(),
#> Premiere = col_character(),
#> Finale = col_character(),
#> Seasons = col_double(),
#> Episodes = col_double(),
#> `avr. Ep/S` = col_double()
#> )
mydata1 <- mydata %>%
group_by(Genre) %>%
summarise(Genre_avr_ep = sum(Avr_ep_in_sn) / n(),
Genre_max_ep = max(Avr_ep_in_sn),
Genre_min_ep = min(Avr_ep_in_sn))
mydata2 <- gather(mydata1, key = "data_type", value = "data",
Genre_avr_ep, Genre_max_ep, Genre_min_ep)
mydata
mydata1
mydata2#> # A tibble: 144 x 5
#> Title Genre Seasons Episodes Avr_ep_in_sn
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Bad Samaritans Comedy 1 5 5
#> 2 Richie Rich Sitcom 2 21 10.5
#> 3 Hemlock Grove Horror/Thri~ 3 33 11
#> 4 W/ Bob & David Comedy 1 5 5
#> 5 Netflix Presents: The Characters Comedy 1 8 8
#> 6 Marco Polo Drama 2 20 10
#> 7 Trailer Park Boys Out of the Park~ Other 1 8 8
#> 8 The Get Down Drama 2 11 5.5
#> 9 Girlboss Comedy 1 13 13
#> 10 Master of None Comedy 2 20 10
#> # ... with 134 more rows
#> # A tibble: 6 x 4
#> Genre Genre_avr_ep Genre_max_ep Genre_min_ep
#> <chr> <dbl> <dbl> <dbl>
#> 1 Comedy 8.21 13 4
#> 2 Drama 9.52 13 5.5
#> 3 Fiction 10.5 13 8
#> 4 Horror/Thriller 10.2 13 8
#> 5 Other 8.83 17 3.5
#> 6 Sitcom 10.1 15 6.67
#> # A tibble: 18 x 3
#> Genre data_type data
#> <chr> <chr> <dbl>
#> 1 Comedy Genre_avr_ep 8.21
#> 2 Drama Genre_avr_ep 9.52
#> 3 Fiction Genre_avr_ep 10.5
#> 4 Horror/Thriller Genre_avr_ep 10.2
#> 5 Other Genre_avr_ep 8.83
#> 6 Sitcom Genre_avr_ep 10.1
#> 7 Comedy Genre_max_ep 13
#> 8 Drama Genre_max_ep 13
#> 9 Fiction Genre_max_ep 13
#> 10 Horror/Thriller Genre_max_ep 13
#> 11 Other Genre_max_ep 17
#> 12 Sitcom Genre_max_ep 15
#> 13 Comedy Genre_min_ep 4
#> 14 Drama Genre_min_ep 5.5
#> 15 Fiction Genre_min_ep 8
#> 16 Horror/Thriller Genre_min_ep 8
#> 17 Other Genre_min_ep 3.5
#> 18 Sitcom Genre_min_ep 6.67
# Write your code here
# 請務必印出圖片
my_plot <- ggplot() +
geom_boxplot(data = mydata, mapping = aes(x = Genre, y = Avr_ep_in_sn)) +
geom_line(data = mydata2, size = 0.8,
mapping = aes(x = Genre, y = data, color = data_type, group = data_type))
my_plot# Write your code here
my_plot + labs(title = "The relationship between genre and average episodes per season",
subtitle = "Take TV series as example",
y = "Average episodes/season",
color = "Data type")請使用 ggplot2 中的 mpg 這份資料繪製圖表。 (可使用 ?mpg 查看這份資料的說明)
class 是否為 SUV。 (6分)displ 和「每加侖可高速行駛英里」 hwy 的線性回歸線,並將「年分」 year 以不同線條類型標示,且不須繪製信心區間 (請使用 geom_smooth())。(6分)displ 的平均值。(6分)SUV 和 Year 。(2分)進階題輸出結果
# Modify the code below
mpg1 <- mpg %>%
mutate(SUV_check = (class == "suv")) %>%
mutate(year = as.character(year))
mpg_plot <- ggplot(data = mpg1, mapping = aes(displ, hwy)) +
geom_point(aes(color = SUV_check)) +
geom_smooth(aes(linetype = year), method = lm, se = FALSE) +
labs(x = "Engine displacement (litres)", y = "Highway miles (per gallon)",
color = "SUV", linetype = "Year")
mpg_plot + geom_vline(xintercept = mean(mpg1$displ))#> `geom_smooth()` using formula 'y ~ x'