1 基本題 (共 100 分)

1.1 視覺化死亡率 (共 40 分)

# 請勿更動此 code chunk 程式碼
library(dplyr)
library(ggplot2)

# 上週作業使用到的 titanic data
titanic <- readr::read_delim("titanic.csv", 
                             delim = ";", 
                             escape_double = FALSE, 
                             trim_ws = TRUE)
# 上週作業 `分組摘要` 的結果
died_summary <- titanic %>% 
  group_by(Sex, Pclass) %>%
  summarise(percent_survived = mean(Survived == "Yes")) %>%
  ungroup()
died_summary

#> # A tibble: 6 x 3
#>   Sex    Pclass percent_survived
#>   <chr>   <dbl>            <dbl>
#> 1 female      1            0.968
#> 2 female      2            0.921
#> 3 female      3            0.5  
#> 4 male        1            0.369
#> 5 male        2            0.157
#> 6 male        3            0.135

此題延續上週作業關於鐵達尼號乘客死亡率的分組摘要。上方的程式碼即是上週分組摘要的答案，儲存於 died_summary。你的任務是使用 ggplot2 將 died_summary 繪製成此長條圖:

# Write your code here

ggplot(data = died_summary ) + 
  geom_bar(mapping = aes(x = Pclass, y = percent_survived), stat = "identity") + 
  facet_wrap(~Sex)

如果不知道如何下手，請先閱讀 R for Data Science 第三章
你應該會使用到 geom_bar() 或是 geom_col()
你應該會使用到 facet_wrap()
輸出結果應與下圖相同：

輸出結果應要與此圖相同

1.2 自己的資料自己畫 (共 50 分)

請自行尋找一份資料 (不得使用 titanic.csv 或內建資料)，將其放在此次作業的 repo 並命名為 mydata.csv (副檔名請根據自己的資料而定, e.g., 若為 tab 分隔檔，請命名為 mydata.tsv)。你的任務是將這份資料讀入並使用 ggplot2 視覺化這份資料。

(10 分) 資料讀取與清理
將 mydata.csv 讀入並進行資料清理 (如果需要的話)，以利接下來的資料視覺化
(30 分) 資料視覺化
請依這份資料的特性以及你想觀察的現象，對這份資料進行視覺化。依據你的喜好，你可以畫任意多張圖，但其中一張圖裡「必須」使用到 2 種或 2 種以上的 geom_*() 函數 (助教也只會依據這張圖評分)。這些 geom_*() 的使用需合理。例如，下方的例子雖然仍畫得出圖，但顯然是不合理的，這種情況將不予給分：
```
ggplot(iris) +
  geom_bar(aes(x = Species)) +
  geom_point(aes(Sepal.Length, Petal.Width))
```
(10 分) Tweak the plot
請依據你的個人偏好「修改」於 2. 所繪製出來的圖。例如，你可以使用某個 coord_*() 將圖的 x、y 軸對調；使用其它的風格；或是修改與新增圖的座標軸名稱與標題等。
- 評分標準：你只要合理地使用到 https://ggplot2.tidyverse.org/reference 當中 Scales, Coordinate systems 或是 Themes 這三個段落之內的任意一個函數即可得分。

若覺得題目說明不夠清楚，可以參考此題的範例。

資料讀取與清理 (10 分)

# Write your code here
# 請務必印出 data frame

library(tibble)
library(dplyr)
library(lubridate)

#> 
#> Attaching package: 'lubridate'

#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

ubike <- as_tibble(read.csv("mydata.csv"))
#view(ubike)

# date_time <- ymd_hm(ubike$time[9])
# date_time
# hour(date_time)
NTUlatitude <- 25.014947
NTUlongitude <- 121.535549
latitude_KM <- 110.9362
longitude_KM <- 101.77545


ubikeClean <- ubike %>%
  
# 整理出距離台大一公里以內的站
  mutate(nearNTU = if_else( ( (abs(latitude - NTUlatitude) * latitude_KM) ** 2 + (abs(longitude - NTUlongitude) * longitude_KM) ** 2) ** 1/2 < 1, 1, 0)) %>% 
  
# 整理出：可借車%、月份、日、時間(24小時制)
  mutate(bikePCT = bike / lot, month = month(ymd_hm(time)), mday = mday(ymd_hm(time)), hour = hour(ymd_hm(time)))


#view(ubikeClean)

ubike24hour <- ubikeClean %>%
  group_by(station, hour) %>%
  # bike_avg：當小時有幾台車可借(一個月平均)
  # bikePCT_avg：當小時的可借車率(總可借車數/總車柱數)(一個月平均)
  summarise(bikePCT_avg = mean(bikePCT), bike_avg = mean(bike), nearNTU) %>% 
  group_by(hour, nearNTU) %>% 
  # hour_bikePCT_avg：(near NTU vs. far NTU)當小時的平均可借車率(一個月平均)
  summarise(hour_bikePCT_avg = mean(bikePCT_avg))

#> `summarise()` regrouping output by 'station', 'hour' (override with `.groups` argument)

#> `summarise()` regrouping output by 'hour' (override with `.groups` argument)

view(ubike24hour)

ubikeNTU <- ubikeClean %>%
  filter(nearNTU == 1) %>% 
  group_by(station, hour) %>%
  # bike_avg：當小時有幾台車可借(一個月平均)
  # bikePCT_avg：當小時的可借車率(總可借車數/總車柱數)(一個月平均)
  summarise(bikePCT_avg = mean(bikePCT), bike_avg = mean(bike)) %>% 
  group_by(hour) %>% 
  # hour_bikePCT_avg：所有近台大站當小時的平均可借車率(一個月平均)
  summarise(station, hour_bikePCT_avg = mean(bikePCT_avg), bikePCT_avg, bike_avg)

#> `summarise()` regrouping output by 'station' (override with `.groups` argument)

#> `summarise()` regrouping output by 'hour' (override with `.groups` argument)

#view(ubikeNTU)
ubikeNTU

#> # A tibble: 216 x 5
#> # Groups:   hour [24]
#>     hour station                           hour_bikePCT_avg bikePCT_avg bike_avg
#>    <int> <fct>                                        <dbl>       <dbl>    <dbl>
#>  1     0 JianGuo & Heping Intersection                0.197      0.150     7.77 
#>  2     0 Keelung & Changxing Intersection             0.197      0.372    27.5  
#>  3     0 MRT Gongguan Sta.(Exit 2)                    0.197      0.0312    0.935
#>  4     0 MRT Taipower Building Sta. (Exit~            0.197      0.398    15.9  
#>  5     0 N.T.U.S.T                                    0.197      0.176     8.10 
#>  6     0 NTU Information Bldg.                        0.197      0.0771    5.55 
#>  7     0 Roosevelt & Xinsheng S. Intersec~            0.197      0.0436    3.84 
#>  8     0 Xinhai & Xinsheng Intersection               0.197      0.3       9    
#>  9     0 Xinsheng & Heping Intersection               0.197      0.228    10.5  
#> 10     1 JianGuo & Heping Intersection                0.270      0.176     9.13 
#> # ... with 206 more rows

資料視覺化 (30 分)

ubikeGraphCompare <- ggplot(data = ubike24hour, aes(x = hour, y = hour_bikePCT_avg, group = nearNTU, color = factor(nearNTU))) +
  geom_point() + 
  geom_line() +
  scale_color_discrete(name = "distance", labels = c("far NTU", "near NTU")) + 
  labs(title = "bike pct compare between near and far NTU") + ylab("avg. bike pct (total bike/total lot)") + xlab("hour")

ubikeGraphCompare

# Write your code here
# 請務必印出圖片

library(ggplot2)
library(RColorBrewer)

ubikeGraph <- ggplot(data = ubikeNTU) + 
  # 主坐標軸：(各站一個月內)當小時的平均車數(堆疊長條圖)
  geom_bar(aes(x = hour, y = bike_avg, fill = station), stat = "identity", position = "fill") + 
  # 修改標籤
  labs(title = "bike amount at stations near NTU") + ylab("bike amount") + xlab("hour")+ 
  # 改變配色
  scale_fill_brewer(palette="Spectral") + 
  # 副座標軸：所有站(平均一個月內)當小時的可借車率(總可借車數/總車柱數)
  geom_point(aes(x = hour, y = hour_bikePCT_avg)) + 
  geom_line(aes(x = hour, y = hour_bikePCT_avg)) + 
  # 修改副座標軸標籤(百分比化)
  scale_y_continuous(sec.axis = sec_axis(~. , name = "avg. bike pct (total bike/total lot) ", labels = scales::percent))


ubikeGraph

Tweak the plot (10 分)

# Write your code here

library(extrafont)

#> Registering fonts with R

#font_import()
#fonts()  # show all fonts import

# 要設 device 才不會出現 error:font family not found in Windows font database
loadfonts(device = "win", quiet = T)


ubikeGraphTweak <- ubikeGraph + 

  theme(text = element_text(family="Lucida Handwriting", size = 10, color = "steelblue4"),
        panel.background = element_rect(fill = "lightcyan", colour = "steelblue4", size = 2,linetype = "dotted"),
        panel.grid.major = element_line(colour = "lightcyan3", size = 0.75, linetype = "solid"),
        panel.grid.minor = element_line(colour = "lightcyan3", size = 0.375, linetype = "dashed")) 

ubikeGraphTweak

1.3 線上實習課調查問卷 (10 分)

請填寫此份課程調查問卷，並將下方 我，未命名，已完成問卷填答 內的 未命名 更改為自己的姓名：

我，余孟琦，已完成問卷填答

2 進階選答題 (共 20 分)

請使用 ggplot2 中的 mpg 這份資料繪製圖表。 (可使用 ?mpg 查看這份資料的說明)

請以顏色標示「車種」 class 是否為 SUV。 (6分)
請繪製「引擎排氣量」 displ 和「每加侖可高速行駛英里」 hwy 的線性回歸線，並將「年分」 year 以不同線條類型標示，且不須繪製信心區間 (請使用 geom_smooth())。(6分)
請以黑色直線標示「引擎排氣量」 displ 的平均值。(6分)
請將顏色和線條類型的圖例名稱分別設定為 SUV 和 Year 。(2分)

本題輸出結果應如下圖：

進階題輸出結果

# Modify the code below

ggplot(data = mpg, mapping = aes(displ, hwy)) +
  geom_point(aes(color = ifelse(class == "suv","#F8766D", "#00BFC4"))) +
  geom_smooth(aes(group = year, linetype = factor(year)), method = lm, se = F) +
  geom_vline(aes(xintercept = mean(displ))) + 
  guides(color = guide_legend("SUV"))+
  scale_colour_discrete(labels = c("False", "True"))+
  guides(linetype = guide_legend("Year"))+
  labs(x = "Engine displacement (litres)", y = "Highway miles (per gallon)")

#> `geom_smooth()` using formula 'y ~ x'

# 找預設顏色
# library(scales)
# show_col(hue_pal()(2))

HW 6: 資料視覺化

余孟琦 B06702049 會計系

2021-04-08

Rmd Source (for TAs)

1 基本題 (共 100 分)

1.1 視覺化死亡率 (共 40 分)

1.2 自己的資料自己畫 (共 50 分)

資料讀取與清理 (10 分)

資料視覺化 (30 分)

Tweak the plot (10 分)

1.3 線上實習課調查問卷 (10 分)

2 進階選答題 (共 20 分)

HW 6: 資料視覺化

余孟琦 B06702049 會計系

2021-04-08 Rmd Source (for TAs)

1 基本題 (共 100 分)

1.1 視覺化死亡率 (共 40 分)

1.2 自己的資料自己畫 (共 50 分)

資料讀取與清理 (10 分)

資料視覺化 (30 分)

Tweak the plot (10 分)

1.3 線上實習課調查問卷 (10 分)

2 進階選答題 (共 20 分)

2021-04-08

Rmd Source (for TAs)