1 Encoding New Documents (60 分)

library(quanteda)
library(quanteda.textstats)
library(jiebaR)

# Document-term matrix 
q_dfm <- readRDS("samesex_marriage.rds") %>%
  corpus(docid_field = "id", text_field = "content") %>%
  tokenizers::tokenize_regex(pattern = "\u3000") %>%
  tokens() %>%
  dfm() %>% 
  dfm_remove(pattern = readLines("stopwords.txt", encoding = "UTF-8"), 
             valuetype = "fixed") %>%
  dfm_select(pattern = "[\u4E00-\u9FFF]", valuetype = "regex") %>%
  dfm_trim(min_termfreq = 5) %>%
  dfm_tfidf()

# LSA Model
lsa_model <- quanteda.textmodels::textmodel_lsa(q_dfm, nd = 15)


###### Convert New document to vector #######

# New document
doc <- readLines("sample_post.txt", encoding = "UTF-8") %>% 
  paste(collapse = "\n")

# Convert raw text to document term matrix
seg <- worker(user = "user_dict.txt")
new_doc_dtm <- list(segment(doc, seg)) %>%
  tokens() %>%
  dfm() %>%
  dfm_match(features = featnames(q_dfm))

# Dimensionality reduction with LSA
p <- predict(lsa_model, newdata = new_doc_dtm)
p$docs_newspace
#> 1 x 15 Matrix of class "dgeMatrix"
#>             [,1]         [,2]       [,3]      [,4]        [,5]       [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.0211677 -0.01264663 0.01741631
#>             [,7]       [,8]        [,9]       [,10]       [,11]       [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.00538418 -0.03365499 0.005100957
#>             [,13]        [,14]       [,15]
#> text1 -0.01506238 -0.007130964 0.001976622

上方的程式碼是實習課上用來將新文本 (i.e., 不在語料庫內的文本) 轉換成向量的程式碼,目的是讓新的文本也能與語料庫內的文本進行比較 (透過向量運算)。

在這題,你的任務是將上方程式碼的後半部包成一個函數 encode_document(),用來將傳入的文本 (可有多篇) 轉換成向量表徵。

要求:

  1. encode_document() 需有 3 個參數,依序為:
    • docs: character vector。每個元素為一篇 (未斷詞的) 文本。
    • dtm: 語料庫所製作出來的 document-term matrix。例如,上方程式碼的 q_dfm
    • lsa: 使用 dtm 進行降維所得到的 LSA Model。例如,上方程式碼的 lsa_model
  2. encode_document() 的回傳值為一個矩陣 ("dgeMatrix"),其列數 (nrow) 等同於 docs 的長度 (文本數),行數 (ncol) 為 lsa_model 設置的維度 nd。矩陣的第一個 row vector 對應到 docs 的第一個元素,第二個 row vector 對應到 docs 的第二個元素,依此類推。
seg <- worker(user = "user_dict.txt", bylines = T)

encode_document <- function(docs, dtm, lsa) {
  # Write your code here
  
  # Convert raw text to document term matrix
  newdocs_dtm <- segment(docs, seg) %>%
    tokens() %>%
    dfm() %>%
    dfm_match(features = featnames(dtm))
  
  # LSA dimensionality reduction
  predict(lsa, newdata = newdocs_dtm)$docs_newspace
}

#### Do not modify the code below ####
doc1 <- paste(readLines("sample_post.txt", encoding = "UTF-8"), collapse = "\n")
doc2 <- paste(readLines("sample_post2.txt", encoding = "UTF-8"), collapse = "\n")
encode_document(docs = c(doc1, doc2), dtm = q_dfm, lsa = lsa_model)
# Should print out:
#> 2 x 15 Matrix of class "dgeMatrix"
#>             [,1]         [,2]       [,3]       [,4]        [,5]       [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.02116770 -0.01264663 0.01741631
#> text2 0.04291259 -0.008224614 0.04674972 0.05099859 -0.02992219 0.03286311
#>             [,7]       [,8]        [,9]        [,10]        [,11]       [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.005384180 -0.033654985 0.005100957
#> text2 0.06443163 0.03061845  0.01812084  0.002242561 -0.008611309 0.036203892
#>              [,13]        [,14]       [,15]
#> text1 -0.015062376 -0.007130964 0.001976622
#> text2 -0.009293649  0.026068611 0.002363140
#> 2 x 15 Matrix of class "dgeMatrix"
#>             [,1]         [,2]       [,3]       [,4]        [,5]       [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.02116770 -0.01264663 0.01741631
#> text2 0.04291259 -0.008224614 0.04674972 0.05099859 -0.02992219 0.03286311
#>             [,7]       [,8]        [,9]        [,10]        [,11]       [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.005384180 -0.033654985 0.005100957
#> text2 0.06443163 0.03061845  0.01812084  0.002242561 -0.008611309 0.036203892
#>              [,13]        [,14]       [,15]
#> text1 -0.015062376 -0.007130964 0.001976622
#> text2 -0.009293649  0.026068611 0.002363140

2 LSA Dimensions & Classification (60 分)

svc.R 內的函數 svm_classifer_train() 是一個簡單的機器學習分類模型訓練函數1,用來訓練出能預測文本是來自於下福盟 (anti) 或伴侶盟 (pro) 的模型。

  1. svm_classifer_train() 有三個參數:
    • docs_df: Document data frame。在此我們使用 samesex_marriage.rds
    • dtm: 由 docs_df 的語料製作而成的 document term matrix。 在此我們使用上方的 q_dfm
    • nd: LSA 模型設置的維度 nd
  2. svm_classifer_train() 回傳的是一個 list,裡面有 4 個元素 (見下方範例):
    • $model: 分類器模型
    • $train_acc: 訓練資料預測準確率
    • $test_acc: 測試資料預測準確率
    • $nd: LSA 降維之維度
    source("svc.R")
    docs_df <- readRDS("samesex_marriage.rds")
    
    # Train and return classifier
    m <- svm_classifer_train(docs_df = docs_df, dtm = q_dfm, nd = 5)
    
    # Print model's prediction accuracy
    cat("Test acc.:", m$test_acc, '\n\n')
    
    # Classifying new (unseen) documents with the model
    docs_vec <- encode_document(c(doc1, doc2), q_dfm, lsa_model) %>%
        as.matrix() %>%
        as.data.frame()
    cat("======= Predict new documents ========\n")
    predict(m$model, newdata = docs_vec)
    #> ======== Model Performance: nd = 5 ========
    #> Train accuracy: 0.7333   Test accuracy: 0.65
    #> 
    #> Test acc.: 0.65 
    #> 
    #> ======= Predict new documents ========
    #> text1 text2 
    #>   pro  anti 
    #> Levels: anti pro

在這題,你的任務是去檢視透過 LSA 降維時,不同的維度設定對於分類器表現的影響。請試試 nd 為 10, 25, 50, 75 之中的何者時,分類器的表現 (test_acc) 會最佳。請透過 for loop 或 lapply() 去找出並回傳表現最佳的分類器,並將此分類器儲存於變數 best_model (這題在運算時會花上一些時間)

source("svc.R")
docs_df <- readRDS("samesex_marriage.rds")
lsa_dims <- c(10, 25, 50, 75)

# Write your code here
models <- lapply(lsa_dims, function(nd) svm_classifer_train(docs_df, q_dfm, nd))
test_acc <- sapply(models, function(m) m$test_acc)
best_model <- models[test_acc == max(test_acc)][[1]]
#> ======== Model Performance: nd = 10 ========
#> Train accuracy: 0.7667   Test accuracy: 0.6667
#> 
#> ======== Model Performance: nd = 25 ========
#> Train accuracy: 0.9  Test accuracy: 0.7542
#> 
#> ======== Model Performance: nd = 50 ========
#> Train accuracy: 0.9  Test accuracy: 0.7083
#> 
#> ======== Model Performance: nd = 75 ========
#> Train accuracy: 0.9333   Test accuracy: 0.7125
###### Do not modify the code below ######
best_model
# Should print out:
#> $model
#> 
#> Call:
#> best.tune(method = svm, train.x = topic ~ ., data = d_train, ranges = list(cost = seq(0.01, 
#>     5, by = 0.05)), kernel = "linear")
#> 
#> 
#> Parameters:
#>    SVM-Type:  C-classification 
#>  SVM-Kernel:  linear 
#>        cost:  2.41 
#> 
#> Number of Support Vectors:  27
#> 
#> 
#> $train_acc
#> [1] 0.9
#> 
#> $test_acc
#> [1] 0.7541667
#> 
#> $nd
#> [1] 25
#> $model
#> 
#> Call:
#> best.tune(method = svm, train.x = topic ~ ., data = d_train, ranges = list(cost = seq(0.01, 
#>     5, by = 0.05)), kernel = "linear")
#> 
#> 
#> Parameters:
#>    SVM-Type:  C-classification 
#>  SVM-Kernel:  linear 
#>        cost:  2.41 
#> 
#> Number of Support Vectors:  27
#> 
#> 
#> $train_acc
#> [1] 0.9
#> 
#> $test_acc
#> [1] 0.7541667
#> 
#> $nd
#> [1] 25

  1. 關於 svm_classifer_train() 裡面的程式碼在做什麼事情,有興趣的同學可以去搜尋 Support Vector Machine 的相關說明,例如 https://rpubs.com/skydome20/R-Note14-SVM-SVR↩︎