1 Encoding New Documents (60 分)

library(quanteda)
library(quanteda.textstats)
library(jiebaR)

# Document-term matrix 
q_dfm <- readRDS("samesex_marriage.rds") %>%
  corpus(docid_field = "id", text_field = "content") %>%
  tokenizers::tokenize_regex(pattern = "\u3000") %>%
  tokens() %>%
  dfm() %>% 
  dfm_remove(pattern = readLines("stopwords.txt", encoding = "UTF-8"), 
             valuetype = "fixed") %>%
  dfm_select(pattern = "[\u4E00-\u9FFF]", valuetype = "regex") %>%
  dfm_trim(min_termfreq = 5) %>%
  dfm_tfidf()

# LSA Model
lsa_model <- quanteda.textmodels::textmodel_lsa(q_dfm, nd = 15)

###### Convert New document to vector #######
# New document
doc <- readLines("sample_post.txt", encoding = "UTF-8") %>% 
  paste(collapse = "\n")

# Convert raw text to document term matrix
seg <- worker(user = "user_dict.txt")
new_doc_dtm <- list(segment(doc, seg)) %>%
  tokens() %>%
  dfm() %>%
  dfm_match(features = featnames(q_dfm))

# Dimensionality reduction with LSA
p <- predict(lsa_model, newdata = new_doc_dtm)
p$docs_newspace

#> 1 x 15 Matrix of class "dgeMatrix"
#>             [,1]         [,2]       [,3]      [,4]        [,5]       [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.0211677 -0.01264663 0.01741631
#>             [,7]       [,8]        [,9]       [,10]       [,11]       [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.00538418 -0.03365499 0.005100957
#>             [,13]        [,14]       [,15]
#> text1 -0.01506238 -0.007130964 0.001976622

上方的程式碼是實習課上用來將新文本 (i.e., 不在語料庫內的文本) 轉換成向量的程式碼，目的是讓新的文本也能與語料庫內的文本進行比較 (透過向量運算)。

在這題，你的任務是將上方程式碼的後半部包成一個函數 encode_document()，用來將傳入的文本 (可有多篇) 轉換成向量表徵。

要求：

encode_document() 需有 3 個參數，依序為：
- docs: character vector。每個元素為一篇 (未斷詞的) 文本。
- dtm: 語料庫所製作出來的 document-term matrix。例如，上方程式碼的 q_dfm
- lsa: 使用 dtm 進行降維所得到的 LSA Model。例如，上方程式碼的 lsa_model
encode_document() 的回傳值為一個矩陣 ("dgeMatrix")，其列數 (nrow) 等同於 docs 的長度 (文本數)，行數 (ncol) 為 lsa_model 設置的維度 nd。矩陣的第一個 row vector 對應到 docs 的第一個元素，第二個 row vector 對應到 docs 的第二個元素，依此類推。

seg <- worker(user = "user_dict.txt", bylines = T)

encode_document <- function(docs, dtm, lsa) {
  # Write your code here
}

#### Do not modify the code below ####
doc1 <- paste(readLines("sample_post.txt", encoding = "UTF-8"), collapse = "\n")
doc2 <- paste(readLines("sample_post2.txt", encoding = "UTF-8"), collapse = "\n")
encode_document(docs = c(doc1, doc2), dtm = q_dfm, lsa = lsa_model)
# Should print out:
#> 2 x 15 Matrix of class "dgeMatrix"
#>             [,1]         [,2]       [,3]       [,4]        [,5]       [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.02116770 -0.01264663 0.01741631
#> text2 0.04291259 -0.008224614 0.04674972 0.05099859 -0.02992219 0.03286311
#>             [,7]       [,8]        [,9]        [,10]        [,11]       [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.005384180 -0.033654985 0.005100957
#> text2 0.06443163 0.03061845  0.01812084  0.002242561 -0.008611309 0.036203892
#>              [,13]        [,14]       [,15]
#> text1 -0.015062376 -0.007130964 0.001976622
#> text2 -0.009293649  0.026068611 0.002363140

#> NULL

2 LSA Dimensions & Classification (60 分)

svc.R 內的函數 svm_classifer_train() 是一個簡單的機器學習分類模型訓練函數¹，用來訓練出能預測文本是來自於下福盟 (anti) 或伴侶盟 (pro) 的模型。

svm_classifer_train() 有三個參數：
- docs_df: Document data frame。在此我們使用 samesex_marriage.rds
- dtm: 由 docs_df 的語料製作而成的 document term matrix。在此我們使用上方的 q_dfm
- nd: LSA 模型設置的維度 nd

svm_classifer_train() 回傳的是一個 list，裡面有 4 個元素 (見下方範例)：

$model: 分類器模型
$train_acc: 訓練資料預測準確率
$test_acc: 測試資料預測準確率
$nd: LSA 降維之維度

source("svc.R")
docs_df <- readRDS("samesex_marriage.rds")

# Train and return classifier
m <- svm_classifer_train(docs_df = docs_df, dtm = q_dfm, nd = 5)

# Print model's prediction accuracy
cat("Test acc.:", m$test_acc, '\n\n')

# Classifying new (unseen) documents with the model
docs_vec <- encode_document(c(doc1, doc2), q_dfm, lsa_model) %>%
    as.matrix() %>%
    as.data.frame()

#> Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), : 'data' must be of a vector type, was 'NULL'

cat("======= Predict new documents ========\n")
predict(m$model, newdata = docs_vec)

#> Error in predict.svm(m$model, newdata = docs_vec): object 'docs_vec' not found

# Should print out:
#> ======== Model Performance: nd = 5 ========
#> Train accuracy: 0.7333   Test accuracy: 0.65
#> 
#> Test acc.: 0.65 
#> 
#> ======= Predict new documents ========
#> text1 text2 
#>   pro  anti 
#> Levels: anti pro

#> ======== Model Performance: nd = 5 ========
#> Train accuracy: 0.7333   Test accuracy: 0.65
#> 
#> Test acc.: 0.65 
#> 
#> ======= Predict new documents ========

在這題，你的任務是去檢視透過 LSA 降維時，不同的維度設定對於分類器表現的影響。請試試 nd 為 10, 25, 50, 75 之中的何者時，分類器的表現 (test_acc) 會最佳。請透過 for loop 或 lapply() 去找出並回傳表現最佳的分類器，並將此分類器儲存於變數 best_model (這題在運算時會花上一些時間)

source("svc.R")
docs_df <- readRDS("samesex_marriage.rds")
lsa_dims <- c(10, 25, 50, 75)

# Write your code here

###### Do not modify the code below ######
best_model

#> Error in eval(expr, envir, enclos): object 'best_model' not found

# Should print out:
#> $model
#> 
#> Call:
#> best.tune(method = svm, train.x = topic ~ ., data = d_train, ranges = list(cost = seq(0.01, 
#>     5, by = 0.05)), kernel = "linear")
#> 
#> 
#> Parameters:
#>    SVM-Type:  C-classification 
#>  SVM-Kernel:  linear 
#>        cost:  2.41 
#> 
#> Number of Support Vectors:  27
#> 
#> 
#> $train_acc
#> [1] 0.9
#> 
#> $test_acc
#> [1] 0.7541667
#> 
#> $nd
#> [1] 25

關於 svm_classifer_train() 裡面的程式碼在做什麼事情，有興趣的同學可以去搜尋 Support Vector Machine 的相關說明，例如 https://rpubs.com/skydome20/R-Note14-SVM-SVR。↩︎

HW 10: 向量表徵

未命名 B01201001 一般系

2021-05-13

Rmd Source (for TAs)

1 Encoding New Documents (60 分)

2 LSA Dimensions & Classification (60 分)

HW 10: 向量表徵

未命名 B01201001 一般系

2021-05-13 Rmd Source (for TAs)

1 Encoding New Documents (60 分)

2 LSA Dimensions & Classification (60 分)

2021-05-13

Rmd Source (for TAs)