library(quanteda)
#> Warning in stringi::stri_info(): Your current locale is not in the list
#> of available locales. Some functions may not work properly. Refer to
#> stri_locale_list() for more details on known locale specifiers.
#> Warning in stringi::stri_info(): Your current locale is not in the list
#> of available locales. Some functions may not work properly. Refer to
#> stri_locale_list() for more details on known locale specifiers.
library(quanteda.textstats)
library(jiebaR)
# Document-term matrix
<- readRDS("samesex_marriage.rds") %>%
q_dfm corpus(docid_field = "id", text_field = "content") %>%
::tokenize_regex(pattern = "\u3000") %>%
tokenizerstokens() %>%
dfm() %>%
dfm_remove(pattern = readLines("stopwords.txt", encoding = "UTF-8"),
valuetype = "fixed") %>%
dfm_select(pattern = "[\u4E00-\u9FFF]", valuetype = "regex") %>%
dfm_trim(min_termfreq = 5) %>%
dfm_tfidf()
# LSA Model
<- quanteda.textmodels::textmodel_lsa(q_dfm, nd = 15)
lsa_model
###### Convert New document to vector #######
# New document
<- readLines("sample_post.txt", encoding = "UTF-8") %>%
doc paste(collapse = "\n")
# Convert raw text to document term matrix
<- worker(user = "user_dict.txt")
seg <- list(segment(doc, seg)) %>%
new_doc_dtm tokens() %>%
dfm() %>%
dfm_match(features = featnames(q_dfm))
# Dimensionality reduction with LSA
<- predict(lsa_model, newdata = new_doc_dtm)
p $docs_newspace p
#> 1 x 15 Matrix of class "dgeMatrix"
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.0211677 -0.01264663 0.01741631
#> [,7] [,8] [,9] [,10] [,11] [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.00538418 -0.03365499 0.005100957
#> [,13] [,14] [,15]
#> text1 -0.01506238 -0.007130964 0.001976622
上方的程式碼是實習課上用來將新文本 (i.e., 不在語料庫內的文本) 轉換成向量的程式碼,目的是讓新的文本也能與語料庫內的文本進行比較 (透過向量運算)。
在這題,你的任務是將上方程式碼的後半部包成一個函數 encode_document()
,用來將傳入的文本 (可有多篇) 轉換成向量表徵。
要求:
encode_document()
需有 3 個參數,依序為:
docs
: character vector。每個元素為一篇 (未斷詞的) 文本。dtm
: 語料庫所製作出來的 document-term matrix。例如,上方程式碼的 q_dfm
lsa
: 使用 dtm
進行降維所得到的 LSA Model。例如,上方程式碼的 lsa_model
encode_document()
的回傳值為一個矩陣 ("dgeMatrix"
),其列數 (nrow) 等同於 docs
的長度 (文本數),行數 (ncol) 為 lsa_model
設置的維度 nd
。矩陣的第一個 row vector 對應到 docs
的第一個元素,第二個 row vector 對應到 docs
的第二個元素,依此類推。<- worker(user = "user_dict.txt", bylines = T)
seg
<- function(docs, dtm, lsa) {
encode_document # Write your code here
<- segment(docs, seg) %>%
n tokens() %>%
dfm() %>%
dfm_match(features = featnames(dtm))
<- predict(lsa, newdata = n)
prediction return(prediction$docs_newspace)
}
#### Do not modify the code below ####
<- paste(readLines("sample_post.txt", encoding = "UTF-8"), collapse = "\n")
doc1 <- paste(readLines("sample_post2.txt", encoding = "UTF-8"), collapse = "\n")
doc2 encode_document(docs = c(doc1, doc2), dtm = q_dfm, lsa = lsa_model)
# Should print out:
#> 2 x 15 Matrix of class "dgeMatrix"
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.02116770 -0.01264663 0.01741631
#> text2 0.04291259 -0.008224614 0.04674972 0.05099859 -0.02992219 0.03286311
#> [,7] [,8] [,9] [,10] [,11] [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.005384180 -0.033654985 0.005100957
#> text2 0.06443163 0.03061845 0.01812084 0.002242561 -0.008611309 0.036203892
#> [,13] [,14] [,15]
#> text1 -0.015062376 -0.007130964 0.001976622
#> text2 -0.009293649 0.026068611 0.002363140
#> 2 x 15 Matrix of class "dgeMatrix"
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> text1 0.01082797 -0.005043434 0.01906469 0.02116770 -0.01264663 0.01741631
#> text2 0.04291259 -0.008224614 0.04674972 0.05099859 -0.02992219 0.03286311
#> [,7] [,8] [,9] [,10] [,11] [,12]
#> text1 0.03035976 0.02448952 -0.01062937 -0.005384180 -0.033654985 0.005100957
#> text2 0.06443163 0.03061845 0.01812084 0.002242561 -0.008611309 0.036203892
#> [,13] [,14] [,15]
#> text1 -0.015062376 -0.007130964 0.001976622
#> text2 -0.009293649 0.026068611 0.002363140
svc.R
內的函數 svm_classifer_train()
是一個簡單的機器學習分類模型訓練函數1,用來訓練出能預測文本是來自於下福盟 (anti
) 或伴侶盟 (pro
) 的模型。
svm_classifer_train()
有三個參數:
docs_df
: Document data frame。在此我們使用 samesex_marriage.rds
dtm
: 由 docs_df
的語料製作而成的 document term matrix。 在此我們使用上方的 q_dfm
nd
: LSA 模型設置的維度 nd
svm_classifer_train()
回傳的是一個 list,裡面有 4 個元素 (見下方範例):
$model
: 分類器模型$train_acc
: 訓練資料預測準確率$test_acc
: 測試資料預測準確率$nd
: LSA 降維之維度source("svc.R")
<- readRDS("samesex_marriage.rds")
docs_df
# Train and return classifier
<- svm_classifer_train(docs_df = docs_df, dtm = q_dfm, nd = 5)
m
# Print model's prediction accuracy
cat("Test acc.:", m$test_acc, '\n\n')
# Classifying new (unseen) documents with the model
<- encode_document(c(doc1, doc2), q_dfm, lsa_model) %>%
docs_vec as.matrix() %>%
as.data.frame()
cat("======= Predict new documents ========\n")
predict(m$model, newdata = docs_vec)
# Should print out:
#> ======== Model Performance: nd = 5 ========
#> Train accuracy: 0.7333 Test accuracy: 0.65
#>
#> Test acc.: 0.65
#>
#> ======= Predict new documents ========
#> text1 text2
#> pro anti
#> Levels: anti pro
#> ======== Model Performance: nd = 5 ========
#> Train accuracy: 0.7333 Test accuracy: 0.65
#>
#> Test acc.: 0.65
#>
#> ======= Predict new documents ========
#> text1 text2
#> pro anti
#> Levels: anti pro
在這題,你的任務是去檢視透過 LSA 降維時,不同的維度設定對於分類器表現的影響。請試試 nd
為 10, 25, 50, 75 之中的何者時,分類器的表現 (test_acc
) 會最佳。請透過 for
loop 或 lapply()
去找出並回傳表現最佳的分類器,並將此分類器儲存於變數 best_model
(這題在運算時會花上一些時間)
source("svc.R")
<- readRDS("samesex_marriage.rds")
docs_df <- c(10, 25, 50, 75)
lsa_dims
# Write your code here
= 0
max for (i in seq_along(lsa_dims)){
<- svm_classifer_train(docs_df = docs_df, dtm = q_dfm, nd = lsa_dims[i])
svm_df if (svm_df$test_acc >= max){
<- svm_df$test_acc
max <- svm_df
best_model
} }
#> ======== Model Performance: nd = 10 ========
#> Train accuracy: 0.7667 Test accuracy: 0.6667
#>
#> ======== Model Performance: nd = 25 ========
#> Train accuracy: 0.9 Test accuracy: 0.7542
#>
#> ======== Model Performance: nd = 50 ========
#> Train accuracy: 0.9 Test accuracy: 0.7083
#>
#> ======== Model Performance: nd = 75 ========
#> Train accuracy: 0.9333 Test accuracy: 0.7125
###### Do not modify the code below ######
best_model# Should print out:
#> $model
#>
#> Call:
#> best.tune(method = svm, train.x = topic ~ ., data = d_train, ranges = list(cost = seq(0.01,
#> 5, by = 0.05)), kernel = "linear")
#>
#>
#> Parameters:
#> SVM-Type: C-classification
#> SVM-Kernel: linear
#> cost: 2.41
#>
#> Number of Support Vectors: 27
#>
#>
#> $train_acc
#> [1] 0.9
#>
#> $test_acc
#> [1] 0.7541667
#>
#> $nd
#> [1] 25
#> $model
#>
#> Call:
#> best.tune(method = svm, train.x = topic ~ ., data = d_train, ranges = list(cost = seq(0.01,
#> 5, by = 0.05)), kernel = "linear")
#>
#>
#> Parameters:
#> SVM-Type: C-classification
#> SVM-Kernel: linear
#> cost: 2.41
#>
#> Number of Support Vectors: 27
#>
#>
#> $train_acc
#> [1] 0.9
#>
#> $test_acc
#> [1] 0.7541667
#>
#> $nd
#> [1] 25
關於 svm_classifer_train()
裡面的程式碼在做什麼事情,有興趣的同學可以去搜尋 Support Vector Machine 的相關說明,例如 https://rpubs.com/skydome20/R-Note14-SVM-SVR。↩︎