利用R语言进行LDA主题建模
本次推送第二篇为纯代码,方便直接copy啦~
准备:
每个文档以txt形式保存在E:/1
1.设置工作路径
library("tm")
setwd(“E:/1”)
corpus<-Corpus(DirSource(directory="E:/1",encoding="UTF-8",recursive=TRUE,mode="text"))
corpus
2.预处理语料库
library("SnowballC")
myStopwords <- c(stopwords("english"), "SalesForce",”null”)
myStopwords <- c(stopwords("english"), stopwords("SMART"))
your_corpus <- tm_map(corpus, content_transformer(tolower))
your_corpus <- tm_map(your_corpus, removeWords, myStopwords)
your_corpus <- tm_map(your_corpus, removeNumbers)
your_corpus <- tm_map(your_corpus, removePunctuation)
your_corpus <- tm_map(your_corpus, stripWhitespace)
your_corpus <- tm_map(your_corpus, stemDocument)
3.构建文档-词矩阵
#这只是一个矩阵,其中文档是行,单词是列,矩阵单元包含单词的频率计数(weightTf),其中wordLengths=c(3,Inf)指单词长度从3到无限大;结果中Sparsity指稀疏性
myDtm<-DocumentTermMatrix(your_corpus,control=list(wordLengths=c(3,Inf)))
myDtm
#超过100次的术语列表
findFreqTerms(myDtm, 100)
#加载slam包,计算TF-IDF,将值较高的保留下来
library("slam")
term_tfidf <-tapply(myDtm$v/row_sums(myDtm)[myDtm$i], myDtm$j, mean)* log2(nDocs(myDtm)/col_sums(myDtm > 0))
summary(term_tfidf)
# term_tfidf是Median值,保留TF-IDF较高的值,使用中值是因为它不受数据中较大的TF-IDF值的影响,而平均值会受到更大的影响。
myDtm <- myDtm[,term_tfidf >= 0.22240]
myDtm <- myDtm[row_sums(myDtm) > 0,]
summary(col_sums(myDtm))
save(myDtm, file = "E:/my_Dtm.Rdata")
4.构建单词词云
library("wordcloud")
#将文档-术语矩阵转换为术语-文档矩阵(t函数为矩阵转置)
myTdm <- t(myDtm)
#将tdm定义为矩阵
m = as.matrix(myTdm)
#按降序获取字数
word_freqs = sort(rowSums(m), decreasing=TRUE)
#创建一个包含单词及其频率的数据帧
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#用前200词作词云
wordcloud(dm$word, dm$freq, max.words=200, random.order=FALSE, rot.per=.2, colors=brewer.pal(9, "Dark2"))
5.使用困惑度确定主题个数
困惑度越大,表示模型解释性越强,所以选择困惑度更大的主题数
burnin = 1000
#迭代次数
iter = 1000
#保存记录的步长
keep = 50
#主题范围(从5到50,以步长5进行递增)
sequ <- seq(5, 50, 5)
#迭代进行试验
fitted_many <- lapply(sequ, function(k) LDA(myDtm, k = k, method = "Gibbs",control = list(burnin = burnin, iter = iter, keep = keep) ))
#抽取每个主题的对数似然估计值
logLiks_many <- lapply(fitted_many, function(L) L [-c(1:(burnin/keep))])
#定义计算调和平均值的函数
harmonicMean <- function(logLikelihoods, precision=2000L) {
library("Rmpfr")
llMed <- median(logLikelihoods)
as.double(llMed - log(mean(exp(-mpfr(logLikelihoods,
prec = precision) + llMed))))
}
#计算各个主题的调和平均数,将其最为模型的最大似然估计
#需加载程序包gmp、Rmpfr
library("gmp")
library("Rmpfr")
hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))
#画出主题数-似然估计曲线图,用于观察
plot(sequ, hm_many, type = "l")
# 计算最佳主题个数
sequ[which.max(hm_many)]
6.构建吉布斯抽样的LDA模型
library("topicmodels")
load("my_Dtm.Rdata")
SEED <- 20080809
BURNIN = 1000
ITER = 1000
k = 20 #之前得出的最优主题数
model_lda <- LDA(myDtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = BURNIN, iter = ITER))
print(model_lda)
save(model_lda, file = "LDA_model.RData")
#看一下每个主题中频率最高的十个数
terms(model_lda, 10)
#将每个主题出现的频率最高的100个词汇导入csv.
write.csv(terms(model_lda, 100), file = "E:/model_mini_news.csv")
#将主题分布导入进 csv
lda_terms <- posterior(model_lda)$terms
write.csv(lda_terms, file = " E:/LDA_TERMS_mini_news.csv")
#将主题导入csv
lda_topics <- posterior(model_lda)$topics
write.csv(lda_topics, file = " E:/LDA_TOPICS_mini_news.csv")
参考资料:
(1)主体部分
Text Mining and Visualization: Case Studies Using Open-Source Tools
作者 Markus Hofmann,Andrew Chisholm
(2)困惑度部分:
https://www.cnblogs.com/deeplearningfans/p/4114892.html