利用R语言进行LDA主题建模
本次推送第二篇为纯代码,方便直接copy啦~
准备:
每个文档以txt形式保存在E:/1
1.设置工作路径
library("tm")setwd(“E:/1”)corpus<-Corpus(DirSource(directory="E:/1",encoding="UTF-8",recursive=TRUE,mode="text"))corpus
2.预处理语料库
library("SnowballC")myStopwords <- c(stopwords("english"), "SalesForce",”null”)myStopwords <- c(stopwords("english"), stopwords("SMART"))your_corpus <- tm_map(corpus, content_transformer(tolower))your_corpus <- tm_map(your_corpus, removeWords, myStopwords)your_corpus <- tm_map(your_corpus, removeNumbers)your_corpus <- tm_map(your_corpus, removePunctuation)your_corpus <- tm_map(your_corpus, stripWhitespace)your_corpus <- tm_map(your_corpus, stemDocument)
3.构建文档-词矩阵
#这只是一个矩阵,其中文档是行,单词是列,矩阵单元包含单词的频率计数(weightTf),其中wordLengths=c(3,Inf)指单词长度从3到无限大;结果中Sparsity指稀疏性myDtm<-DocumentTermMatrix(your_corpus,control=list(wordLengths=c(3,Inf)))myDtm#超过100次的术语列表findFreqTerms(myDtm, 100)#加载slam包,计算TF-IDF,将值较高的保留下来library("slam")term_tfidf <-tapply(myDtm$v/row_sums(myDtm)[myDtm$i], myDtm$j, mean)* log2(nDocs(myDtm)/col_sums(myDtm > 0))summary(term_tfidf)# term_tfidf是Median值,保留TF-IDF较高的值,使用中值是因为它不受数据中较大的TF-IDF值的影响,而平均值会受到更大的影响。myDtm <- myDtm[,term_tfidf >= 0.22240]myDtm <- myDtm[row_sums(myDtm) > 0,]summary(col_sums(myDtm))save(myDtm, file = "E:/my_Dtm.Rdata")
4.构建单词词云
library("wordcloud")#将文档-术语矩阵转换为术语-文档矩阵(t函数为矩阵转置)myTdm <- t(myDtm)#将tdm定义为矩阵m = as.matrix(myTdm)#按降序获取字数word_freqs = sort(rowSums(m), decreasing=TRUE)#创建一个包含单词及其频率的数据帧dm = data.frame(word=names(word_freqs), freq=word_freqs)#用前200词作词云wordcloud(dm$word, dm$freq, max.words=200, random.order=FALSE, rot.per=.2, colors=brewer.pal(9, "Dark2"))
5.使用困惑度确定主题个数
困惑度越大,表示模型解释性越强,所以选择困惑度更大的主题数
burnin = 1000#迭代次数iter = 1000#保存记录的步长keep = 50#主题范围(从5到50,以步长5进行递增)sequ <- seq(5, 50, 5)#迭代进行试验fitted_many <- lapply(sequ, function(k) LDA(myDtm, k = k, method = "Gibbs",control = list(burnin = burnin, iter = iter, keep = keep) ))#抽取每个主题的对数似然估计值logLiks_many <- lapply(fitted_many, function(L) L[-c(1:(burnin/keep))])#定义计算调和平均值的函数harmonicMean <- function(logLikelihoods, precision=2000L) {library("Rmpfr")llMed <- median(logLikelihoods)as.double(llMed - log(mean(exp(-mpfr(logLikelihoods,prec = precision) + llMed))))}#计算各个主题的调和平均数,将其最为模型的最大似然估计#需加载程序包gmp、Rmpfrlibrary("gmp")library("Rmpfr")hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))#画出主题数-似然估计曲线图,用于观察plot(sequ, hm_many, type = "l")# 计算最佳主题个数sequ[which.max(hm_many)]
6.构建吉布斯抽样的LDA模型
library("topicmodels")load("my_Dtm.Rdata")SEED <- 20080809BURNIN = 1000ITER = 1000k = 20 #之前得出的最优主题数model_lda <- LDA(myDtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = BURNIN, iter = ITER))print(model_lda)save(model_lda, file = "LDA_model.RData")#看一下每个主题中频率最高的十个数terms(model_lda, 10)#将每个主题出现的频率最高的100个词汇导入csv.write.csv(terms(model_lda, 100), file = "E:/model_mini_news.csv")#将主题分布导入进 csvlda_terms <- posterior(model_lda)$termswrite.csv(lda_terms, file = " E:/LDA_TERMS_mini_news.csv")#将主题导入csvlda_topics <- posterior(model_lda)$topicswrite.csv(lda_topics, file = " E:/LDA_TOPICS_mini_news.csv")
参考资料:
(1)主体部分
Text Mining and Visualization: Case Studies Using Open-Source Tools
作者 Markus Hofmann,Andrew Chisholm
(2)困惑度部分:
https://www.cnblogs.com/deeplearningfans/p/4114892.html
