vlambda博客
学习文章列表

利用R语言进行LDA主题建模

本次推送第二篇为纯代码,方便直接copy啦~


准备:

每个文档以txt形式保存在E:/1


1.设置工作路径


#加载包tmlibrary("tm")#设置工作路径setwd(“E:/1”)corpus<-Corpus(DirSource(directory="E:/1",encoding="UTF-8",recursive=TRUE,mode="text"))#显示语料库有多少个文档corpus



2.预处理语料库


library("SnowballC")#设置停用词词典和自定义词汇myStopwords <- c(stopwords("english"), "SalesForce",”null”)myStopwords <- c(stopwords("english"), stopwords("SMART"))your_corpus <- tm_map(corpus, content_transformer(tolower))#每个变换只是在一个文档上,tm_map将其作用到所有文档your_corpus <- tm_map(your_corpus, removeWords, myStopwords)your_corpus <- tm_map(your_corpus, removeNumbers)your_corpus <- tm_map(your_corpus, removePunctuation)#标点your_corpus <- tm_map(your_corpus, stripWhitespace)#空白your_corpus <- tm_map(your_corpus, stemDocument)



3.构建文档-词矩阵


#这只是一个矩阵,其中文档是行,单词是列,矩阵单元包含单词的频率计数(weightTf),其中wordLengths=c(3,Inf)指单词长度从3到无限大;结果中Sparsity指稀疏性myDtm<-DocumentTermMatrix(your_corpus,control=list(wordLengths=c(3,Inf)))myDtm#超过100次的术语列表findFreqTerms(myDtm, 100)#加载slam包,计算TF-IDF,将值较高的保留下来library("slam")term_tfidf <-tapply(myDtm$v/row_sums(myDtm)[myDtm$i], myDtm$j, mean)* log2(nDocs(myDtm)/col_sums(myDtm > 0))summary(term_tfidf)# term_tfidf是Median值,保留TF-IDF较高的值,使用中值是因为它不受数据中较大的TF-IDF值的影响,而平均值会受到更大的影响。myDtm <- myDtm[,term_tfidf >= 0.22240] myDtm <- myDtm[row_sums(myDtm) > 0,]summary(col_sums(myDtm))save(myDtm, file = "E:/my_Dtm.Rdata")


4.构建单词词云


library("wordcloud")#将文档-术语矩阵转换为术语-文档矩阵(t函数为矩阵转置)myTdm <- t(myDtm)#将tdm定义为矩阵m = as.matrix(myTdm)#按降序获取字数word_freqs = sort(rowSums(m), decreasing=TRUE)#创建一个包含单词及其频率的数据帧dm = data.frame(word=names(word_freqs), freq=word_freqs)#用前200词作词云wordcloud(dm$word, dm$freq, max.words=200, random.order=FALSE, rot.per=.2, colors=brewer.pal(9, "Dark2"))



5.使用困惑度确定主题个数


困惑度越大,表示模型解释性越强,所以选择困惑度更大的主题数

burnin = 1000#迭代次数iter = 1000#保存记录的步长keep = 50#主题范围(从550,以步长5进行递增)sequ <- seq(5, 50, 5)#迭代进行试验fitted_many <- lapply(sequ, function(k) LDA(myDtm, k = k, method = "Gibbs",control = list(burnin = burnin, iter = iter, keep = keep) ))#抽取每个主题的对数似然估计值logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])#定义计算调和平均值的函数harmonicMean <- function(logLikelihoods, precision=2000L) { library("Rmpfr") llMed <- median(logLikelihoods) as.double(llMed - log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))}#计算各个主题的调和平均数,将其最为模型的最大似然估计#需加载程序包gmpRmpfr library("gmp") library("Rmpfr")hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))#画出主题数-似然估计曲线图,用于观察plot(sequ, hm_many, type = "l")# 计算最佳主题个数sequ[which.max(hm_many)]



6.构建吉布斯抽样的LDA模型


library("topicmodels")load("my_Dtm.Rdata")SEED <- 20080809BURNIN = 1000ITER = 1000k = 20 #之前得出的最优主题数model_lda <- LDA(myDtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = BURNIN, iter = ITER))print(model_lda)save(model_lda, file = "LDA_model.RData")#看一下每个主题中频率最高的十个数terms(model_lda, 10)#将每个主题出现的频率最高的100个词汇导入csv.write.csv(terms(model_lda, 100), file = "E:/model_mini_news.csv")#将主题分布导入进 csvlda_terms <- posterior(model_lda)$termswrite.csv(lda_terms, file = " E:/LDA_TERMS_mini_news.csv")#将主题导入csvlda_topics <- posterior(model_lda)$topicswrite.csv(lda_topics, file = " E:/LDA_TOPICS_mini_news.csv")




参考资料:

(1)主体部分

Text Mining and Visualization: Case Studies Using Open-Source Tools

作者 Markus Hofmann,Andrew Chisholm

(2)困惑度部分:

https://www.cnblogs.com/deeplearningfans/p/4114892.html