vlambda博客
学习文章列表

R语言实现 朴素贝叶斯分类(垃圾短信)

ham代表垃圾短信

spam代表非垃圾短信


# Example: Filtering spam SMS messages ----

## Step 1: 查看模型概要




#read the sms data into the sms data frame

sms_raw <- read.csv("F:\\rwork\\Machine Learning with R (2nd Ed.)\\Chapter 04\\sms_spam.csv", stringsAsFactors = FALSE)


#examine the structure of the sms data

str(sms_raw)

R语言实现 朴素贝叶斯分类(垃圾短信)


因子化

sms_raw$type <- factor( sms_raw$type )


检查变量

str(sms_raw$type)table(sms_raw$type)

R语言实现 朴素贝叶斯分类(垃圾短信)


建立语料库(短信内容)

library('tm')sms_corpus <- VCorpus(VectorSource(sms_raw$text))head(sms_corpus)


观察第一条和第二条短信的概要

print(sms_corpus)inspect(sms_corpus[1:2])

R语言实现 朴素贝叶斯分类(垃圾短信)


#观看第一条短信内容

as.character(sms_corpus[[1]])lapply(sms_corpus[1:2], as.character)

R语言实现 朴素贝叶斯分类(垃圾短信)


全部小写

sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))


#show the difference between sms_corpus and corpus_clean

as.character(sms_corpus[[1]])as.character(sms_corpus_clean[[1]])

R语言实现 朴素贝叶斯分类(垃圾短信)


删除数字

sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers


删除填充词

stopwords里面是to,and,but等一些自带的一些词

sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) 


删除标点符号

sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation)


自定义函数

removePunctuation("hello...world")replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }

gsub函数用空格代替任何标点符号

replacePunctuation("hello...world")

R语言实现 朴素贝叶斯分类(垃圾短信)


此函数将单词全部变为原形

library(SnowballC)wordStem(c("learn", "learned", "learning", "learns"))

R语言实现 朴素贝叶斯分类(垃圾短信)


为了使得wordStem应用于整个文本语料库

sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)


删除多余的空格

sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace)


观察一些变换前后的结果

lapply(sms_corpus[1:3], as.character)lapply(sms_corpus_clean[1:3], as.character)

R语言实现 朴素贝叶斯分类(垃圾短信)



创建稀疏矩阵

sms_dtm <- DocumentTermMatrix(sms_corpus_clean)


#alternative solution: create a document-term sparse matrix directly from the SMS corpus

sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list( tolower = TRUE, removeNumbers = TRUE, stopwords = TRUE, removePunctuation = TRUE, stemming = TRUE))


#上述结果中sms_dtm和sms_dtm2略有不一样,主要是 不同的停用词 去除功能


#alternative solution: using custom stop words function ensures identical result

sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list( tolower = TRUE, removeNumbers = TRUE, stopwords = function(x) { removeWords(x, stopwords()) }, removePunctuation = TRUE, stemming = TRUE))

#自定义函数,修改stopwords,获得与sms_dtm2一样的结果


#compare the result

sms_dtmsms_dtm2sms_dtm3

R语言实现 朴素贝叶斯分类(垃圾短信)


#creating training and test datasets

一部分作为训练数据,一部分作为测试数据

sms_dtm_train <- sms_dtm[1:4169, ]sms_dtm_test <- sms_dtm[4170:5559, ]


#also save the labels

sms_train_labels <- sms_raw[1:4169, ]$typesms_test_labels <- sms_raw[4170:5559, ]$type


#check that the proportion of spam is similar

prop.table(table(sms_train_labels))prop.table(table(sms_test_labels))#计算所占比例

R语言实现 朴素贝叶斯分类(垃圾短信)


#word cloud visualization

library(wordcloud)wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

画云图

在语料库最小次数50次;不随机排列

R语言实现 朴素贝叶斯分类(垃圾短信)


获得子集

spam <- subset(sms_raw, type == "spam") ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))#最常见的40个单词wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

R语言实现 朴素贝叶斯分类(垃圾短信)

R语言实现 朴素贝叶斯分类(垃圾短信)

剔除训练数据中出现次数少于记录总数0.1%的单词

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)sms_dtm_freq_train

R语言实现 朴素贝叶斯分类(垃圾短信)



找出最少出现5次的单词

sms_freq_words <- findFreqTerms(sms_dtm_train, 5)str(sms_freq_words)

R语言实现 朴素贝叶斯分类(垃圾短信)


查看上面筛选过后的词的频率

sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]


如果大于1就是Yes

convert_counts <- function(x) { x <- ifelse(x > 0, "Yes", "No")}


将语料中不是0的全部换为Yes

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)


Step 2: 分类模型建立


library(e1071)sms_classifier <- naiveBayes(sms_train, sms_train_labels)


Step 3: 评估模型性能

sms_test_pred <- predict(sms_classifier, sms_test)library(gmodels)CrossTable(sms_test_pred, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual'))


我们发现有6+34=40条短信未被正确分类


Step 4: 模型的性能提升


sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)sms_test_pred2 <- predict(sms_classifier2, sms_test)CrossTable(sms_test_pred2, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual'))



改善模型,将拉普拉斯(laplace)值设为1,发现有6+32=38条短信未被正确分类。

虽然从40到38看上去是一个很小的变化,考虑到模型的准确性已经相当好了,这其实是很大的提高。

最终该模型将超过97%的短信正确分成垃圾短信和非垃圾短信。


如果有错误欢迎指正哦~~~撒花

(可私信要数据哦~)