朴素贝叶斯实现文本分类
文本分类
词语去重¶
import random
import jieba
import sklearn
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
def set_word(filename):
word_set = set()
with open(filename,'r+') as f:
for line in f.readlines():
word = line.strip().decode('utf-8')
if len(word) > 0 and word not in word_set:
word_set.add(word)
return word_set
文本处理
import os
def text_Processing(file_path,test_size = 0.2):
file_list = os.listdir(file_path)
data_list = []
class_list = []
for file in file_list:
new_file_path = os.path.join(file_path,file)
files = os.listdir(new_file_path)
j = 1
for file_ in files:
if j>100:
break
with open(os.join(new_file_path,file_),'r+') as f:
raw = f.read()
#jieba并行分词(window不可用),参数为进程数
word_cut = jie.cut(raw,cut_all = False) #精确模式
words_list = list[word_cut]
jieba.disable_parallel()#释放进程
#文件夹名就是分类
j += 1
##划分训练集和测试集
data_class_list = zip(data_list,class_list)
random.shuffle(data_class_list)
index = int(len(data_class_list)*test_size) + 1
train_list = data_class_list[index:]
test_list = data_class_listp[:index]
zip(*train_list) =
zip(*test_list) =
#统计词频
all_words_dict = {}
for word_list in train_data_list:
for word in word_list:
if all_words_dict[word]:
+= 1
else:
+= 1
all_words_tuple_lsit = sorted(all_words_dict.items,key=lambda f:f[1],reverse=True)
all_words_list = list(zip(*all_words_tuple_lsit)[0])
return all_words_list,train_data_list,test_class_list,train_class_list
选取特征词
def words_dict(all_words_list,deletN,stopwords_set = set()):
feature_words = []
n = 1
for t in range(deletN,len(all_words_list),1):
if n > 1000: #特征词的维度1000
break
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words
文本特征
def text_features(train_data_list,test_data_list,feature_words,flag='sklearn'):
def rapper(text,feature_words):
text_words = set(text)
if flag == 'nltk':
features = {word:1 if word in text_words else 0 for word in feature_words}
elif flag == 'sklearn':
features = [1 if word in text_words else 0 for word in feature_words]
else:
features = []
return features
train_feature_list = [rapper(text,feature_words) for text in train_data_list]
test_feature_list = [rapper(text_,feature_words) for text_ in test_data_list]
return train_feature_list,test_feature_list
分类,同时输出准确率
import nltk
def text_classifier(train_feature_list,test_feature_list,train_class_list,test_class_list,flag = 'sklearn'):
if flag == 'nltk':
train_flist = zip(train_feature_list,train_class_list)
test_flist = zip(test_feature_list,test_class_list)
classifier = nltk.classify.NaiveBayesClassifier.train(train_flist)
test_accuracy = nltk.classify.accuracy(classifier,test_flist)
elif flag == 'sklearn':
classifier = MultinomialNB.fit(train_feature_list,train_class_list) #多项式朴素贝叶斯fit
test_accuracy = classifier.score(test_feature_list,test_class_list) # 对测试集打分
else:
test_accuracy = []
return test_accuracy
实战
print('start')
##文本预处理
file_path('./testdata')
all_words_list,train_data_list,test_class_list,train_class_list = text_Processing(file_path,test_size=0.2)
stopwords_file = './stopwords.txt'
stopwords_set = set_word(stopwords_file)
## 文本特征提取和分类
flag = 'sklearn'
deleteNs = range(0,1000,20)
test_accuracy_list = []
for deleteN in deleteNs:
feature_word = words_dict(all_words_list,deletN,stopwords_set)
train_feature_list,test_feature_list = text_features(train_data_list,test_data_list,feature_word,flag)
test_acctacy = text_classifier(train_feature_list,test_feature_list,train_class_list,test_class_list)
test_accuracy_list.append(test_acctacy)
print(test_accuracy_list)
print(finished)
可以从搜狗语料库下载数据集