中文情感分类 | 易学教程

本小结是对中文评论进行处理，利用word2vec工具获得特征数据，进而完成情感分析的目的

注意：本文重点是如何获得特征向量
根据项目本身的语料情况，一条评论就是一个txt文档，有两个语料文件：pos文件下包含1000条积极的评论，neg文件下包含1000条消极的评论

1-初始语料的预处理-把正向和负向评论分别规整到一个txt文件中，实施代码如下

import logging import os,os.path import codecs,sys  #设置读取文件内容的函数 def getContent(fullname):     f = codecs.open(fullname,"r")     #需要注意的时在原始语料中一个txt文档中只有一句评论     content=f.readline()     f.close()     return content  if __name__=="__main__":     #得到文件名     program = os.path.basename(sys.argv[0])     logger=logging.getLogger(program)     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')     logging.root.setLevel(level=logging.INFO)      #输入文件目录     inp = "data\ChnSentiCorp_htl_ba_2000"     #这里时输入文件目录下正向评论文件夹和负向评论文件夹     folders=["neg","pos"]      #这里开始遍历两个文件夹     for foldername in folders:         #日志文件         logging.info("running"+foldername+"files.")          #这里是定义输出文件         outp = "2000_"+foldername+".txt"         output = codecs.open(outp,"w")         i=0          rootdir = inp+"\\"+foldername         #三个参数：分别返回1-父目录 2-所有文件夹名字（不包含路径） 3-所有文件名字         for parent,dirnames,filenames in os.walk(rootdir):             for filename in filenames:                 content = getContent(rootdir+"\\"+filename)                 output.writelines(content)                 i=i+1         #遍历结束将1000个数据完全写入到"2000_"+foldername+".txt"文件下         output.close()         #书写日志         logger.info("saved"+str(i)+"files")

经过上一阶段的处理现在得到了两个txt文件，一个正向的txt文件和一个负向的txt文件，每一个txt文件下都包含1000条语句，下一步我们要将两个txt文件的中文数据做分词处理,这里我们使用结巴分词

2-逐行读取数据进行jieba分词-实施代码如下

import jieba import jieba.analyse import codecs,sys,string,re  #首先定义文本分词的方法 def prepareData(sourceFile,targetFile):     f=codecs.open(sourceFile,"r",encoding="utf-8")     target=codecs.open(targetFile,"w",encoding="utf-8")     print("open source file:"+sourceFile)     print("open target file:"+target)      lineNum=1     line = f.readline()     while line:         print("---------processing"+lineNum+"article--------")         #经过两个函数的处理获得单词         line = clearText(line)         seg_line = sent2word(line)         target.writelines(seg_line+"\n")         lineNum+=1         #再次读入一行数据         line = f.readline()     #工作完成关闭文件     print("done")     f.close()     target.close()  #定义clearText函数用来对文本进行清洗-去掉停用词等操作 def clearText(line):     if line!="":         #去除空格         line=line.strip()          #定义两个空字符串         intab=""         outtab=""          trantab = string.makettrans(intab,outtab)         pun_num = string.punctuation+string.digits          line =line.encode("utf-8")         line = line.translate(trantab,pun_num)         line = line.decode("utf-8")          #去掉文本中的英文和数字         line = re.sub("[a-zA-Z0-9]","",line)         #去除文本中的中英文符号         line = re.sub("[\s+\.\!\/_,$%^*(+\"\'；：“”．]+|[+――！，。？?、~@#￥%……&*（）]+".decode("utf8"), "", line)     #文本清洗完成并返回     return line #定义sent2word函数将句子切分成词语 def sent2word(line):     segList=jieba.cut(line,cut_all=False)     segSentence=""     for word in segList:         if word!="\t":             segSentence+=word+" "     #返回词组并且去掉两端的空格     return segSentence.strip() if __name__=="__main__":     sourceFile = "2000_neg.txt"     targetFile = "2000_neg_cut.txt"     prepareData(sourceFile,targetFile)      sourceFile = "2000_pos.txt"     targetFile = "2000_pos_cut.txt"     prepareData(sourceFile, targetFile)

分词完成后，即可读取停用词表中的停用词，对分词后的正负语料进行去除停用词
1-读取停用词表
2-遍历分词后的句子，将没歌词丢到此表中进行匹配，若存在则替换为空

3-#去除停用词，具体实施代码如下

 import codecs,sys  #定义去除函数 def stopWord(sourceFile,targetFile,stopkey):     sourcef = codecs.open(sourceFile,"r",encoding="utf-8")     targetf = codecs.open(targetFile,"w",encoding="utf-8")     print("open source file:" + sourceFile)     print("open target file:" + targetFile)      lineNum = 1     line = sourcef.readline()     while line:         print("---------processing" + lineNum + "article--------")         #调用delstopword函数将句子中的停用词删除         sentence = delstopword(line,stopkey)          targetf.writelines(sentence + "\n")         lineNum += 1         # 再次读入一行数据         line = f.readline()     # 工作完成关闭文件     print("done")     targetf.close()     sourcef.close()  #定义delstopword函数来删除停用词 def delstopword(line,stopkey):     wordList = line.split(" ")     sentence=""     for word in wordList:         if word not in stopkey:             if word!="\t":                 sentence+=word+""     return sentence.strip()  if __name__=="__main__":     stopkey = [w.strip() for w in codecs.open('data\stopWord.txt', 'r', encoding='utf-8').readlines()]      sourceFile = "2000_neg_cut.txt"     targetFile = "2000_neg_cut_stopword.txt"     stopWord(sourceFile,targetFile,stopkey)      sourceFile = "2000_pos_cut.txt"     targetFile = "2000_pos_cut_stopword.txt"     stopWord(sourceFile, targetFile, stopkey)

4-获得特征向量

经过分词处理后我们就得到了可以训练word2vec的语料，下面要进行词向量模型的训练
从wiki中文语料生成的词向量中抽取本文语料的特征词向量
即本文从文章最后得到的wiki.zh.text.vector中抽取特征词向量作为模型的输入

获取特征词向量的主要步骤如下：
1-读取模型词向量矩阵
2-遍历语句中的每一个词，从模型词向量矩阵中抽取当前词的数值向量，一条语句即可的得到一个二维矩阵，行数为词的个数，列数为模型设定的维数
3-根据得到的矩阵计算矩阵均值作为当前语句的特征词向量
4-全部语句计算完成后，拼接语句类别代表的值，写入csv文件

import warnings # 加入词条语句的意思是忽略警告 warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') import logging import os.path import codecs,sys #导入用于科学计算处理矩阵的两个工具包 import numpy as np import pandas as pd  import gensim  #构建函数返回一个词序列表的词向量 def getWordVecs(wordList,model):     #定义一个空数组     vecs=[]     for word in wordList:         word=word.replace("\n"," ")         try:             vecs.append(model[word])         except KeyError:             continue     return np.array(vecs,dtype="float")  #构建文档词向量 def buildVecs(filename,model):     fileVecs=[]     with codecs.open(filename,"rb",encoding="utf-8")as contents:         for line in contents:             #更新日志文件             logger.info("Start line: " + line)             wordList=line.split(" ")             #这里得到的是一句评论的向量矩阵             vecs=getWordVecs(wordList,model)              if len(vecs)>0:                 #经过这一步的处理将向量矩阵变成了一个向量                 vecsArray=sum(np.array(vecs))/len(vecs)                 #将这一句话的一个向量添加到fileVecs下                 fileVecs.append(vecsArray)     #最终返回的是pos文件和neg文件内所有数据的向量     return fileVecs  if __name__=="__main__":     #这一块解决的是日志的问题     program = os.path.basename(sys.argv[0])     logger = logging.getLogger(program)     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)     logger.info("running %s" % ' '.join(sys.argv))      #设置读取文件的目录     fdir = '/Users/sy/Desktop/pyRoot/SentimentAnalysis/'     inp = fdir + 'wiki.zh.text.vector'     # 下载训练好的词向量模型-从inp目录导入词向量模型     model = gensim.models.KeyedVectors.load_word2vec_format(inp, binary=False)      #调用buildVecs函数将文档内的数据进行向量化     posInput = buildVecs(fdir + '2000_pos_cut_stopword.txt', model)     negInput = buildVecs(fdir + '2000_neg_cut_stopword.txt', model)      #要设置类别-积极的评论设置为1，消极的评论设置为0     Y = np.concatenate((np.ones(len(posInput)), np.zeros(len(negInput))))      X=posInput[:]     for neg in negInput:         X.append(neg)     X=np.append(X)      #将上述数据写入到csv文件中      df_x=pd.DataFrame(X)     df_y=pd.DataFrame(Y)     data=pd.concat([df_y,df_x],axis=1)      data.to_csv(fdir+"2000_data.csv")

来源：51CTO

作者：无敌小熊猫

链接：https://blog.csdn.net/qq_41609475/article/details/100165829

标签

词向量