按标点切分语料
src = ''tgt = ''temp = ",.!?;"def fun(file1,file2,temp): with open(file1,'r',encoding='utf-8') as fl1: with open(file2,'w',encoding='utf-8') as fl2: for line in fl1.readlines(): for word in line: if word not in temp: fl2.write(word) else: if word != '\n': fl2.write(word+'\n') else: fl2.write(word)fun(src,tgt,temp)
查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子,将其定位之后再交换句子并生成新的文件
### 本代码同过英文文档来定位中文文档import langidimport tensorflow as tfimport codecsfrom langdetect import detect ## detect()输出探测出的语言类型from langdetect import detect_langs ## detect()输出探测出的所有语言类型及其所占的比例src = '' ## 英文tgt = '' ## 中文file1 = ''file2 = ''def fun1(seq): temp = langid.classify(seq) return temp[0]def fun(seq): lemp = detect(seq) return lemp### 该函数的功能是定位外语句子def fun2(src): k = 0 temp = [] with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl: for line in fl.readlines(): k += 1 try: temp1 = fun(line) ### 正常的情况下用langdetect expect: temp1 = fun1(line) ### 异常的情况下用langid if temp1 == 'zh': temp.append(k) else: pass return temp### 该函数的功能是交换句子### src(英文):file1是切分后的英文句子,file2是切分后的中文句子### tgt(中文):file1是切分后的中文句子,file2是切分后的英文句子def fun3(temp,src,tgt,file1,file2): num = 0 #s_file = open(src,'r',encoding='utf-8') s_file = open(tgt,'r',encoding='utf-8') fl1 = open(file1,'w',encoding = 'utf-8') fl2 = open(file2,'w',encoding = 'utf-8') for line in s_file.readlines(): num += 1 if num in temp: fl2.write(line) else: fl1.write(line) s_file.close() fl1.close() fl2.close()
if __name__ == "__main__": temp = fun2(src) fun3(temp,src,tgt,file1,file2)
分词
import jiebasrc = ''tgt = ''def cut(file1,file2): with open(file1,'r',encoding='utf-8') as fl1: with open(file2,'w',encoding='utf-8') as fl2: for line in fl1.readlines(): ## seq = jieba.cut(line,cut_all=True) ## 全模式 ## seq = jieba.cut_for_search(line ) ## 搜索引擎模式 seq = jieba.cut(line,cut_all = False) ## 精确模式 seq = ' '.join(seq) fl2.write(seq)
还原句子
### 将分好词的结果文件还原成句子file = ''tgt_file = ''def fun(file,file2): with open(file,'r',encoding='utf-8') as fl1: fl2 = open(file2,'w',encoding='utf-8') for line in fl.readlines(): line = line.replace(" ",'') fl2.write(line) fl2.close()fun(file,tgt_file)
随机生成测试集
### 本代码的功能是随机抽取测试集,并将文本除抽取出的测试集之后余下的部分生成训练集import numpy as npimport randomsrc_en = ''src_ch = ''cut_num = 3000 ## 抽取句子的数量tgt_train_en = ''tgt_train_ch = ''tgt_dev_en = ''tgt_dev_ch = ''## 生成随机数def random_num(): temp = [] for i in range(cut_num): a = random.randint(1,25000) ## 生成随机数的范围 if a not in temp: temp.append(a) print(len(temp)) ## 实际抽取出的数量 temp = sorted(temp,reverse=False) ## 升序 return temp## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)def new_file(file1,file2,file3,file4,file5,file6): temp = random_num() fl1 = open(file1,'r',encoding='utf-8') fl2 = open(file2,'r',encoding='utf-8') fl3 = open(file3,'r',encoding='utf-8') fl4 = open(file4,'r',encoding='utf-8') fl5 = open(file5,'r',encoding='utf-8') fl6 = open(file6,'r',encoding='utf-8') def fun(f1,f2,f3): num = 0 i = 0 for line1 in fl.readlines(): num += 1 if i< len(temp): if num == temp[i]: f3.write(line1) i += 1 else: f2.write(line1) fun(fl1,fl3,fl4) fun(fl2,fl5,fl6) fl1.close() fl2.close() fl3.close() fl4.close() fl5.close() fl6.close()
new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)