一些好用的代码 | 易学教程

按标点切分语料

src = ''tgt = ''temp = ",.!?;"def fun(file1,file2,temp):　　with open(file1,'r',encoding='utf-8') as fl1:　　　　with open(file2,'w',encoding='utf-8') as fl2:　　　　　　for line in fl1.readlines():　　　　　　　　for word in line:　　　　　　　　　　if word not in temp:　　　　　　　　　　　　fl2.write(word)　　　　　　　　　　　else:　　　　　　　　　　　　if word != '\n':　　　　　　　　　　　　　　fl2.write(word+'\n')　　　　　　　　　　　　else:　　　　　　　　　　　　　　fl2.write(word)fun(src,tgt,temp)

查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子，将其定位之后再交换句子并生成新的文件

### 本代码同过英文文档来定位中文文档import langidimport tensorflow as tfimport codecsfrom langdetect import detect        ## detect()输出探测出的语言类型from langdetect import detect_langs  ## detect()输出探测出的所有语言类型及其所占的比例src = ''   ## 英文tgt = ''   ## 中文file1 = ''file2 = ''def fun1(seq）:　　temp = langid.classify(seq)　　return temp[0]def fun(seq):　　lemp = detect(seq)　　return lemp### 该函数的功能是定位外语句子def fun2(src):　　k = 0　　temp = []　　with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:　　　　for line in fl.readlines():　　　　　　k += 1          try:　　　　　　　　temp1 = fun(line)   ### 正常的情况下用langdetect　　　　　　expect:　　　　　　　　temp1 = fun1(line)  ### 异常的情况下用langid　　　　　　if temp1 == 'zh':　　　　　　　　temp.append(k)　　　　　　else:　　　　　　　　pass　　　　return temp### 该函数的功能是交换句子### src(英文)：file1是切分后的英文句子，file2是切分后的中文句子### tgt(中文)：file1是切分后的中文句子，file2是切分后的英文句子def fun3(temp,src,tgt,file1,file2):　　num = 0　　#s_file = open(src,'r',encoding='utf-8')　　s_file = open(tgt,'r',encoding='utf-8')　　fl1 = open(file1,'w',encoding = 'utf-8')　　fl2 = open(file2,'w',encoding = 'utf-8')　　for line in s_file.readlines():　　　　num += 1　　　　if num in temp:　　　　　　fl2.write(line)　　　　else:　　　　　　fl1.write(line)　　s_file.close()　　fl1.close()　　fl2.close()

if __name__ == "__main__":　　temp = fun2(src)　　fun3(temp,src,tgt,file1,file2)

分词

import jiebasrc = ''tgt = ''def cut(file1,file2):　　with open(file1,'r',encoding='utf-8') as fl1:　　　　with open(file2,'w',encoding='utf-8') as fl2:　　　　　　for line in fl1.readlines():　　　　　　　　## seq = jieba.cut(line,cut_all=True) ## 全模式　　　　　　　　## seq = jieba.cut_for_search(line  ) ## 搜索引擎模式　　　　　　　　seq = jieba.cut(line,cut_all = False) ## 精确模式　　　　　　　　seq = ' '.join(seq)　　　　　　　　fl2.write(seq)

还原句子

### 将分好词的结果文件还原成句子file = ''tgt_file = ''def fun(file,file2):　　with open(file,'r',encoding='utf-8') as fl1:　　　　fl2 = open(file2,'w',encoding='utf-8')　　　　　　for line in fl.readlines():　　　　　　　　line = line.replace(" ",'')　　　　　　　　fl2.write(line)　　　　fl2.close()fun(file,tgt_file)

随机生成测试集

### 本代码的功能是随机抽取测试集，并将文本除抽取出的测试集之后余下的部分生成训练集import numpy as npimport randomsrc_en = ''src_ch = ''cut_num = 3000    ## 抽取句子的数量tgt_train_en = ''tgt_train_ch = ''tgt_dev_en = ''tgt_dev_ch = ''## 生成随机数def random_num():　　temp = []　　for i in range(cut_num):　　　　a = random.randint(1,25000) ## 生成随机数的范围　　　　if a not in temp:　　　　　　temp.append(a)　　print(len(temp))   ## 实际抽取出的数量　　temp = sorted(temp,reverse=False) ## 升序　　return temp## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)def new_file(file1,file2,file3,file4,file5,file6):　　temp = random_num()　　fl1 = open(file1,'r',encoding='utf-8')　　fl2 = open(file2,'r',encoding='utf-8')　　fl3 = open(file3,'r',encoding='utf-8')　　fl4 = open(file4,'r',encoding='utf-8')　　fl5 = open(file5,'r',encoding='utf-8')　　fl6 = open(file6,'r',encoding='utf-8')　　   def fun(f1,f2,f3):　　　　num = 0　　　　i = 0　　　　for line1 in fl.readlines():　　　　　　num += 1　　　　　　if i< len(temp):　　　　　　　　if num == temp[i]:　　　　　　　　　　f3.write(line1)　　　　　　　　　　　i += 1　　　　　　　　else:　　　　　　　　　　f2.write(line1)　　fun(fl1,fl3,fl4)　　fun(fl2,fl5,fl6)　　fl1.close()　　fl2.close()　　fl3.close()　　fl4.close()　　fl5.close()　　fl6.close()

new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)

来源：https://www.cnblogs.com/hanouba/p/11637737.html

标签

句子

num