一些好用的代码

大兔子大兔子 提交于 2019-11-30 19:46:16

按标点切分语料

src = ''tgt = ''temp = ",.!?;"def fun(file1,file2,temp):  with open(file1,'r',encoding='utf-8') as fl1:    with open(file2,'w',encoding='utf-8') as fl2:      for line in fl1.readlines():        for word in line:          if word not in temp:            fl2.write(word)           else:            if word != '\n':              fl2.write(word+'\n')            else:              fl2.write(word)fun(src,tgt,temp)        

查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子,将其定位之后再交换句子并生成新的文件

### 本代码同过英文文档来定位中文文档import langidimport tensorflow as tfimport codecsfrom langdetect import detect        ## detect()输出探测出的语言类型from langdetect import detect_langs  ## detect()输出探测出的所有语言类型及其所占的比例src = ''   ## 英文tgt = ''   ## 中文file1 = ''file2 = ''def fun1(seq):  temp = langid.classify(seq)  return temp[0]def fun(seq):  lemp = detect(seq)  return lemp### 该函数的功能是定位外语句子def fun2(src):  k = 0  temp = []  with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:    for line in fl.readlines():      k += 1          try:        temp1 = fun(line)   ### 正常的情况下用langdetect      expect:        temp1 = fun1(line)  ### 异常的情况下用langid      if temp1 == 'zh':        temp.append(k)      else:        pass    return temp### 该函数的功能是交换句子### src(英文):file1是切分后的英文句子,file2是切分后的中文句子### tgt(中文):file1是切分后的中文句子,file2是切分后的英文句子def fun3(temp,src,tgt,file1,file2):  num = 0  #s_file = open(src,'r',encoding='utf-8')  s_file = open(tgt,'r',encoding='utf-8')  fl1 = open(file1,'w',encoding = 'utf-8')  fl2 = open(file2,'w',encoding = 'utf-8')  for line in s_file.readlines():    num += 1    if num in temp:      fl2.write(line)    else:      fl1.write(line)  s_file.close()  fl1.close()  fl2.close()
if __name__ == "__main__":  temp = fun2(src)  fun3(temp,src,tgt,file1,file2)

 分词

import jiebasrc = ''tgt = ''def cut(file1,file2):  with open(file1,'r',encoding='utf-8') as fl1:    with open(file2,'w',encoding='utf-8') as fl2:      for line in fl1.readlines():        ## seq = jieba.cut(line,cut_all=True) ## 全模式        ## seq = jieba.cut_for_search(line  ) ## 搜索引擎模式        seq = jieba.cut(line,cut_all = False) ## 精确模式        seq = ' '.join(seq)        fl2.write(seq)

还原句子

### 将分好词的结果文件还原成句子file = ''tgt_file = ''def fun(file,file2):  with open(file,'r',encoding='utf-8') as fl1:    fl2 = open(file2,'w',encoding='utf-8')      for line in fl.readlines():        line = line.replace(" ",'')        fl2.write(line)    fl2.close()fun(file,tgt_file)

 随机生成测试集

### 本代码的功能是随机抽取测试集,并将文本除抽取出的测试集之后余下的部分生成训练集import numpy as npimport randomsrc_en = ''src_ch = ''cut_num = 3000    ## 抽取句子的数量tgt_train_en = ''tgt_train_ch = ''tgt_dev_en = ''tgt_dev_ch = ''## 生成随机数def random_num():  temp = []  for i in range(cut_num):    a = random.randint(1,25000) ## 生成随机数的范围    if a not in temp:      temp.append(a)  print(len(temp))   ## 实际抽取出的数量  temp = sorted(temp,reverse=False) ## 升序  return temp## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)def new_file(file1,file2,file3,file4,file5,file6):  temp = random_num()  fl1 = open(file1,'r',encoding='utf-8')  fl2 = open(file2,'r',encoding='utf-8')  fl3 = open(file3,'r',encoding='utf-8')  fl4 = open(file4,'r',encoding='utf-8')  fl5 = open(file5,'r',encoding='utf-8')  fl6 = open(file6,'r',encoding='utf-8')     def fun(f1,f2,f3):    num = 0    i = 0    for line1 in fl.readlines():      num += 1      if i< len(temp):        if num == temp[i]:          f3.write(line1)           i += 1        else:          f2.write(line1)  fun(fl1,fl3,fl4)  fun(fl2,fl5,fl6)  fl1.close()  fl2.close()  fl3.close()  fl4.close()  fl5.close()  fl6.close()
new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)  

 

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!