import os import re import sys import glob import xml.etree.ElementTree as ET def xml_to_txt(indir,outdir): os.chdir(indir) annotations = os.listdir('.') annotations = glob.glob(str(annotations)+'*.xml') pat = re.compile('(?<=\>).*?(?=\<)') for i, file in enumerate(annotations): file_save = file.split('.')[0]+'.txt' file_txt=os.path.join(outdir,file_save) f_w = open(file_txt,'w',encoding="utf-8") tree=ET.parse(file) root = tree.getroot() for obj in root.iter('PostItem'): current = list() for ele in obj.iter(): if "content" in ele.tag: content = obj.find('content').text if content: content = re.sub(r'</?\w+[^>]*>','',content).replace(" "," ").strip() print(content) f_w.write(content) f_w.write("\n") if "caption" in ele.tag: caption = obj.find('caption').text if caption: caption = re.sub(r'</?\w+[^>]*>','',caption).replace(" "," ").strip() f_w.write(caption) f_w.write("\n") print(caption) indir='E:\Data\demo-xml' #打开目录 outdir='E:\Data\demo-txt' #保存目录 xml_to_txt(indir,outdir)
来源:https://www.cnblogs.com/hankleo/p/11727219.html