python对不同类型文件的字符查找
TXT文件:
def txt_handler(self, f_name, find_str):
"""
处理txt文件
:param file_name:
:return:
"""
line_count = 1;
file_str_dict = {}
if os.path.exists(f_name):
f = open(f_name, 'r', encoding='utf-8')
for line in f :
if find_str in line:
file_str_dict['file_name'] = f_name
file_str_dict['line_count'] = line_count
break
else:
line_count += 1
return file_str_dict
docx文件
需要用到docx包
pip install python-docx
参考https://python-docx.readthedocs.io/en/latest/
from docx import Document
def docx_handler(self, f_name, find_str):
"""
处理word docx文件
:param file_name:
:return:
"""
# line_count = 1;
file_str_dict = {}
if os.path.exists(f_name):
document = Document(f_name) # 打开文件x.docx
for paragraph in document.paragraphs: # 每个获取段落
# print(paragraph.text)
if find_str in paragraph.text:
file_str_dict['file_name'] = f_name
# file_str_dict['line_count'] = line_count
break
return file_str_dict
doc文件:
python没有专门处理doc文件的包,需要把doc转换成docx,再用docx文件类型方式进行处理
from win32com import client as wc
def doc_to_docx(self, fileName):
# 将doc转换成docx
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(fileName)
# 使用参数16表示将doc转换成docx,保存成docx后才能 读文件
FileNameDocx = fileName[:-4] + '.docx'
doc.SaveAs(FileNameDocx, 16)
doc.Close()
word.Quit()
return FileNameDocx
pdf文件:
这里使用PDFMiner包
python3安装
python -m pip install pdfminer.six
参考文章
https://dzone.com/articles/exporting-data-from-pdfs-with-python
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def pdf_handler(self, f_name, find_str):
"""
处理pdf文件
:param file_name:
:return:
"""
# line_count = 1;
file_str_dict = {}
if os.path.exists(f_name):
# pdf = pdfplumber.open(f_name) # 打开文件x.pdf
for page in self.extract_text_by_page(f_name):
# 获取当前页面的全部文本信息,包括表格中的文字
if find_str in page:
file_str_dict['file_name'] = f_name
# file_str_dict['line_count'] = line_count
break
return file_str_dict
@staticmethod
def extract_text_by_page(pdf_path):
"""
按页读取PDF
生成器函数按页生成(yield)了文本
:param pdf_path:
:return:
"""
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text # 使用生成器
# close open handles
converter.close()
fake_file_handle.close()
来源:https://www.cnblogs.com/xiao-apple36/p/12020830.html