批量提取word doc文档中的表格

doc文档无法通过docx包读取，需转换为docx。

docx格式的文件本质上是一个ZIP文件。

如果将后缀改为zip就能看清内部结构。

docx提取word中的表格非常方便。

这段代码还可以提取docx中的图片。

import os, shutil, xlwt, re
from docx import Document
from win32com import client as wc
path=r'C:\Users\Administrator\Desktop\0221\正常发布的'
excelpath=os.path.join(path, "..\\")
target = os.path.join(path, "..\\docx")
imgPath = r'C:\Users\Administrator\Desktop\0221\pic'
word = wc.Dispatch("Word.Application")
imgabspath = ''
if not os.path.exists(target):
    os.mkdir(target)
if not os.path.exists(imgPath):
    os.mkdir(imgPath)

for root, dirs, files in os.walk(path):
    #print(root,files)
    for name in files:
        if name.endswith('.doc'):
            doc = word.Documents.Open(os.path.join(root, name))
            docxabs = os.path.join(target, os.path.basename(name)+'x') 
            doc.SaveAs(docxabs, 12)
            doc.Close()
        elif name.endswith('.docx'):
            docxabs = os.path.join(target, os.path.basename(name))
            shutil.copyfile(os.path.join(root, name),docxabs)
word.Quit()
                                   
workbook = xlwt.Workbook(encoding = 'GBK')
worksheet = workbook.add_sheet('公司列表')
worksheet.write(0,0,'单位名称')
worksheet.write(0,1,'营业执照（副本）注册号') 
worksheet.write(0,2,'单位性质')
worksheet.write(0,3,'单位地址')
worksheet.write(0,4,'联系人') 
worksheet.write(0,5,'联系电话') 
worksheet.write(0,6,'单位简介')
worksheet.write(0,7,'招聘信息')
worksheet.write(0,8,'营业执照')
for root, dirs, files in os.walk(target):
    c_row = 1
    for name in files:
        print(name)
        d = Document(os.path.join(root, name))
        t = d.tables
        company=t[0].cell(0,1).text.replace('\n', '').replace(' ', '')
        worksheet.write(c_row,0,company)
        worksheet.write(c_row,1,t[0].cell(0,3).text)
        worksheet.write(c_row,2,t[0].cell(1,1).text)
        worksheet.write(c_row,3,t[0].cell(2,1).text)
        worksheet.write(c_row,4,re.findall(r'[\u4E00-\u9FA5]+',t[0].cell(2,3).text))
        worksheet.write(c_row,5,re.findall("[0-9]{11}",t[0].cell(2,3).text))
        if t[0].cell(3,0).text == '单位简介':
            worksheet.write(c_row,6,t[0].cell(3,1).text)
            worksheet.write(c_row,7,t[0].cell(4,1).text)
        elif t[0].cell(4,0).text == '单位简介':
            worksheet.write(c_row,6,t[0].cell(4,1).text)
            worksheet.write(c_row,7,t[0].cell(5,1).text)
        
        if ".docx" not in name:
            continue
        #subImgPath = os.path.join(imgPath, company)
        #if not os.path.exists(subImgPath):
        #    os.makedirs(subImgPath)
        imgabspath = ''
        for rel in d.part._rels:
            rel = d.part._rels[rel]              #获得资源
            if "image" not in rel.target_ref:
                continue
            imgName = re.findall("/(.*)",rel.target_ref)[0]
            extension = os.path.splitext(imgName)[1]
            imgabspath = imgPath + "/" + company + extension
            with open(imgabspath,"wb") as f:
                f.write(rel.target_part.blob)
        if os.path.isfile(imgabspath):#os.listdir(subImgPath):
            worksheet.write(c_row,8,imgabspath)
        os.rename(os.path.join(root, name), os.path.join(root, company+'.docx'))
        
        c_row += 1
workbook.save(os.path.join(excelpath, '公司招聘列表.xls'))

来源：CSDN

作者：梓沂

链接：https://blog.csdn.net/qq_27361945/article/details/104439726

标签

docx