PDFminer: extract text with its font information

后端 未结 6 1259
伪装坚强ぢ
伪装坚强ぢ 2021-02-08 03:26

I find this question, but it uses command line, and I do not want to call a Python script in command line using subprocess and parse HTML files to get the font information.

6条回答
  •  刺人心
    刺人心 (楼主)
    2021-02-08 03:57

    #!/usr/bin/env python
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.layout import LAParams
    from pdfminer.converter import PDFPageAggregator
    import pdfminer
    
    
    def createPDFDoc(fpath):
        fp = open(fpath, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser, password='')
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise "Not extractable"
        else:
            return document
    
    
    def createDeviceInterpreter():
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        return device, interpreter
    
    
    def parse_obj(objs):
        for obj in objs:
            if isinstance(obj, pdfminer.layout.LTTextBox):
                for o in obj._objs:
                    if isinstance(o,pdfminer.layout.LTTextLine):
                        text=o.get_text()
                        if text.strip():
                            for c in  o._objs:
                                if isinstance(c, pdfminer.layout.LTChar):
                                    print "fontname %s"%c.fontname
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs)
            else:
                pass
    
    
    document=createPDFDoc("/tmp/simple.pdf")
    device,interpreter=createDeviceInterpreter()
    pages=PDFPage.create_pages(document)
    interpreter.process_page(pages.next())
    layout = device.get_result()
    
    
    parse_obj(layout._objs)
    

提交回复
热议问题