How to get bookmark's page number

后端 未结 4 1782
南方客
南方客 2020-12-16 04:52
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination


def get_outlines(pdf_filepat         


        
4条回答
  •  离开以前
    2020-12-16 05:29

    As @theta pointed out "split a pdf based on outline" has the code required to extract page numbers. If you feel this is complicated I copied part of the code which maps page ids to page numbers and made it a function. Here is a working example that prints page number of bookmark o[0]:

    from PyPDF2 import PdfFileReader
    
    
    def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
        if _result is None:
            _result = {}
        if pages is None:
            _num_pages = []
            pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
        t = pages["/Type"]
        if t == "/Pages":
            for page in pages["/Kids"]:
                _result[page.idnum] = len(_num_pages)
                _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
        elif t == "/Page":
            _num_pages.append(1)
        return _result
    # main
    f = open('document.pdf','rb')
    p = PdfFileReader(f)
    # map page ids to page numbers
    pg_id_num_map = _setup_page_id_to_num(p)
    o = p.getOutlines()
    pg_num = pg_id_num_map[o[0].page.idnum] + 1
    print(pg_num)
    

    probably too late for @theta but might help others :) btw my first post on stackoverflow so excuse me if I did not follow the usual format

    To extend this further: If you are looking to get the exact location on the page for a bookmark this will make your job easier:

    from PyPDF2 import PdfFileReader
    import PyPDF2 as pyPdf
    
    def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
        if _result is None:
            _result = {}
        if pages is None:
            _num_pages = []
            pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
        t = pages["/Type"]
        if t == "/Pages":
            for page in pages["/Kids"]:
                _result[page.idnum] = len(_num_pages)
                _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
        elif t == "/Page":
            _num_pages.append(1)
        return _result
    def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
        if result is None:
            result = dict()
        if type(outlines) == list:
            for outline in outlines:
                result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
        elif type(outlines) == pyPdf.pdf.Destination:
            title = outlines['/Title']
            result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
            left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
        return result
    
    # main
    pdf_name = 'document.pdf'
    f = open(pdf_name,'rb')
    pdf = PdfFileReader(f)
    # map page ids to page numbers
    pg_id_num_map = _setup_page_id_to_num(pdf)
    outlines = pdf.getOutlines()
    bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
    print(bookmarks_info)
    

    Note: My bookmarks are section numbers (ex: 1.1 Introduction) and I am mapping the bookmark info to the section number. If your bookmarks are different modify this part of the code:

        elif type(outlines) == pyPdf.pdf.Destination:
            title = outlines['/Title']
            result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
            left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
    

提交回复
热议问题