How to get bookmark's page number

后端 未结 4 1781
南方客
南方客 2020-12-16 04:52
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination


def get_outlines(pdf_filepat         


        
相关标签:
4条回答
  • 2020-12-16 05:07

    In 2019, for ones who are interested in a faster way, it's possible to use:

    from PyPDF2 import PdfFileReader
    
    def printPageNumberFrom(filename):
        with open(filename, "rb") as f:
           pdf = PdfFileReader(f)
           bookmarks = pdf.getOutlines()
           for b in bookmarks:
               print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0 
    
    0 讨论(0)
  • 2020-12-16 05:28

    Manage bookmarks recursively with vjayky and Giulio D suggestion

    PyPDF2 >= v1.25

    from PyPDF2 import PdfFileReader
    
    def printBookmarksPageNumbers(pdf):
        def review_and_print_bookmarks(bookmarks, lvl=0):
            for b in bookmarks:
                if type(b) == list:
                    review_and_print_bookmarks(b, lvl + 4)
                    continue
                pg_num = pdf.getDestinationPageNumber(b) + 1 #page count starts from 0
                print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
        review_and_print_bookmarks(pdf.getOutlines())
    
    with open('document.pdf', "rb") as f:
        pdf = PdfFileReader(f)
        printBookmarksPageNumbers(pdf)
    

    PyPDF2 < v1.25

    from PyPDF2 import PdfFileReader
    
    def printBookmarksPageNumbers(pdf):
        # Map page ids to page numbers
        pg_id_to_num = {}
        for pg_num in range(0, pdf.getNumPages()):
            pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
    
        def review_and_print_bookmarks(bookmarks, lvl=0):
            for b in bookmarks:
                if type(b) == list:
                    review_and_print_bookmarks(b, lvl + 4)
                    continue
                pg_num = pg_id_to_num[b.page.idnum] + 1 #page count starts from 0 
                print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
        review_and_print_bookmarks(pdf.getOutlines())
    
    with open('document.pdf', "rb") as f:
        pdf = PdfFileReader(f)
        printBookmarksPageNumbers(pdf)
    
    0 讨论(0)
  • 2020-12-16 05:29

    As @theta pointed out "split a pdf based on outline" has the code required to extract page numbers. If you feel this is complicated I copied part of the code which maps page ids to page numbers and made it a function. Here is a working example that prints page number of bookmark o[0]:

    from PyPDF2 import PdfFileReader
    
    
    def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
        if _result is None:
            _result = {}
        if pages is None:
            _num_pages = []
            pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
        t = pages["/Type"]
        if t == "/Pages":
            for page in pages["/Kids"]:
                _result[page.idnum] = len(_num_pages)
                _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
        elif t == "/Page":
            _num_pages.append(1)
        return _result
    # main
    f = open('document.pdf','rb')
    p = PdfFileReader(f)
    # map page ids to page numbers
    pg_id_num_map = _setup_page_id_to_num(p)
    o = p.getOutlines()
    pg_num = pg_id_num_map[o[0].page.idnum] + 1
    print(pg_num)
    

    probably too late for @theta but might help others :) btw my first post on stackoverflow so excuse me if I did not follow the usual format

    To extend this further: If you are looking to get the exact location on the page for a bookmark this will make your job easier:

    from PyPDF2 import PdfFileReader
    import PyPDF2 as pyPdf
    
    def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
        if _result is None:
            _result = {}
        if pages is None:
            _num_pages = []
            pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
        t = pages["/Type"]
        if t == "/Pages":
            for page in pages["/Kids"]:
                _result[page.idnum] = len(_num_pages)
                _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
        elif t == "/Page":
            _num_pages.append(1)
        return _result
    def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
        if result is None:
            result = dict()
        if type(outlines) == list:
            for outline in outlines:
                result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
        elif type(outlines) == pyPdf.pdf.Destination:
            title = outlines['/Title']
            result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
            left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
        return result
    
    # main
    pdf_name = 'document.pdf'
    f = open(pdf_name,'rb')
    pdf = PdfFileReader(f)
    # map page ids to page numbers
    pg_id_num_map = _setup_page_id_to_num(pdf)
    outlines = pdf.getOutlines()
    bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
    print(bookmarks_info)
    

    Note: My bookmarks are section numbers (ex: 1.1 Introduction) and I am mapping the bookmark info to the section number. If your bookmarks are different modify this part of the code:

        elif type(outlines) == pyPdf.pdf.Destination:
            title = outlines['/Title']
            result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
            left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
    
    0 讨论(0)
  • 2020-12-16 05:31

    I'm not sure but according to the docs for pyPdf.Destination at http://pybrary.net/pyPdf/pythondoc-pyPdf.pdf.html#pyPdf.pdf.Destination.page-attribute the page number for the bookmark is just Destination.page .

    0 讨论(0)
提交回复
热议问题