Parse annotations from a pdf

前端 未结 8 762
攒了一身酷
攒了一身酷 2020-11-28 21:15

I want a python function that takes a pdf and returns a list of the text of the note annotations in the document. I have looked at python-poppler (https://code.launchpad.net

8条回答
  •  萌比男神i
    2020-11-28 21:46

    Just in case somebody is looking for some working code. Here is a script I use.

    import poppler
    import sys
    import urllib
    import os
    
    def main():
      input_filename = sys.argv[1]
        # http://blog.hartwork.org/?p=612
      document = poppler.document_new_from_file('file://%s' % \
        urllib.pathname2url(os.path.abspath(input_filename)), None)
      n_pages = document.get_n_pages()
      all_annots = 0
    
      for i in range(n_pages):
            page = document.get_page(i)
            annot_mappings = page.get_annot_mapping ()
            num_annots = len(annot_mappings)
            if num_annots > 0:
                for annot_mapping in annot_mappings:
                    if  annot_mapping.annot.get_annot_type().value_name != 'POPPLER_ANNOT_LINK':
                        all_annots += 1
                        print 'page: {0:3}, {1:10}, type: {2:10}, content: {3}'.format(i+1, annot_mapping.annot.get_modified(), annot_mapping.annot.get_annot_type().value_nick, annot_mapping.annot.get_contents())
    
      if all_annots > 0:
        print str(all_annots) + " annotation(s) found"
      else:
        print "no annotations found"
    
    if __name__ == "__main__":
        main()
    

提交回复
热议问题