I want a python function that takes a pdf and returns a list of the text of the note annotations in the document. I have looked at python-poppler (https://code.launchpad.net
The author @JorjMcKie of PyMuPDF
wrote a snippet for me and I modified a bit:
import fitz # to import the PyMuPDF library
# from pprint import pprint
def _parse_highlight(annot: fitz.Annot, wordlist: list) -> str:
points = annot.vertices
quad_count = int(len(points) / 4)
sentences = ['' for i in range(quad_count)]
for i in range(quad_count):
r = fitz.Quad(points[i * 4: i * 4 + 4]).rect
words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
sentences[i] = ' '.join(w[4] for w in words)
sentence = ' '.join(sentences)
return sentence
def main() -> dict:
doc = fitz.open('path/to/your/file')
page = doc[0]
wordlist = page.getText("words") # list of words on page
wordlist.sort(key=lambda w: (w[3], w[0])) # ascending y, then x
highlights = {}
annot = page.firstAnnot
i = 0
while annot:
if annot.type[0] == 8:
highlights[i] = _parse_highlight(annot, wordlist)
i += 1
print('> ' + highlights[i] + '\n')
annot = annot.next
# pprint(highlights)
return highlights
if __name__ == "__main__":
main()
Though there are still some small typos in the results:
> system upsets,
> expansion of smart grid monitoring devices that generally provide nodal voltages and power injections at fine spatial resolution,
> hurricanes to indi- vidual lightning strikes),