I am using Apache PDFbox library to extract the the highlighted text (i.e., with yellow background) from a PDF file. I am totally new to this library and don\'t know which c
I Hope this answer help everyone who is facing the same problem.
// PDF32000-2008
// 12.5.2 Annotation Dictionaries
// 12.5.6 Annotation Types
// 12.5.6.10 Text Markup Annotations
@SuppressWarnings({ "unchecked", "unused" })
public ArrayList getHighlightedText(String filePath, int pageNumber) throws IOException {
ArrayList highlightedTexts = new ArrayList<>();
// this is the in-memory representation of the PDF document.
// this will load a document from a file.
PDDocument document = PDDocument.load(filePath);
// this represents all pages in a PDF document.
List allPages = document.getDocumentCatalog().getAllPages();
// this represents a single page in a PDF document.
PDPage page = allPages.get(pageNumber);
// get annotation dictionaries
List annotations = page.getAnnotations();
for(int i=0; i 1) {
str = str.concat(highlightedText);
} else {
str = highlightedText;
}
}
highlightedTexts.add(str);
}
}
document.close();
return highlightedTexts;
}