I am trying to extract text with all information from the pdf using pdfbox. I got all the information i want, except color. I tried different ways to get the fontcolor (incl
I also ended up doing something like this. Pasting code below, hope it helps someone.
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;
public class Parser extends PDFTextStripper {
public Parser() throws IOException {
super(ResourceLoader.loadProperties(
"org/apache/pdfbox/resources/PageDrawer.properties", true));
super.setSortByPosition(true);
}
public void parse(String path) throws IOException{
PDDocument doc = PDDocument.load(path);
List pages = doc.getDocumentCatalog().getAllPages();
for (PDPage page : pages) {
this.processStream(page, page.getResources(), page.getContents().getStream());
}
}
@Override
protected void processTextPosition(TextPosition text) {
try {
PDGraphicsState graphicsState = getGraphicsState();
System.out.println("R = " + graphicsState.getNonStrokingColor().getJavaColor().getRed());
System.out.println("G = " + graphicsState.getNonStrokingColor().getJavaColor().getGreen());
System.out.println("B = " + graphicsState.getNonStrokingColor().getJavaColor().getBlue());
}
catch (IOException ioe) {}
}
public static void main(String[] args) throws IOException, COSVisitorException {
Parser p = new Parser();
p.parse("/Users/apple/Desktop/123.pdf");
}
}