Could someone give me an example of how to extract coordinates for a \'word\' with PDFBox
I am using this link to extract positions of individual characters: https:/
You can create CustomPDFTextStripper
which extends PDFTextStripper
and override protected void writeString(String text, List
. In this overriden method you need to split textPositions
by the word separator to get List
for each word. After that you can join each character and compute bounding box.
Full example below which contains also drawing of the resulting bounding boxes.
package com.example;
import lombok.Value;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.junit.Ignore;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class PdfBoxTest {
private static final String BASE_DIR_PATH = "C:\\Users\\Milan\\50330484";
private static final String INPUT_FILE_PATH = "input.pdf";
private static final String OUTPUT_IMAGE_PATH = "output.jpg";
private static final String OUTPUT_BBOX_IMAGE_PATH = "output-bbox.jpg";
private static final float FROM_72_TO_300_DPI = 300.0f / 72.0f;
@Test
public void run() throws Exception {
pdfToImage();
drawBoundingBoxes();
}
@Ignore
@Test
public void pdfToImage() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
PDFRenderer renderer = new PDFRenderer(document);
BufferedImage image = renderer.renderImageWithDPI(0, 300);
ImageIO.write(image, "JPEG", new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
}
@Ignore
@Test
public void drawBoundingBoxes() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
List words = getWords(document);
draw(words);
}
private List getWords(PDDocument document) throws IOException {
CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
customPDFTextStripper.setSortByPosition(true);
customPDFTextStripper.setStartPage(0);
customPDFTextStripper.setEndPage(1);
Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
customPDFTextStripper.writeText(document, writer);
List words = customPDFTextStripper.getWords();
return words;
}
private void draw(List words) throws IOException {
BufferedImage bufferedImage = ImageIO.read(new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
Graphics2D graphics = bufferedImage.createGraphics();
graphics.setColor(Color.GREEN);
List rectangles = words.stream()
.map(word -> new Rectangle(word.getX(), word.getY(), word.getWidth(), word.getHeight()))
.collect(Collectors.toList());
rectangles.forEach(graphics::draw);
graphics.dispose();
ImageIO.write(bufferedImage, "JPEG", new File(BASE_DIR_PATH, OUTPUT_BBOX_IMAGE_PATH));
}
private class CustomPDFTextStripper extends PDFTextStripper {
private final List words;
public CustomPDFTextStripper() throws IOException {
this.words = new ArrayList<>();
}
public List getWords() {
return new ArrayList<>(words);
}
@Override
protected void writeString(String text, List textPositions) throws IOException {
String wordSeparator = getWordSeparator();
List wordTextPositions = new ArrayList<>();
for (TextPosition textPosition : textPositions) {
String str = textPosition.getUnicode();
if (wordSeparator.equals(str)) {
if (!wordTextPositions.isEmpty()) {
this.words.add(createWord(wordTextPositions));
wordTextPositions.clear();
}
} else {
wordTextPositions.add(textPosition);
}
}
super.writeString(text, textPositions);
}
private WordWithBBox createWord(List wordTextPositions) {
String word = wordTextPositions.stream()
.map(TextPosition::getUnicode)
.collect(Collectors.joining());
int minX = Integer.MAX_VALUE;
int minY = Integer.MAX_VALUE;
int maxX = Integer.MIN_VALUE;
int maxY = Integer.MIN_VALUE;
for (TextPosition wordTextPosition : wordTextPositions) {
minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
}
return new WordWithBBox(word, minX, minY, maxX - minX, maxY - minY);
}
}
private int from72To300Dpi(float f) {
return Math.round(f * FROM_72_TO_300_DPI);
}
@Value
private class WordWithBBox {
private final String word;
private final int x;
private final int y;
private final int width;
private final int height;
}
}
Note:
If you are interested in other options, you can check also Poppler
PDF to image
pdftoppm -r 300 -jpeg input.pdf output
Generate an XHTML file containing bounding box information for each word in the file.
pdftotext -r 300 -bbox input.pdf