Could someone give me an example of how to extract coordinates for a 'word' using PDFBox

后端 未结 2 496
[愿得一人]
[愿得一人] 2020-12-11 13:53

Could someone give me an example of how to extract coordinates for a \'word\' with PDFBox

I am using this link to extract positions of individual characters: https:/

2条回答
  •  攒了一身酷
    2020-12-11 14:43

    You can create CustomPDFTextStripper which extends PDFTextStripper and override protected void writeString(String text, List textPositions). In this overriden method you need to split textPositions by the word separator to get List for each word. After that you can join each character and compute bounding box.

    Full example below which contains also drawing of the resulting bounding boxes.

    package com.example;
    
    import lombok.Value;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.rendering.PDFRenderer;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.TextPosition;
    import org.junit.Ignore;
    import org.junit.Test;
    
    import javax.imageio.ImageIO;
    import java.awt.*;
    import java.awt.image.BufferedImage;
    import java.io.*;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.stream.Collectors;
    
    public class PdfBoxTest {
    
        private static final String BASE_DIR_PATH = "C:\\Users\\Milan\\50330484";
        private static final String INPUT_FILE_PATH = "input.pdf";
        private static final String OUTPUT_IMAGE_PATH = "output.jpg";
        private static final String OUTPUT_BBOX_IMAGE_PATH = "output-bbox.jpg";
    
        private static final float FROM_72_TO_300_DPI = 300.0f / 72.0f;
    
        @Test
        public void run() throws Exception {
            pdfToImage();
            drawBoundingBoxes();
        }
    
        @Ignore
        @Test
        public void pdfToImage() throws IOException {
            PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
            PDFRenderer renderer = new PDFRenderer(document);
            BufferedImage image = renderer.renderImageWithDPI(0, 300);
            ImageIO.write(image, "JPEG", new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
        }
    
        @Ignore
        @Test
        public void drawBoundingBoxes() throws IOException {
    
            PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
    
            List words = getWords(document);
    
            draw(words);
        }
    
        private List getWords(PDDocument document) throws IOException {
    
            CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
            customPDFTextStripper.setSortByPosition(true);
            customPDFTextStripper.setStartPage(0);
            customPDFTextStripper.setEndPage(1);
    
            Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
            customPDFTextStripper.writeText(document, writer);
    
            List words = customPDFTextStripper.getWords();
    
            return words;
        }
    
        private void draw(List words) throws IOException {
    
            BufferedImage bufferedImage = ImageIO.read(new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
    
            Graphics2D graphics = bufferedImage.createGraphics();
    
            graphics.setColor(Color.GREEN);
    
            List rectangles = words.stream()
                    .map(word -> new Rectangle(word.getX(), word.getY(), word.getWidth(), word.getHeight()))
                    .collect(Collectors.toList());
            rectangles.forEach(graphics::draw);
    
            graphics.dispose();
    
            ImageIO.write(bufferedImage, "JPEG", new File(BASE_DIR_PATH, OUTPUT_BBOX_IMAGE_PATH));
        }
    
        private class CustomPDFTextStripper extends PDFTextStripper {
    
            private final List words;
    
            public CustomPDFTextStripper() throws IOException {
                this.words = new ArrayList<>();
            }
    
            public List getWords() {
                return new ArrayList<>(words);
            }
    
            @Override
            protected void writeString(String text, List textPositions) throws IOException {
    
                String wordSeparator = getWordSeparator();
                List wordTextPositions = new ArrayList<>();
    
                for (TextPosition textPosition : textPositions) {
                    String str = textPosition.getUnicode();
                    if (wordSeparator.equals(str)) {
                        if (!wordTextPositions.isEmpty()) {
                            this.words.add(createWord(wordTextPositions));
                            wordTextPositions.clear();
                        }
                    } else {
                        wordTextPositions.add(textPosition);
                    }
                }
    
                super.writeString(text, textPositions);
            }
    
            private WordWithBBox createWord(List wordTextPositions) {
    
                String word = wordTextPositions.stream()
                        .map(TextPosition::getUnicode)
                        .collect(Collectors.joining());
    
                int minX = Integer.MAX_VALUE;
                int minY = Integer.MAX_VALUE;
                int maxX = Integer.MIN_VALUE;
                int maxY = Integer.MIN_VALUE;
    
                for (TextPosition wordTextPosition : wordTextPositions) {
    
                    minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
                    minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
                    maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
                    maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
                }
    
                return new WordWithBBox(word, minX, minY, maxX - minX, maxY - minY);
            }
        }
    
        private int from72To300Dpi(float f) {
            return Math.round(f * FROM_72_TO_300_DPI);
        }
    
        @Value
        private class WordWithBBox {
            private final String word;
            private final int x;
            private final int y;
            private final int width;
            private final int height;
        }
    }
    

    Note:

    If you are interested in other options, you can check also Poppler

    PDF to image

    pdftoppm -r 300 -jpeg input.pdf output
    

    Generate an XHTML file containing bounding box information for each word in the file.

    pdftotext -r 300 -bbox input.pdf
    

提交回复
热议问题