Getting PDF TextObjects with PDFBox

守給你的承諾、 提交于 2019-12-03 07:28:52
Phil

Based on the linked question and the hint by mkl yesterday (thanks!), I've decided to build something to parse the tokens. Something to consider is that within a PDF Text Object, the attributes precede the operator, so I collect all attributes in a collection until I encounter the operator. Then, when I know what operator the attributes belong to, I move them to their proper locations. This is what I've come up with:

import java.io.File;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFOperator;

public class TextExtractor {
    public static void main(String[] args) { 
        try {
            File input = new File("C:\\some\\file.pdf");
            PDDocument document = PDDocument.load(input);
            List allPages = document.getDocumentCatalog().getAllPages();
            // just parsing page 2 here, as it's only a sample
            PDPage page = (PDPage) allPages.get(2);
            PDStream contents = page.getContents();
            PDFStreamParser parser = new PDFStreamParser(contents.getStream());
            parser.parse();  
            List tokens = parser.getTokens();  
            boolean parsingTextObject = false; //boolean to check whether the token being parsed is part of a TextObject
            PDFTextObject textobj = new PDFTextObject();
            for (int i = 0; i < tokens.size(); i++)  
            {  
                Object next = tokens.get(i); 
                if (next instanceof PDFOperator)  {
                    PDFOperator op = (PDFOperator) next;  
                    switch(op.getOperation()){
                        case "BT":
                            //BT: Begin Text. 
                            parsingTextObject = true;
                            textobj = new PDFTextObject();
                            break;
                        case "ET":
                            parsingTextObject = false;
                            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
                            break;
                        case "Tj":
                            textobj.setText();
                            break;
                        case "Tm":
                            textobj.setMatrix();
                            break;
                        default:
                            //System.out.println("unsupported operation " + op.getOperation());
                    }
                    textobj.clearAllAttributes();
                }
                else if (parsingTextObject)  {
                    textobj.addAttribute(next);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } 
    }
}

In combination with:

import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

class PDFTextObject{
    private List attributes = new ArrayList<Object>();
    private String text = "";
    private float x = -1;
    private float y = -1;

    public void clearAllAttributes(){
        attributes = new ArrayList<Object>();
    }

    public void addAttribute(Object anAttribute){
        attributes.add(anAttribute);
    }

    public void setText(){
        //Move the contents of the attributes to the text attribute.
        for (int i = 0; i < attributes.size(); i++){
            if (attributes.get(i) instanceof COSString){
                COSString aString = (COSString) attributes.get(i);
                text = text + aString.getString();
            }
            else {
                System.out.println("Whoops! Wrong type of property...");
            }
        }
    }

    public String getText(){
        return text;
    }

    public void setMatrix(){
        //Move the contents of the attributes to the x and y attributes.
        //A Matrix has 6 attributes, the last two of which are x and y
        for (int i = 4; i < attributes.size(); i++){
            float curval = -1;
            if (attributes.get(i) instanceof COSInteger){
                COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                curval = aCOSInteger.floatValue();

            }
            if (attributes.get(i) instanceof COSFloat){
                COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                curval = aCOSFloat.floatValue();
            }
            switch(i) {
                case 4:
                    x = curval;
                    break;
                case 5:
                    y = curval;
                    break;
            }
        }
    }

    public float getX(){
        return x;
    }

    public float getY(){
        return y;
    }
}

It gives the output:

Text: This page has been intentionally left blank.@70.8661,576.0
Text: 2@45.7136,761.1024

While it does the trick, I'm sure I've broken some conventions and haven't always written the most elegant code. Improvements and alternate solutions are welcome.

I added a version of the Phil response with pdfbox-2.0.1

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

public class TextExtractor {
  public static void main(String[] args) {
    try {
      File input = new File("src\\test\\resources\\files\\file1.pdf");
      PDDocument document = PDDocument.load(input);
      PDPageTree allPages = document.getDocumentCatalog().getPages();
      // just parsing page 2 here, as it's only a sample
      PDPage page = allPages.get(0);
      PDFStreamParser parser = new PDFStreamParser(page);
      parser.parse();
      List tokens = parser.getTokens();
      boolean parsingTextObject = false; // boolean to check whether the token
                                         // being parsed is part of a TextObject
      PDFTextObject textobj = new PDFTextObject();
      for (int i = 0; i < tokens.size(); i++) {
        Object next = tokens.get(i);
        if (next instanceof Operator) {
          Operator op = (Operator) next;
          switch (op.getName()) {
          case "BT":
            // BT: Begin Text.
            parsingTextObject = true;
            textobj = new PDFTextObject();
            break;
          case "ET":
            parsingTextObject = false;
            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
            break;
          case "Tj":
            textobj.setText();
            break;
          case "Tm":
            textobj.setMatrix();
            break;
          default:
            System.out.println("unsupported operation " + op);
          }
          textobj.clearAllAttributes();
        } else if (parsingTextObject) {
          textobj.addAttribute(next);
        } else {
          System.out.println("ignore "+next.getClass()+" -> "+next);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }


  static class PDFTextObject{
      private List attributes = new ArrayList<Object>();
      private String text = "";
      private float x = -1;
      private float y = -1;

      public void clearAllAttributes(){
          attributes = new ArrayList<Object>();
      }

      public void addAttribute(Object anAttribute){
          attributes.add(anAttribute);
      }

      public void setText(){
          //Move the contents of the attributes to the text attribute.
          for (int i = 0; i < attributes.size(); i++){
              if (attributes.get(i) instanceof COSString){
                  COSString aString = (COSString) attributes.get(i);
                  text = text + aString.getString();
              }
              else {
                  System.out.println("Whoops! Wrong type of property...");
              }
          }
      }

      public String getText(){
          return text;
      }

      public void setMatrix(){
          //Move the contents of the attributes to the x and y attributes.
          //A Matrix has 6 attributes, the last two of which are x and y
          for (int i = 4; i < attributes.size(); i++){
              float curval = -1;
              if (attributes.get(i) instanceof COSInteger){
                  COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                  curval = aCOSInteger.floatValue();

              }
              if (attributes.get(i) instanceof COSFloat){
                  COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                  curval = aCOSFloat.floatValue();
              }
              switch(i) {
                  case 4:
                      x = curval;
                      break;
                  case 5:
                      y = curval;
                      break;
              }
          }
      }

      public float getX(){
          return x;
      }

      public float getY(){
          return y;
      }
  }
}
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!