How to extract text from a PDF file with Apache PDFBox

前端 未结 5 1408
不知归路
不知归路 2020-12-08 05:02

I would like to extract text from a given PDF file with Apache PDFBox.

I wrote this code:

PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null         


        
5条回答
  •  谎友^
    谎友^ (楼主)
    2020-12-08 05:25

    This works fine to extract data from a PDF file that has text content using pdfbox 2.0.6

    import java.io.File;
    import java.io.IOException;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.PDFTextStripperByArea;
    
    public class PDFTextExtractor {
       public static void main(String[] args) throws IOException {
           System.out.println(readParaFromPDF("C:\\sample1.pdf",3, "Enter Start Text Here", "Enter Ending Text Here"));
        //Enter FilePath, Page Number, StartsWith, EndsWith
       }
       public static String readParaFromPDF(String pdfPath, int pageNo, String strStartIndentifier, String strEndIdentifier) {
           String returnString = "";
           try {
               PDDocument document = PDDocument.load(new File(pdfPath));
               document.getClass();        
               if (!document.isEncrypted()) {
                   PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                   stripper.setSortByPosition(true);
                   PDFTextStripper tStripper = new PDFTextStripper();
                   tStripper.setStartPage(pageNo);
                   tStripper.setEndPage(pageNo);
                   String pdfFileInText = tStripper.getText(document);
                   String strStart = strStartIndentifier;
                   String strEnd = strEndIdentifier;
                   int startInddex = pdfFileInText.indexOf(strStart);
                   int endInddex = pdfFileInText.indexOf(strEnd);
                   returnString = pdfFileInText.substring(startInddex, endInddex) + strEnd;
               }
              } catch (Exception e) {
                  returnString = "No ParaGraph Found";
           }
                return returnString;
       }
    }
    

提交回复
热议问题