extract images from pdf using pdfbox

前端 未结 8 1997
刺人心
刺人心 2020-11-28 09:22

I m trying to extract images from a pdf using pdfbox. The example pdf here

But i m getting blank images only.

The code i m trying:-

public st         


        
8条回答
  •  清歌不尽
    2020-11-28 10:06

    For someone who want just copy and paste this ready to use code

    import org.apache.pdfbox.contentstream.PDFStreamEngine;
    import org.apache.pdfbox.contentstream.operator.Operator;
    import org.apache.pdfbox.cos.COSBase;
    import org.apache.pdfbox.cos.COSName;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.pdmodel.graphics.PDXObject;
    import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
    import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.IOException;
    import java.util.List;
    import java.util.UUID;
    
    public class ExtractImagesUseCase extends PDFStreamEngine{
        private final String filePath;
        private final String outputDir;
    
        // Constructor
        public ExtractImagesUseCase(String filePath,
                                    String outputDir){
            this.filePath = filePath;
            this.outputDir = outputDir;
        }
    
        // Execute
        public void execute(){
            try{
                File file = new File(filePath);
                PDDocument document = PDDocument.load(file);
    
                for(PDPage page : document.getPages()){
                    processPage(page);
                }
    
            }catch(IOException e){
                e.printStackTrace();
            }
        }
    
        @Override
        protected void processOperator(Operator operator, List operands) throws IOException{
            String operation = operator.getName();
    
            if("Do".equals(operation)){
                COSName objectName = (COSName) operands.get(0);
                PDXObject pdxObject = getResources().getXObject(objectName);
    
                if(pdxObject instanceof PDImageXObject){
                    // Image
                    PDImageXObject image = (PDImageXObject) pdxObject;
                    BufferedImage bImage = image.getImage();
    
                    // File
                    String randomName = UUID.randomUUID().toString();
                    File outputFile = new File(outputDir,randomName + ".png");
    
                    // Write image to file
                    ImageIO.write(bImage, "PNG", outputFile);
    
                }else if(pdxObject instanceof PDFormXObject){
                    PDFormXObject form = (PDFormXObject) pdxObject;
                    showForm(form);
                }
            }
    
            else super.processOperator(operator, operands);
        }
    }
    

    Demo

    public class ExtractImageDemo{
        public static void main(String[] args){
            String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
            String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
    
            ExtractImagesUseCase useCase = new ExtractImagesUseCase(
                    filePath,
                    outputDir
            );
            useCase.execute();
        }
    }
    

提交回复
热议问题