Manipulate paths, color etc. in iText

前端 未结 1 1575
长发绾君心
长发绾君心 2020-12-04 04:11

I need to analyze path data of PDF files and manipulate content with iText 7. Manipulations include deletion/replacemant and coloring.

I can analyze the graphics al

相关标签:
1条回答
  • 2020-12-04 04:35

    Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?

    In essence you are looking for a class which is not merely parsing a PDF content stream and signaling the instructions in it like the PdfCanvasProcessor (the PdfDocumentContentParser you use is merely a very thin wrapper for PdfCanvasProcessor) but which also creates the content stream anew with the instructions you forward back to it.

    A generic content stream editor class

    For iText 5.5.x a proof-of-concept for such a content stream editor class can be found in this answer (the Java version is a bit further down in the answer text).

    This is a port of that proof-of-concept to iText 7:

    public class PdfCanvasEditor extends PdfCanvasProcessor
    {
        /**
         * This method edits the immediate contents of a page, i.e. its content stream.
         * It explicitly does not descent into form xobjects, patterns, or annotations.
         */
        public void editPage(PdfDocument pdfDocument, int pageNumber) throws IOException
        {
            if ((pdfDocument.getReader() == null) || (pdfDocument.getWriter() == null))
            {
                throw new PdfException("PdfDocument must be opened in stamping mode.");
            }
    
            PdfPage page = pdfDocument.getPage(pageNumber);
            PdfResources pdfResources = page.getResources();
            PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
            editContent(page.getContentBytes(), pdfResources, pdfCanvas);
            page.put(PdfName.Contents, pdfCanvas.getContentStream());
        }
    
        /**
         * This method processes the content bytes and outputs to the given canvas.
         * It explicitly does not descent into form xobjects, patterns, or annotations.
         */
        public void editContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
        {
            this.canvas = canvas;
            processContent(contentBytes, resources);
            this.canvas = null;
        }
    
        /**
         * <p>
         * This method writes content stream operations to the target canvas. The default
         * implementation writes them as they come, so it essentially generates identical
         * copies of the original instructions the {@link ContentOperatorWrapper} instances
         * forward to it.
         * </p>
         * <p>
         * Override this method to achieve some fancy editing effect.
         * </p> 
         */
        protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
        {
            PdfOutputStream pdfOutputStream = canvas.getContentStream().getOutputStream();
            int index = 0;
    
            for (PdfObject object : operands)
            {
                pdfOutputStream.write(object);
                if (operands.size() > ++index)
                    pdfOutputStream.writeSpace();
                else
                    pdfOutputStream.writeNewLine();
            }
        }
    
        //
        // constructor giving the parent a dummy listener to talk to 
        //
        public PdfCanvasEditor()
        {
            super(new DummyEventListener());
        }
    
        //
        // Overrides of PdfContentStreamProcessor methods
        //
        @Override
        public IContentOperator registerContentOperator(String operatorString, IContentOperator operator)
        {
            ContentOperatorWrapper wrapper = new ContentOperatorWrapper();
            wrapper.setOriginalOperator(operator);
            IContentOperator formerOperator = super.registerContentOperator(operatorString, wrapper);
            return formerOperator instanceof ContentOperatorWrapper ? ((ContentOperatorWrapper)formerOperator).getOriginalOperator() : formerOperator;
        }
    
        //
        // members holding the output canvas and the resources
        //
        protected PdfCanvas canvas = null;
    
        //
        // A content operator class to wrap all content operators to forward the invocation to the editor
        //
        class ContentOperatorWrapper implements IContentOperator
        {
            public IContentOperator getOriginalOperator()
            {
                return originalOperator;
            }
    
            public void setOriginalOperator(IContentOperator originalOperator)
            {
                this.originalOperator = originalOperator;
            }
    
            @Override
            public void invoke(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
            {
                if (originalOperator != null && !"Do".equals(operator.toString()))
                {
                    originalOperator.invoke(processor, operator, operands);
                }
                write(processor, operator, operands);
            }
    
            private IContentOperator originalOperator = null;
        }
    
        //
        // A dummy event listener to give to the underlying canvas processor to feed events to
        //
        static class DummyEventListener implements IEventListener
        {
            @Override
            public void eventOccurred(IEventData data, EventType type)
            { }
    
            @Override
            public Set<EventType> getSupportedEvents()
            {
                return null;
            }
        }
    }
    

    (PdfCanvasEditor.java)

    The explanations from the iText 5 answer still apply, the parsing framework has not changed much from iText 5.5.x to iText 7.0.x.

    Usage examples

    Unfortunately you wrote in very vague terms about how exactly you want to change the contents. Thus I simply ported some iText 5 samples which made use of the original iText 5 content stream editor class:

    Watermark removal

    These are ports of the use cases in this answer.

    testRemoveBoldMTTextDocument

    This example drops all text written in a font the name of which ends with "BoldMT":

    try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
            PdfReader pdfReader = new PdfReader(resource);
            OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBoldMTText.pdf"));
            PdfWriter pdfWriter = new PdfWriter(result);
            PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
    {
        PdfCanvasEditor editor = new PdfCanvasEditor()
        {
    
            @Override
            protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
            {
                String operatorString = operator.toString();
    
                if (TEXT_SHOWING_OPERATORS.contains(operatorString))
                {
                    if (getGraphicsState().getFont().getFontProgram().getFontNames().getFontName().endsWith("BoldMT"))
                        return;
                }
                
                super.write(processor, operator, operands);
            }
    
            final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
        };
        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
        {
            editor.editPage(pdfDocument, i);
        }
    }
    

    (EditPageContent.java test method testRemoveBoldMTTextDocument)

    testRemoveBigTextDocument

    This example drops all text written with a large font size:

    try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
            PdfReader pdfReader = new PdfReader(resource);
            OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBigText.pdf"));
            PdfWriter pdfWriter = new PdfWriter(result);
            PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
    {
        PdfCanvasEditor editor = new PdfCanvasEditor()
        {
    
            @Override
            protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
            {
                String operatorString = operator.toString();
    
                if (TEXT_SHOWING_OPERATORS.contains(operatorString))
                {
                    if (getGraphicsState().getFontSize() > 100)
                        return;
                }
                
                super.write(processor, operator, operands);
            }
    
            final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
        };
        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
        {
            editor.editPage(pdfDocument, i);
        }
    }
    

    (EditPageContent.java test method testRemoveBigTextDocument)

    Text color change

    This is a port of the use case in this answer.

    testChangeBlackTextToGreenDocument

    This example changes the color of black text to green.

    try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
            PdfReader pdfReader = new PdfReader(resource);
            OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-blackTextToGreen.pdf"));
            PdfWriter pdfWriter = new PdfWriter(result);
            PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
    {
        PdfCanvasEditor editor = new PdfCanvasEditor()
        {
    
            @Override
            protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
            {
                String operatorString = operator.toString();
    
                if (TEXT_SHOWING_OPERATORS.contains(operatorString))
                {
                    if (currentlyReplacedBlack == null)
                    {
                        Color currentFillColor = getGraphicsState().getFillColor();
                        if (Color.BLACK.equals(currentFillColor))
                        {
                            currentlyReplacedBlack = currentFillColor;
                            super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(1), new PdfNumber(0), new PdfLiteral("rg")));
                        }
                    }
                }
                else if (currentlyReplacedBlack != null)
                {
                    if (currentlyReplacedBlack instanceof DeviceCmyk)
                    {
                        super.write(processor, new PdfLiteral("k"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfNumber(1), new PdfLiteral("k")));
                    }
                    else if (currentlyReplacedBlack instanceof DeviceGray)
                    {
                        super.write(processor, new PdfLiteral("g"), Arrays.asList(new PdfNumber(0), new PdfLiteral("g")));
                    }
                    else
                    {
                        super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfLiteral("rg")));
                    }
                    currentlyReplacedBlack = null;
                }
    
                super.write(processor, operator, operands);
            }
    
            Color currentlyReplacedBlack = null;
    
            final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
        };
        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
        {
            editor.editPage(pdfDocument, i);
        }
    }
    

    (EditPageContent.java test method testChangeBlackTextToGreenDocument)

    0 讨论(0)
提交回复
热议问题