Search and replace text in PDF using JAVA

问题

Need to replace the text in the pdf with different language. In the first step, I was trying to search and replace a text in the pdf file using itextpdf ad pdfbox API.

Use the below code snippet which uses itextpdf api to search and replace the text "Hello" to "Hi" from the source PDF file. The new PDF is created without any text replacements.

public void manipulatePdf(String src, String dest) throws Exception {
    PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));
    int noOfPages = pdfDoc.getNumberOfPages();
    for (int i = 1; i < noOfPages; i++) {
        PdfPage page = pdfDoc.getPage(i);
        PdfDictionary dict = page.getPdfObject();
        PdfObject object = dict.get(PdfName.Contents);
        if (object instanceof PdfStream) {
            PdfStream stream = (PdfStream) object;
            byte[] data = stream.getBytes();
            stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));
        }
    }
    pdfDoc.close();
}

Also used apache pdfbox to achieve the same thing but no luck in that. Below is the code snippet for the reference.

    public static PDDocument replaceText(PDDocument document, String searchString, String replacement)
        throws IOException {        
    for (PDPage page : document.getPages()) {
        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List tokens = parser.getTokens();
        for (int j = 0; j < tokens.size(); j++) {
            Object next = tokens.get(j);
            if (next instanceof Operator) {
                Operator op = (Operator) next;
                // Tj and TJ are the two operators that display strings in a PDF
                if (op.getName().equals("Tj")) {
                    // Tj takes one operator and that is the string to display
                    // so lets update that operator
                    COSString previous = (COSString) tokens.get(j - 1);
                    String string = previous.getString();
                    //System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                    string = string.replaceFirst(searchString, replacement);
                    previous.setValue(string.getBytes());

                } else if (op.getName().equals("TJ")) {
                    COSArray previous = (COSArray) tokens.get(j - 1);
                    for (int k = 0; k < previous.size(); k++) {
                        Object arrElement = previous.getObject(k);
                        if (arrElement instanceof COSString) {
                            COSString cosString = (COSString) arrElement;
                            String string = cosString.getString();
                            //System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                            string = StringUtils.replaceOnce(string, searchString, replacement);
                            cosString.setValue(string.getBytes());
                        }
                    }
                }
            }
        }

        PDStream updatedStream = new PDStream(document);
        OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
        tokenWriter.writeTokens(tokens);
         // save content
        page.setContents(updatedStream);
        out.close();
    }

Any solution/suggestion is highly appreciated.

回答1:

This is a working version, uses PDFBox

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

public final class PDFEditor {

    private PDFEditor() {
    }

    public static void main(String[] args) throws IOException {
        PDDocument document = null;
        document = PDDocument.load(new File("src path"));
        document = replaceText(document, "Hello", "Hi");
        document.save("target Path");
        document.close();
    }

    private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
        if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {
            return document;
        }

        for (PDPage page : document.getPages()) {
            PDFStreamParser parser = new PDFStreamParser(page);
            parser.parse();
            List<?> tokens = parser.getTokens();

            for (int j = 0; j < tokens.size(); j++) {
                Object next = tokens.get(j);
                if (next instanceof Operator) {
                    Operator op = (Operator) next;

                    String pstring = "";
                    int prej = 0;

                    if (op.getName().equals("Tj")) {
                        COSString previous = (COSString) tokens.get(j - 1);
                        String string = previous.getString();
                        string = string.replaceFirst(searchString, replacement);
                        previous.setValue(string.getBytes());
                    } else if (op.getName().equals("TJ")) {
                        COSArray previous = (COSArray) tokens.get(j - 1);
                        for (int k = 0; k < previous.size(); k++) {
                            Object arrElement = previous.getObject(k);
                            if (arrElement instanceof COSString) {
                                COSString cosString = (COSString) arrElement;
                                String string = cosString.getString();

                                if (j == prej) {
                                    pstring += string;
                                } else {
                                    prej = j;
                                    pstring = string;
                                }
                            }
                        }

                        if (searchString.equals(pstring.trim())) {
                            COSString cosString2 = (COSString) previous.getObject(0);
                            cosString2.setValue(replacement.getBytes());

                            int total = previous.size() - 1;
                            for (int k = total; k > 0; k--) {
                                previous.remove(k);
                            }
                        }
                    }
                }
            }
            PDStream updatedStream = new PDStream(document);
            OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(tokens);
            out.close();
            page.setContents(updatedStream);
        }

        return document;
    }

}

Dependencies :

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.0.6</version>
</dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.11</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.0</version>
</dependency>

来源：https://stackoverflow.com/questions/52027733/search-and-replace-text-in-pdf-using-java

标签

java

itext

pdfbox