I just found Apache POI library very useful for editing Word files using Java. Specifically, I want to edit a DOCX file using Apache POI\'s XWPF classes. I
Here is what we did for text replacement using Apache POI. We found that it was not worth the hassle and simpler to replace the text of an entire XWPFParagraph instead of a run. A run can be randomly split in the middle of a word as Microsoft Word is in charge of where runs are created within the paragraph of a document. Therefore the text you might be searching for could be half in one run and half in another. Using the full text of a paragraph, removing its existing runs, and adding a new run with the adjusted text seems to solve the problem of text replacement.
However there is a cost of doing the replacement at the paragraph level; you lose the formatting of the runs in that paragraph. For example if in the middle of your paragraph you had bolded the word "bits", and then when parsing the file you replaced the word "bits" with "bytes", the word "bytes" would no longer be bolded. Because the bolding was stored with a run that was removed when the paragraph's entire body of text was replaced. The attached code has a commented out section that was working for replacement of text at the run level if you need it.
It should also be noted that the below works if the text you are inserting contains \n return characters. We could not find a way to insert returns without creating a run for each section prior to the return and marking the run addCarriageReturn(). Cheers
package com.healthpartners.hcss.client.external.word.replacement;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
public class TextReplacer {
private String searchValue;
private String replacement;
public TextReplacer(String searchValue, String replacement) {
this.searchValue = searchValue;
this.replacement = replacement;
}
public void replace(XWPFDocument document) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph xwpfParagraph : paragraphs) {
replace(xwpfParagraph);
}
}
private void replace(XWPFParagraph paragraph) {
if (hasReplaceableItem(paragraph.getText())) {
String replacedText = StringUtils.replace(paragraph.getText(), searchValue, replacement);
removeAllRuns(paragraph);
insertReplacementRuns(paragraph, replacedText);
}
}
private void insertReplacementRuns(XWPFParagraph paragraph, String replacedText) {
String[] replacementTextSplitOnCarriageReturn = StringUtils.split(replacedText, "\n");
for (int j = 0; j < replacementTextSplitOnCarriageReturn.length; j++) {
String part = replacementTextSplitOnCarriageReturn[j];
XWPFRun newRun = paragraph.insertNewRun(j);
newRun.setText(part);
if (j+1 < replacementTextSplitOnCarriageReturn.length) {
newRun.addCarriageReturn();
}
}
}
private void removeAllRuns(XWPFParagraph paragraph) {
int size = paragraph.getRuns().size();
for (int i = 0; i < size; i++) {
paragraph.removeRun(0);
}
}
private boolean hasReplaceableItem(String runText) {
return StringUtils.contains(runText, searchValue);
}
//REVISIT The below can be removed if Michele tests and approved the above less versatile replacement version
// private void replace(XWPFParagraph paragraph) {
// for (int i = 0; i < paragraph.getRuns().size() ; i++) {
// i = replace(paragraph, i);
// }
// }
// private int replace(XWPFParagraph paragraph, int i) {
// XWPFRun run = paragraph.getRuns().get(i);
//
// String runText = run.getText(0);
//
// if (hasReplaceableItem(runText)) {
// return replace(paragraph, i, run);
// }
//
// return i;
// }
// private int replace(XWPFParagraph paragraph, int i, XWPFRun run) {
// String runText = run.getCTR().getTArray(0).getStringValue();
//
// String beforeSuperLong = StringUtils.substring(runText, 0, runText.indexOf(searchValue));
//
// String[] replacementTextSplitOnCarriageReturn = StringUtils.split(replacement, "\n");
//
// String afterSuperLong = StringUtils.substring(runText, runText.indexOf(searchValue) + searchValue.length());
//
// Counter counter = new Counter(i);
//
// insertNewRun(paragraph, run, counter, beforeSuperLong);
//
// for (int j = 0; j < replacementTextSplitOnCarriageReturn.length; j++) {
// String part = replacementTextSplitOnCarriageReturn[j];
//
// XWPFRun newRun = insertNewRun(paragraph, run, counter, part);
//
// if (j+1 < replacementTextSplitOnCarriageReturn.length) {
// newRun.addCarriageReturn();
// }
// }
//
// insertNewRun(paragraph, run, counter, afterSuperLong);
//
// paragraph.removeRun(counter.getCount());
//
// return counter.getCount();
// }
// private class Counter {
// private int i;
//
// public Counter(int i) {
// this.i = i;
// }
//
// public void increment() {
// i++;
// }
//
// public int getCount() {
// return i;
// }
// }
// private XWPFRun insertNewRun(XWPFParagraph xwpfParagraph, XWPFRun run, Counter counter, String newText) {
// XWPFRun newRun = xwpfParagraph.insertNewRun(counter.i);
// newRun.getCTR().set(run.getCTR());
// newRun.getCTR().getTArray(0).setStringValue(newText);
//
// counter.increment();
//
// return newRun;
// }
If somebody needs also to keep the formatting of the text, this code works better.
private static Map<Integer, XWPFRun> getPosToRuns(XWPFParagraph paragraph) {
int pos = 0;
Map<Integer, XWPFRun> map = new HashMap<Integer, XWPFRun>(10);
for (XWPFRun run : paragraph.getRuns()) {
String runText = run.text();
if (runText != null) {
for (int i = 0; i < runText.length(); i++) {
map.put(pos + i, run);
}
pos += runText.length();
}
}
return (map);
}
public static <V> void replace(XWPFDocument document, Map<String, V> map) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
replace(paragraph, map);
}
}
public static <V> void replace(XWPFDocument document, String searchText, V replacement) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
replace(paragraph, searchText, replacement);
}
}
private static <V> void replace(XWPFParagraph paragraph, Map<String, V> map) {
for (Map.Entry<String, V> entry : map.entrySet()) {
replace(paragraph, entry.getKey(), entry.getValue());
}
}
public static <V> void replace(XWPFParagraph paragraph, String searchText, V replacement) {
boolean found = true;
while (found) {
found = false;
int pos = paragraph.getText().indexOf(searchText);
if (pos >= 0) {
found = true;
Map<Integer, XWPFRun> posToRuns = getPosToRuns(paragraph);
XWPFRun run = posToRuns.get(pos);
XWPFRun lastRun = posToRuns.get(pos + searchText.length() - 1);
int runNum = paragraph.getRuns().indexOf(run);
int lastRunNum = paragraph.getRuns().indexOf(lastRun);
String texts[] = replacement.toString().split("\n");
run.setText(texts[0], 0);
XWPFRun newRun = run;
for (int i = 1; i < texts.length; i++) {
newRun.addCarriageReturn();
newRun = paragraph.insertNewRun(runNum + i);
/*
We should copy all style attributes
to the newRun from run
also from background color, ...
Here we duplicate only the simple attributes...
*/
newRun.setText(texts[i]);
newRun.setBold(run.isBold());
newRun.setCapitalized(run.isCapitalized());
// newRun.setCharacterSpacing(run.getCharacterSpacing());
newRun.setColor(run.getColor());
newRun.setDoubleStrikethrough(run.isDoubleStrikeThrough());
newRun.setEmbossed(run.isEmbossed());
newRun.setFontFamily(run.getFontFamily());
newRun.setFontSize(run.getFontSize());
newRun.setImprinted(run.isImprinted());
newRun.setItalic(run.isItalic());
newRun.setKerning(run.getKerning());
newRun.setShadow(run.isShadowed());
newRun.setSmallCaps(run.isSmallCaps());
newRun.setStrikeThrough(run.isStrikeThrough());
newRun.setSubscript(run.getSubscript());
newRun.setUnderline(run.getUnderline());
}
for (int i = lastRunNum + texts.length - 1; i > runNum + texts.length - 1; i--) {
paragraph.removeRun(i);
}
}
}
}
The first chunk of code is giing me a NullPointerException, anyone know what is wrong?
run.getText(int position) - from documentation: Returns: the text of this text run or null if not set
Just check if it is not null before calling contains() on it
And btw if you want to replace the text you need to set it in position from which you get it, in this case r.setText(text, 0);. Otherwise text will be added not replaced
Based on Dmitry Stolbov answer here and the problems and limitations encountered by it and the rest of the responses I came with the below class, that implements the method generateDocument that searches in paragraphs and tables.
Here I solved several problems found in the responses like:
This works fine but I need some insights on how to solve a problem I' having. Sometimes the value to replace in the file is larger than the tag to replace, and that ends up screwing up the alignments. For example:
the template:
the output file:
What happened is that the {#branch#} and {#insurCompanyCorporateName#} were replaced by larger strings, after the {#branch#} tag there are several "\t" elements and that, combined to the fact that {#insurCompanyCorporateName#} value is also larger that the tag, pushed the contents forward making it split to the next line.
I was wondering if anyone has some insights on how I could maybe understand at runtime if the values I'm replacing make the document split lines, or mess up the position of further elements in the page. In this case I would like my program to understand that he should remove some "\t" after the branch for example. Or maybe split the {#insurCompanyCorporateName#} to a new line, but making the new line starting bellow the original tag or something.
Thoghts?
The class:
package com.idoine.struts2.action.shared;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.*;
import org.json.JSONObject;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
/**
* Created by migue on 11/11/2020.
*/
public class DocumentGeneratorAction {
public static ByteArrayInputStream generateDocument(String templatePath, JSONObject fields){
/** used as reference: https://stackoverflow.com/a/49765239/5936443 [at 11/11/2020]
This method is responsible for generating a document as a ByteArrayInputStream, using an exisiting word template at templatePath
It replaces any keyTags in the document by the corresponding value in the JSONObject fields
it assumes the keyTags come preceeded by the separator "{#" and proceeded by "#}", in the following form: {#keyTag#}
*/
try {
XWPFDocument doc = new XWPFDocument(OPCPackage.open(templatePath));
// search in paragraphs
for(XWPFParagraph p : doc.getParagraphs()){
replaceFieldsParagraph(p, fields);
}
// search in tables
for(XWPFTable t : doc.getTables()){
replaceFieldsTable(t, fields);
}
ByteArrayOutputStream out = new ByteArrayOutputStream();
doc.write(out);
ByteArrayInputStream inputStream = new ByteArrayInputStream(out.toByteArray());
return inputStream;
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidFormatException e) {
e.printStackTrace();
}
return null;
}
public static void replaceFieldsParagraph(XWPFParagraph paragraph, JSONObject fields){
/** this method is responsible for replacing any ocurrences in the paragraph of any of the keyTags
* present in the JSONObject fields by the corresponding value */
String text = paragraph.getText(); //all the text from each run concatenated
String findStr;
if( !text.contains("{#")) //paragraph doesn't have keys to replace
return;
// for each field to replace, search it in the curr paragraph
for( String key : fields.keySet()){
findStr = "{#" + key + "#}";
// if paragraph doesn't have current key, we skip to next key
if( text.contains(findStr)) {
mergeRunsWithSplittedKeyTags(paragraph);
for (XWPFRun run : paragraph.getRuns()) {
// check if current run has current key
checkAndReplaceFieldRun(run, findStr, String.valueOf(fields.get(key)));
}
}
}
}
public static void replaceFieldsTable(XWPFTable table, JSONObject fields){
/** this method is responsible for replacing any ocurrences in the table of any of the keyTags
* present in the JSONObject fields by the corresponding value */
if( table.getNumberOfRows() > 0){
for(XWPFTableRow row : table.getRows()){ // iterate over rows
for( XWPFTableCell cell : row.getTableCells()){ // iterate over columns
if( cell.getParagraphs() != null && cell.getParagraphs().size()>0){
for(XWPFParagraph paragraph : cell.getParagraphs()){ // get cell paragraphs
replaceFieldsParagraph(paragraph, fields); // replacing existing keyTags in paragraph
}
}
}
}
}
}
public static void checkAndReplaceFieldRun(XWPFRun run, String findStr, String value){
String runText = run.getText(0);
if( runText!= null && runText.contains(findStr)){
runText = runText.replace(findStr, value);
run.setText(runText, 0);
}
}
public static void mergeRunsWithSplittedKeyTags(XWPFParagraph paragraph){
/**
A run is a part of the paragraph that has the same formatting.
Word separates the text in paragraphs by different runs in a almost 'random' way,
sometimes the tag we are looking for is splitted across multiple runs.
This method merges the runs that have a keyTag or part of one,
so that the keyTag starting with "{#" and ending with "#}" is in the same run
*/
String runText;
XWPFRun run, nextRun;
List<XWPFRun> runs = paragraph.getRuns();
for( int i=0 ; i<runs.size(); i++){
run = runs.get(i);
runText = run.getText(0);
if( runText != null &&
(runText.contains("{#") || // current run has the complete separator "{#"
(runText.contains("{") && (runs.get(i + 1).getText(0)!=null && runs.get(i + 1).getText(0).substring(0, 1).equals("#"))))){ //current run has the first char, next run has the second char
while( !openTagMatchesCloseTag(runText) ){
nextRun = runs.get(i + 1);
runText = runText + nextRun.getText(0);
paragraph.removeRun(i + 1);
}
run.setText(runText, 0); // if we don't set with arg pos=0 it doesn't replace the contents, it adds to them and repeats chars
}
}
}
public static boolean openTagMatchesCloseTag(String runText){
/** This method validates if we have a complete run.
* Either by having no keyTags present, or by having a complete keyTag.
* If we have parts of a keyTag, but not the complete one, returns false.*/
int incompleteOpenTagCount = runText.split("\\{", -1).length - 1; // "{"
int completeOpenTagCount = runText.split("\\{#", -1).length - 1; // "{#"
int completeCloseTagCount = runText.split("#}", -1).length - 1; // "#}"
if(completeOpenTagCount>0){ // we already have open and close tags, compare the counts
return completeOpenTagCount == completeCloseTagCount;
} else {
if( incompleteOpenTagCount>0 ){ // we only have a "{" not the whole "{#"
return false;
}
}
//doesn't have neither "{" nor "{#", so there's no need to close tags
return true;
}
}