Unable to read all content in order of a word document (docx) in Apache POI

后端 未结 1 503
广开言路
广开言路 2020-12-21 20:41

I\'ve been trying to read all content (including tables, pictures, paragraphs) from a word document. I\'m able to read tables and paragraphs using getBodyElementsIterator()

相关标签:
1条回答
  • 2020-12-21 21:34

    As told in comments already the question how to read all content in order of a word document (docx) in apache poi is much too broad to be answerable here. A *.docx is a ZIP archive in Office Open XML file format. It contains the document.xml for the document body. This is very complex XML which needs to be traversed. But that document.xml might contain references to other resources in the *.docx ZIP archive which then also needs to be traversed.

    What I can provide is a template of how this traversing process could look like. It starts at XWPFDocument and at first traverses all the IBodyElements in it. According to the found type of IBodyElement it does further traversing processes then.

    import java.io.FileInputStream;
    
    import org.apache.poi.xwpf.usermodel.*;
    
    import java.util.List;
    
    public class WordReadAllContent {
    
     static void traversePictures(List<XWPFPicture> pictures) throws Exception {
      for (XWPFPicture picture : pictures) {
       System.out.println(picture);
       XWPFPictureData pictureData = picture.getPictureData();
       System.out.println(pictureData);
      }
     }
    
     static void traverseRunElements(List<IRunElement> runElements) throws Exception {
      for (IRunElement runElement : runElements) {
       if (runElement instanceof XWPFFieldRun) {
        XWPFFieldRun fieldRun = (XWPFFieldRun)runElement;
        System.out.println(fieldRun.getClass().getName());
        System.out.println(fieldRun);
        traversePictures(fieldRun.getEmbeddedPictures());
       } else if (runElement instanceof XWPFHyperlinkRun) {
        XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun)runElement;
        System.out.println(hyperlinkRun.getClass().getName());
        System.out.println(hyperlinkRun);
        traversePictures(hyperlinkRun.getEmbeddedPictures());
       } else if (runElement instanceof XWPFRun) {
        XWPFRun run = (XWPFRun)runElement;
        System.out.println(run.getClass().getName());
        System.out.println(run);
        traversePictures(run.getEmbeddedPictures());
       } else if (runElement instanceof XWPFSDT) {
        XWPFSDT sDT = (XWPFSDT)runElement;
        System.out.println(sDT);
        System.out.println(sDT.getContent());
        //ToDo: The SDT may have traversable content too.
       }
      }
     }
    
     static void traverseTableCells(List<ICell> tableICells) throws Exception {
      for (ICell tableICell : tableICells) {
       if (tableICell instanceof XWPFSDTCell) {
        XWPFSDTCell sDTCell = (XWPFSDTCell)tableICell;
        System.out.println(sDTCell);
        //ToDo: The SDTCell may have traversable content too.
       } else if (tableICell instanceof XWPFTableCell) {
        XWPFTableCell tableCell = (XWPFTableCell)tableICell;
        System.out.println(tableCell);
        traverseBodyElements(tableCell.getBodyElements());
       }
      }
     }
    
     static void traverseTableRows(List<XWPFTableRow> tableRows) throws Exception {
      for (XWPFTableRow tableRow : tableRows) {
       System.out.println(tableRow);
       traverseTableCells(tableRow.getTableICells());
      }
     }
    
     static void traverseBodyElements(List<IBodyElement> bodyElements) throws Exception {
      for (IBodyElement bodyElement : bodyElements) {
       if (bodyElement instanceof XWPFParagraph) {
        XWPFParagraph paragraph = (XWPFParagraph)bodyElement;
        System.out.println(paragraph);
        traverseRunElements(paragraph.getIRuns());
       } else if (bodyElement instanceof XWPFSDT) {
        XWPFSDT sDT = (XWPFSDT)bodyElement;
        System.out.println(sDT);
        System.out.println(sDT.getContent());
        //ToDo: The SDT may have traversable content too.
       } else if (bodyElement instanceof XWPFTable) {
        XWPFTable table = (XWPFTable)bodyElement;
        System.out.println(table);
        traverseTableRows(table.getRows());
       }
      }
     }
    
     public static void main(String[] args) throws Exception {
    
      String inFilePath = "./WordDocument.docx";
    
      XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
      traverseBodyElements(document.getBodyElements());
    
      document.close();
     }
    
    }
    

    This is a working draft. I am sure, I forgot something.

    0 讨论(0)
提交回复
热议问题