poi提取docx中的文字和图片

霸气de小男生 提交于 2020-01-10 01:55:36
package com.fry.poiDemo.dao;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.List;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;

public class Word {
    // maven太好用了
    // 读取srcFile源word文件docx文字
    // 读取srcFile源word文件docx中的image图片并且存放在文件夹imageFile中
    public String readDocxImage(String srcFile, String imageFile) {
        String path = srcFile;
        File file = new File(path);
        try {
            // 用XWPFWordExtractor来获取文字
            FileInputStream fis = new FileInputStream(file);
            XWPFDocument document = new XWPFDocument(fis);
            XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
            String text = xwpfWordExtractor.getText();
//            System.out.println(text);
            //将获取到的文字存放到对应文件名中的txt文件中
            String temp[]=srcFile.split("\\/");
            String temp1=temp[temp.length-1];
            String temp3[]=temp1.split("\\.");
            String txtFileName="myRes//txt//"+temp3[0]+".txt";
            PrintStream ps = new PrintStream(txtFileName);
            ps.println(text);
            

            // 用XWPFDocument的getAllPictures来获取所有的图片
            List<XWPFPictureData> picList = document.getAllPictures();
            for (XWPFPictureData pic : picList) {
//                System.out.println(pic.getPictureType() + file.separator + pic.suggestFileExtension() + file.separator
//                        + pic.getFileName());
                byte[] bytev = pic.getData();
//                System.out.println(bytev.length);
                // 大于1000bites的图片我们才弄下来,消除word中莫名的小图片的影响
                if (bytev.length > 300) {
                    FileOutputStream fos = new FileOutputStream(imageFile + pic.getFileName());
                    fos.write(bytev);
                }
            }
            fis.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.9</version>
</dependency>

  

  

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!