How to do CopyMerge in Hadoop 3.0?

前端 未结 4 664
半阙折子戏
半阙折子戏 2020-12-29 09:05

I know hadoop version 2.7\'s FileUtil has the copyMerge function that merges multiple files into a new one.

But t

4条回答
  •  -上瘾入骨i
    2020-12-29 09:46

    This should work

    /** Copy all files in a directory to one output file (merge). */
        public static boolean copyMerge(FileSystem srcFS, Path srcDir,
                                        FileSystem dstFS, Path dstFile,
                                        boolean deleteSource,
                                        Configuration conf, String addString) throws IOException {
            dstFile = checkDest(srcDir.getName(), dstFS, dstFile, false);
    
            if (!srcFS.getFileStatus(srcDir).isDirectory())
                return false;
    
            OutputStream out = dstFS.create(dstFile);
    
            try {
                FileStatus contents[] = srcFS.listStatus(srcDir);
                Arrays.sort(contents);
                for (int i = 0; i < contents.length; i++) {
                    if (contents[i].isFile()) {
                        InputStream in = srcFS.open(contents[i].getPath());
                        try {
                            IOUtils.copyBytes(in, out, conf, false);
                            if (addString!=null)
                                out.write(addString.getBytes("UTF-8"));
    
                        } finally {
                            in.close();
                        }
                    }
                }
            } finally {
                out.close();
            }
    
    
            if (deleteSource) {
                return srcFS.delete(srcDir, true);
            } else {
                return true;
            }
        }
    
        private static Path checkDest(String srcName, FileSystem dstFS, Path dst,
                                      boolean overwrite) throws IOException {
            if (dstFS.exists(dst)) {
                FileStatus sdst = dstFS.getFileStatus(dst);
                if (sdst.isDirectory()) {
                    if (null == srcName) {
                        throw new IOException("Target " + dst + " is a directory");
                    }
                    return checkDest(null, dstFS, new Path(dst, srcName), overwrite);
                } else if (!overwrite) {
                    throw new IOException("Target " + dst + " already exists");
                }
            }
            return dst;
        }
    

提交回复
热议问题