I know hadoop
version 2.7
\'s FileUtil
has the copyMerge
function that merges multiple files into a new one.
But t
This should work
/** Copy all files in a directory to one output file (merge). */
public static boolean copyMerge(FileSystem srcFS, Path srcDir,
FileSystem dstFS, Path dstFile,
boolean deleteSource,
Configuration conf, String addString) throws IOException {
dstFile = checkDest(srcDir.getName(), dstFS, dstFile, false);
if (!srcFS.getFileStatus(srcDir).isDirectory())
return false;
OutputStream out = dstFS.create(dstFile);
try {
FileStatus contents[] = srcFS.listStatus(srcDir);
Arrays.sort(contents);
for (int i = 0; i < contents.length; i++) {
if (contents[i].isFile()) {
InputStream in = srcFS.open(contents[i].getPath());
try {
IOUtils.copyBytes(in, out, conf, false);
if (addString!=null)
out.write(addString.getBytes("UTF-8"));
} finally {
in.close();
}
}
}
} finally {
out.close();
}
if (deleteSource) {
return srcFS.delete(srcDir, true);
} else {
return true;
}
}
private static Path checkDest(String srcName, FileSystem dstFS, Path dst,
boolean overwrite) throws IOException {
if (dstFS.exists(dst)) {
FileStatus sdst = dstFS.getFileStatus(dst);
if (sdst.isDirectory()) {
if (null == srcName) {
throw new IOException("Target " + dst + " is a directory");
}
return checkDest(null, dstFS, new Path(dst, srcName), overwrite);
} else if (!overwrite) {
throw new IOException("Target " + dst + " already exists");
}
}
return dst;
}