I have a folder in hdfs which has two subfolders each one has about 30 subfolders which,finally,each one contains xml files. I want to list all xml files giving only the mai
Here is a code snippet, that counts number of files in a particular HDFS directory (I used this to determine how many reducers to use in a particular ETL code). You can easily modify this to suite your needs.
private int calculateNumberOfReducers(String input) throws IOException {
int numberOfReducers = 0;
Path inputPath = new Path(input);
FileSystem fs = inputPath.getFileSystem(getConf());
FileStatus[] statuses = fs.globStatus(inputPath);
for(FileStatus status: statuses) {
if(status.isDirectory()) {
numberOfReducers += getNumberOfInputFiles(status, fs);
} else if(status.isFile()) {
numberOfReducers ++;
}
}
return numberOfReducers;
}
/**
* Recursively determines number of input files in an HDFS directory
*
* @param status instance of FileStatus
* @param fs instance of FileSystem
* @return number of input files within particular HDFS directory
* @throws IOException
*/
private int getNumberOfInputFiles(FileStatus status, FileSystem fs) throws IOException {
int inputFileCount = 0;
if(status.isDirectory()) {
FileStatus[] files = fs.listStatus(status.getPath());
for(FileStatus file: files) {
inputFileCount += getNumberOfInputFiles(file, fs);
}
} else {
inputFileCount ++;
}
return inputFileCount;
}