I have a folder in hdfs which has two subfolders each one has about 30 subfolders which,finally,each one contains xml files. I want to list all xml files giving only the mai
Code snippet for both recursive and non-recursive approaches:
//helper method to get the list of files from the HDFS path
public static List
listFilesFromHDFSPath(Configuration hadoopConfiguration,
String hdfsPath,
boolean recursive) throws IOException,
IllegalArgumentException
{
//resulting list of files
List filePaths = new ArrayList();
//get path from string and then the filesystem
Path path = new Path(hdfsPath); //throws IllegalArgumentException
FileSystem fs = path.getFileSystem(hadoopConfiguration);
//if recursive approach is requested
if(recursive)
{
//(heap issues with recursive approach) => using a queue
Queue fileQueue = new LinkedList();
//add the obtained path to the queue
fileQueue.add(path);
//while the fileQueue is not empty
while (!fileQueue.isEmpty())
{
//get the file path from queue
Path filePath = fileQueue.remove();
//filePath refers to a file
if (fs.isFile(filePath))
{
filePaths.add(filePath.toString());
}
else //else filePath refers to a directory
{
//list paths in the directory and add to the queue
FileStatus[] fileStatuses = fs.listStatus(filePath);
for (FileStatus fileStatus : fileStatuses)
{
fileQueue.add(fileStatus.getPath());
} // for
} // else
} // while
} // if
else //non-recursive approach => no heap overhead
{
//if the given hdfsPath is actually directory
if(fs.isDirectory(path))
{
FileStatus[] fileStatuses = fs.listStatus(path);
//loop all file statuses
for(FileStatus fileStatus : fileStatuses)
{
//if the given status is a file, then update the resulting list
if(fileStatus.isFile())
filePaths.add(fileStatus.getPath().toString());
} // for
} // if
else //it is a file then
{
//return the one and only file path to the resulting list
filePaths.add(path.toString());
} // else
} // else
//close filesystem; no more operations
fs.close();
//return the resulting list
return filePaths;
} // listFilesFromHDFSPath