Spark lists all leaf node even in partitioned data

前端 未结 2 1852
盖世英雄少女心
盖世英雄少女心 2020-12-01 16:31

I have parquet data partitioned by date & hour, folder structure:

events_v3
  -- event_date=2015-01-01
    -- event_hour=2015-0         


        
2条回答
  •  死守一世寂寞
    2020-12-01 17:03

    As soon as spark is given a directory to read from it issues call to listLeafFiles (org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala). This in turn calls fs.listStatus which makes an api call to get list of files and directories. Now for each directory this method is called again. This hapens recursively until no directories are left. This by design works good in a HDFS system. But works bad in s3 since list file is an RPC call. S3 on other had supports get all files by prefix, which is exactly what we need.

    So for example if we had above directory structure with 1 year worth of data with each directory for hour and 10 sub directory we would have , 365 * 24 * 10 = 87k api calls, this can be reduced to 138 api calls given that there are only 137000 files. Each s3 api calls return 1000 files.

    Code: org/apache/hadoop/fs/s3a/S3AFileSystem.java

    public FileStatus[] listStatusRecursively(Path f) throws FileNotFoundException,
                IOException {
            String key = pathToKey(f);
            if (LOG.isDebugEnabled()) {
                LOG.debug("List status for path: " + f);
            }
    
            final List result = new ArrayList();
            final FileStatus fileStatus =  getFileStatus(f);
    
            if (fileStatus.isDirectory()) {
                if (!key.isEmpty()) {
                    key = key + "/";
                }
    
                ListObjectsRequest request = new ListObjectsRequest();
                request.setBucketName(bucket);
                request.setPrefix(key);
                request.setMaxKeys(maxKeys);
    
                if (LOG.isDebugEnabled()) {
                    LOG.debug("listStatus: doing listObjects for directory " + key);
                }
    
                ObjectListing objects = s3.listObjects(request);
                statistics.incrementReadOps(1);
    
                while (true) {
                    for (S3ObjectSummary summary : objects.getObjectSummaries()) {
                        Path keyPath = keyToPath(summary.getKey()).makeQualified(uri, workingDir);
                        // Skip over keys that are ourselves and old S3N _$folder$ files
                        if (keyPath.equals(f) || summary.getKey().endsWith(S3N_FOLDER_SUFFIX)) {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Ignoring: " + keyPath);
                            }
                            continue;
                        }
    
                        if (objectRepresentsDirectory(summary.getKey(), summary.getSize())) {
                            result.add(new S3AFileStatus(true, true, keyPath));
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Adding: fd: " + keyPath);
                            }
                        } else {
                            result.add(new S3AFileStatus(summary.getSize(),
                                    dateToLong(summary.getLastModified()), keyPath,
                                    getDefaultBlockSize(f.makeQualified(uri, workingDir))));
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Adding: fi: " + keyPath);
                            }
                        }
                    }
    
                    for (String prefix : objects.getCommonPrefixes()) {
                        Path keyPath = keyToPath(prefix).makeQualified(uri, workingDir);
                        if (keyPath.equals(f)) {
                            continue;
                        }
                        result.add(new S3AFileStatus(true, false, keyPath));
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Adding: rd: " + keyPath);
                        }
                    }
    
                    if (objects.isTruncated()) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("listStatus: list truncated - getting next batch");
                        }
    
                        objects = s3.listNextBatchOfObjects(objects);
                        statistics.incrementReadOps(1);
                    } else {
                        break;
                    }
                }
            } else {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Adding: rd (not a dir): " + f);
                }
                result.add(fileStatus);
            }
    
            return result.toArray(new FileStatus[result.size()]);
        }
    

    /org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala

    def listLeafFiles(fs: FileSystem, status: FileStatus, filter: PathFilter): Array[FileStatus] = {
        logTrace(s"Listing ${status.getPath}")
        val name = status.getPath.getName.toLowerCase
        if (shouldFilterOut(name)) {
          Array.empty[FileStatus]
        }
        else {
          val statuses = {
            val stats = if(fs.isInstanceOf[S3AFileSystem]){
              logWarning("Using Monkey patched version of list status")
              println("Using Monkey patched version of list status")
              val a = fs.asInstanceOf[S3AFileSystem].listStatusRecursively(status.getPath)
              a
    //          Array.empty[FileStatus]
            }
            else{
              val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDirectory)
              files ++ dirs.flatMap(dir => listLeafFiles(fs, dir, filter))
    
            }
            if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
          }
          // statuses do not have any dirs.
          statuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
            case f: LocatedFileStatus => f
    
            // NOTE:
            //
            // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
            //   operations, calling `getFileBlockLocations` does no harm here since these file system
            //   implementations don't actually issue RPC for this method.
            //
            // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
            //   be a big deal since we always use to `listLeafFilesInParallel` when the number of
            //   paths exceeds threshold.
            case f => createLocatedFileStatus(f, fs.getFileBlockLocations(f, 0, f.getLen))
          }
        }
      }
    

提交回复
热议问题