How do I run os.walk in parallel in Python?

后端 未结 3 1808
执念已碎
执念已碎 2020-12-15 04:32

I wrote a simple app in Java that takes a list of paths and generates a file with all the file paths under that original list.

If I have paths.txt that has:

相关标签:
3条回答
  • 2020-12-15 05:12

    Here is a multiprocessing solution:

    from multiprocessing.pool import Pool
    from multiprocessing import JoinableQueue as Queue
    import os
    
    def explore_path(path):
        directories = []
        nondirectories = []
        for filename in os.listdir(path):
            fullname = os.path.join(path, filename)
            if os.path.isdir(fullname):
                directories.append(fullname)
            else:
                nondirectories.append(filename)
        outputfile = path.replace(os.sep, '_') + '.txt'
        with open(outputfile, 'w') as f:
            for filename in nondirectories:
                print >> f, filename
        return directories
    
    def parallel_worker():
        while True:
            path = unsearched.get()
            dirs = explore_path(path)
            for newdir in dirs:
                unsearched.put(newdir)
            unsearched.task_done()
    
    # acquire the list of paths
    with open('paths.txt') as f:
        paths = f.split()
    
    unsearched = Queue()
    for path in paths:
        unsearched.put(path)
    
    pool = Pool(5)
    for i in range(5):
        pool.apply_async(parallel_worker)
    
    unsearched.join()
    print 'Done'
    
    0 讨论(0)
  • 2020-12-15 05:19

    This is a pattern for threads in python which has been useful to me. I'm not sure if threading is going to increase your performance due to the way threads work in CPython though.

    import threading
    import Queue
    import os
    
    class PathThread (threading.Thread):
        def __init__(self, queue):
            threading.Thread.__init__(self)
            self.queue = queue
    
        def printfiles(self, p):
            for path, dirs, files in os.walk(p):
                for f in files:
                    print path + "/" + f
    
        def run(self):
            while True:
                path = self.queue.get()
                self.printfiles(path)
                self.queue.task_done()
    
    # threadsafe queue
    pathqueue = Queue.Queue()
    paths = ["foo", "bar", "baz"]
    
    # spawn threads
    for i in range(0, 5):
        t = PathThread(pathqueue)
        t.setDaemon(True)
        t.start()
    
    # add paths to queue
    for path in paths:
        pathqueue.put(path)
    
    # wait for queue to get empty
    pathqueue.join()
    
    0 讨论(0)
  • 2020-12-15 05:21

    Even threading can be pretty helpful for directory traversal. I use the following code for traversing a SharePoint tree, getting a pretty significant speedup for about 50 threads. This particular program returns (path, data) pairs for all xml files in a directory structure, and can be simply expanded for your use. (This is cut and pasted from my program; some additional editing is needed.)

    #unique string for error passing error messages
    ERROR = '\xffERROR\xff'
    
    class ScanWorker(threading.Thread):
        """Worker class for scanning directory structures.
        pathQueue: queue for pathnames of directories
        resultQueue: results of processFile, pairs of (path, data) to be updated
        """
        lock = threading.Lock()
        dirCount = 0
        def __init__(self, pathQueue, resultQueue):
            self.pathQueue = pathQueue
            self.resultQueue = resultQueue
            super().__init__()
    
        def run(self):
            """Worker thread.
            Get a directory, process it, and put new directories on the
            queue."""
            try:
                while True:
                    self.processDir(self.pathQueue.get())
                    self.pathQueue.task_done()
            except Exception as e:
                #pass on exception to main thread
                description = traceback.format_exception(*sys.exc_info())
                description.insert(0,
                    "Error in thread {}:\n".format(
                        threading.current_thread().name))
                self.resultQueue.put((ERROR, description))
                self.pathQueue.task_done()
    
        def processDir(self, top):
            """Visit a directory
            Call self.processFile on every file, and queue the directories.
            """
            #Wait and retry a few times in case of network errors.
            #SharePoint is not reliable, gives errors for no reason
            for retryCount in range(30):
                try:
                    names = listdir(top)
                    break
                except OSError as e:
                    if e.errno in (2,22):
                        lastError = e
                        print(end="L", flush=True)
                        time.sleep(1)
                    else:
                        raise
            else:
                print("List: too many retries")
                raise lastError
            #it is not important to worry about race conditions here
            self.__class__.dirCount += 1
            #process contents
            for name in names:
                if isdir(join(top, name)): self.pathQueue.put(join(top, name))
                else: self.processFile(join(top, name))
    
        def processFile(self, path):
            """Get XML file.
            """
            #only xml files
            if not path.lower().endswith('.xml'): return
            filemtime = datetime.fromtimestamp(getmtime(path))
            #SharePoint is not reliable, gives errors for no reason; just retry
            for retryCount in range(30):
                try:
                    data = open(path,'rb').read()
                    break
                except OSError as e:
                    if e.errno in (2,22):
                        lastError = e
                        print(end="R", flush=True)
                        time.sleep(1)
                    else:
                        raise
            else:
                print("Read: too many retries")
                raise lastError
            self.resultQueue.put((path, data))
    
    class Scanner:
        """Interface to the ScanWorkers
        Sharepoint is pretty fast compared to its delay and handles 50 workers well
        Make sure you only create one instance of Scanner!
        """
        def __init__(self, workers):
            #don't restrict the path queue length; this causes deadlock
            #we use a LIFO queue to get more depth-first like search
            #reducing average queue length and hopefully improving server caching
            self.pathQueue = queue.LifoQueue()
            #this is the output queue to the main thread
            self.resultQueue = queue.Queue(5)
            self.workers = workers
            #start workers
            for i in range(workers):
                t = ScanWorker(self.pathQueue, self.resultQueue)
                t.setDaemon(True)
                t.start()
    
        def startWorkers(self, path):
            #add counter
            self.added = 0
            #and go
            self.pathQueue.put(path)
    
        def processResult(self, wait=True):
            """Get an element from the result queue, and add to the zip file."""
            path, data = self.resultQueue.get(block=wait)
            if path==ERROR:
                #process gave alarm; stop scanning
                #pass on description
                raise ScanError(data)
            <do whatever you want to do with the file>
            self.resultQueue.task_done()
            self.added += 1
    
    #main
    try:
        #set up
        scanner = Scanner(threads)
        scanner.startWorkers(rootpath)
        pathQueue, resultQueue = scanner.pathQueue, scanner.resultQueue
        #scanner is rolling; wait for it to finish
        with pathQueue.all_tasks_done:
            while pathQueue.unfinished_tasks:
                #tasks are still running
                #process results
                while True:
                    try: scanner.processResult(wait=False)
                    except queue.Empty: break
                #no new files found; check if scanner is ready
                done = pathQueue.all_tasks_done.wait(timeout=1)
                if not done:
                    #Not yet; print something while we wait
                    print(
                        "\rProcessed {} files from {} directories [{} {}]  "
                        .format(
                            scanner.added,
                            ScanWorker.dirCount,
                            pathQueue.unfinished_tasks,
                            resultQueue.unfinished_tasks,
                        ), end='\r')
        #just to make sure everybody is ready: join the path queue
        pathQueue.join()
        #process remaining of result queue
        while resultQueue.unfinished_tasks: scanner.processResult(wait=True)
        #go to new line to prevent overwriting progress messages
        print()
    except ScanError as e:
        print()
        print(*e.args[0], end='')
        print("Process interrupted.")
    except KeyboardInterrupt:
        print("\nProcess interrupted.")
    print()
    
    0 讨论(0)
提交回复
热议问题