How to sort huge files with Python?

匿名 (未验证) 提交于 2019-12-03 02:31:01

问题:

I found some this promising code on activestate.com to sort huge files. I'm trying to run it on the default Python 2.6.5 interpreter on Ubuntu 10.04. When I try running it on a small test file, I get the error trace below. I asked for help on activestate.com, but this thread has been silent for over 18 months. Is there anyone here who sees an obvious solution?

Thanks.

## {{{ http://code.activestate.com/recipes/576755/ (r3) # based on Recipe 466302: Sorting big files the Python 2.4 way # by Nicolas Lehuen  import os from tempfile import gettempdir from itertools import islice, cycle from collections import namedtuple import heapq  Keyed = namedtuple("Keyed", ["key", "obj"])  def merge(key=None, *iterables):     # based on code posted by Scott David Daniels in c.l.p.     # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d      if key is None:         keyed_iterables = iterables     else:         keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)                             for iterable in iterables]      for element in heapq.merge(*keyed_iterables):         yield element.obj   def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):     if tempdirs is None:         tempdirs = []     if not tempdirs:         tempdirs.append(gettempdir())      chunks = []     try:         with open(input,'rb',64*1024) as input_file:             input_iterator = iter(input_file)             for tempdir in cycle(tempdirs):                 current_chunk = list(islice(input_iterator,buffer_size))                 if not current_chunk:                     break                 current_chunk.sort(key=key)                 output_chunk = open(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024)                 chunks.append(output_chunk)                 output_chunk.writelines(current_chunk)                 output_chunk.flush()                 output_chunk.seek(0)         with open(output,'wb',64*1024) as output_file:             output_file.writelines(merge(key, *chunks))     finally:         for chunk in chunks:             try:                 chunk.close()                 os.remove(chunk.name)             except Exception:                 pass

Error trace:

Traceback (most recent call last):   File "./batch_sort.py", line 108, in <module>     batch_sort(args[0],args[1],options.key,options.buffer_size,options.tempdirs)   File "./batch_sort.py", line 54, in batch_sort     output_file.writelines(merge(key, *chunks))   File "./batch_sort.py", line 30, in merge     yield element.obj AttributeError: 'str' object has no attribute 'obj'

回答1:

The code for merge is incorrect. If you don't provide a key, each element is a string instead of a keyed tuple.

Try this instead:

def merge(key=None, *iterables):     # based on code posted by Scott David Daniels in c.l.p.     # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d      if key is None:         for element in heapq.merge(*iterables):             yield element     else:         keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)                         for iterable in iterables]         for element in heapq.merge(*keyed_iterables):             yield element.obj


标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!