I found some this promising code on activestate.com to sort huge files. I'm trying to run it on the default Python 2.6.5 interpreter on Ubuntu 10.04. When I try running it on a small test file, I get the error trace below. I asked for help on activestate.com, but this thread has been silent for over 18 months. Is there anyone here who sees an obvious solution?
Thanks.
## {{{ http://code.activestate.com/recipes/576755/ (r3) # based on Recipe 466302: Sorting big files the Python 2.4 way # by Nicolas Lehuen import os from tempfile import gettempdir from itertools import islice, cycle from collections import namedtuple import heapq Keyed = namedtuple("Keyed", ["key", "obj"]) def merge(key=None, *iterables): # based on code posted by Scott David Daniels in c.l.p. # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d if key is None: keyed_iterables = iterables else: keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable) for iterable in iterables] for element in heapq.merge(*keyed_iterables): yield element.obj def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): if tempdirs is None: tempdirs = [] if not tempdirs: tempdirs.append(gettempdir()) chunks = [] try: with open(input,'rb',64*1024) as input_file: input_iterator = iter(input_file) for tempdir in cycle(tempdirs): current_chunk = list(islice(input_iterator,buffer_size)) if not current_chunk: break current_chunk.sort(key=key) output_chunk = open(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024) chunks.append(output_chunk) output_chunk.writelines(current_chunk) output_chunk.flush() output_chunk.seek(0) with open(output,'wb',64*1024) as output_file: output_file.writelines(merge(key, *chunks)) finally: for chunk in chunks: try: chunk.close() os.remove(chunk.name) except Exception: pass
Error trace:
Traceback (most recent call last): File "./batch_sort.py", line 108, in <module> batch_sort(args[0],args[1],options.key,options.buffer_size,options.tempdirs) File "./batch_sort.py", line 54, in batch_sort output_file.writelines(merge(key, *chunks)) File "./batch_sort.py", line 30, in merge yield element.obj AttributeError: 'str' object has no attribute 'obj'