Python: Choose random line from file, then delete that line

前端 未结 6 1607
礼貌的吻别
礼貌的吻别 2020-11-30 13:16

I\'m new to Python (in that I learned it through a CodeAcademy course) and could use some help with figuring this out.

I have a file, \'TestingDeleteLines.txt\', tha

6条回答
  •  臣服心动
    2020-11-30 13:44

    To choose a random line from a file, you could use a space efficient single-pass reservoir-sampling algorithm. To delete that line, you could print everything except the chosen line:

    #!/usr/bin/env python3
    import fileinput
    
    with open(filename) as file:
        k = select_random_it(enumerate(file), default=[-1])[0]
    
    if k >= 0: # file is not empty
        with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
            for i, line in enumerate(file):
                if i != k: # keep line
                    print(line, end='') # stdout is redirected to filename
    

    where select_random_it() implements the reservoir-sampling algorithm:

    import random
    
    def select_random_it(iterator, default=None, randrange=random.randrange):
        """Return a random element from iterator.
    
        Return default if iterator is empty.
        iterator is exhausted.
        O(n)-time, O(1)-space algorithm.
        """
        # from https://stackoverflow.com/a/1456750/4279
        # select 1st item with probability 100% (if input is one item, return it)
        # select 2nd item with probability 50% (or 50% the selection stays the 1st)
        # select 3rd item with probability 33.(3)%
        # select nth item with probability 1/n
        selection = default
        for i, item in enumerate(iterator, start=1):
            if randrange(i) == 0: # random [0..i)
                selection = item
        return selection
    

    To print k random lines from a file and delete them:

    #!/usr/bin/env python3
    import random
    import sys
    
    k = 10
    filename = 'TestingDeleteLines.txt'
    with open(filename) as file:
        random_lines = reservoir_sample(file, k) # get k random lines
    
    if not random_lines: # file is empty
        sys.exit() # do nothing, exit immediately
    
    print("\n".join(map(str.strip, random_lines))) # print random lines
    delete_lines(filename, random_lines) # delete them from the file
    

    where reservoir_sample() uses the same algorithm as select_random_it() but allows to choose k items instead of one:

    import random
    
    def reservoir_sample(iterable, k,
                         randrange=random.randrange, shuffle=random.shuffle):
        """Select *k* random elements from *iterable*.
    
        Use O(n) Algorithm R https://en.wikipedia.org/wiki/Reservoir_sampling
    
        If number of items less then *k* then return all items in random order.
        """
        it = iter(iterable)
        if not (k > 0):
            raise ValueError("sample size must be positive")
    
        sample = list(islice(it, k)) # fill the reservoir
        shuffle(sample)
        for i, item in enumerate(it, start=k+1):
            j = randrange(i) # random [0..i)
            if j < k:
                sample[j] = item # replace item with gradually decreasing probability
        return sample
    

    and delete_lines() utility function deletes chosen random lines from the file:

    import fileinput
    import os
    
    def delete_lines(filename, lines):
        """Delete *lines* from *filename*."""
        lines = set(lines) # for amortized O(1) lookup
        with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
            for line in file:
                if line not in lines:
                    print(line, end='')
        os.unlink(filename + '.bak') # remove backup if there is no exception
    

    reservoir_sample(), delete_lines() funciton do not load the whole file into memory and therefore they can work for arbitrary large files.

提交回复
热议问题