I\'m new to Python (in that I learned it through a CodeAcademy course) and could use some help with figuring this out.
I have a file, \'TestingDeleteLines.txt\', tha
To choose a random line from a file, you could use a space efficient single-pass reservoir-sampling algorithm. To delete that line, you could print everything except the chosen line:
#!/usr/bin/env python3
import fileinput
with open(filename) as file:
k = select_random_it(enumerate(file), default=[-1])[0]
if k >= 0: # file is not empty
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for i, line in enumerate(file):
if i != k: # keep line
print(line, end='') # stdout is redirected to filename
where select_random_it() implements the reservoir-sampling algorithm:
import random
def select_random_it(iterator, default=None, randrange=random.randrange):
"""Return a random element from iterator.
Return default if iterator is empty.
iterator is exhausted.
O(n)-time, O(1)-space algorithm.
"""
# from https://stackoverflow.com/a/1456750/4279
# select 1st item with probability 100% (if input is one item, return it)
# select 2nd item with probability 50% (or 50% the selection stays the 1st)
# select 3rd item with probability 33.(3)%
# select nth item with probability 1/n
selection = default
for i, item in enumerate(iterator, start=1):
if randrange(i) == 0: # random [0..i)
selection = item
return selection
To print k random lines from a file and delete them:
#!/usr/bin/env python3
import random
import sys
k = 10
filename = 'TestingDeleteLines.txt'
with open(filename) as file:
random_lines = reservoir_sample(file, k) # get k random lines
if not random_lines: # file is empty
sys.exit() # do nothing, exit immediately
print("\n".join(map(str.strip, random_lines))) # print random lines
delete_lines(filename, random_lines) # delete them from the file
where reservoir_sample() uses the same algorithm as select_random_it() but allows to choose k items instead of one:
import random
def reservoir_sample(iterable, k,
randrange=random.randrange, shuffle=random.shuffle):
"""Select *k* random elements from *iterable*.
Use O(n) Algorithm R https://en.wikipedia.org/wiki/Reservoir_sampling
If number of items less then *k* then return all items in random order.
"""
it = iter(iterable)
if not (k > 0):
raise ValueError("sample size must be positive")
sample = list(islice(it, k)) # fill the reservoir
shuffle(sample)
for i, item in enumerate(it, start=k+1):
j = randrange(i) # random [0..i)
if j < k:
sample[j] = item # replace item with gradually decreasing probability
return sample
and delete_lines() utility function deletes chosen random lines from the file:
import fileinput
import os
def delete_lines(filename, lines):
"""Delete *lines* from *filename*."""
lines = set(lines) # for amortized O(1) lookup
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
if line not in lines:
print(line, end='')
os.unlink(filename + '.bak') # remove backup if there is no exception
reservoir_sample(), delete_lines() funciton do not load the whole file into memory and therefore they can work for arbitrary large files.