Given a string of a million numbers, return all repeating 3 digit numbers

前端 未结 13 1528
误落风尘
误落风尘 2020-12-22 15:41

I had an interview with a hedge fund company in New York a few months ago and unfortunately, I did not get the internship offer as a data/software engineer. (They also asked

13条回答
  •  清歌不尽
    2020-12-22 16:04

    Here's my answer:

    from timeit import timeit
    from collections import Counter
    import types
    import random
    
    def setup_data(n):
        digits = "0123456789"
        return dict(text = ''.join(random.choice(digits) for i in range(n)))
    
    
    def f_counter(text):
        c = Counter()
        for i in range(len(text)-2):
            ss = text[i:i+3]
            c.update([ss])
        return (i for i in c.items() if i[1] > 1)
    
    def f_dict(text):
        d = {}
        for i in range(len(text)-2):
            ss = text[i:i+3]
            if ss not in d:
                d[ss] = 0
            d[ss] += 1
        return ((i, d[i]) for i in d if d[i] > 1)
    
    def f_array(text):
        a = [[[0 for _ in range(10)] for _ in range(10)] for _ in range(10)]
        for n in range(len(text)-2):
            i, j, k = (int(ss) for ss in text[n:n+3])
            a[i][j][k] += 1
        for i, b in enumerate(a):
            for j, c in enumerate(b):
                for k, d in enumerate(c):
                    if d > 1: yield (f'{i}{j}{k}', d)
    
    
    for n in (1E1, 1E3, 1E6):
        n = int(n)
        data = setup_data(n)
        print(f'n = {n}')
        results = {}
        for name, func in list(globals().items()):
            if not name.startswith('f_') or not isinstance(func, types.FunctionType):
                continue
            print("{:16s}{:16.8f} ms".format(name[2:], timeit(
                'results[name] = f(**data)', globals={'f':func, 'data':data, 'results':results, 'name':name}, number=10)*100))
        for r in results:
            print('{:10}: {}'.format(r, sorted(list(results[r]))[:5]))
    

    The array lookup method is very fast (even faster than @paul-panzer's numpy method!). Of course, it cheats since it isn't technicailly finished after it completes, because it's returning a generator. It also doesn't have to check every iteration if the value already exists, which is likely to help a lot.

    n = 10
    counter               0.10595780 ms
    dict                  0.01070654 ms
    array                 0.00135370 ms
    f_counter : []
    f_dict    : []
    f_array   : []
    n = 1000
    counter               2.89462101 ms
    dict                  0.40434612 ms
    array                 0.00073838 ms
    f_counter : [('008', 2), ('009', 3), ('010', 2), ('016', 2), ('017', 2)]
    f_dict    : [('008', 2), ('009', 3), ('010', 2), ('016', 2), ('017', 2)]
    f_array   : [('008', 2), ('009', 3), ('010', 2), ('016', 2), ('017', 2)]
    n = 1000000
    counter            2849.00500992 ms
    dict                438.44007806 ms
    array                 0.00135370 ms
    f_counter : [('000', 1058), ('001', 943), ('002', 1030), ('003', 982), ('004', 1042)]
    f_dict    : [('000', 1058), ('001', 943), ('002', 1030), ('003', 982), ('004', 1042)]
    f_array   : [('000', 1058), ('001', 943), ('002', 1030), ('003', 982), ('004', 1042)]
    

提交回复
热议问题