可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I have a list of different ids, start dates and end dates, let's say :
[ (5, d.datetime(2010, 9, 19, 0, 0, 0), d.datetime(2010, 9, 19, 0, 5, 10)), (6, d.datetime(2010, 9, 19, 0, 0, 0), d.datetime(2010, 9, 19, 12, 59, 59)), (4, d.datetime(2010, 9, 19, 10, 30, 17), d.datetime(2010, 9, 19, 20, 20, 59)), (6, d.datetime(2010, 9, 19, 14, 12, 0), d.datetime(2010, 9, 19, 23, 59, 59)), (5, d.datetime(2010, 9, 19, 17, 0, 22), d.datetime(2010, 9, 19, 19, 14, 20)) ]
I need somehow to find overlapping timerange and prepare new list with properly ids which were under coverage at the specific timerange, for example for list above result should be :
[ ('5,6', d.datetime(2010, 9, 19, 0, 0, 0), d.datetime(2010, 9, 19, 0, 5, 10), ('6', d.datetime(2010, 9, 19, 0, 5, 10), d.datetime(2010, 9, 19, 10, 30, 17), ('4,6', d.datetime(2010, 9, 19, 10, 30, 17), d.datetime(2010, 9, 19, 12, 59, 59), ('4', d.datetime(2010, 9, 19, 12, 59, 59), d.datetime(2010, 9, 19, 14, 12, 0), ('4,6', d.datetime(2010, 9, 19, 14, 12, 0), d.datetime(2010, 9, 19, 17, 0, 22), ('4,5,6', d.datetime(2010, 9, 19, 17, 0, 22), d.datetime(2010, 9, 19, 19, 14, 20), ('4,6', d.datetime(2010, 9, 19, 19, 14, 20), d.datetime(2010, 9, 19, 20, 20, 59), ('6', d.datetime(2010, 9, 19, 20, 20, 59), d.datetime(2010, 9, 19, 23, 59, 59) ]
Visual concept:

Actually for now I've solution like this: I'm getting minimal and maximum dates of the whole range, then start iterate from min_date to max_date each 1 second, when in particular second we match some of intervals from target list, I save matched ids as dictionary key and append time from iterator to list as value, then save it to parent list, then next and next. At final I go over all dicts in parent list and get ids as keys and first, last date in value list as range that I need to find. But this solution works very slow when I count ranges in month. Because it's takes too much time iterate 1 month in seconds.
Here is code:
def delta(start, end, delta): cur = start while cur < end: yield cur cur += delta final_ranges = [] last_result = None i = -1 for checker_date in delta( sorted_ranges_by_start[0]['start'], sorted_ranges_by_end[-1]['end'], relativedelta(seconds=1)): aggregator = [] for rng in ranges: if rng['start'] <= checker_date <= rng['end']: aggregator.append(str(rng['id'])) if len(aggregator) > 0: ids = ','.join(set(aggregator)) if last_result != ids: final_ranges.append({}) last_result = ids i += 1 if ids not in final_ranges[i]: final_ranges[i][ids] = [] final_ranges[i][ids].append(checker_date)
But as I said it's working very slow in big ranges.
In this way please help me find algorithm that can do it without iteration through month or maybe advice any way to improve iteration speed ( not sure, maybe try to write this part on C and then embed to Python )
Thanks.
回答1:
I've make it work with the code below.
Basic explanation is to first detect cut points between provided periods, that is, everytime a period starts on ends. Second, iterate between cutpoints only, not periods, and check if they overlap with any to see if they are active between those cut points. Accumulate active periods.
Process time depends on the number of cutpoints and periods, and not on elapsed time.
from datetime import datetime from sortedcontainers import SortedSet periods = [ (5, datetime(2010, 9, 19, 0, 0, 0), datetime(2010, 9, 19, 0, 5, 10)), (6, datetime(2010, 9, 19, 0, 0, 0), datetime(2010, 9, 19, 12, 59, 59)), (4, datetime(2010, 9, 19, 10, 30, 17), datetime(2010, 9, 19, 20, 20, 59)), (6, datetime(2010, 9, 19, 14, 12, 0), datetime(2010, 9, 19, 23, 59, 59)), (5, datetime(2010, 9, 19, 17, 0, 22), datetime(2010, 9, 19, 19, 14, 20)) ] cutpoints = SortedSet() for period in periods: cutpoints.add(period[1]) cutpoints.add(period[2]) ranges = [] start_cutpoint = None for end_cutpoint in cutpoints: if not start_cutpoint: # skip first start_cutpoint = end_cutpoint continue cut_point_active_periods = [] for period in periods: # check if period and cutpoint range overlap start_overlap = max(start_cutpoint, period[1]) end_overlap = min(end_cutpoint, period[2]) if start_overlap < end_overlap: cut_point_active_periods.append(period[0]) ranges.append((cut_point_active_periods, start_cutpoint, end_cutpoint)) start_cutpoint = end_cutpoint
回答2:
Make two records for every interval: {id, time, start/end}
Sort list of all these records compared by time. If time fields tie, compare start/end field and choose end first.
Walk through the list.
When you meet start record, add id to active list
, make time last time
When you meet end record, output active list
with last time
label to result, then remove id from active list. Change last time
let's we have intervals
A: 0..3 B: 1..2 C: 2..4
Records:
(A,0,s), (A,3,e), (B,1,s), (B,2,e), (C,2,s), (C,4,e)
Sorted:
(A,0,s), (B,1,s), (B,2,e), (C,2,s), (A,3,e), (C,4,e)
Walk sorted list:
current active output last time (A,0,s) A - 0 (B,1,s) A,B A 0..1 1 (B,2,e) A A,B 1..2 2 (C,2,s) A,C - 2 (A,3,e) C A,C 2..3 3 (C,4,e) - C 3..4 4
回答3:
This was quite a programming challenge for me, but I finally managed to do it. Basically, I sorted all the times along with their ID's, and then I ran a for loop to get the results:
from datetime import datetime timelist = [ (5, datetime(2010, 9, 19, 0, 0, 0), datetime(2010, 9, 19, 0, 5, 10)), (6, datetime(2010, 9, 19, 0, 0, 0), datetime(2010, 9, 19, 12, 59, 59)), (4, datetime(2010, 9, 19, 10, 30, 17), datetime(2010, 9, 19, 20, 20, 59)), (6, datetime(2010, 9, 19, 14, 12, 0), datetime(2010, 9, 19, 23, 59, 59)), (5, datetime(2010, 9, 19, 17, 0, 22), datetime(2010, 9, 19, 19, 14, 20)) ] timelist_new = [] for time in timelist: timelist_new.append((time[0], time[1], 'begin')) timelist_new.append((time[0], time[2], 'end')) timelist_new = sorted(timelist_new, key=lambda x: x[1]) key = None keylist = set() aggregator = [] for idx in range(len(timelist_new[:-1])): t1 = timelist_new[idx] t2 = timelist_new[idx + 1] t1_key = str(t1[0]) t2_key = str(t2[0]) t1_dt = t1[1] t2_dt = t2[1] t1_pointer = t1[2] t2_pointer = t2[2] if t1_dt == t2_dt: keylist.add(t1_key) keylist.add(t2_key) elif t1_dt < t2_dt: if t1_pointer == 'begin': keylist.add(t1_key) if t1_pointer == 'end': keylist.discard(t1_key) key = ','.join(sorted(keylist)) aggregator.append((key, t1_dt, t2_dt)) for stuff in aggregator: print stuff
Output:
('5,6', datetime.datetime(2010, 9, 19, 0, 0), datetime.datetime(2010, 9, 19, 0, 0)) ('5,6', datetime.datetime(2010, 9, 19, 0, 0), datetime.datetime(2010, 9, 19, 0, 5, 10)) ('6', datetime.datetime(2010, 9, 19, 0, 5, 10), datetime.datetime(2010, 9, 19, 10, 30, 17)) ('4,6', datetime.datetime(2010, 9, 19, 10, 30, 17), datetime.datetime(2010, 9, 19, 12, 59, 59)) ('4', datetime.datetime(2010, 9, 19, 12, 59, 59), datetime.datetime(2010, 9, 19, 14, 12)) ('4,6', datetime.datetime(2010, 9, 19, 14, 12), datetime.datetime(2010, 9, 19, 17, 0, 22)) ('4,5,6', datetime.datetime(2010, 9, 19, 17, 0, 22), datetime.datetime(2010, 9, 19, 19, 14, 20)) ('4,6', datetime.datetime(2010, 9, 19, 19, 14, 20), datetime.datetime(2010, 9, 19, 20, 20, 59)) ('6', datetime.datetime(2010, 9, 19, 20, 20, 59), datetime.datetime(2010, 9, 19, 23, 59, 59)) ***Repl Closed***
Just remove that first line of the output since the begin and end dates are the same :)
final_list = [] for stuff in aggregator: if stuff[1] != stuff[2]: final_list.append(stuff)