问题
Count the number of messages within a date range per interval. I"m using python 2.6.5 only.
For example Start date: 12/11/2014 End date: 12/12/2014 Start time: 02:00 End time: 02:05 Interval: Per 1 min
So this translates to how many messages are between each interval of a minute from start date 12/11 to end date 12/12. So my out put will look like this: (does not need to have strings min and messages)
datetime(2014, 12, 11, 2, 0) min : 0 messages,
datetime(2014, 12, 11, 2, 1) min: 1 message,
datetime(2014, 12, 11, 2, 2) min: 2 messages,
datetime(2014, 12, 11, 2, 3) min: 1 message,
datetime(2014, 12, 11, 2, 4) min : 0 messages,
datetime(2014, 12, 11, 2, 5) min : 0 messages
I believe I accomplish this but its very slow with large datasets. I think because it uses two loops and if the the second loop is extremely large then it takes very long time and does it for each iteration of the first loop. I need better procedure or alrogithm to accomplish this?
Edit: I need to include zero for intervals that do not have messages. I'm also trying to find peak,min and average.
from datetime import date,datetime, timedelta, time
def perdelta(start, end, delta):
curr = start
while curr < end:
yield curr
curr += delta
def rdata(table, fromDate, toDate, fromTime, toTime, interval):
date_to_alert = {}
start_date = datetime(fromDate.year, fromDate.month, fromDate.day, fromTime.hour, fromTime.minute)
end_date = datetime(toDate.year, toDate.month, toDate.day, toTime.hour, toTime.minute)
list_range_of_dates = []
for date_range in perdelta(start_date ,end_date ,interval):
list_range_of_dates.append(date_range)
print list_range_of_dates
index = 0
for date_range in list_range_of_dates:
for row in table:
print('first_alerted_time 1: %s index: %s len: %s' % ( row['first_alerted_time'], index, len(list_range_of_dates)-1))
if row['first_alerted_time'] and row['first_alerted_time'] >= list_range_of_dates[index] and row['first_alerted_time'] < list_range_of_dates[index + 1]:
print('Start date: %s' % list_range_of_dates[index] )
print('first_alerted_time: %s' % row['first_alerted_time'])
print('end date: %s' % list_range_of_dates[index + 1])
if list_range_of_dates[index] in date_to_alert:
date_to_alert[list_range_of_dates[index]].append(row)
else:
date_to_alert[list_range_of_dates[index]] = [row]
elif row['first_alerted_time']:
print('first_alerted_time 2: %s' % row['first_alerted_time'])
index = index + 1
print date_to_alert
for key, value in date_to_alert.items():
date_to_alert[key] = len(value)
print date_to_alert
t1 = []
if date_to_alert:
avg = sum(date_to_alert.values())/len(date_to_alert.keys())
for date_period, num_of_alerts in date_to_alert.items():
#[date_period] = date_to_alert[date_period]
t1.append( [ date_period, num_of_alerts, avg] )
print t1
return t1
def main():
example_table = [
{'first_alerted_time':datetime(2014, 12, 11, 2, 1,45)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 2,33)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 2,45)},
{'first_alerted_time':datetime(2014, 12, 11, 2, 3,45)},
]
example_table.sort()
print example_table
print rdata(example_table, date(2014,12,11), date(2014,12,12), time(00,00,00), time(00,00,00), timedelta(minutes=1))
Update: First attempt for improvement:
Default Dictionary approach
def default_dict_approach(table, fromDate, toDate, fromTime, toTime, interval):
from collections import defaultdict
t1 = []
start_date = datetime.combine(fromDate, fromTime)
end_date = datetime.combine(toDate, toTime)+ interval
times = (d['first_alerted_time'] for d in table)
counter = defaultdict(int)
for dt in times:
if start_date <= dt < end_date:
counter[to_s(dt - start_date) // to_s(interval)] += 1
date_to_alert = {}
date_to_alert = dict((ts*interval + start_date, count) for ts, count in counter.iteritems())
max_num,min_num,avg = 0,0,0
list_of_dates = list(perdelta(start_date,end_date,interval))
if date_to_alert:
freq_values = date_to_alert.values()
size_freq_values = len(freq_values)
avg = sum(freq_values)/ size_freq_values
max_num = max(freq_values)
if size_freq_values == len(list_of_dates):
min_num = min(freq_values)
else:
min_num = 0
for date_period in list_of_dates:
if date_period in date_to_alert:
t1.append([ date_period.strftime("%Y-%m-%d %H:%M"), date_to_alert[date_period], avg, max_num, min_num])
else:
t1.append([ date_period.strftime("%Y-%m-%d %H:%M"), 0, avg, max_num, min_num])
return (t1,max_num,min_num,avg)
numpy approach
def numpy_approach(table, fromDate, toDate, fromTime, toTime, interval):
date_to_alert = {}
start_date = datetime.combine(fromDate, fromTime)
end_date = datetime.combine(toDate, toTime)+ interval
list_range_of_dates = []
for date_range in perdelta(start_date ,end_date ,interval):
list_range_of_dates.append(date_range)
#print list_range_of_dates
index = 0
times = np.fromiter((d['first_alerted_time'] for d in table),
dtype='datetime64[us]', count=len(table))
print times
bins = np.fromiter(list_range_of_dates,
dtype=times.dtype)
print bin
a, bins = np.histogram(times, bins)
print(dict(zip(bins[a.nonzero()].tolist(), a[a.nonzero()])))
回答1:
You want to implement numpy.histogram() for dates:
import numpy as np
times = np.fromiter((d['first_alerted_time'] for d in example_table),
dtype='datetime64[us]', count=len(example_table))
bins = np.fromiter(date_range(start_date, end_date + step, step),
dtype=times.dtype)
a, bins = np.histogram(times, bins)
print(dict(zip(bins[a.nonzero()].tolist(), a[a.nonzero()])))
Output
{datetime.datetime(2014, 12, 11, 2, 0): 3,
datetime.datetime(2014, 12, 11, 2, 3): 1}
numpy.historgram()
works even if the step is not constant and times
array is unsorted. Otherwise the call can be optimized if you decide to use numpy
.
There are two general approaches that you could use on Python 2.6 to implement numpy.historgram
:
itertools.groupby
-based: the input should be sorted but it allows to implement a single-pass, constant memory algorithmcollections.defaultdict
-based: the input may be unsorted and it is also a linear algorithm but it isO(number_of_nonempty_bins)
in memory
groupby()
-based solution:
from itertools import groupby
times = (d['first_alerted_time'] for d in example_table)
bins = date_range(start_date, end_date + step, step)
def key(dt, end=[next(bins)]):
while end[0] <= dt:
end[0] = next(bins)
return end[0]
print dict((end-step, sum(1 for _ in g)) for end, g in groupby(times, key=key))
It produces the same output as histogram()
-based approach.
Note: all dates that are less than start_date
are put in the first bin.
defaultdict()
-based solution
from collections import defaultdict
def to_s(td): # for Python 2.6
return td.days*86400 + td.seconds #NOTE: ignore microseconds
times = (d['first_alerted_time'] for d in example_table)
counter = defaultdict(int)
for dt in times:
if start_date <= dt < end_date:
counter[to_s(dt - start_date) // to_s(step)] += 1
print dict((ts*step + start_date, count) for ts, count in counter.iteritems())
The output is the same as the other two solutions.
回答2:
I'd do it with a single pass on table
, building a dict
. If the table's supposed to be sorted on the first_alerted_time
field, you can locate the lower and upper bounds for the pass using module bisect
, then you don't even need the checks on start_date
and end_date
, just something like:
counts = collections.Counter()
counts.update(datrow['first_alerted_time'].date()
for row in table[start_index:end_index])
or obvious variants (e.g with itertools
) if the slice of table
eats up too much memory.
If table
can't be assumed to be sorted, the approach is not very different, you just need to add an if
clause and loop over the whole table:
counts = collections.Counter()
counts.update(row['first_alerted_time'].date()
for row in table[start_index:end_index]
if start_date < row['first_alerted_time'] < end_date)
If you can clarify what can be assumed about table
then (if applicable, and you have problems doing it) I can show how to use bisect
to find the boundaries on a sorted table.
So this gives you a Counter
with counts by date. If you need a list sorted by date (e.g of (date, count) pairs), sorted(counts.items())
will give you that, &c. Again, if you can clarify exactly what kinds of results are acceptable, and have problems obtaining them, I can show how to do that.
If minutes are also important (as said in the Q's text but not shown in the example code that's said to be working), just replace row['first_alerted_time'].date()
in either of the above snippets with row['first_alerted_time'].replace(second=0,microsecond=0)
. Again, the resulting counter can be sorted, &c.
回答3:
You have so many print
in the loop. I'm not sure they are just debug information or you really wanna print them? I just assume they are debug information.
First, you don't need two loop but one. The first one is totally useless. Because your interval is 1 minute
, what you can do is change
date_to_alert[list_range_of_dates[index]]...
TO
t = datetime(row['first_alerted_time'].year, row['first_alerted_time'].month, .... row.['first_alerted_time'].minute]
if t in data_to_alert:
......
So you can get rid of the first loop.
==EDIT==
If the interval is dynamic, you can still calculate the time.
def calc(alert_time, interval):
t = datetime(row['first_alerted_time'].year, row['first_alerted_time'].month, .... row.['first_alerted_time'].minute]
from_t = datetime(fromDate.year, fromDate.month, fromDate.day, fromTime.hour, fromTime.minute)
return (t - from_t) / interval * interval + from_t
For example, let say the from_time
is 10 and interval
is 5, your bucket will be
10, 15, 20, 25, ....
Then
- 17 shall return (17-10) / 5 * 5 + 10 = 15
- 25 shall return (25-10) / 5 * 5 + 10 = 25
And to call this function:
t = calc_t(row['first_alerted_time'], interval)
if t in data_to_alert:
....
==EDIT END==
What's more, if your table, is sorted, you can binary search the start/end row of the table and only count the rows between them which makes your code even faster
==EDIT==
For example
1: 2015-01-01-10:00:00
2: 2015-01-01-11:00:00
3: 2015-01-01-12:00:00
4: 2015-01-02-11:13:00
5: 2015-01-02-12:45:00
Your from time is 2015-01-01-10:30:00 and end time is 2015-01-02:11:30:00. So binary search these two timestamps in the table, you only need to work on row 2,3 and 4 but ignore first and last row. This shall increase the speed of your program if you only care about few rows in a super large table. However, this only works if your table is sorted so you can do binary search.
==EDIT END==
来源:https://stackoverflow.com/questions/27640016/python-count-the-frequency-of-messages-within-a-date-range-within-per-dynamic-i