可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
Imagine that there is 10 houses, where there can be one to an infinite number of persons. Each of those persons sends a number of messages, containing their userid and the house number. This can be from 1 to infinite number of messages. I want to know the average number of messages that is sent by each person, for each house, to later plot which house got the largest number of average messages.
Now, that I've explained conceptually, the houses aren't houses, but latitudes, from f.ex -90 to -89 etc. And that a person can send messages from different houses.
So I've got a database with latitude and senderID. I want to plot the density of latitudes pr unique senderID:
Number of rows/Number of unique userids
at each latitude over an interval
This is an sample input:
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95, 40.72, 47.59, 54.42, 63.84, 76.77, 77.43, 78.54] userid= [5, 7, 6, 6, 6, 6, 5, 2, 2, 2, 1, 5, 10, 9 ,8]
Here are the corresponding densities:
-80 to -90: 1 -40 to -50: 1 -30 to -40: 4 -20 to -30: 1 40 to 50: 2 50 to 60: 1 60 to 70: 1 70 to 80: 1
An other input:
lat = [70,70,70,70,70,80,80,80] userid = [1,2,3,4,5,1,1,2]
The density for latitude 70 is 1, while the density for latitude 80 is 1.5.
If I would do this through a database query/pseudocode I would do something like:
SELECT count(latitude) FROM messages WHERE latitude < 79 AND latitude > 69 SELECT count(distinct userid) FROM messages WHERE latitude < 79 AND latitude > 69
The density would then be count(latitude)/count(distinct userid)
- also to be interpreted as totalmessagesFromCertainLatitude/distinctUserIds. This would be repeated for intervals from -90 to 90, i.e -90<latitude<-89
up to 89<latitude<90
To get any help with this is probably a far stretch, but I just cant organize my thoughts to do this while I'm confident there are no errors. I would be happy for anything. I'm sorry if I was unclear.
回答1:
I'm not 100% sure I've understood the output you want, but this will produce a stepped, cumulative histogram-like plot with the x-axis being latitudes (binned) and the y axis being the density you define above.
From your sample code, you already have numpy
installed and are happy to use it. The approach I would take is to get two data sets rather like what would be returned by your SQL sample and then use them to get the densities and then plot. Using your existing latitude / userid data format - it might look something like this
EDIT: Removed first version of code from here and some comments which were redundant following clarification and question edits from the OP
Following comments and OP clarification - I think this is what is desired:
import numpy as np import matplotlib.pyplot as plt from itertools import groupby import numpy as np import matplotlib.pyplot as plt from itertools import groupby def draw_hist(latitudes,userids): min_lat = -90 max_lat = 90 binwidth = 1 bin_range = np.arange(min_lat,max_lat,binwidth) all_rows = zip(latitudes,userids) binned_latitudes = np.digitize(latitudes,bin_range) all_in_bins = zip(binned_latitudes,userids) unique_in_bins = list(set(all_in_bins)) all_in_bins.sort() unique_in_bins.sort() bin_count_all = [] for bin, group in groupby(all_in_bins, lambda x: x[0]): bin_count_all += [(bin, len([k for k in group]))] bin_count_unique = [] for bin, group in groupby(unique_in_bins, lambda x: x[0]): bin_count_unique += [(bin, len([ k for k in group]))] # bin_count_all and bin_count_unique now contain the data # corresponding to the SQL / pseudocode in your question # for each latitude bin bin_density = [(bin_range[b-1],a*1.0/u) for ((b,a),(_,u)) in zip(bin_count_all, bin_count_unique)] bin_density = np.array(bin_density).transpose() # plot as standard bar - note you can put uneven widths in # as an array-like here if necessary # the * simply unpacks the x and y values from the density plt.bar(*bin_density, width=binwidth) plt.show() # can save away plot here if desired latitudes = [-70.5, 5.3, 70.32, 70.43, 5, 32, 80, 80, 87.3] userids = [1,1,2,2,4,5,1,1,2] draw_hist(latitudes,userids)
Sample output with different bin widths on OP dataset

回答2:
Because this packs so neatly into pandas' built-ins, it's probably fast in pandas for big datasets.
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95, 40.72, 47.59, 54.42, 63.84, 76.77, 77.43, 78.54] userid= [5, 7, 6, 6, 6, 6, 5, 2, 2, 2, 1, 5, 10, 9 ,8] import pandas as pd import matplotlib.pyplot as plt from matplotlib.patches import Rectangle from matplotlib.collections import PatchCollection from math import floor df = pd.DataFrame(zip(userid,lat), columns = ['userid','lat'] ) df['zone'] = map(lambda x: floor(x) * 10,df.lat/10) # for ten-degree zones zonewidth=10 #df['zone'] = map(floor, df.lat) # for one-degree zones #zonewidth=1 # ditto dfz = df.groupby('zone') #returns a dict of dataframes #for k, v in dfz: # useful for exploring the GroupBy object # print(k, v.userid.values, float(len(v.userid.values))/len(set(v.userid.values))) p = [(k, float(len(v.userid.values))/len(set(v.userid.values))) for k, v in dfz] # plotting could be tightened up -- PatchCollection? R = [Rectangle((x, 0), zonewidth, y, facecolor='red', edgecolor='black',fill=True) for x, y in p] fig, ax = plt.subplots() for r in R: ax.add_patch(r) plt.xlim((-90, 90)) tall = max([r.get_height() for r in R]) plt.ylim((0, tall + 0.5)) plt.show()
For the first set of test data:

回答3:
I think this solves the case, allthough it isn't efficient at all:
con = lite.connect(databasepath) binwidth = 1 latitudes = [] userids = [] info = [] densities = [] with con: cur = con.cursor() cur.execute('SELECT latitude, userid FROM dynamicMessage') con.commit() print "executed" while True: tmp = cur.fetchone() if tmp != None: info.append([float(tmp[0]),float(tmp[1])]) else: break info = sorted(info, key=itemgetter(0)) for x in info: latitudes.append(x[0]) userids.append(x[1]) x = 0 latitudecount = 0 for b in range(int(min(latitudes)),int(max(latitudes))+1): numlatitudes = sum(i<b for i in latitudes) if numlatitudes > 1: tempdensities = latitudes[0:numlatitudes] latitudes = latitudes[numlatitudes:] tempuserids = userids[0:numlatitudes] userids = userids[numlatitudes:] density = numlatitudes/len(list(set(tempuserids))) if density>1: tempdensities = [b]*int(density) densities.extend(tempdensities) plt.hist(densities, bins=len(list(set(densities)))) plt.savefig('latlongstats'+'t'+str(time.strftime("%H:%M:%S")), format='png')
回答4:
What follows is not a complete solution in terms of plotting the required histogram, but I think it's nevertheless worthy of being reported
The bulk of the solution, we scan the array of tuples to select the ones in the required range and we count
- the number of selected tuples
- the unique ids, using a trick consisting in creating a set (this discards automatically the duplicates) and computing its numerosity
eventually we return the required ratio or zero if the count of distinct ids is zero
def ratio(d, mn, mx): tmp = [(lat, uid) for lat, uid in d if mn <= lat < mx] nlats, nduids = len(tmp), len({t[1] for t in tmp}) return 1.0*nlats/nduids if nduids>0 else 0
The data is input and assigned, via zip
, to a list of tuples
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95, -19.00, -12.32, -6.14, -1.11, 4.40, 10.23, 19.40, 31.18, 40.72, 47.59, 54.42, 63.84, 76.77] userid= [52500.0, 70100.0, 35310.0, 47776.0, 70100.0, 30991.0, 37328.0, 25575.0, 37232.0, 6360.0, 52908.0, 52908.0, 52908.0, 77500.0, 345.0, 6360.0, 3670.0, 36690.0, 3720.0, 2510.0, 2730.0] data = zip(lat,userid)
preparation of the bins
extremes = range(-90,91,10) intervals = zip(extremes[:-1],extremes[1:])
actual computation, the result is a list of float
s that can be passed to the relevant pyplot
functions
ratios = [ratio(data,*i) for i in intervals] print ratios # [1.0, 0, 0, 0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0]