Pandas: calculate haversine distance within each group of rows

后端 未结 4 870
自闭症患者
自闭症患者 2020-12-18 14:26

The sample CSV is like this:

 user_id  lat         lon
    1   19.111841   72.910729
    1   19.111342   72.908387
    2   19.111542   72.907387
    2   19.1         


        
相关标签:
4条回答
  • 2020-12-18 14:36

    You just need a working data structure, dict of lists and lat/lon as tuples. Quickly prototyped it could look like this:

    from haversine import haversine  # pip3 install haversine
    from collections import defaultdict
    
    csv = """
    1   19.111841   72.910729
    1   19.111342   72.908387
    2   19.111342   72.908387
    2   19.137815   72.914085
    2   19.119677   72.905081
    2   19.119677   72.905081
    3   19.119677   72.905081
    3   19.120217   72.907121
    5   19.119677   72.905081
    5   19.119677   72.905081
    5   19.119677   72.905081
    5   19.111860   72.911346
    5   19.111860   72.911346
    5   19.119677   72.905081
    6   19.119677   72.905081
    6   19.119677   72.905081
    """
    
    d = defaultdict(list)  # data structure !
    
    for line in csv.splitlines():
        line = line.strip()  # remove whitespaces
    
        if not line:
            continue  # skip empty lines
    
        cId, lat, lon = line.split('   ')
        d[cId].append((float(lat), float(lon)))
    
    for k, v in d.items():
        print ('Distance for id: ', k, haversine(v[0], v[1]))
    

    returns:

    Distance for id:  1 0.2522433072207346
    Distance for id:  2 3.0039140173887557
    Distance for id:  3 0.22257643412844885
    Distance for id:  5 0.0
    Distance for id:  6 0.0
    
    0 讨论(0)
  • 2020-12-18 14:45

    Assuming that you want to compute haversine() with the first element in each user id group against all the other entries in a group, this approach will work:

    # copying example data from OP
    import pandas as pd
    df = pd.read_clipboard() # alternately, df = pd.read_csv(filename)
    
    def haversine_wrapper(row):
        # return None when both lon/lat pairs are the same
        if (row['first_lon'] == row['lon']) & (row['first_lat'] == row['lat']):
            return None
        return haversine(row['first_lon'], row['first_lat'], row['lon'], row['lat'])
    
    df['result'] = (df.merge(df.groupby('user_id', as_index=False)
                               .agg({'lat':'first','lon':'first'})
                               .rename(columns={'lat':'first_lat','lon':'first_lon'}), 
                             on='user_id')
                      .apply(haversine_wrapper, axis='columns'))
    
    print(df)
    

    Output:

    user_id        lat        lon     result
     0    1  19.111841  72.910729        NaN
     1    1  19.111342  72.908387   0.252243
     2    2  19.111542  72.907387        NaN
     3    2  19.137815  72.914085   3.004976
     4    2  19.119677  72.905081   0.936454
     5    2  19.129677  72.905081   2.031021
     6    3  19.319677  72.905081        NaN
     7    3  19.120217  72.907121  22.179974
     8    4  19.420217  72.807121        NaN
     9    4  19.520217  73.307121  53.584504
     10   5  19.319677  72.905081        NaN
     11   5  19.419677  72.805081  15.286775
     12   5  19.629677  72.705081  40.346128
     13   5  19.111860  72.911347  23.117560
     14   5  19.111860  72.931346  23.272178
     15   5  19.219677  72.605081  33.395165
     16   6  19.319677  72.805082        NaN
     17   6  19.419677  72.905086  15.287063
    
    0 讨论(0)
  • 2020-12-18 14:54

    Try this approach:

    import pandas as pd
    import numpy as np
    
    # parse CSV to DataFrame. You may want to specify the separator (`sep='...'`)
    df = pd.read_csv('/path/to/file.csv')
    
    # vectorized haversine function
    def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
        """
        slightly modified version: of http://stackoverflow.com/a/29546836/2901002
    
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees or in radians)
    
        All (lat, lon) coordinates must have numeric dtypes and be of equal length.
    
        """
        if to_radians:
            lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    
        a = np.sin((lat2-lat1)/2.0)**2 + \
            np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
    
        return earth_radius * 2 * np.arcsin(np.sqrt(a))
    

    Now we can calculate distances between coordinates belonging to the same id (group):

    df['dist'] = \
        np.concatenate(df.groupby('id')
                         .apply(lambda x: haversine(x['lat'], x['lon'],
                                                    x['lat'].shift(), x['lon'].shift())).values)
    

    Result:

    In [105]: df
    Out[105]:
        id        lat        lon       dist
    0    1  19.111841  72.910729        NaN
    1    1  19.111342  72.908387   0.252243
    2    2  19.111542  72.907387        NaN
    3    2  19.137815  72.914085   3.004976
    4    2  19.119677  72.905081   2.227658
    5    2  19.129677  72.905081   1.111949
    6    3  19.319677  72.905081        NaN
    7    3  19.120217  72.907121  22.179974
    8    4  19.420217  72.807121        NaN
    9    4  19.520217  73.307121  53.584504
    10   5  19.319677  72.905081        NaN
    11   5  19.419677  72.805081  15.286775
    12   5  19.629677  72.705081  25.594890
    13   5  19.111860  72.911347  61.509917
    14   5  19.111860  72.931346   2.101215
    15   5  19.219677  72.605081  36.304756
    16   6  19.319677  72.805082        NaN
    17   6  19.419677  72.905086  15.287063
    
    0 讨论(0)
  • 2020-12-18 14:59

    This should work exactly like your sample input and output.

    SCRIPT

    import csv
    from haversine import haversine
    
    with open('file.csv') as file:
    
        reader = csv.reader(file)
        next(reader) # skip header
        previous_row = (None, None, None)
        for id, lon, lat in reader:
    
            id, lon, lat = int(id), float(lon), float(lat)
            current_row = id, lon, lat
            distance = float('nan')
    
            if current_row[0] == previous_row[0]:
                distance = haversine(previous_row[1:], current_row[1:])
    
            print('{} {:02.7f} {:02.7f} {:02.7f}'.format(*current_row, distance))
            previous_row = current_row
    

    OUTPUT

    1 19.1118410 72.9107290 nan
    1 19.1113420 72.9083870 0.2522433
    2 19.1115420 72.9073870 nan
    2 19.1378150 72.9140850 3.0049762
    2 19.1196770 72.9050810 2.2276576
    2 19.1296770 72.9050810 1.1119493
    3 19.3196770 72.9050810 nan
    3 19.1202170 72.9071210 22.1799743
    4 19.4202170 72.8071210 nan
    4 19.5202170 73.3071210 53.5845041
    5 19.3196770 72.9050810 nan
    5 19.4196770 72.8050810 15.2867753
    5 19.6296770 72.7050810 25.5948897
    5 19.1118600 72.9113470 61.5099175
    5 19.1118600 72.9313460 2.1012148
    5 19.2196770 72.6050810 36.3047557
    6 19.3196770 72.8050820 nan
    6 19.4196770 72.9050860 15.2870632
    
    0 讨论(0)
提交回复
热议问题