I want to merge two dataframes on three columns: email, subject and timestamp. The timestamps between the dataframes differ and I therefore need to identify the closest mat
You want to apply the closest timestamp logic to each group of 'email' and 'subject'
a = """timestamp,email,subject
2016-07-01 10:17:00,a@gmail.com,subject3
2016-07-01 02:01:02,a@gmail.com,welcome
2016-07-01 14:45:04,a@gmail.com,subject3
2016-07-01 08:14:02,a@gmail.com,subject2
2016-07-01 16:26:35,a@gmail.com,subject4
2016-07-01 10:17:00,b@gmail.com,subject3
2016-07-01 02:01:02,b@gmail.com,welcome
2016-07-01 14:45:04,b@gmail.com,subject3
2016-07-01 08:14:02,b@gmail.com,subject2
2016-07-01 16:26:35,b@gmail.com,subject4
"""
b = """timestamp,email,subject,clicks,var1
2016-07-01 02:01:14,a@gmail.com,welcome,1,1
2016-07-01 08:15:48,a@gmail.com,subject2,2,2
2016-07-01 10:17:39,a@gmail.com,subject3,1,7
2016-07-01 14:46:01,a@gmail.com,subject3,1,2
2016-07-01 16:27:28,a@gmail.com,subject4,1,2
2016-07-01 10:17:05,b@gmail.com,subject3,0,0
2016-07-01 02:01:03,b@gmail.com,welcome,0,0
2016-07-01 14:45:05,b@gmail.com,subject3,0,0
2016-07-01 08:16:00,b@gmail.com,subject2,0,0
2016-07-01 17:00:00,b@gmail.com,subject4,0,0
"""
df1 = pd.read_csv(StringIO(a), parse_dates=['timestamp'])
df2 = pd.read_csv(StringIO(b), parse_dates=['timestamp'])
df2 = df2.set_index(['email', 'subject'])
def find_closest_date(timepoint, time_series, add_time_delta_column=True):
# takes a pd.Timestamp() instance and a pd.Series with dates in it
# calcs the delta between `timepoint` and each date in `time_series`
# returns the closest date and optionally the number of days in its time delta
time_series = time_series.values
timepoint = np.datetime64(timepoint)
deltas = np.abs(np.subtract(time_series, timepoint))
idx_closest_date = np.argmin(deltas)
res = {"closest_date": time_series[idx_closest_date]}
idx = ['closest_date']
if add_time_delta_column:
res["closest_delta"] = deltas[idx_closest_date]
idx.append('closest_delta')
return pd.Series(res, index=idx)
# Then group df1 as needed
grouped = df1.groupby(['email', 'subject'])
# Finally loop over the group items, finding the closest timestamps
join_ts = pd.DataFrame()
for name, group in grouped:
try:
join_ts = pd.concat([join_ts, group['timestamp']\
.apply(find_closest_date, time_series=df2.loc[name, 'timestamp'])],
axis=0)
except KeyError:
pass
df3 = pd.merge(pd.concat([df1, join_ts], axis=1), df2, left_on=['closest_date'], right_on=['timestamp'])