问题
I have two dataframes DF(~100k rows)which is a raw data file and DF1(15k rows), mapping file. I'm trying to match the DF.address and DF.Name columns to DF1.Address and DF1.Name. Once the match is found DF1.ID should be populated in DF.ID(if DF1.ID is not None) else DF1.top_ID should be populated in DF.ID.
I'm able to match the address and name with the help of fuzzy logic but i'm stuck how to connect the result obtained to populate the ID.
DF1-Mapping file
DF Raw Data file
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from operator import itemgetter
df=pd.read_excel("Test1", index=False)
df1=pd.read_excel("Test2", index=False)
df=df[df['ID'].isnull()]
zip_code=df['Zip'].tolist()
Facility_city=df['City'].tolist()
Address=df['Address'].tolist()
Name_list=df['Name'].tolist()
def fuzzy_match(x, choice, scorer, cutoff):
return (process.extractOne(x,
choices=choice,
scorer=scorer,
score_cutoff=cutoff))
for pin,city,Add,Name in zip(zip_code,Facility_city,Address,Name_list):
#====Address Matching=====#
choice=df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Address1']
result=fuzzy_match(Add,choice,fuzz.ratio,70)
#====Name Matching========#
if (result is not None):
if (result[3]>70):
choice_1=(df1.loc[(df1['Zip']==pin) &(df1['City']==city),'Name'])
result_1=(fuzzy_match(Name,choice_1,fuzz.ratio,95))
print(ID)
if (result_1 is not None):
if(result_1[3]>95):
#Here populating the matching ID
print("ok")
else:
continue
else:
continue
else:
continue
else:
回答1:
IIUC: Here is a solution:
from fuzzywuzzy import fuzz
import pandas as pd
#Read raw data from clipboard
raw = pd.read_clipboard()
#Read map data from clipboard
mp = pd.read_clipboard()
#Merge raw data and mp data as following
dfr = mp.merge(raw, on=['Hospital Name', 'City', 'Pincode'], how='outer')
#dfr will have many duplicate rows - eliminate duplicate
#To eliminate duplicate using toke_sort_ratio, compare address x and y
dfr['SCORE'] = dfr.apply(lambda x: fuzz.token_sort_ratio(x['Address_x'], x['Address_y']), axis=1)
#Filter only max ratio rows grouped by Address_x
dfr1 = dfr.iloc[dfr.groupby('Address_x').apply(lambda x: x['SCORE'].idxmax())]
#dfr1 shall have the desired result
This link has sample data to test the solution provided.
来源:https://stackoverflow.com/questions/49507193/fuzzy-logic-for-excel-data-pandas