I am pretty new to Pandas and trying to find out where my code breaks. Say, I am doing a type conversion:
df[\'x\']=df[\'x\'].astype(\'int\')
I hit the same problem, and as I have a big input file (3 million rows), enumerating all rows will take a long time. Therefore I wrote a binary-search to locate the offending row.
import pandas as pd
import sys
def binarySearch(df, l, r, func):
while l <= r:
mid = l + (r - l) // 2;
result = func(df, mid, mid+1)
if result:
# Check if we hit exception at mid
return mid, result
result = func(df, l, mid)
if result is None:
# If no exception at left, ignore left half
l = mid + 1
else:
r = mid - 1
# If we reach here, then the element was not present
return -1
def check(df, start, end):
result = None
try:
# In my case, I want to find out which row cause this failure
df.iloc[start:end].uid.astype(int)
except Exception as e:
result = str(e)
return result
df = pd.read_csv(sys.argv[1])
index, result = binarySearch(df, 0, len(df), check)
print("index: {}".format(index))
print(result)