I have a dataFrame in pandas and several of the columns have all null values. Is there a built in function which will let me remove those columns?
df
'''
pets location owner id
0 cat San_Diego Champ 123.0
1 dog NaN Ron NaN
2 cat NaN Brick NaN
3 monkey NaN Champ NaN
4 monkey NaN Veronica NaN
5 dog NaN John NaN
'''
def rmissingvaluecol(dff,threshold):
l = []
l = list(dff.drop(dff.loc[:,list((100*(dff.isnull().sum()/len(dff.index))>=threshold))].columns, 1).columns.values)
print("# Columns having more than %s percent missing values:"%threshold,(dff.shape[1] - len(l)))
print("Columns:\n",list(set(list((dff.columns.values))) - set(l)))
return l
rmissingvaluecol(df,1) #Here threshold is 1% which means we are going to drop columns having more than 1% of missing values
#output
'''
# Columns having more than 1 percent missing values: 2
Columns:
['id', 'location']
'''
Now create new dataframe excluding these columns
l = rmissingvaluecol(df,1)
df1 = df[l]
PS: You can change threshold as per your requirement
You can find the percentage of missing values for each column (optional)
def missing(dff):
print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))
missing(df)
#output
'''
id 83.33
location 83.33
owner 0.00
pets 0.00
dtype: float64
'''