可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I have a large dataframe in pandas that apart from the column used as index is supposed to have only numeric values:
df = pd.DataFrame({'a': [1, 2, 3, 'bad', 5], 'b': [0.1, 0.2, 0.3, 0.4, 0.5], 'item': ['a', 'b', 'c', 'd', 'e']}) df = df.set_index('item')
How can I find the row of the dataframe df
that has a non-numeric value in it?
In this example it's the fourth row in the dataframe, which has the string 'bad'
in the a
column. How can this row be found programmatically?
回答1:
You could use np.isreal
to check the type of each element (applymap applies a function to each element in the DataFrame):
In [11]: df.applymap(np.isreal) Out[11]: a b item a True True b True True c True True d False True e True True
If all in the row are True then they are all numeric:
In [12]: df.applymap(np.isreal).all(1) Out[12]: item a True b True c True d False e True dtype: bool
So to get the subDataFrame of rouges, (Note: the negation, ~, of the above finds the ones which have at least one rogue non-numeric):
In [13]: df[~df.applymap(np.isreal).all(1)] Out[13]: a b item d bad 0.4
You could also find the location of the first offender you could use argmin:
In [14]: np.argmin(df.applymap(np.isreal).all(1)) Out[14]: 'd'
As @CTZhu points out, it may be slightly faster to check whether it's an instance of either int or float (there is some additional overhead with np.isreal):
df.applymap(lambda x: isinstance(x, (int, float)))
回答2:
Sorry about the confusion, this should be the correct approach. Do you want only to capture 'bad'
only, not things like 'good'
; Or just any non-numerical values?
In[15]: np.where(np.any(np.isnan(df.convert_objects(convert_numeric=True)), axis=1)) Out[15]: (array([3]),)
回答3:
Already some great answers to this question, however here is a nice snippet that I use regularly to drop rows if they have non-numeric values on some columns:
# Eliminate invalid data from dataframe (see Example below for more context) numdf = (df.drop(data_columns, axis=1) .join(df[data_columns].apply(pd.to_numeric, errors='coerce'))) numdf = numdf[num_df[data_columns].notnull().all(axis=1)]
The way this works is we first drop
all the data_columns
from the df
, and then use a join
to put them back in after passing them through pd.to_numeric
(with option 'coerce'
, such that all non-numeric entries are converted to NaN
). The result is saved to numdf
.
On the second line we use a filter that keeps only rows where all values are not null.
Note that pd.to_numeric
is coercing to NaN
everything that cannot be converted to a numeric value, so strings that represent numeric values will not be removed. For example '1.25'
will be recognized as the numeric value 1.25
.
Disclaimer: pd.to_numeric
was introduced in pandas version 0.17.0
Example:
In [1]: import pandas as pd In [2]: df = pd.DataFrame({"item": ["a", "b", "c", "d", "e"], ...: "a": [1,2,3,"bad",5], ...: "b":[0.1,0.2,0.3,0.4,0.5]}) In [3]: df Out[3]: a b item 0 1 0.1 a 1 2 0.2 b 2 3 0.3 c 3 bad 0.4 d 4 5 0.5 e In [4]: data_columns = ['a', 'b'] In [5]: num_df = (df ...: .drop(data_columns, axis=1) ...: .join(df[data_columns].apply(pd.to_numeric, errors='coerce'))) In [6]: num_df Out[6]: item a b 0 a 1 0.1 1 b 2 0.2 2 c 3 0.3 3 d NaN 0.4 4 e 5 0.5 In [7]: num_df[num_df[data_columns].notnull().all(axis=1)] Out[7]: item a b 0 a 1 0.1 1 b 2 0.2 2 c 3 0.3 4 e 5 0.5
回答4:
In case you are working with a column with string values, you can use THE VERY USEFUL function series.str.isnumeric() like:
a = pd.Series(['hi','hola','2.31','288','312','1312', '0,21', '0.23'])
What i do is to copy that column to new column, and do a str.replace('.','') and str.replace(',','') then i select the numeric values. and:
a = a.str.replace('.','') a = a.str.replace(',','') a.str.isnumeric()
Out[15]: 0 False 1 False 2 True 3 True 4 True 5 True 6 True 7 True dtype: bool
Good luck all!