问题
For df:
id Date ITEM_ID TYPE GROUP
0 13710750 2019-07-01 SLM607 O X
1 13710760 2019-07-01 SLM607 O M
2 13710770 2019-07-03 SLM607 O I
3 13710780 2019-09-03 SLM607 O N
4 13667449 2019-08-02 887643 O I
5 13667450 2019-08-02 792184 O I
6 13728171 2019-09-17 SLM607 I I
7 13667452 2019-08-02 794580 O I
... ... ... ... ... ... ... ... ... ...
with reproducible example:
data = {'id': [13710750, 13710760, 13710770, 13710780, 13667449, 13667450, 13728171, 13667452],
'Date': ['2019-07-01', '2019-07-01', '2019-07-03', '2019-09-03', '2019-08-02', '2019-08-02', '2019-09-17', '2019-08-02'],
'ITEM_ID': ['SLM607', 'SLM607', 'SLM607', 'SLM607', '887643', '792184', 'SLM607', '794580'],
'TYPE': ['O', 'O', 'O', 'O', 'O', 'O', 'I', 'O'],
'GROUP': ['X', 'M', 'I','N','I','I','I', 'I']}
df = pd.DataFrame(data)
df
I want to loop through ITEM_ID by comparing the current element in ITEM_ID with all rows prior to check if there is any row(s) that satisfy conditions below:
(1) has same ITEM_ID as current row; and
(2) if that current row has TYPE== I and the previous row in (1) has TYPE== O, then delete both rows.
What is an efficient way to do this?
Tried:
l = [test.loc[x,'ITEM_ID'] in test.loc[0:x,'ITEM_ID'].tolist() for x in np.arange(0,len(test))]
test['New']=l
test
to filter rows with same 'ITEM_ID' but ended up with True for every single row. I am not sure what went wrong and if this is the correct approach.
Update:
@Reza's solution
df.groupby('ITEM_ID')['TYPE'].apply(lambda x: (x == 'I') & (x.shift() == 'O'))
seemed to work. I would like to add an additional condition, ie., same GROUP.
Tried:
df.loc[df.groupby('ITEM_ID').apply(lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift()))]
but caught error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-177-f68c5ffc70cc> in <module>
----> 1 df.loc[df.groupby('ITEM_ID').apply(lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift()))]
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1765
1766 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767 return self._getitem_axis(maybe_callable, axis=axis)
1768
1769 def _is_scalar_access(self, key: Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 return self._get_slice_axis(key, axis=axis)
1912 elif com.is_bool_indexer(key):
-> 1913 return self._getbool_axis(key, axis=axis)
1914 elif is_list_like_indexer(key):
1915
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getbool_axis(self, key, axis)
1779 # caller is responsible for ensuring non-None axis
1780 labels = self.obj._get_axis(axis)
-> 1781 key = check_bool_indexer(labels, key)
1782 inds = key.nonzero()[0]
1783 return self.obj._take_with_is_copy(inds, axis=axis)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in check_bool_indexer(index, key)
2311 result = key
2312 if isinstance(key, ABCSeries) and not key.index.equals(index):
-> 2313 result = result.reindex(index)
2314 mask = isna(result._values)
2315 if mask.any():
~\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
4028 @Appender(generic.NDFrame.reindex.__doc__)
4029 def reindex(self, index=None, **kwargs):
-> 4030 return super().reindex(index=index, **kwargs)
4031
4032 def drop(
~\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
4542 # perform the reindex on the axes
4543 return self._reindex_axes(
-> 4544 axes, level, limit, tolerance, method, fill_value, copy
4545 ).__finalize__(self)
4546
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4557 ax = self._get_axis(a)
4558 new_index, indexer = ax.reindex(
-> 4559 labels, level=level, limit=limit, tolerance=tolerance, method=method
4560 )
4561
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in reindex(self, target, method, level, limit, tolerance)
2423 else:
2424 # hopefully?
-> 2425 target = MultiIndex.from_tuples(target)
2426
2427 if (
~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in from_tuples(cls, tuples, sortorder, names)
487 tuples = tuples._values
488
--> 489 arrays = list(lib.tuples_to_object_array(tuples).T)
490 elif isinstance(tuples, list):
491 arrays = list(lib.to_object_array_tuples(tuples).T)
pandas\_libs\lib.pyx in pandas._libs.lib.tuples_to_object_array()
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'
回答1:
Try the following which returns a boolean Series:
df.groupby('ITEM_ID')['TYPE'].apply(lambda x: (x == 'I') & (x.shift() == 'O'))
0 False
1 False
2 False
3 True
4 False
Name: TYPE, dtype: bool
For the second case, you can use:
myfilter = lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift())
df.groupby('ITEM_ID').apply(myfilter).reset_index('ITEM_ID', drop=True).sort_index()
来源:https://stackoverflow.com/questions/63788778/compare-current-row-with-all-previous-rows