I have two dataframes with identical column names and dtypes, similar to the following:
A object
B category
C category
>
To complement JohnE's answer, here's a function that does the job by converting to union_categoricals all the category columns present on all input dataframes:
def concatenate(dfs):
"""Concatenate while preserving categorical columns.
NB: We change the categories in-place for the input dataframes"""
from pandas.api.types import union_categoricals
import pandas as pd
# Iterate on categorical columns common to all dfs
for col in set.intersection(
*[
set(df.select_dtypes(include='category').columns)
for df in dfs
]
):
# Generate the union category across dfs for this column
uc = union_categoricals([df[col] for df in dfs])
# Change to union category for all dataframes
for df in dfs:
df[col] = pd.Categorical(df[col].values, categories=uc.categories)
return pd.concat(dfs)
Note the categories are changed in place in the input list:
df1=pd.DataFrame({'a': [1, 2],
'x':pd.Categorical(['dog','cat']),
'y': pd.Categorical(['banana', 'bread'])})
df2=pd.DataFrame({'x':pd.Categorical(['rat']),
'y': pd.Categorical(['apple'])})
concatenate([df1, df2]).dtypes