I\'ve created a Pandas DataFrame
df = DataFrame(index=[\'A\',\'B\',\'C\'], columns=[\'x\',\'y\'])
and got this
x y
A NaN
In addition to the answers above, here is a benchmark comparing different ways to add rows of data to an already existing dataframe. It shows that using at or set-value is the most efficient way for large dataframes (at least for these test conditions).
For the test, an existing dataframe comprising 100,000 rows and 1,000 columns and random numpy values was used. To this dataframe, 100 new rows were added.
Code see below:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 21 16:38:46 2018
@author: gebbissimo
"""
import pandas as pd
import numpy as np
import time
NUM_ROWS = 100000
NUM_COLS = 1000
data = np.random.rand(NUM_ROWS,NUM_COLS)
df = pd.DataFrame(data)
NUM_ROWS_NEW = 100
data_tot = np.random.rand(NUM_ROWS + NUM_ROWS_NEW,NUM_COLS)
df_tot = pd.DataFrame(data_tot)
DATA_NEW = np.random.rand(1,NUM_COLS)
#%% FUNCTIONS
# create and append
def create_and_append(df):
for i in range(NUM_ROWS_NEW):
df_new = pd.DataFrame(DATA_NEW)
df = df.append(df_new)
return df
# create and concatenate
def create_and_concat(df):
for i in range(NUM_ROWS_NEW):
df_new = pd.DataFrame(DATA_NEW)
df = pd.concat((df, df_new))
return df
# store as dict and
def store_as_list(df):
lst = [[] for i in range(NUM_ROWS_NEW)]
for i in range(NUM_ROWS_NEW):
for j in range(NUM_COLS):
lst[i].append(DATA_NEW[0,j])
df_new = pd.DataFrame(lst)
df_tot = df.append(df_new)
return df_tot
# store as dict and
def store_as_dict(df):
dct = {}
for j in range(NUM_COLS):
dct[j] = []
for i in range(NUM_ROWS_NEW):
dct[j].append(DATA_NEW[0,j])
df_new = pd.DataFrame(dct)
df_tot = df.append(df_new)
return df_tot
# preallocate and fill using .at
def fill_using_at(df):
for i in range(NUM_ROWS_NEW):
for j in range(NUM_COLS):
#print("i,j={},{}".format(i,j))
df.at[NUM_ROWS+i,j] = DATA_NEW[0,j]
return df
# preallocate and fill using .at
def fill_using_set(df):
for i in range(NUM_ROWS_NEW):
for j in range(NUM_COLS):
#print("i,j={},{}".format(i,j))
df.set_value(NUM_ROWS+i,j,DATA_NEW[0,j])
return df
#%% TESTS
t0 = time.time()
create_and_append(df)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))
t0 = time.time()
create_and_concat(df)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))
t0 = time.time()
store_as_list(df)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))
t0 = time.time()
store_as_dict(df)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))
t0 = time.time()
fill_using_at(df_tot)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))
t0 = time.time()
fill_using_set(df_tot)
t1 = time.time()
print('Needed {} seconds'.format(t1-t0))