I have some rather large pandas DataFrames and I\'d like to use the new bulk SQL mappings to upload them to a Microsoft SQL Server via SQL Alchemy. The pandas.to_sql method,
As this is an I/O heavy workload you can also use the python threading module through multiprocessing.dummy. This sped things up for me:
import math
from multiprocessing.dummy import Pool as ThreadPool
...
def insert_df(df, *args, **kwargs):
nworkers = 4
chunksize = math.floor(df.shape[0] / nworkers)
chunks = [(chunksize * i, (chunksize * i) + chunksize) for i in range(nworkers)]
chunks.append((chunksize * nworkers, df.shape[0]))
pool = ThreadPool(nworkers)
def worker(chunk):
i, j = chunk
df.iloc[i:j, :].to_sql(*args, **kwargs)
pool.map(worker, chunks)
pool.close()
pool.join()
....
insert_df(df, "foo_bar", engine, if_exists='append')
Based on @ansonw answers:
def to_sql(engine, df, table, if_exists='fail', sep='\t', encoding='utf8'):
# Create Table
df[:0].to_sql(table, engine, if_exists=if_exists)
# Prepare data
output = cStringIO.StringIO()
df.to_csv(output, sep=sep, header=False, encoding=encoding)
output.seek(0)
# Insert data
connection = engine.raw_connection()
cursor = connection.cursor()
cursor.copy_from(output, table, sep=sep, null='')
connection.commit()
cursor.close()
I insert 200000 lines in 5 seconds instead of 4 minutes
.
https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server?view=sql-server-2017
https://www.microsoft.com/en-us/download/details.aspx?id=56567
from sqlalchemy import create_engine
import urllib
server = '*****'
database = '********'
username = '**********'
password = '*********'
params = urllib.parse.quote_plus(
'DRIVER={ODBC Driver 17 for SQL Server};'+
'SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
#Checking Connection
connected = pd.io.sql._is_sqlalchemy_connectable(engine)
print(connected) #Output is True if connection established successfully
df.to_sql('Table_Name', con=engine, if_exists='append', index=False)
"""
if_exists: {'fail', 'replace', 'append'}, default 'fail'
fail: If table exists, do nothing.
replace: If table exists, drop it, recreate it, and insert data.
append: If table exists, insert data. Create if does not exist.
"""
If there are many records
# limit based on sp_prepexec parameter count
tsql_chunksize = 2097 // len(bd_pred_score_100.columns)
# cap at 1000 (limit for number of rows inserted by table-value constructor)
tsql_chunksize = 1000 if tsql_chunksize > 1000 else tsql_chunksize
print(tsql_chunksize)
df.to_sql('table_name', con = engine, if_exists = 'append', index= False, chunksize=tsql_chunksize)
PS: You can change the parameters as per your requirement.
Pandas 0.25.1 has a parameter to do multi-inserts, so it's no longer necessary to workaround this issue with SQLAlchemy.
Set method='multi'
when calling pandas.DataFrame.to_sql
.
In this example, it would be
df.to_sql(table, schema=schema, con=e, index=False, if_exists='replace', method='multi')
Answer sourced from docs here
Worth noting that I've only tested this with Redshift. Please let me know how it goes on other databases so I can update this answer.