Create multiple columns in Pandas Dataframe from one function

这一生的挚爱 提交于 2019-12-03 17:31:09

问题


I'm a python newbie, so I hope my two questions are clear and complete. I posted the actual code and a test data set in csv format below.

I've been able to construct the following code (mostly with the help from the StackOverflow contributors) to calculate the Implied Volatility of an option contract using Newton-Raphson method. The process calculates Vega when determining the Implied Volatility. Although I'm able to create a new DataFrame column for Implied Volatility using the Pandas DataFrame apply method, I'm unable to create a second column for Vega. Is there a way create two separate DataFrame columns when the function to returns IV & Vega together?

I tried:

  • return iv, vega from function
  • df[['myIV', 'Vega']] = df.apply(newtonRap, axis=1)
  • Got ValueError: Shape of passed values is (56, 2), indices imply (56, 13)

Also tried:

  • return iv, vega from function
  • df['myIV'], df['Vega'] = df.apply(newtonRap, axis=1)
  • Got ValueError: Shape of passed values is (56, 2), indices imply (56, 13)

Additionally, the calculation process is slow. I imported numba and implemented the @jit(nogil=True) decorator, but I only see a performance improvement of 25%. The test data set is the performance test has almost 900,000 records. The run time is 2 hours and 9 minutes without numba or with numba, but witout nogil=True. The run time when using numba and @jit(nogil=True) is 1 hour and 32 minutes. Can I do better?

from datetime import datetime
from math import sqrt, pi, log, exp, isnan
from scipy.stats import norm
from numba import jit


# dff = Daily Fed Funds (Posted rate is usually one day behind)
dff = pd.read_csv('https://research.stlouisfed.org/fred2/data/DFF.csv', parse_dates=[0], index_col='DATE')
rf = float('%.4f' % (dff['VALUE'][-1:][0] / 100))
# rf = .0015                        # Get Fed Funds Rate https://research.stlouisfed.org/fred2/data/DFF.csv
tradingMinutesDay = 450             # 7.5 hours per day * 60 minutes per hour
tradingMinutesAnnum = 113400        # trading minutes per day * 252 trading days per year
cal = USFederalHolidayCalendar()    # Load US Federal holiday calendar


@jit(nogil=True)                                # nogil=True arg improves performance by 25%
def newtonRap(row):
    """Estimate Implied Volatility (IV) using Newton-Raphson method

    :param row (dataframe):  Options contract params for function
        TimeStamp (datetime): Close date
        Expiry (datetime): Option contract expiration date
        Strike (float): Option strike
        OptType (object): 'C' for call; 'P' for put
        RootPrice (float): Underlying close price
        Bid (float): Option contact closing bid
        Ask (float): Option contact closing ask

    :return:
        float: Estimated implied volatility
    """
    if row['Bid'] == 0.0 or row['Ask'] == 0.0 or row['RootPrice'] == 0.0 or row['Strike'] == 0.0 or \
       row['TimeStamp'] == row['Expiry']:
        iv, vega = 0.0, 0.0         # Set iv and vega to zero if option contract is invalid or expired
    else:
        # dte (Days to expiration) uses pandas bdate_range method to determine the number of business days to expiration
        #   minus USFederalHolidays minus constant of 1 for the TimeStamp date
        dte = float(len(pd.bdate_range(row['TimeStamp'], row['Expiry'])) -
                    len(cal.holidays(row['TimeStamp'], row['Expiry']).to_pydatetime()) - 1)
        mark = (row['Bid'] + row['Ask']) / 2
        cp = 1 if row['OptType'] == 'C' else -1
        S = row['RootPrice']
        K = row['Strike']
        # T = the number of trading minutes to expiration divided by the number of trading minutes in year
        T = (dte * tradingMinutesDay) / tradingMinutesAnnum
        # TODO get dividend value
        d = 0.00
        iv = sqrt(2 * pi / T) * mark / S        # Closed form estimate of IV Brenner and Subrahmanyam (1988)
        vega = 0.0
        for i in range(1, 100):
            d1 = (log(S / K) + T * (rf - d + iv ** 2 / 2)) / (iv * sqrt(T))
            d2 = d1 - iv * sqrt(T)
            vega = S * norm.pdf(d1) * sqrt(T)
            model = cp * S * norm.cdf(cp * d1) - cp * K * exp(-rf * T) * norm.cdf(cp * d2)
            iv -= (model - mark) / vega
            if abs(model - mark) < 1.0e-9:
                break
        if isnan(iv) or isnan(vega):
            iv, vega = 0.0, 0.0
    # TODO Return vega with iv if add'l pandas column possible
    # return iv, vega
    return iv


if __name__ == "__main__":
    # test function from baseline data
    get_csv = True

    if get_csv:
        csvHeaderList = ['TimeStamp', 'OpraSymbol', 'RootSymbol', 'Expiry', 'Strike', 'OptType', 'RootPrice', 'Last',
                         'Bid', 'Ask', 'Volume', 'OpenInt', 'IV']
        fileName = 'C:/tmp/test-20150930-56records.csv'
        df = pd.read_csv(fileName, parse_dates=[0, 3], names=csvHeaderList)
    else:
        pass

    start = datetime.now()
    # TODO Create add'l pandas dataframe column, if possible, for vega
    # df[['myIV', 'Vega']] = df.apply(newtonRap, axis=1)
    # df['myIV'], df['Vega'] = df.apply(newtonRap, axis=1)
    df['myIV'] = df.apply(newtonRap, axis=1)
    end = datetime.now()
    print end - start

Test Data: C:/tmp/test-20150930-56records.csv

2015-09-30 16:00:00,AAPL151016C00109000,AAPL,2015-10-16 16:00:00,109,C,109.95,3.46,3.6,3.7,1565,1290,0.3497 2015-09-30 16:00:00,AAPL151016P00109000,AAPL,2015-10-16 16:00:00,109,P,109.95,2.4,2.34,2.42,3790,3087,0.3146 2015-09-30 16:00:00,AAPL151016C00110000,AAPL,2015-10-16 16:00:00,110,C,109.95,3,2.86,3,10217,28850,0.3288 2015-09-30 16:00:00,AAPL151016P00110000,AAPL,2015-10-16 16:00:00,110,P,109.95,2.81,2.74,2.8,12113,44427,0.3029 2015-09-30 16:00:00,AAPL151016C00111000,AAPL,2015-10-16 16:00:00,111,C,109.95,2.35,2.44,2.45,6674,2318,0.3187 2015-09-30 16:00:00,AAPL151016P00111000,AAPL,2015-10-16 16:00:00,111,P,109.95,3.2,3.1,3.25,2031,3773,0.2926 2015-09-30 16:00:00,AAPL151120C00110000,AAPL,2015-11-20 16:00:00,110,C,109.95,5.9,5.7,5.95,5330,17112,0.3635 2015-09-30 16:00:00,AAPL151120P00110000,AAPL,2015-11-20 16:00:00,110,P,109.95,6.15,6.1,6.3,3724,15704,0.3842


回答1:


If I understand you right, what you should be doing is returning a Series from your function. Something like:

return pandas.Series({"IV": iv, "Vega": vega})

If you want to put the result into new columns of the same input DataFrame, then just do:

df[["IV", "Vega"]] = df.apply(newtonRap, axis=1)



回答2:


As far as the performance with numba is concerned, numba doesn't know anything about pandas dataframes and cannot compile operations on them down to fast machine code. Your best bet is to profile what part of your method is slow (using line_profiler for example), and then offload that part to another method that you construct the inputs using the .values attributes of the dataframe columns, which gives you access to the underlying numpy array. Otherwise numba is just going to operate mostly in "object mode" (see the numba glossary) and won't improve performance drastically




回答3:


The trick to vectorize code is to not think in terms of rows, but instead think in terms of columns.

I almost have this working (I'll try to finish it later), but you want to do something along the lines of this:

from datetime import datetime
from math import sqrt, pi, log, exp, isnan
from numpy import inf, nan
from scipy.stats import norm
import pandas as pd
from pandas import Timestamp
from pandas.tseries.holiday import USFederalHolidayCalendar

# Initial parameters
rf = .0015                          # Get Fed Funds Rate https://research.stlouisfed.org/fred2/data/DFF.csv
tradingMinutesDay = 450             # 7.5 hours per day * 60 minutes per hour
tradingMinutesAnnum = 113400        # trading minutes per day * 252 trading days per year
cal = USFederalHolidayCalendar()    # Load US Federal holiday calendar
two_pi = 2 * pi                     # 2 * Pi (to reduce computations)
threshold = 1.0e-9                  # convergence threshold.

# Create sample data:
col_order = ['TimeStamp', 'OpraSymbol', 'RootSymbol', 'Expiry', 'Strike', 'OptType', 'RootPrice', 'Last', 'Bid', 'Ask', 'Volume', 'OpenInt', 'IV']
df = pd.DataFrame({'Ask': {0: 3.7000000000000002, 1: 2.4199999999999999, 2: 3.0, 3: 2.7999999999999998, 4: 2.4500000000000002, 5: 3.25, 6: 5.9500000000000002, 7: 6.2999999999999998},
                   'Bid': {0: 3.6000000000000001, 1: 2.3399999999999999, 2: 2.8599999999999999, 3: 2.7400000000000002, 4: 2.4399999999999999, 5: 3.1000000000000001, 6: 5.7000000000000002, 7: 6.0999999999999996},
                   'Expiry': {0: Timestamp('2015-10-16 16:00:00'), 1: Timestamp('2015-10-16 16:00:00'), 2: Timestamp('2015-10-16 16:00:00'), 3: Timestamp('2015-10-16 16:00:00'), 4: Timestamp('2015-10-16 16:00:00'), 5: Timestamp('2015-10-16 16:00:00'), 6: Timestamp('2015-11-20 16:00:00'), 7: Timestamp('2015-11-20 16:00:00')},
                   'IV': {0: 0.3497, 1: 0.3146, 2: 0.3288, 3: 0.3029, 4: 0.3187, 5: 0.2926, 6: 0.3635, 7: 0.3842},
                   'Last': {0: 3.46, 1: 2.34, 2: 3.0, 3: 2.81, 4: 2.35, 5: 3.20, 6: 5.90, 7: 6.15},
                   'OpenInt': {0: 1290.0, 1: 3087.0, 2: 28850.0, 3: 44427.0, 4: 2318.0, 5: 3773.0, 6: 17112.0, 7: 15704.0},
                   'OpraSymbol': {0: 'AAPL151016C00109000', 1: 'AAPL151016P00109000', 2: 'AAPL151016C00110000', 3: 'AAPL151016P00110000', 4: 'AAPL151016C00111000', 5: 'AAPL151016P00111000', 6: 'AAPL151120C00110000', 7: 'AAPL151120P00110000'},
                   'OptType': {0: 'C', 1: 'P', 2: 'C', 3: 'P', 4: 'C', 5: 'P', 6: 'C', 7: 'P'},
                   'RootPrice': {0: 109.95, 1: 109.95, 2: 109.95, 3: 109.95, 4: 109.95, 5: 109.95, 6: 109.95, 7: 109.95},
                   'RootSymbol': {0: 'AAPL', 1: 'AAPL', 2: 'AAPL', 3: 'AAPL', 4: 'AAPL', 5: 'AAPL', 6: 'AAPL', 7: 'AAPL'},
                   'Strike': {0: 109.0, 1: 109.0, 2: 110.0, 3: 110.0, 4: 111.0, 5: 111.0, 6: 110.0, 7: 110.0},
                   'TimeStamp': {0: Timestamp('2015-09-30 16:00:00'), 1: Timestamp('2015-09-30 16:00:00'), 2: Timestamp('2015-09-30 16:00:00'), 3: Timestamp('2015-09-30 16:00:00'), 4: Timestamp('2015-09-30 16:00:00'), 5: Timestamp('2015-09-30 16:00:00'), 6: Timestamp('2015-09-30 16:00:00'), 7: Timestamp('2015-09-30 16:00:00')},
                   'Volume': {0: 1565.0, 1: 3790.0, 2: 10217.0, 3: 12113.0, 4: 6674.0, 5: 2031.0, 6: 5330.0, 7: 3724.0}})
df = df[col_order]

# Vectorize columns
df['mark'] = (df.Bid + df.Ask) / 2
df['cp'] = df.OptType.map({'C': 1, 'P': -1})
df['Log_S_K'] = (df.RootPrice / df.Strike).apply(log)
df['divs'] = 0  # TODO: Get dividend value.
df['vega'] = 0.
df['converged'] = False

# Vectorized datetime calculations
date_pairs = set(zip(df.TimeStamp, df.Expiry))
total_days = {(t1, t2): len(pd.bdate_range(t1, t2)) 
                        for t1, t2 in date_pairs}
hols = {(t1, t2): len(cal.holidays(t1, t2).to_pydatetime()) 
                  for t1, t2 in date_pairs}
del date_pairs

df['total_days'] = [total_days.get((t1, t2))
                    for t1, t2 in zip(df.TimeStamp, df.Expiry)]
df['hols'] = [hols.get((t1, t2))
              for t1, t2 in zip(df.TimeStamp, df.Expiry)]
df['days_to_exp'] = df.total_days - df.hols - 1
df.loc[df.days_to_exp < 0, 'days_to_exp'] = 0  # Min zero.
df.drop(['total_days', 'hols'], axis='columns', inplace=True)
df['years_to_expiry'] = (df.days_to_exp * tradingMinutesDay / tradingMinutesAnnum)

# Initial implied vol 'guess'
df['implied_vol'] = (two_pi / df.years_to_expiry) ** 0.5 * df.mark / df.RootPrice  

for i in xrange(100):  # range(100) in Python 3.x
    # Create mask of options where the vol has not converged.
    mask = [not c for c in df.converged.values]
    if df.converged.all():
        break

    # Aliases.
    data = df.loc[mask, :]
    cp = data.cp
    mark = data.mark
    S = data.RootPrice
    K = data.Strike
    d = data.divs
    T = data.years_to_expiry
    log_S_K = data.Log_S_K
    iv = data.implied_vol

    # Calcs.
    d1 = (log_S_K + T * (rf - d + .5 * iv ** 2)) / (iv * T ** 0.5)
    d2 = d1 - iv * T ** 0.5
    df.loc[mask, 'vega'] = vega = S * d1.apply(norm.pdf) * T ** 0.5
    model = cp * (S * (cp * d1).apply(norm.cdf)
                  - K * (-rf * T).apply(exp) * (cp * d2).apply(norm.cdf))
    iv_delta = (model - mark) / vega
    df.loc[mask, 'implied_vol'] = iv - iv_delta

    # Clean-up and check for convergence.
    df.loc[df.implied_vol < 0, 'implied_vol'] = 0
    idx = model[(model - mark).abs() < threshold].index
    df.ix[idx, 'converged'] = True
    df.loc[:, 'implied_vol'].fillna(0, inplace=True)
    df.loc[:, 'implied_vol'].replace([inf, -inf], nan, inplace=True)
    df.loc[:, 'vega'].fillna(0, inplace=True)
    df.loc[:, 'vega'].replace([inf, -inf], nan, inplace=True)


来源:https://stackoverflow.com/questions/33044340/create-multiple-columns-in-pandas-dataframe-from-one-function

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!