Return Inverse Hessian Matrix at the end of DNN Training and Partial Derivatives wrt the Inputs

问题

Using Keras and Tensorflow as the backend, I have built a DNN that takes stellar spectra as an input (7213 data points) and output three stellar parameters (Temperature, gravity, and metallicity). The network trains well and predicts well on my test sets, but in order for the results to be scientifically useful, I need to be able to estimate my errors. The first step in doing this is to obtain the inverse Hessian matrix, which doesn't seem to be possible using just Keras. Therefore I am attempting to create a workaround with scipy, using scipy.optimize.minimize with either BFGS, L-BFGS-B, or Netwon-CG as the method. Any of these will return the inverse Hessian matrix.

The idea is to train the model using the Adam optimizer for 100 epochs (or until the model converges) and then run one single iteration or function of BFGS (or one of the others) to return the Hessian matrix of my model.

Here is my code:

from scipy.optimize import minimize

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam


# Define vars
activation = 'relu'
init = 'he_normal'
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-08

input_shape = (None,n)
n_hidden = [2048,1024,512,256,128,32]
output_dim = 3

epochs = 100
lr = 0.0008
batch_size = 64
decay = 0.00

# Design DNN Layers

model = Sequential([

    Dense(n_hidden[0], batch_input_shape=input_shape, init=init, activation=activation),

    Dense(n_hidden[1], init=init, activation=activation), 

    Dense(n_hidden[2], init=init, activation=activation),

    Dense(n_hidden[3], init=init, activation=activation),

    Dense(n_hidden[4], init=init, activation=activation),

    Dense(n_hidden[5], init=init, activation=activation),

    Dense(output_dim, init=init, activation='linear'),
])


# Optimization function
optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay)


# Compile and train network
model.compile(optimizer=optimizer, loss='mean_squared_error')

#train_X.shape = (50000,7213)
#train_Y.shape = (50000,3)
#cv_X.shape = (10000,7213)
#cv_Y.shape = (10000,3)

history = model.fit(train_X, train_Y, validation_data=(cv_X, cv_Y),
             nb_epoch=epochs, batch_size=batch_size, verbose=2)


weights = []
for layer in model.layers:
    weights.append(layer.get_weights())

def loss(W):
    weightsList = W
    weightsList = np.array(W)
    new_weights = []
    for i, layer in enumerate((weightsList)):
        new_weights.append(np.array(weightsList[i]))
    model.set_weights(np.array(new_weights))
    preds = model.predict(train_X)
    mse = np.sum(np.square(np.subtract(preds,train_Y)))/len(train_X[:,0])
    print(mse)
    return mse


x0=weights    
res = minimize(loss, x0, args=(), method = 'BFGS', options={'maxiter':1,'eps':1e-6,'disp':True})
#res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})
#res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})
inv_hess = res['hess_inv']

1) The model trains extremely well, but when attempting to run the scipy minimizer for a single iteration with the previously trained weights, I run into problems.

Output when trying method=BFGS:

0.458706819754
0.457811632697
0.458706716791
...
0.350124572422
0.350186770445
0.350125320636

ValueErrorTraceback (most recent call last)
---> 19 res = minimize(loss, x0, args=(), method = 'BFGS', tol=1, options={'maxiter':1,'eps':1e-6,'disp':True})#,'gtol':0.1}, tol=5)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    442         return _minimize_cg(fun, x0, args, jac, callback, **options)
    443     elif meth == 'bfgs':
--> 444         return _minimize_bfgs(fun, x0, args, jac, callback, **options)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
    963         try:  # this was handled in numeric, let it remaines for more safety
--> 964             rhok = 1.0 / (numpy.dot(yk, sk))
    965         except ZeroDivisionError:
    966             rhok = 1000.0

ValueError: operands could not be broadcast together with shapes (7213,2048) (2048,1024)

Output when trying method=L-BFGS-B:

ValueErrorTraceback (most recent call last)

---> 20 res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 450                                 callback=callback, **options)


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/lbfgsb.pyc in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    300         raise ValueError('maxls must be positive.')
    301 
--> 302     x = array(x0, float64)
    303     f = array(0.0, float64)
    304     g = zeros((n,), float64)

ValueError: setting an array element with a sequence.

Output when trying method=Newton-CG

ValueErrorTraceback (most recent call last)

---> 21 res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    445     elif meth == 'newton-cg':
    446         return _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
--> 447                                   **options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback, xtol, eps, maxiter, disp, return_all, **unknown_options)
   1438     _check_unknown_options(unknown_options)
   1439     if jac is None:
-> 1440         raise ValueError('Jacobian is required for Newton-CG method')

ValueError: Jacobian is required for Newton-CG method

2) The next task is to obtain the derivative of the model outputs with respect to the model inputs. For instance, for one stellar parameter (one of the outputs), say Temperature, I need to find the partial derivatives with respect to each of the 7213 inputs. And then do the same for each of the 3 outputs.

So basically, my first task (1) is to find a way to return the inverse Hessian matrix of my model and next (2) I need to find a way to return the first-order partial derivatives of my outputs with respect to my inputs.

Does anyone have some insight on either of these two tasks? Thanks.

EDIT

I am trying to use theano.gradient.jacobian() to return the Jacobian matrix of my output w.r.t. my inputs. I have turned my model into a function of the model weights and used that function as the first parameter in theano.gradient.jacobian(). My problem arises when I try and run the gradient with multidimensional arrays which my model weights and input data are in the form of.

import theano.tensor as T

weights_in_model = T.dvector('model_weights')
x = T.dvector('x')

def pred(x,weights_in_model):
    weights = T.stack((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.shape_padright(x, n_ones=1)

    prediction=T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.stack((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    T.flatten(prediction)

    return prediction


f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


x = train_X
weights_in_model = model.get_weights()
h(x,weights_in_model)

This last line gives the error:

TypeError: ('Bad input argument to theano function with name "<ipython-input-365-a1ab256aa220>:1"  at index 0(0-based)', 'Wrong number of dimensions: expected 1, got 2 with shape (2000, 7213).')

But when I change the inputs to:

weights_in_model = T.matrix('model_weights')
x = T.matrix('x')

I get an error from the line:

f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

reading:

AssertionError: tensor.jacobian expects a 1 dimensional variable as `expression`. If not use flatten to make it a vector

Any ideas on how to get around this?

回答1:

ANSWER FOUND!: This code works for predicting one output value from the model. Currently I am working on modifying it to compute 3 jacobian matrices; one for each output.

import theano
import theano.tensor as T
import theano.typed_list
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'

# Declare function input placeholders
weights_in_model = theano.typed_list.TypedListType(theano.tensor.dmatrix)()
x = T.matrix('x')

# Define model function
def pred(x,weights_in_model): 
    weights = T.concatenate((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.concatenate((x, T.ones((T.shape(x)[0], 1))), axis=1)

    prediction = T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.concatenate((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.flatten(prediction)
    return prediction

# Create gradient function
f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

# Compile function
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


# Get function inputs
weights_in_model_ = model.get_weights()
x_=train_data

# Reshape bias layers
weights_in_model_[1] = np.reshape(weights_in_model_[1], (1, 2048))
weights_in_model_[3] = np.reshape(weights_in_model_[3], (1, 1024))
weights_in_model_[5] = np.reshape(weights_in_model_[5], (1, 512))
weights_in_model_[7] = np.reshape(weights_in_model_[7], (1, 256))
weights_in_model_[9] = np.reshape(weights_in_model_[9], (1, 128))
weights_in_model_[11] = np.reshape(weights_in_model_[11], (1, 32))
weights_in_model_[13] = np.reshape(weights_in_model_[13], (1, 1))

# Compute Jacobian (returns format with a bunch of zero rows)
jacs = h(x_, weights_in_model_)

# Put Jacobian matrix in proper format (ie. shape = (number_of_input_examples, number_of_input_features)

jacobian_matrix = np.zeros((jacs.shape[0],jacs.shape[2]))
for i, jac in enumerate(jacs): 
    jacobian_matrix[i] = jac[i]

Next task is to find the Hessian matrix of the outputs w.r.t. the model weights!

来源：https://stackoverflow.com/questions/42792405/return-inverse-hessian-matrix-at-the-end-of-dnn-training-and-partial-derivatives

标签

python

scipy

neural-network

keras

hessian-matrix