How to get gradients in TF 2.2 Eager?

安稳与你 提交于 2020-07-16 06:58:32

问题


model.total_loss has been deprecated in Eager, so below no longer works - how else to fetch gradients?


Works in TF 2.1/2.0:

import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')

x = y = np.random.randn(32, 16)
model.train_on_batch(x, y)

grad_tensors = model.optimizer.get_gradients(model.total_loss, model.trainable_weights)

Note: alternatives should be able to set learning_phase flag, and (preferred, not required) handle sample_weight. Above accomplishes this via K.function(..., outputs=grad_tensors).


回答1:


Network structure has changed in 2.2, making certain Model attributes or methods inaccessible. Below works for both Graph and Eager, and is tested to give reproducible results. Eager case works only with trainable weights, not layer outputs; I'll soon add a more complete version covering outputs to See RNN.

Eager method reuses Eager train loop code, ensuring consistency with internal gradient computation.

Update: complete method here; all backends supported (TF 1, 2, Eager, Graph, keras, tf.keras), and weights and outputs.


Method:

import numpy as np
import tensorflow as stf
from tensorflow.keras import backend as K
from tensorflow.python.distribute import parameter_server_strategy
from tensorflow.python.keras.engine import data_adapter
from tensorflow.python.keras.mixed_precision.experimental import (
    loss_scale_optimizer as lso)


def _get_grads_graph(model, x, y, params, sample_weight=None, learning_phase=0):
    sample_weight = sample_weight or np.ones(len(x))

    outputs = model.optimizer.get_gradients(model.total_loss, params)
    inputs  = (model.inputs + model._feed_targets + model._feed_sample_weights
               + [K.learning_phase()])

    grads_fn = K.function(inputs, outputs)
    gradients = grads_fn([x, y, sample_weight, learning_phase])
    return gradients

def _get_grads_eager(model, x, y, params, sample_weight=None, learning_phase=0):
    def _process_input_data(x, y, sample_weight, model):
        iterator = data_adapter.single_batch_iterator(model.distribute_strategy,
                                                      x, y, sample_weight,
                                                      class_weight=None)
        data = next(iterator)
        data = data_adapter.expand_1d(data)
        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
        return x, y, sample_weight

    def _clip_scale_grads(strategy, tape, optimizer, loss, params):
        with tape:
            if isinstance(optimizer, lso.LossScaleOptimizer):
                loss = optimizer.get_scaled_loss(loss)

        gradients = tape.gradient(loss, params)

        aggregate_grads_outside_optimizer = (
            optimizer._HAS_AGGREGATE_GRAD and not isinstance(
                strategy.extended,
                parameter_server_strategy.ParameterServerStrategyExtended))

        if aggregate_grads_outside_optimizer:
            gradients = optimizer._aggregate_gradients(zip(gradients, params))
        if isinstance(optimizer, lso.LossScaleOptimizer):
            gradients = optimizer.get_unscaled_gradients(gradients)

        gradients = optimizer._clip_gradients(gradients)
        return gradients

    x, y, sample_weight = _process_input_data(x, y, sample_weight, model)

    with tf.GradientTape() as tape:
        y_pred = model(x, training=bool(learning_phase))
        loss = model.compiled_loss(y, y_pred, sample_weight,
                                   regularization_losses=model.losses)

    gradients = _clip_scale_grads(model.distribute_strategy, tape,
                                  model.optimizer, loss, params)
    gradients = K.batch_get_value(gradients)
    return gradients

def get_gradients(model, x, y, params, sample_weight=None, learning_phase=0,
                  evaluate=True):
    if tf.executing_eagerly():
        return _get_grads_eager(model, x, y, params, sample_weight,
                                learning_phase)
    else:
        return _get_grads_graph(model, x, y, params, sample_weight,
                                learning_phase)

Test:

import numpy as np
np.random.seed(1)
import random
random.seed(2)
import tensorflow as tf
tf.compat.v1.set_random_seed(3)
tf.random.set_seed(4)
# tf.compat.v1.disable_eager_execution()

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import GlorotUniform


ipt = Input((4,))
out = Dense(4, kernel_initializer=GlorotUniform(seed=0))(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')

x = y = np.random.randn(32, 4)
model.train_on_batch(x, y)
print(model.get_weights())

grads = get_gradients(model, x, y, model.trainable_weights)
print(grads)
# WEIGHTS (Eager & Graph)
[array([[-0.4995359 ,  0.3558198 ,  0.518725  ,  0.4680259 ],
        [-0.19397011,  0.6424813 ,  0.5327964 , -0.52391374],
        [ 0.6039545 ,  0.07058681, -0.62931913, -0.6724267 ],
        [ 0.42698476, -0.52317786, -0.2453942 ,  0.03615759]], dtype=float32),
 array([-0.00100001,  0.00099961,  0.00100002,  0.00100001], dtype=float32)]

# GRADS (Eager & Graph)
[array([[-0.5818436 ,  0.22703086,  0.2980485 ,  0.42571294],
        [ 0.18901172, -0.20659731,  0.08305292, -0.31698108],
        [ 0.41603914, -0.01972354, -0.72125435, -0.34481353],
        [ 0.38650095, -0.31618145, -0.17637177, -0.55846536]], dtype=float32),
 array([ 0.17147431, -0.00683564, -0.31096804, -0.14086047], dtype=float32)]



回答2:


is this what are you looking for?

ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')

x = y = tf.constant(np.random.randn(32, 16))
model.train_on_batch(x, y)

with tf.GradientTape() as tape:
    # Doing the computation in the context of the gradient tape
    # For example computing loss
    pred = model(x)
    loss = tf.metrics.MSE(y,pred)

# Getting the gradient of weight w.r.t loss 
grad = tape.gradient(loss, model.trainable_weights) 
grad


来源:https://stackoverflow.com/questions/61887944/how-to-get-gradients-in-tf-2-2-eager

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!