问题
model.total_loss
has been deprecated in Eager, so below no longer works - how else to fetch gradients?
Works in TF 2.1/2.0:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = np.random.randn(32, 16)
model.train_on_batch(x, y)
grad_tensors = model.optimizer.get_gradients(model.total_loss, model.trainable_weights)
Note: alternatives should be able to set learning_phase
flag, and (preferred, not required) handle sample_weight
. Above accomplishes this via K.function(..., outputs=grad_tensors)
.
回答1:
Network structure has changed in 2.2, making certain Model attributes or methods inaccessible. Below works for both Graph and Eager, and is tested to give reproducible results. Eager case works only with trainable weights, not layer outputs; I'll soon add a more complete version covering outputs to See RNN.
Eager method reuses Eager train loop code, ensuring consistency with internal gradient computation.
Update: complete method here; all backends supported (TF 1, 2, Eager, Graph, keras
, tf.keras
), and weights and outputs.
Method:
import numpy as np
import tensorflow as stf
from tensorflow.keras import backend as K
from tensorflow.python.distribute import parameter_server_strategy
from tensorflow.python.keras.engine import data_adapter
from tensorflow.python.keras.mixed_precision.experimental import (
loss_scale_optimizer as lso)
def _get_grads_graph(model, x, y, params, sample_weight=None, learning_phase=0):
sample_weight = sample_weight or np.ones(len(x))
outputs = model.optimizer.get_gradients(model.total_loss, params)
inputs = (model.inputs + model._feed_targets + model._feed_sample_weights
+ [K.learning_phase()])
grads_fn = K.function(inputs, outputs)
gradients = grads_fn([x, y, sample_weight, learning_phase])
return gradients
def _get_grads_eager(model, x, y, params, sample_weight=None, learning_phase=0):
def _process_input_data(x, y, sample_weight, model):
iterator = data_adapter.single_batch_iterator(model.distribute_strategy,
x, y, sample_weight,
class_weight=None)
data = next(iterator)
data = data_adapter.expand_1d(data)
x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
return x, y, sample_weight
def _clip_scale_grads(strategy, tape, optimizer, loss, params):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, params)
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and not isinstance(
strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
gradients = optimizer._aggregate_gradients(zip(gradients, params))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients)
return gradients
x, y, sample_weight = _process_input_data(x, y, sample_weight, model)
with tf.GradientTape() as tape:
y_pred = model(x, training=bool(learning_phase))
loss = model.compiled_loss(y, y_pred, sample_weight,
regularization_losses=model.losses)
gradients = _clip_scale_grads(model.distribute_strategy, tape,
model.optimizer, loss, params)
gradients = K.batch_get_value(gradients)
return gradients
def get_gradients(model, x, y, params, sample_weight=None, learning_phase=0,
evaluate=True):
if tf.executing_eagerly():
return _get_grads_eager(model, x, y, params, sample_weight,
learning_phase)
else:
return _get_grads_graph(model, x, y, params, sample_weight,
learning_phase)
Test:
import numpy as np
np.random.seed(1)
import random
random.seed(2)
import tensorflow as tf
tf.compat.v1.set_random_seed(3)
tf.random.set_seed(4)
# tf.compat.v1.disable_eager_execution()
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import GlorotUniform
ipt = Input((4,))
out = Dense(4, kernel_initializer=GlorotUniform(seed=0))(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = np.random.randn(32, 4)
model.train_on_batch(x, y)
print(model.get_weights())
grads = get_gradients(model, x, y, model.trainable_weights)
print(grads)
# WEIGHTS (Eager & Graph)
[array([[-0.4995359 , 0.3558198 , 0.518725 , 0.4680259 ],
[-0.19397011, 0.6424813 , 0.5327964 , -0.52391374],
[ 0.6039545 , 0.07058681, -0.62931913, -0.6724267 ],
[ 0.42698476, -0.52317786, -0.2453942 , 0.03615759]], dtype=float32),
array([-0.00100001, 0.00099961, 0.00100002, 0.00100001], dtype=float32)]
# GRADS (Eager & Graph)
[array([[-0.5818436 , 0.22703086, 0.2980485 , 0.42571294],
[ 0.18901172, -0.20659731, 0.08305292, -0.31698108],
[ 0.41603914, -0.01972354, -0.72125435, -0.34481353],
[ 0.38650095, -0.31618145, -0.17637177, -0.55846536]], dtype=float32),
array([ 0.17147431, -0.00683564, -0.31096804, -0.14086047], dtype=float32)]
回答2:
is this what are you looking for?
ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = tf.constant(np.random.randn(32, 16))
model.train_on_batch(x, y)
with tf.GradientTape() as tape:
# Doing the computation in the context of the gradient tape
# For example computing loss
pred = model(x)
loss = tf.metrics.MSE(y,pred)
# Getting the gradient of weight w.r.t loss
grad = tape.gradient(loss, model.trainable_weights)
grad
来源:https://stackoverflow.com/questions/61887944/how-to-get-gradients-in-tf-2-2-eager