Errors when training my saved tensorflow model

问题

I'm trying to build a custom keras model whith the subclassing API but I get some errors when I load a previous instance of my model and I try to train it :

Here is the class of my model, it has 3 inputs and 1 output :

import tensorflow as tf


spec1 = tf.TensorSpec(shape=(1,40,5,1))
spec2 = tf.TensorSpec(shape=(1,3))

class Conv_Rnn_model(tf.keras.Model):
    def __init__(self):
        # super() permet d'appeler le constructeur de la classe mère dans la classe fille
        # permet égalemet de résoudre le problème d'appel multiple de méthode dans les configuration de classe en diamant
        super().__init__() 
       
        self.loss_object = tf.keras.losses.MeanSquaredError()
        self.optimizer   = tf.keras.optimizers.Adam()
        # Convolutions :
            # input :  [batch_size, rows, cols, channels]
            # return : [batch_size, new_rows, new_cols, filters]
        self.conv1 = tf.keras.layers.Conv2D(filters = 32, kernel_size = (8,2),input_shape=(40,5,1), activation='relu', name ="conv1")
        self.conv2 = tf.keras.layers.Conv2D(filters = 64, kernel_size = (6,1),input_shape=(40,5,1), activation='relu', name ="conv2")
        self.conv3 = tf.keras.layers.Conv2D(filters =128, kernel_size = (6,1),input_shape=(40,5,1), activation='relu', name ="conv3")
       
        
        # recurrent cells : 
            #input :  [batch_size, time_steps, features]
            #return : [batch_size, time_steps, units (if return_sequence=True)]
        self.lstm1A = tf.keras.layers.LSTM(64, return_sequences=True, name = "lstm1A")
        self.lstm1B = tf.keras.layers.LSTM(64, name = "lstm1B")
        
        self.lstm2A = tf.keras.layers.LSTM(64, return_sequences=True, name = "lstm2A")
        self.lstm2B = tf.keras.layers.LSTM(64, name = "lstm2B")
         
        self.lstm3A = tf.keras.layers.LSTM(64, return_sequences=True, name = "lstm3A")
        self.lstm3B = tf.keras.layers.LSTM(64, name = "lstm3B")
        
        # Concat layer : 
        self.concat = tf.keras.layers.Concatenate(axis=1)

        # fully connected layers :
            #input  : [batch_size, ... , input_dim]
            #return : [batch_size, ... , units]
        self.dense = tf.keras.layers.Dense(32)
        self.out = tf.keras.layers.Dense(3, activation='softmax')
        
    @tf.function(input_signature=[[tf.TensorSpec(shape=(1,40,5,1),name="M15"),
                                   tf.TensorSpec(shape=(1,40,5,1),name="H1"),
                                   tf.TensorSpec(shape=(1,40,5,1),name="H4")]
                                 ])
    def call(self, data):
        """ 
        TODO: comprendre comment se calculer les outputs des conv2D 
        pour remplacer les conv_res.shape par des constantes
        """
        
        #tf.no_gradient("concatenate")
        #tf.no_gradient("reshape")
        
        conv1_res = self.conv1(data[0])
        conv2_res = self.conv2(data[1])
        conv3_res = self.conv3(data[2])
            
        shape1 = (1, conv1_res.shape[1],conv1_res.shape[2]*conv1_res.shape[3])
        shape2 = (1, conv2_res.shape[1],conv2_res.shape[2]*conv2_res.shape[3])
        shape3 = (1, conv3_res.shape[1],conv3_res.shape[2]*conv3_res.shape[3])


        f1 = self.lstm1B(self.lstm1A( tf.reshape(conv1_res, shape1) ))
        f2 = self.lstm2B(self.lstm2A( tf.reshape(conv2_res, shape2) ))
        f3 = self.lstm3B(self.lstm3A( tf.reshape(conv3_res, shape3) ))  
        
        # returns of fully connected layers
        pre_output = self.dense(self.concat([f1,f2,f3]))
        output     = self.out(pre_output)
        
        return output
    
    @tf.function(input_signature=[[spec1,spec1,spec1], spec2])
    def train_step(self,X_train, y_train):
        
        #X_train, y_train = data
        with tf.GradientTape() as tape:
            y_pred = self(X_train)  # Forward pass
            # Compute the loss value
            loss = self.loss_object(y_train, y_pred)

        # Compute gradients
        gradients = tape.gradient(loss, self.trainable_variables)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

When I train my model and then I save it all work very well :

from Conv_Rnn_model import Conv_Rnn_model
from tensorflow.keras.models import load_model
from numpy.random import rand
from functions import getDataset_3period

model_1 = Conv_Rnn_model()
model_1.compile(loss='mse', optimizer='adam')
iterations = 5
data_environment = getDataset_3period(window_size=40)# get dataframe of EURUSD 

for i in range(iterations):
    state   = state = data_environment[i] 
    # target  = tf.constant(rand(1,3),dtype=tf.float32)
    target= rand(1,3)
    X_train = [state[:1],state[1:2],state[2:3]]
    # X_train = tf.constant(X_train, dtype=tf.float32)
    model_1.train_step(X_train, target)
    print("epoch", i)

model_1.save("models/model_test1")

But when I try to reload my trained model and train it again I get errors :

model_2 = load_model("models/model_test1", compile=False)

2020-08-08 18:17:27.277841: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:28.048269: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:28.651946: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:28.946418: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.857832: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.872207: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'while' has 11 outputs but the _output_shapes attribute specifies shapes for 20 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.891483: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm1A/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.892203: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm1B/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.892926: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm2A/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.893593: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm2B/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.894289: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm3A/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.
2020-08-08 18:17:32.894950: W tensorflow/core/common_runtime/graph_constructor.cc:808] Node 'lstm3B/PartitionedCall' has 5 outputs but the _output_shapes attribute specifies shapes for 22 outputs. Output shapes may be inaccurate.

model_2(X_train)
Out[3]: <tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[0.2654996 , 0.40409103, 0.33040944]], dtype=float32)>

with train_step function :

model_2.train_step(X_train, target)
Traceback (most recent call last):

  File "<ipython-input-4-27db33666dda>", line 1, in <module>
    model_2.train_step(X_train, target)

TypeError: train_step() takes 2 positional arguments but 3 were given

or with fit function :

 model_2.compile(loss='mse', optimizer='adam')
model_2.fit(X_train, target)
Traceback (most recent call last):

  File "<ipython-input-6-d013b7a5a810>", line 2, in <module>
    model_2.fit(X_train, target)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 103, in _method_wrapper
    return method(self, *args, **kwargs)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1102, in fit
    tmp_logs = self.train_function(iterator)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 787, in __call__
    result = self._call(*args, **kwds)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 830, in _call
    self._initialize(args, kwds, add_initializers_to=initializers)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 702, in _initialize
    self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 2948, in _get_concrete_function_internal_garbage_collected
    graph_function, _, _ = self._maybe_define_function(args, kwargs)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3319, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3171, in _create_graph_function
    func_graph_module.func_graph_from_py_func(

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 987, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 613, in wrapped_fn
    return weak_wrapped_fn().__wrapped__(*args, **kwds)

  File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 974, in wrapper
    raise e.ag_error_metadata.to_exception(e)

ValueError: in user code:

    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:809 train_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:799 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1261 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2794 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3217 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:792 run_step  **
        outputs = model.train_step(data)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:750 train_step
        y_pred = self(x, training=True)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:990 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:71 return_outputs_and_add_losses
        outputs, losses = fn(inputs, *args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:167 wrap_with_training_arg
        return control_flow_util.smart_cond(
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/utils/control_flow_util.py:112 smart_cond
        return smart_module.smart_cond(
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/smart_cond.py:54 smart_cond
        return true_fn()
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:168 <lambda>
        training, lambda: replace_training_and_call(True),
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:165 replace_training_and_call
        return wrapped_call(*args, **kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:787 __call__
        result = self._call(*args, **kwds)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:821 _call
        results = self._stateful_fn(*args, **kwds)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py:2921 __call__
        graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3319 _maybe_define_function
        graph_function = self._create_graph_function(args, kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py:3171 _create_graph_function
        func_graph_module.func_graph_from_py_func(
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:987 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py:613 wrapped_fn
        return weak_wrapped_fn().__wrapped__(*args, **kwds)
    /opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/saved_model/function_deserialization.py:251 restored_function_body
        raise ValueError(

    ValueError: Could not find matching function to call loaded from the SavedModel. Got:
      Positional arguments (2 total):
        * (<tf.Tensor 'data:0' shape=(None, 40, 5, 1) dtype=float32>, <tf.Tensor 'data_1:0' shape=(None, 40, 5, 1) dtype=float32>, <tf.Tensor 'data_2:0' shape=(None, 40, 5, 1) dtype=float32>)
        * True
      Keyword arguments: {}
    
    Expected these arguments to match one of the following 4 option(s):
    
    Option 1:
      Positional arguments (2 total):
        * [TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/0'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/1'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/2')]
        * False
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (2 total):
        * [TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='M15'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='H1'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='H4')]
        * False
      Keyword arguments: {}
    
    Option 3:
      Positional arguments (2 total):
        * [TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='M15'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='H1'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='H4')]
        * True
      Keyword arguments: {}
    
    Option 4:
      Positional arguments (2 total):
        * [TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/0'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/1'), TensorSpec(shape=(1, 40, 5, 1), dtype=tf.float32, name='data/2')]
        * True
      Keyword arguments: {}

I'm trying to resolve this bug since 1 week and I've already read many times the tensorflow guide.

回答1:

Try to re-create model and copy weights from the saved model to the new one. There were some issues with continue training of saved model and I used this walk-around. You have to assure that names of the layers are the same for both models, or you may access layers by index. Below is code example for copying weights:

for layer in modelSaved.layers:
    print(f"name = {layer.name}, params = {layer.count_params():d}")
    if layer.count_params() > 0:
        destLayers = [x for x in model.layers if x.name == layer.name]
        if (len(destLayers) == 1) and (destLayers[0].count_params() == layer.count_params()):
            destLayers[0].set_weights(layer.get_weights())
            print(f'Weights copied for layer {destLayers[0].name}')

来源：https://stackoverflow.com/questions/63320771/errors-when-training-my-saved-tensorflow-model

标签

python

tensorflow

machine-learning

keras

custom-training