tensorflow: ValueError: No gradients provided for any variable (Keras 2.4, Tensorflow 2.3.0)

So I’m using this model to train on Google colab, it was written for Tensorflow 1.9 and Keras 2, but when I train I get the following error, has anyone seen this or how to solve it?

It was training fine before but this error started today.

Actual code:

from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential, model_from_json
from keras.utils import to_categorical
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import RMSprop
from keras.layers.convolutional import Conv2D
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM,     concatenate , Input, Reshape, Dense
from keras.preprocessing.image import array_to_img, img_to_array, load_img
import numpy as np

dir_name = '/data/train/'

# Read a file and return a string
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text

def load_data(data_dir):
text = []
images = []
# Load all the files and order them
all_filenames = listdir(data_dir)
all_filenames.sort()
for filename in (all_filenames):
    if filename[-3:] == "npz":
        # Load the images already prepared in arrays
        image = np.load(data_dir+filename)
        images.append(image['features'])
    else:
        # Load the boostrap tokens and rap them in a start and end tag
        syntax = '<START> ' + load_doc(data_dir+filename) + ' <END>'
        # Seperate all the words with a single space
        syntax = ' '.join(syntax.split())
        # Add a space after each comma
        syntax = syntax.replace(',', ' ,')
        text.append(syntax)
images = np.array(images, dtype=float)
return images, text

train_features, texts = load_data(dir_name)

# Initialize the function to create the vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)
# Create the vocabulary 
tokenizer.fit_on_texts([load_doc('bootstrap.vocab')])

# Add one spot for the empty word in the vocabulary 
vocab_size = len(tokenizer.word_index) + 1
max_length = 48

def preprocess_data(texts, features, max_sequence):
X, y, image_data = list(), list(), list()
sequences = tokenizer.texts_to_sequences(texts)
for img_no, seq in enumerate(sequences):
    for i in range(1, len(seq)):
        # Add the sentence until the current count(i) and add the current     count to the output
        in_seq, out_seq = seq[:i], seq[i]
        # Pad all the input token sentences to max_sequence
        in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
        # Turn the output into one-hot encoding
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # Add the corresponding image to the boostrap token file
        image_data.append(features[img_no])
        # Cap the input sentence to 48 tokens and add it
        X.append(in_seq[-48:])
        y.append(out_seq)
return np.array(image_data), np.array(X), np.array(y)


# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, features, n_step, max_sequence):
# loop until we finish training
while 1:
    # loop over photo identifiers in the dataset
    for i in range(0, len(descriptions), n_step):
        Ximages, XSeq, y = list(), list(),list()
        for j in range(i, min(len(descriptions), i+n_step)):
            image = features[j]
            # retrieve text input
            desc = descriptions[j]
            # generate input-output pairs
            in_img, in_seq, out_word = preprocess_data([desc], [image], max_sequence)
            for k in range(len(in_img)):
                Ximages.append(in_img[k])
                XSeq.append(in_seq[k])
                y.append(out_word[k])
        # yield this batch of samples to the model
        yield [[array(Ximages), array(XSeq)], array(y)]

#Create the encoder
image_model = Sequential()
image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(256, 256, 3,)))
image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))

image_model.add(Flatten())
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))

image_model.add(RepeatVector(max_length))

visual_input = Input(shape=(256, 256, 3,))
encoded_image = image_model(visual_input)

language_input = Input(shape=(max_length,))
language_model = Embedding(vocab_size, 50, input_length=max_length,    mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

#Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

#Save the model for every 2nd epoch
filepath="org-weights-epoch-{epoch:04d}--loss-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, verbose=1, save_weights_only=True,     period=2)
callbacks_list = [checkpoint]

# test the data generator
generator = data_generator(texts, train_features, 1, 150)
model.fit_generator(generator, steps_per_epoch=50, epochs=5, callbacks=callbacks_list, verbose=1)

Error I receive when training:

ValueError                                
Traceback (most recent call last)
<ipython-input-4-6927891f43ca> in <module>()
  1 # test the data generator
  2 generator = data_generator(texts, train_features, 1, max_sequence)
----> 3 loaded_model.fit_generator(generator, steps_per_epoch=steps, epochs=5, callbacks=callbacks_list, verbose=1)
  4 loaded_model.save(mydrive + '/output/weights.hdf5')

12 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
971           except Exception as e:  # pylint:disable=broad-except
972             if hasattr(e, "ag_error_metadata"):
--> 973               raise e.ag_error_metadata.to_exception(e)
974             else:
975               raise

ValueError: in user code:

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
    return step_function(self, iterator)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
    outputs = model.distribute_strategy.run(run_step, args=(data,))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
    return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
    return self._call_for_each_replica(fn, args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
    return fn(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
    outputs = model.train_step(data)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:757 train_step
    self.trainable_variables)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:2737 _minimize
    trainable_variables))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:562 _aggregate_gradients
    filtered_grads_and_vars = _filter_grads(grads_and_vars)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1271 _filter_grads
    ([v.name for _, v in grads_and_vars],))

ValueError: No gradients provided for any variable: ['embedding_1/embeddings:0', 'lstm_1/lstm_cell/kernel:0', 'lstm_1/lstm_cell/recurrent_kernel:0', 'lstm_1/lstm_cell/bias:0', 'conv2d_1/kernel:0', 'conv2d_1/bias:0', 'conv2d_2/kernel:0', 'conv2d_2/bias:0', 'conv2d_3/kernel:0', 'conv2d_3/bias:0', 'conv2d_4/kernel:0', 'conv2d_4/bias:0', 'conv2d_5/kernel:0', 'conv2d_5/bias:0', 'conv2d_6/kernel:0', 'conv2d_6/bias:0', 'conv2d_7/kernel:0', 'conv2d_7/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0', 'dense_2/kernel:0', 'dense_2/bias:0', 'lstm_2/lstm_cell_1/kernel:0', 'lstm_2/lstm_cell_1/recurrent_kernel:0', 'lstm_2/lstm_cell_1/bias:0', 'lstm_3/lstm_cell_2/kernel:0', 'lstm_3/lstm_cell_2/recurrent_kernel:0', 'lstm_3/lstm_cell_2/bias:0', 'lstm_4/lstm_cell_3/kernel:0', 'lstm_4/lstm_cell_3/recurrent_kernel:0', 'lstm_4/lstm_cell_3/bias:0', 'dense_3/kernel:0', 'dense_3/bias:0'].

I’m training it on Google Colab using the TPU. If I train it on Tensorflow 1.x it trains fine but takes 8 hours per epoch with my dataset. Tensorflow 2.x was taking 1 hour per epoch but is now giving this error

EDIT: SOLUTION

I cannot train on TPU, but I am at least able to train on GPU, I can continue the project ! Solution by @silentkinght25 and @silentkinght25 solves it.

“To resolve this u must modify the data generator function as: yield ([array(Ximages), array(XSeq)], array(y)) instead of yield [[array(Ximages), array(XSeq)], array(y)]”

About this issue

Original URL
State: closed
Created 4 years ago
Comments: 20 (5 by maintainers)

Most upvoted comments

Try changing the yield part in data_generator from:

# yield this batch of samples to the model
        yield [[array(Ximages), array(XSeq)], array(y)]

to just:

# yield this batch of samples to the model
        yield [array(Ximages), array(XSeq)], array(y)

Since the documentation expects a tuple (x, y) to be returned from your generator.

+17

Kaelinator on Aug 6, 2020

I had same problem @ScruffySilky but i resolved it. The problem is in data generator function, because model.fit(generator,…) expect a list of tuple or a dict of tuple. To resolve this u must modify the data generator function as: yield ([array(Ximages), array(XSeq)], array(y)) instead of yield [[array(Ximages), array(XSeq)], array(y)]

silentkinght25 on Sep 18, 2020

Is there any planned solution for this bug ?

ScruffySilky on Aug 24, 2020

I had same problem @ScruffySilky but i resolved it. The problem is in data generator function, because model.fit(generator,…) expect a list of tuple or a dict of tuple. To resolve this u must modify the data generator function as: yield ([array(Ximages), array(XSeq)], array(y)) instead of yield [[array(Ximages), array(XSeq)], array(y)]

I hope this resolves ur issue.

silentkinght25 on Sep 18, 2020

Ok I cannot train on TPU, but I am able to train on GPU. Solution by @silentkinght25 and @silentkinght25 seems to solve it.

“To resolve this u must modify the data generator function as: yield ([array(Ximages), array(XSeq)], array(y)) instead of yield [[array(Ximages), array(XSeq)], array(y)]”

ScruffySilky on Sep 21, 2020

Try changing the yield part in data_generator from:
# yield this batch of samples to the model
        yield [[array(Ximages), array(XSeq)], array(y)]
to just:
# yield this batch of samples to the model
        yield [array(Ximages), array(XSeq)], array(y)
Since the documentation expects a tuple (x, y) to be returned from your generator.

Changing that gives this error:

`UnavailableError                          Traceback (most recent call last)
<ipython-input-5-38870834c9d8> in <module>()
      1 # test the data generator
      2 generator = data_generator(texts, train_features, 1, max_sequence)
----> 3 model.fit_generator(generator, steps_per_epoch=steps, epochs=5, callbacks=callbacks_list, verbose=1)
      4 model.save(mydrive + '/output/weights.hdf5')

16 frames
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)

UnavailableError: {{function_node __inference_train_function_28401}} failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"@1596748432.082355881","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":3948,"referenced_errors":[{"created":"@1596748432.082354141","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":394,"grpc_status":14}]}
	 [[{{node IteratorGetNext}}]]`

ScruffySilky on Aug 6, 2020

Having the same issue. Using model.fit with just numpy arrays works (with saved data using numpy.save), but I was trying to use the new saving/loading dataset features, and I’m getting this error message in my model.fit_generator(...).

EDIT: Okay well it was because my generator wasn’t yielding a tuple with (x, y), instead just a single dictionary with all my named keys. Now I’m yielding { 'input1': ..., 'input2': ...}, label according to the documentation

Kaelinator on Aug 6, 2020