tensorflow: ModelCheckpoint callback error
Please make sure that this is a bug. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template
System information
- Have I written custom code (as opposed to using a stock example script provided in TensorFlow): No
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04): windows 10
- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:
- TensorFlow installed from (source or binary): Anaconda
- TensorFlow version (use command below): 1.13.1
- Python version: 3.5
- Bazel version (if compiling from source):
- GCC/Compiler version (if compiling from source):
- CUDA/cuDNN version: 7.3.4
- GPU model and memory: GTX 2080 TI
Describe the current behavior Using tf.keras, when I fit the model with train dataset and validation dataset created from the tf.dataset, and use ModelCheckpoint with default policy, an error showed up. The error seemed to happen because the callbacks tried to save the model with the same name twice, one at the end of each training epoch, one at the end of each validation epoch twice(using validation dataset). This should not happen because I think the source code already set overwrite = true, but it still happened.
The error information:
Traceback (most recent call last):
File "C:/Users/nones/iCloudDrive/Courses/DL/Ass8-CNN-MNIST/eager_tf_keras.py", line 180, in <module>
fit_model_and_evaluate(model)
File "C:/Users/nones/iCloudDrive/Courses/DL/Ass8-CNN-MNIST/eager_tf_keras.py", line 172, in fit_model_and_evaluate
validation_steps=40, verbose=verbose)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\engine\training.py", line 851, in fit
initial_epoch=initial_epoch)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\engine\training_generator.py", line 232, in model_iteration
callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\callbacks.py", line 251, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\callbacks.py", line 624, in on_epoch_end
self.model.save(filepath, overwrite=True)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\engine\network.py", line 1334, in save
save_model(self, filepath, overwrite, include_optimizer)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\keras\engine\saving.py", line 152, in save_model
name, val.shape, dtype=val.dtype)
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\h5py\_hl\group.py", line 119, in create_dataset
self[name] = dset
File "C:\Users\nones\Anaconda3\envs\tensorflow\lib\site-packages\h5py\_hl\group.py", line 287, in __setitem__
h5o.link(obj.id, self.id, name, lcpl=lcpl, lapl=self._lapl)
File "h5py\_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py\_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py\h5o.pyx", line 202, in h5py.h5o.link
RuntimeError: Unable to create link (name already exists)
Describe the expected behavior At least the error should not happen when saving model with the same name twice. Better if we can explicitly control when a model is saved.
Code to reproduce the issue Provide a reproducible test case that is the bare minimum necessary to generate the problem.
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
tf.enable_eager_execution()
# Mnist dataset
IMAGE_ROW, IMAGE_COLS = 28, 28
NUM_CLASSES = 10
BATCH_SIZE = 32
temp_dir = './temp'
def get_input_datasets(use_bfloat16=False):
"""Creates train and test dataset objects for mnist dataset.
Args:
use_bfloat16: Boolean, to determine if input should be cast to bfloat16
Returns:
Train dataset, test dataset and input shape, and class names.
"""
cast_dtype = tf.bfloat16 if use_bfloat16 else tf.float32
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
if tf.keras.backend.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, IMAGE_ROW, IMAGE_COLS)
x_test = x_test.reshape(x_test.shape[0], 1, IMAGE_ROW, IMAGE_COLS)
input_shape = (1, IMAGE_ROW, IMAGE_COLS)
else:
x_train = x_train.reshape(x_train.shape[0], IMAGE_ROW, IMAGE_COLS, 1)
x_test = x_test.reshape(x_test.shape[0], IMAGE_ROW, IMAGE_COLS, 1)
input_shape = (IMAGE_ROW, IMAGE_COLS, 1)
# Preprocess
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES)
# build dataset
# ds_train = tf.data.Dataset.from_tensor_slices({'images': x_train, 'labels': y_train})
# ds_test = tf.data.Dataset.from_tensor_slices({'images': x_test, 'labels': y_test})
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1)
ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
ds_valid = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
ds_test = tf.data.Dataset.from_tensor_slices((x_test,y_test))
# Preprocess dataset
ds_train = preprocess_dataset(ds_train, cast_dtype)
ds_valid = preprocess_dataset(ds_valid, cast_dtype)
ds_test = preprocess_dataset(ds_test, cast_dtype)
return ds_train, ds_valid, ds_test, input_shape, class_names
def preprocess_dataset(dataset, cast_dtype):
dataset = dataset.map(lambda x, y: (tf.cast(x, cast_dtype), y))
dataset = dataset.shuffle(buffer_size=6000)
dataset = dataset.repeat()
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=1000)
return dataset
def plot_image(image):
plt.figure()
plt.imshow(image)
plt.colorbar()
plt.grid(False)
def test_ds(dataset, fname):
save_dir = os.path.join(temp_dir, "%s.png" % fname)
os.makedirs(temp_dir, exist_ok=True)
plt.figure()
for image, label in dataset.take(1):
for index in range(4):
plt.subplot(2, 2, index + 1)
plt.imshow(image[index].numpy().reshape(28, 28))
plt.xlabel(label[index].numpy())
plt.grid(False)
plt.savefig(save_dir, bbox_inches="tight")
plt.clf()
def get_optimizer(optimizer_choice='SGD', learning_rate=0.01, momentum=0.9):
return {
'SGD':tf.keras.optimizers.SGD(lr=learning_rate, momentum=momentum),
'Adam': tf.keras.optimizers.Adam(lr=learning_rate)
}.get(optimizer_choice, 'SGD')
#
# return {
# 'SGD':tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum),
# 'Adam': tf.train.AdamOptimizer(learning_rate=learning_rate)
# }.get(optimizer_choice, 'SGD')
def create_model(input_shapes):
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', input_shape=input_shapes))
model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(NUM_CLASSES, activation='softmax'))
return model
def create_model_functional(input_shapes, kernel_size=(3, 3), dropout_rate=0, l2_regularizer=0.1):
input_tensor = tf.keras.Input(shape=input_shapes)
layer = tf.keras.layers.Conv2D(filters=64, kernel_size=kernel_size, activation='relu')(input_tensor)
layer = tf.keras.layers.Conv2D(filters=64, kernel_size=kernel_size, activation='relu')(layer)
layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(layer)
layer = tf.keras.layers.Dropout(rate=dropout_rate)(layer)
layer = tf.keras.layers.Flatten()(layer)
layer = tf.keras.layers.Dense(128, activation='relu')(layer)
layer = tf.keras.layers.Dropout(rate=dropout_rate)(layer)
predictions = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax',
kernel_regularizer=tf.keras.regularizers.l2(l=l2_regularizer))(layer)
model = tf.keras.models.Model(inputs=input_tensor, outputs=predictions)
return model
def fit_model_and_evaluate(model, optimizer_choice='SGD', learning_rate=0.01, verbose=1):
optimizer = get_optimizer(optimizer_choice=optimizer_choice, learning_rate=learning_rate)
os.makedirs('graph', exist_ok=True)
os.makedirs('checkpoint', exist_ok=True)
file_path = 'checkpoint/model.{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}.hdf5'
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path)
# log_dir = os.path.join()
board = tf.keras.callbacks.TensorBoard(log_dir='./graph', histogram_freq=0,
write_graph=True, write_images=True)
model.compile(loss=tf.keras.losses.categorical_crossentropy,
optimizer=optimizer,
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.fit(x=ds_train, validation_data=ds_valid, epochs=20, steps_per_epoch=468,
callbacks=[board, model_checkpoint],
validation_steps=40, verbose=verbose)
score = model.evaluate(ds_test, steps=10, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
if __name__ == '__main__':
ds_train, ds_valid, ds_test, input_shapes, class_names = get_input_datasets()
model = create_model_functional(input_shapes)
fit_model_and_evaluate(model)
# test_ds(ds_train, 'train')
# test_ds(ds_test, 'test')
Other info / logs Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 17 (4 by maintainers)
After set “save_weight_only”=True the error went away, but this is not desired.
@atomextranova can you try disabling eager execution? I am facing the same issue and it only happens when eager execution is on.
same bug with tensorflow 2.0