tensorflow: KeyError when taking derivative to input of Conv2D with tf.function only
System information
- Have I written custom code (as opposed to using a stock example script provided in TensorFlow): yes, see below
- OS Platform and distribution: Fedora
- TensorFlow installed from: binary
- TensorFlow version: 2.3.1
- Python version: 3.8.6
- CUDA/cuDNN version: 10.0
- GPU model and memory: GeForce GTX 1080 Ti
Describe the current behavior
Crashes with a stacktrace: KeyError: 'strides'
.
code
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow_datasets as tfds
data, data_info = tfds.load("mnist", split="train", as_supervised=True, with_info=True)
data = data.map(lambda x, _: tf.cast(x, tf.float32) / 255.)
data_shape = data_info.features["image"].shape
dimension = tf.reduce_prod(data_shape).numpy()
latent_distribution = tfp.distributions.MultivariateNormalDiag(
loc=[0.] * dimension,
scale_diag=[1.] * dimension,
)
input = tf.keras.Input(shape=data_shape)
state = input
state = tf.keras.layers.Conv2D(64, kernel_size=4, strides=2, use_bias=False)(state)
state = tf.keras.layers.Conv2D(128, kernel_size=4, strides=2, padding="same", use_bias=False)(state)
state = tf.keras.layers.Conv2D(256, kernel_size=4, strides=2, padding="same", use_bias=False)(state)
state = tf.keras.layers.Activation(tf.nn.softplus)(state) # Essential to trigger bug
state = tf.keras.layers.Conv2D(1, kernel_size=4, strides=1, use_bias=False)(state)
state = tf.keras.layers.Flatten()(state)
f = tf.keras.Model(inputs=input, outputs=state)
optimizer = tf.keras.optimizers.Adam()
@tf.function # Essential to trigger bug
def train_step(data):
with tf.GradientTape() as tape:
tape.watch(f.trainable_variables)
with tf.GradientTape() as c_tape:
c_tape.watch(data)
with tf.GradientTape() as a_tape:
a_tape.watch(data)
b = f(data)
a = a_tape.gradient(b, data)
a_flat = tf.reshape(a, (-1, dimension))
c = c_tape.batch_jacobian(a, data)
c = tf.reshape(c, (-1, dimension, dimension))
d = latent_distribution.log_prob(a_flat)
_, e = tf.linalg.slogdet(c)
ff = tf.reduce_mean(d + e)
loss = -ff
gradients = tape.gradient(loss, f.trainable_variables)
optimizer.apply_gradients(zip(gradients, f.trainable_variables))
train_data = data.batch(1)
for batch in train_data:
train_step(batch)
log
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-11-aa3781dddb11> in <module>
1 for batch in train_data:
----> 2 train_step(batch)
.../python3.8/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
.../python3.8/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
821 # This is the first call of __call__, so we have to initialize.
822 initializers = []
--> 823 self._initialize(args, kwds, add_initializers_to=initializers)
824 finally:
825 # At this point we know that the initialization is complete (or less
.../python3.8/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
694 self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
695 self._concrete_stateful_fn = (
--> 696 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
697 *args, **kwds))
698
.../python3.8/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2853 args, kwargs = None, None
2854 with self._lock:
-> 2855 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2856 return graph_function
2857
.../python3.8/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3211
3212 self._function_cache.missed.add(call_context_key)
-> 3213 graph_function = self._create_graph_function(args, kwargs)
3214 self._function_cache.primary[cache_key] = graph_function
3215 return graph_function, args, kwargs
.../python3.8/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3063 arg_names = base_arg_names + missing_arg_names
3064 graph_function = ConcreteFunction(
-> 3065 func_graph_module.func_graph_from_py_func(
3066 self._name,
3067 self._python_function,
.../python3.8/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
984 _, original_func = tf_decorator.unwrap(python_func)
985
--> 986 func_outputs = python_func(*func_args, **func_kwargs)
987
988 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
.../python3.8/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
598 # __wrapped__ allows AutoGraph to swap in a converted function. We give
599 # the function a weak reference to itself to avoid a reference cycle.
--> 600 return weak_wrapped_fn().__wrapped__(*args, **kwds)
601 weak_wrapped_fn = weakref.ref(wrapped_fn)
602
.../python3.8/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
971 except Exception as e: # pylint:disable=broad-except
972 if hasattr(e, "ag_error_metadata"):
--> 973 raise e.ag_error_metadata.to_exception(e)
974 else:
975 raise
KeyError: in user code:
<ipython-input-9-5c54f4064666>:24 train_step *
gradients = tape.gradient(loss, f.trainable_variables)
.../python3.8/site-packages/tensorflow/python/eager/backprop.py:1067 gradient **
flat_grad = imperative_grad.imperative_grad(
.../python3.8/site-packages/tensorflow/python/eager/imperative_grad.py:71 imperative_grad
return pywrap_tfe.TFE_Py_TapeGradient(
.../python3.8/site-packages/tensorflow/python/eager/backprop.py:162 _gradient_function
return grad_fn(mock_op, *out_grads)
.../python3.8/site-packages/tensorflow/python/ops/nn_grad.py:50 _Conv2DBackpropInputGrad
strides=op.get_attr("strides"),
.../python3.8/site-packages/tensorflow/python/eager/backprop.py:121 get_attr
raise KeyError(attr)
KeyError: 'strides'
And log on a different setup:
- OS Platform and distribution: Arch
- TensorFlow installed from: source
- TensorFlow version: 2.5.0, 2cad9d750cadd825910b61351a731eb0e8031608
- Python version: 3.8.6-1
- CUDA/cuDNN version: 11.1.1-1 / 8.0.5.39-1
- GPU model and memory: GeForce GTX 960M
log
2020-12-01 22:58:30.236554: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2020-12-01 22:58:31.813385: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2020-12-01 22:58:31.814170: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2020-12-01 22:58:33.296576: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.296901: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1727] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 960M computeCapability: 5.0
coreClock: 1.0975GHz coreCount: 5 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 74.65GiB/s
2020-12-01 22:58:33.296923: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2020-12-01 22:58:33.321243: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2020-12-01 22:58:33.321345: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2020-12-01 22:58:33.335247: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2020-12-01 22:58:33.339668: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2020-12-01 22:58:33.359017: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11
2020-12-01 22:58:33.365850: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2020-12-01 22:58:33.367723: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2020-12-01 22:58:33.367855: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.368285: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.368606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1869] Adding visible gpu devices: 0
2020-12-01 22:58:33.369722: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2020-12-01 22:58:33.369860: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.370218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1727] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 960M computeCapability: 5.0
coreClock: 1.0975GHz coreCount: 5 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 74.65GiB/s
2020-12-01 22:58:33.370243: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2020-12-01 22:58:33.370266: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2020-12-01 22:58:33.370284: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2020-12-01 22:58:33.370301: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2020-12-01 22:58:33.370316: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2020-12-01 22:58:33.370330: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11
2020-12-01 22:58:33.370344: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2020-12-01 22:58:33.370362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2020-12-01 22:58:33.370428: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.370793: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:33.371107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1869] Adding visible gpu devices: 0
2020-12-01 22:58:33.371882: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2020-12-01 22:58:34.128469: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-12-01 22:58:34.128514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1273] 0
2020-12-01 22:58:34.128520: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1286] 0: N
2020-12-01 22:58:34.129295: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:34.129675: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:34.130039: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-01 22:58:34.130359: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1413] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 1635 MB memory) -> physical GPU (device: 0, name: GeForce GTX 960M, pci bus id: 0000:01:00.0, compute capability: 5.0)
2020-12-01 22:58:34.278962: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:127] None of the MLIR optimization passes are enabled (registered 2)
2020-12-01 22:58:34.293661: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2601325000 Hz
2020-12-01 22:58:35.392106: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2020-12-01 22:58:36.093824: I tensorflow/stream_executor/cuda/cuda_dnn.cc:334] Loaded cuDNN version 8005
2020-12-01 22:58:40.021160: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2020-12-01 22:58:40.920336: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2020-12-01 22:58:42.314731: I tensorflow/core/util/cuda_solvers.cc:180] Creating CudaSolver handles for stream 0x55ae4f348220
2020-12-01 22:58:42.315311: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11
2020-12-01 22:58:43.103484: F tensorflow/core/util/cuda_solvers.cc:115] Check failed: cusolverDnCreate(&cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS Failed to create cuSolverDN instance.
[1] 2834 abort (core dumped) python bug.py
About this issue
- Original URL
- State: closed
- Created 4 years ago
- Comments: 19 (5 by maintainers)
Hi @vandenheuvel, yes sorry about that. I got this issue confused with another one 😃 Added the bug label back.