tensorflow: gRPC: terminate called after throwing an instance of 'std::bad_alloc'
System information • OS Platform and Distribution: CentOS Linux release 7.4.1708 (Core) • TensorFlow version : Tensorflow-1.12.0 built from source
Describe the problem: Running a model parallel implementation results in the following error:
what(): std::bad_alloc
Aborted
The code runs fine on a single node i.e. with a single worker, but distributing across 2 nodes/2 workers results in the above error. Suspect it is related to grpc.
Source code / logs On the chief worker:
import tensorflow as tf
from time import time
def conv_op(inputs, kernel_, name):
with tf.variable_scope(name) as scope:
conv = tf.nn.conv3d(inputs, kernel_, [1, 1, 1, 1, 1], dilations=[1, 1, 1, 1, 1], padding='SAME')
return conv
def inference_withconv(inputs, kernel_):
with tf.device('/job:worker/task:{}'.format(0)):
conv1_woc = conv_op(inputs, kernel_, 'conv1_woc')
with tf.device('/job:worker/task:{}'.format(1)):
conv2_woc = conv_op(conv1_woc, kernel_, 'conv2_woc')
with tf.device('/job:worker/task:{}'.format(0)):
convadd_results = tf.math.add(conv1_woc, conv2_woc)
return convadd_results
def run_benchmark():
image_shape = (1,1024,1024,1024,1)
kernel_1_shape = (5,5,5,1,1)
bias_shape = (1)
dummy_image = tf.truncated_normal(
image_shape,
dtype=tf.float32,
mean=0,
stddev=1,
name='3Dconv_image_1')
dummy_kernel_1 = tf.truncated_normal(
kernel_1_shape,
dtype=tf.float32,
mean=0,
stddev=1,
name='3Dconv_kernel_1')
image_shape = (1, 1024, 1024, 1024, 1)
image_init = tf.placeholder(tf.float32, shape=image_shape, name='input_1')
res_ = inference_withconv(image_init, dummy_kernel_1)
# Define the cluster spec
cluster_spec = tf.train.ClusterSpec({'worker' : [('<ip_address_1>' + ":" + '2222'), ('<ip_address_2>' + ":" + '2222')]})
task_id=0 # Chief worker
server_config = tf.ConfigProto(inter_op_parallelism_threads=2, intra_op_parallelism_threads=20)
server = tf.train.Server(cluster_spec, job_name='worker', task_index=task_id, config=server_config)
session_config = tf.ConfigProto(
inter_op_parallelism_threads=2,
intra_op_parallelism_threads=20)
with tf.Session(server.target, config=session_config) as sess:
sess.run(tf.initialize_all_variables())
image_, kernel_1 = sess.run([dummy_image, dummy_kernel_1])
infer_results_ = sess.run(res_, feed_dict={'input_1:0': image_, '3Dconv_kernel_1:0': kernel_1})
if __name__ == '__main__':
run_benchmark()
On the non-chief worker (on another node with a different IP address):
import tensorflow as tf
# Define the cluster spec
cluster_spec = tf.train.ClusterSpec({'worker' : [('<ip_address_1>' + ":" + '2222'), ('<ip_address_2>' + ":" + '2222')]})
task_id=1 # Non-chief worker
server_config = tf.ConfigProto(inter_op_parallelism_threads=2, intra_op_parallelism_threads=20)
server = tf.train.Server(cluster_spec, job_name='worker', task_index=task_id, config=server_config)
server.join()
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 15 (5 by maintainers)
#6 0x00007fffed2490c1 in operator new (sz=18446744069414584364) The call stack showed it was requesting to alloc a memory buffer with 18446744069414584364 bytes (1844674406G). of cause it failed.
I think there is a bug in the method tensorflow::grpc::EncodeTensorToByteBuffer(bool, tensorflow::Tensor const&, grpc::ByteBuffer*) () when calculate the required buffer size.
@asterisk37n Thanks for your comment. Unfortunately, we are facing this issue with batch_size 1 because the input size is really big.