tensorflow: gRPC: terminate called after throwing an instance of 'std::bad_alloc'

System information • OS Platform and Distribution: CentOS Linux release 7.4.1708 (Core) • TensorFlow version : Tensorflow-1.12.0 built from source

Describe the problem: Running a model parallel implementation results in the following error:

  what():  std::bad_alloc
  Aborted

The code runs fine on a single node i.e. with a single worker, but distributing across 2 nodes/2 workers results in the above error. Suspect it is related to grpc.

Source code / logs On the chief worker:

import tensorflow as tf
from time import time

def conv_op(inputs, kernel_, name):
    with tf.variable_scope(name) as scope:
        conv = tf.nn.conv3d(inputs, kernel_, [1, 1, 1, 1, 1], dilations=[1, 1, 1, 1, 1], padding='SAME')
    return conv

def inference_withconv(inputs, kernel_):
    with tf.device('/job:worker/task:{}'.format(0)):
        conv1_woc = conv_op(inputs, kernel_, 'conv1_woc')
    with tf.device('/job:worker/task:{}'.format(1)):
        conv2_woc = conv_op(conv1_woc, kernel_, 'conv2_woc')
    with tf.device('/job:worker/task:{}'.format(0)):
        convadd_results = tf.math.add(conv1_woc, conv2_woc)
        return convadd_results

def run_benchmark():
    image_shape = (1,1024,1024,1024,1)
    kernel_1_shape = (5,5,5,1,1)
    bias_shape = (1)

    dummy_image = tf.truncated_normal(
       image_shape,
       dtype=tf.float32,
       mean=0,
       stddev=1,
       name='3Dconv_image_1')

    dummy_kernel_1 = tf.truncated_normal(
       kernel_1_shape,
       dtype=tf.float32,
       mean=0,
       stddev=1,
       name='3Dconv_kernel_1')

    image_shape = (1, 1024, 1024, 1024, 1)
    image_init = tf.placeholder(tf.float32, shape=image_shape, name='input_1')

    res_ = inference_withconv(image_init, dummy_kernel_1)

    # Define the cluster spec
    cluster_spec = tf.train.ClusterSpec({'worker' : [('<ip_address_1>' + ":" + '2222'), ('<ip_address_2>' + ":" + '2222')]})

    task_id=0 # Chief worker
    server_config = tf.ConfigProto(inter_op_parallelism_threads=2, intra_op_parallelism_threads=20)
    server = tf.train.Server(cluster_spec, job_name='worker', task_index=task_id, config=server_config)

    session_config = tf.ConfigProto(
      inter_op_parallelism_threads=2,
      intra_op_parallelism_threads=20)

    with tf.Session(server.target, config=session_config) as sess:
        sess.run(tf.initialize_all_variables())
        image_, kernel_1 = sess.run([dummy_image, dummy_kernel_1])
        infer_results_ = sess.run(res_, feed_dict={'input_1:0': image_, '3Dconv_kernel_1:0': kernel_1})

if __name__ == '__main__':
    run_benchmark()

On the non-chief worker (on another node with a different IP address):

import tensorflow as tf

# Define the cluster spec
cluster_spec = tf.train.ClusterSpec({'worker' : [('<ip_address_1>' + ":" + '2222'), ('<ip_address_2>' + ":" + '2222')]})

task_id=1 # Non-chief worker
server_config = tf.ConfigProto(inter_op_parallelism_threads=2, intra_op_parallelism_threads=20)
server = tf.train.Server(cluster_spec, job_name='worker', task_index=task_id, config=server_config)

server.join()


About this issue

  • Original URL
  • State: closed
  • Created 5 years ago
  • Comments: 15 (5 by maintainers)

Most upvoted comments

#6 0x00007fffed2490c1 in operator new (sz=18446744069414584364) The call stack showed it was requesting to alloc a memory buffer with 18446744069414584364 bytes (1844674406G). of cause it failed.

I think there is a bug in the method tensorflow::grpc::EncodeTensorToByteBuffer(bool, tensorflow::Tensor const&, grpc::ByteBuffer*) () when calculate the required buffer size.

@asterisk37n Thanks for your comment. Unfortunately, we are facing this issue with batch_size 1 because the input size is really big.