tensorflow: tf.nn.separable_conv2d is slower than conv2d on GPU

System information

Have I written custom code (as opposed to using a stock example script provided in TensorFlow):Yes
OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Ubuntu 16.04
TensorFlow installed from (source or binary): binary
TensorFlow version (use command below): TF 1.3
Python version: 3.6
Bazel version (if compiling from source):
CUDA/cuDNN version: CUDA8.0 /cuDNN6
GPU model and memory: GTX1080ti 11G

Describe the problem

In theory, separable_conv2d should be more efficient than conv2d, but when I test a simple model on Cifar10, the result shows that nn.separable_conv2d run slower on GPU, but is indeed faster on CPU.

Here is my test results on GPU:

training time for normal_conv after 2000 step: 8.18395892999979 sec
time for normal_conv after one forward step:  0.003980965999289765 sec
training time for separable_conv after 2000 step: 9.158266903999902 sec
time for separable_conv after one forward step:  0.0036441169995669043 sec

Source code / logs

Below is a fully self-contained example, I first define a model with two conv2d , than I define another model with one conv2d followed by one separable_conv2d, both model have 32 channels for each conv_layer and identical fc_layer.

import tensorflow as tf
import timeit
import numpy as np
from tensorflow.contrib.keras.python.keras.datasets.cifar10 import load_data

(x_train, y_train), (x_val, y_val) = load_data()
learning_rate = 0.001
num_steps = 1000
n_classes = 10
batch_size = 32

def reformat(labels):
    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(n_classes) == labels[:,None]).astype(np.float32)
    return  labels.reshape(labels.shape[0],10)
train_labels = reformat(y_train)
tf.reset_default_graph()
x = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.float32, [None, 10])
weights1 = {}
weights2 = {}
dtype = tf.float32
with tf.name_scope('INIT_OP'):
    conv_initializer =  tf.contrib.layers.xavier_initializer_conv2d(dtype=dtype)
    fc_initializer =  tf.contrib.layers.xavier_initializer(dtype=dtype)

k = 3
kernel = 16

# Define weights for normal ConvNet
with tf.name_scope('VARIABLES_1'):
    weights1['conv1'] = tf.get_variable('conv1', [k, k, 3, kernel], initializer=conv_initializer, dtype=dtype, trainable=True)
    weights1['b1'] = tf.get_variable('b1', initializer=tf.zeros([kernel]))
    weights1['conv2'] = tf.get_variable('conv2', [k, k, kernel, kernel], initializer=conv_initializer, dtype=dtype, trainable=True)
    weights1['b2'] = tf.get_variable('b2', initializer=tf.zeros([kernel]))

    weights1['wd1'] = tf.get_variable('wd1', [8*8*kernel, 512], initializer=fc_initializer, dtype=dtype, trainable=True)
    weights1['bd1'] = tf.get_variable('bd1',  initializer=tf.zeros([512]) )
    weights1['wd2'] = tf.get_variable('wd2', [512, 10], initializer=fc_initializer, dtype=dtype, trainable=True)
    weights1['bd2'] = tf.get_variable('bd2',  initializer=tf.zeros([10]) )


#Define weights for separable ConvNet
with tf.name_scope('VARIABLES_sep'):
    weights2['conv1'] = tf.get_variable('2_conv1', [k, k, 3, kernel], initializer=conv_initializer, dtype=dtype, trainable=True)
    weights2['conv_dw2'] = tf.get_variable('conv_dw2', [k, k, kernel, 1], initializer=conv_initializer, dtype=dtype, trainable=True)
    weights2['conv_pw2'] = tf.get_variable('conv_pw2', [1, 1, kernel, kernel], initializer=conv_initializer, dtype=dtype, trainable=True)

    weights2['b1'] = tf.get_variable('2_b1', initializer=tf.zeros([kernel]))
    weights2['b2'] = tf.get_variable('2_b2', initializer=tf.zeros([kernel]))

    weights2['wd1'] = tf.get_variable('2_wd1', [8*8*kernel, 512], initializer=fc_initializer, dtype=dtype, trainable=True)
    weights2['bd1'] = tf.get_variable('2_bd1',  initializer=tf.zeros([512]) )
    weights2['wd2'] = tf.get_variable('2_wd2', [512, 10], initializer=fc_initializer, dtype=dtype, trainable=True)
    weights2['bd2'] = tf.get_variable('2_bd2',  initializer=tf.zeros([10]) )

def forward_conv_sep( inp, weights):
    hidden = conv_block(inp, weights2['conv1'], weights2['b1'])
    hidden = maxpool2d(hidden)
    hidden = conv_block_dw(hidden, weights2['conv_dw2'], weights2['conv_pw2'], weights2['b2'])
    hidden = maxpool2d(hidden)
    hidden = tf.reshape( hidden, [-1, np.prod([int(dim) for dim in hidden.get_shape()[1:]])] )
    fc1 = tf.matmul(hidden, weights2['wd1']) + weights2['bd1']
    fc1 = tf.nn.relu(fc1)
    return tf.matmul(fc1, weights2['wd2']) + weights2['bd2']

def forward_conv( inp, weights):
    hidden = conv_block(inp, weights1['conv1'], weights1['b1'])
    hidden = maxpool2d(hidden)
    hidden = conv_block(hidden, weights1['conv2'], weights1['b2'])
    hidden = maxpool2d(hidden)
    hidden = tf.reshape( hidden, [-1, np.prod([int(dim) for dim in hidden.get_shape()[1:]])] )
    fc1 = tf.matmul(hidden, weights1['wd1']) + weights1['bd1']
    fc1 = tf.nn.relu(fc1)
    return tf.matmul(fc1, weights1['wd2']) + weights1['bd2']


def conv_block_dw(inp, cweight_w, cweight_p, bweight):
    no_stride =  [1,1,1,1]
    conv_output = tf.nn.separable_conv2d(inp, cweight_w, cweight_p, no_stride, 'SAME') + bweight
    return tf.nn.relu(conv_output)

def conv_block(inp, cweight, bweight, activation=tf.nn.relu):
    no_stride =  [1,1,1,1]
    conv_output = tf.nn.conv2d(inp, cweight, no_stride, 'SAME') + bweight
    return tf.nn.relu(conv_output)

def maxpool2d(inp, k=2):
    return tf.nn.max_pool(inp, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                          padding='SAME')

#logits for normal ConvNet
with tf.name_scope("forward_conv"):
    pred1 = forward_conv(x, weights1)

#Cost for normal ConvNet
with tf.name_scope("cost1"):
    loss1 = tf.nn.softmax_cross_entropy_with_logits(logits=pred1, labels=y)
    cost1 = tf.reduce_mean(loss1)

#training op for normal ConvNet
with tf.name_scope('train_op1'):
    train_op1 = tf.train.RMSPropOptimizer(learning_rate, 0.9).minimize(cost1)    

#logits for separable ConvNet
with tf.name_scope("forward_conv_sep"):
    pred2 = forward_conv_sep(x, weights2)

#Cost for separable ConvNet
with tf.name_scope("cost2"):
    loss2 = tf.nn.softmax_cross_entropy_with_logits(logits=pred2, labels=y)
    cost2 = tf.reduce_mean(loss2)

# training op for separable ConvNet
with tf.name_scope('train_op2'):
    train_op2 = tf.train.RMSPropOptimizer(learning_rate, 0.9).minimize(cost2)


with tf.name_scope('INIT'):
    init = tf.global_variables_initializer()


with tf.Session() as sess:

    sess.run(init)

    #train normal ConvNet for 2000 steps
    start = timeit.default_timer()
    for step in range(num_steps):
        r = np.random.choice(y_train.shape[0], batch_size, replace=False)
        batch_data = x_train[r]
        batch_labels = train_labels[r]

        feed_dict = {x : batch_data, y: batch_labels}
        _ , l = sess.run([train_op1,cost1], feed_dict=feed_dict)

    stop = timeit.default_timer()
    print ('training time for normal_conv after '+str(num_steps)+' step:',stop - start) 


    start = timeit.default_timer()
    feed_dict = {x : batch_data, y: batch_labels}
    predictions1 = sess.run(pred1, feed_dict=feed_dict)
    stop = timeit.default_timer()
    print ('time for normal_conv after one forward step: ',stop - start)



    # train separable ConvNet for 2000 steps
    start = timeit.default_timer()
    for step in range(num_steps):
        r = np.random.choice(y_train.shape[0], batch_size, replace=False)
        batch_data = x_train[r]
        batch_labels = train_labels[r]

        feed_dict = {x : batch_data, y: batch_labels}
        _ , l = sess.run([train_op2,cost2], feed_dict=feed_dict)


    stop = timeit.default_timer()
    print ('training time for sep_conv after '+str(num_steps)+' step:',stop - start) 

    start = timeit.default_timer()
    feed_dict = {x : batch_data, y: batch_labels}
    predictions = sess.run(pred2, feed_dict=feed_dict)
    stop = timeit.default_timer()
    print ('time for sep_conv after one forward step: ',stop - start)

About this issue

Original URL
State: closed
Created 7 years ago
Comments: 19 (7 by maintainers)

Most upvoted comments

Why is this closed? I implemented UNet with separable conv2d and it was around 80% slower than using a standard conv2d. Is there anything in the works to optimize this like using groups?

HouseOfFinwe on Oct 16, 2018

This should be fixed by #33836. It currently requires tf-nightly but will ship in the coming 2.2 release.

byronyi on Feb 10, 2020

At what kernel sizes will convolutions be computed via FFT instead of directly? Anyway, a speedup by doing a separable convolution is more noticeable for larger kernels, so for small kernels the overhead involved in doing two convolutions might be larger than the speedup, especially for what I assume is a highly-optimized convolution setting with the 3x3 kernel (Winograd).

Essentially, for a [m, n] kernel it would take m*n calculations for a convolution and m+n calculations for the separable convolution, if I’m not mistaken.

carlthome on Sep 14, 2017