tfjs: Mismatch in packed depthwise conv 2d results on Mali GPU

Please make sure that this is a bug. As per our GitHub Policy, we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template

System information

  • Have I written custom code (as opposed to using a stock example script provided in TensorFlow.js): Yes, test case shared below
  • Mobile device : Pixel 6 Pro (Reproduces on any android device with Mali GPU that I tried)
  • TensorFlow.js installed from (npm or script link): 3.19.0
  • Browser version: Chrome 103.0.5060.53

Describe the current behavior Packed depthwise conv2d produces incorrect result on Mali GPUs when WEBGL_MAX_TEXTURE_SIZE is left at the default value (which is 4096 on most modern android devices). In one of our networks, we end up creating a 3672x1 sized texture for weights, which produces incorrect outputs (presumably some error in sampling the texture, but that is just a guess). Setting max texture size lower than 3672 fixes the issue.

I have attached sample code below to produce the issue (uses the same filter dims as the layer which caused inaccuracy in our original network). The code does the following:

  • We first run packed depthwise conv with 4096 as the texture size (the default value of max texture size on all browsers I tried, the value is hardcoded for consistent results)
  • Next, we re-run the same node with a max size of 2048.
  • Finally, we set backend to cpu to get the reference output.
  • With size 2048, the outputs match the reference, but with the default size, the outputs do not match.

Note: The mismatch occurs only on Mali GPUs with Android based on my tests. iOS, MacOS chrome, Androids with Adreno GPU all produce the correct result with default texture size (of 4096)

Standalone code to reproduce the issue

tf.ENV.set('WEBGL_PACK_DEPTHWISECONV', true)

let w = Array.from({length: 3 * 3 * 816}, () => Math.random())
let x = Array.from({length: 12 * 10 * 816}, () => Math.random())

let inputs = {
    filter: tf.tensor(w, [3, 3, 816, 1]),
    x: tf.tensor(x, [1, 12, 10, 816]),
    strides: 1,
    pad: [[0, 0], [1, 1], [1, 1], [0, 0]],
    dataFormat: "channelsLast",
    dilations: 1,
    activation: 'relu'
};

tf.setBackend('webgl')
tf.ENV.set('WEBGL_MAX_TEXTURE_SIZE', 4096)
let out_4096 = tf.fused.depthwiseConv2d(inputs);

tf.ENV.set('WEBGL_MAX_TEXTURE_SIZE', 2048)
inputs.x = tf.tensor(x, [1, 12, 10, 816])
inputs.filter = tf.tensor(w, [3, 3, 816, 1])
let out_2048 = tf.fused.depthwiseConv2d(inputs);

tf.setBackend('cpu')
inputs.x = tf.tensor(x, [1, 12, 10, 816])
inputs.filter = tf.tensor(w, [3, 3, 816, 1])
let out_reference = tf.fused.depthwiseConv2d(inputs);

const doTensorsDiffer = function(t0, t1) {
	return tf.any(tf.greater(tf.abs(tf.sub(t0, t1)), tf.scalar(1e-2))).dataSync()[0];
}

console.log("Default and 2048 differ? " + doTensorsDiffer(out_4096, out_2048));
console.log("Reference and 2048 differ? " + doTensorsDiffer(out_reference, out_2048));
console.log("Reference and 4096 differ? " + doTensorsDiffer(out_reference, out_4096));

Other info / logs Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

About this issue

  • Original URL
  • State: closed
  • Created 2 years ago
  • Comments: 27

Most upvoted comments

tf.ENV.set(‘WEBGL_PACK_DEPTHWISECONV’, true)

let w = Array.from({length: 3 * 3 * 816}, () => Math.random()) let x = Array.from({length: 12 * 10 * 816}, () => Math.random())

let inputs = { filter: tf.tensor(w, [3, 3, 816, 1]), x: tf.tensor(x, [1, 12, 10, 816]), strides: 1, pad: [[0, 0], [1, 1], [1, 1], [0, 0]], dataFormat: “channelsLast”, dilations: 1, activation: ‘relu’ };

tf.setBackend(‘webgl’) tf.ENV.set(‘WEBGL_MAX_TEXTURE_SIZE’, 4096) let out_4096 = tf.fused.depthwiseConv2d(inputs);

tf.ENV.set(‘WEBGL_MAX_TEXTURE_SIZE’, 2048) inputs.x = tf.tensor(x, [1, 12, 10, 816]) inputs.filter = tf.tensor(w, [3, 3, 816, 1]) let out_2048 = tf.fused.depthwiseConv2d(inputs);

tf.setBackend(‘cpu’) inputs.x = tf.tensor(x, [1, 12, 10, 816]) inputs.filter = tf.tensor(w, [3, 3, 816, 1]) let out_reference = tf.fused.depthwiseConv2d(inputs);

const doTensorsDiffer = function(t0, t1) { return tf.any(tf.greater(tf.abs(t0.sub(t1)), tf.scalar(1e-2))).dataSync()[0]; }

console.log("Default and 2048 differ? " + doTensorsDiffer(out_4096, out_2048)); console.log("Reference and 2048 differ? " + doTensorsDiffer(out_reference, out_2048)); console.log("Reference and 4096 differ? " + doTensorsDiffer(out_reference, out_4096));

Thanks for the fix @Linchenn!

@shanumantesc

Just merged the fix PR to our code base. You could try it by locally building or your could wait our tfjs-v3.21.0.

You could use either tf.env().set('WEBGL_MAX_SIZE_FOR_NARROW_TEXTURE', 2048); or tf.env().set('WEBGL_AUTO_SQUARIFY_NARROW_TEXTURE_SHAPE', true); before running codes on Mali GPU.

Apologies for my misunderstanding of how the vertices were set up. Great to hear that you have a feasible workaround!

Good catch, but I got the same correct results when I using the following fragment shader:

var colorFS = `#version 300 es
precision highp float;

in vec2 v_texcoord;
out vec4 fragColor;
uniform float color;

void main() {
    fragColor = vec4(color, 1.0/color, 0, 0);
}
`;

I set color as 3672, and the output is still 3672, 0.00027233114815317094, 0, 0

@Linchenn really interesting analysis, thanks for digging into this! An interesting followup where I set width to 2 and height as 3672

let width = 2;
let height = 3672;
let texFbPair1 = createTextureAndFramebuffer(gl, width, height);

let buf = gl.createBuffer();
gl.bindBuffer(gl.ARRAY_BUFFER, buf);
gl.bufferData(gl.ARRAY_BUFFER, new Float32Array([
  -1, -1, 0, 0,
   1, -1, 1, 0,
  -1,  1, 0, 1,
  -1,  1, 0, 1,
   1, -1, 1, 0,
   1,  1, 1, 1
]), gl.STATIC_DRAW);
  
gl.bindBuffer(gl.ARRAY_BUFFER, buf);
let colorPrgPositionLoc = gl.getAttribLocation(colorProgram, "position");
gl.enableVertexAttribArray(colorPrgPositionLoc);
gl.vertexAttribPointer(colorPrgPositionLoc, 2, gl.FLOAT, false, 16, 0);
gl.bindBuffer(gl.ARRAY_BUFFER, buf);
let colorPrgUvLoc = gl.getAttribLocation(colorProgram, "uv");
gl.enableVertexAttribArray(colorPrgUvLoc);
gl.vertexAttribPointer(colorPrgUvLoc, 2, gl.FLOAT, false, 16, 8);
gl.useProgram(colorProgram);
gl.bindFramebuffer(gl.FRAMEBUFFER, texFbPair1.fb);
gl.viewport(0, 0, width, height);
gl.drawArrays(gl.TRIANGLES, 0, 6);


// Prepare to logout the shader program's outputs.
let fb = texFbPair1.fb
gl.bindFramebuffer(gl.FRAMEBUFFER, fb)
let packedRGBA = new Float32Array(width * height * 4);
gl.readPixels(
          0, 0, width, height, gl.RGBA, gl.FLOAT, packedRGBA);


let ys = packedRGBA.filter((e, i) => i%(4*width)===1);
for (var i = 0; i < 10; i += 1) {
  console.log(ys[i+1] + ":" + ys[i] + " Diff: " + 1/(ys[i+1] - ys[i]))
}

In this case I get

0.0005092620849609375:0.00023698806762695312 Diff: 3672.77057793345
0.0007815361022949219:0.0005092620849609375 Diff: 3672.77057793345
0.0010538101196289062:0.0007815361022949219 Diff: 3672.77057793345
0.0013260841369628906:0.0010538101196289062 Diff: 3672.77057793345
0.001598358154296875:0.0013260841369628906 Diff: 3672.77057793345
0.0018706321716308594:0.001598358154296875 Diff: 3672.77057793345
0.0021429061889648438:0.0018706321716308594 Diff: 3672.77057793345
0.0024156570434570312:0.0021429061889648438 Diff: 3666.3496503496503
0.0026879310607910156:0.0024156570434570312 Diff: 3672.77057793345
0.002960205078125:0.0026879310607910156 Diff: 3672.77057793345

So this is closer to what we would want. Although we are always off by 1 with NN sampling per my understanding and there is also one case where diff is still wrong 3666.3496503496503 . As I increase the width beyond 2, I still see this pattern of 3672.77057793345 and 3669.557305336833 popping up, but it is better than width 1 🤔