llama.cpp: Subtle Vulkan shader compilation bug when running on Adreno GPUs (Samsung Galaxy S23 Ultra)

GPU info:

QUALCOMM build          : 7b26bdd942, Iab69c31769
                                                                                                    Build Date              : 08/28/23
                                                                                                    Shader Compiler Version : E031.41.03.44
                                                                                                    Local Branch            : 
                                                                                                    Remote Branch           : refs/tags/AU_LINUX_ANDROID_LA.VENDOR.13.2.0.11.00.00.855.659
                                                                                                    Remote Branch           : NONE
                                                                                                    Reconstruct Branch      : NOTHING
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Build Config            : S P 14.1.4 AArch64
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Driver Path             : /vendor/lib64/hw/vulkan.adreno.so
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Driver Version          : 0676.42
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  PFP                     : 0x01740158
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  ME                      : 0x00000000
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Application Name    : ggml-vulkan
                                                                                                    Application Version : 0x00000001
                                                                                                    Engine Name         : (null)
                                                                                                    Engine Version      : 0x00000000
                                                                                                    Api Version         : 0x00402000

In the file: ggml_vk_generate_shaders.py:640: dequant_q4_K_body

The following DOES NOT WORK:

const int y_idx = i * QUANT_K + 64 * il + n * ir;
        const int qs_idx = 32*il + n * ir;

        uint8_t sc;
        uint8_t m;
        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is] & 63);
            m  = uint8_t(data_a[i].scales[is + 4] & 63);
        } else {
            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
        }
        const FLOAT_TYPE d1 = dall * sc;
        const FLOAT_TYPE m1 = dmin * m;

        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is + 1] & 63);
            m  = uint8_t(data_a[i].scales[is + 5] & 63);
        } else {
            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
        }
        const FLOAT_TYPE d2 = dall * sc;
        const FLOAT_TYPE m2 = dmin * m;

        [[unroll]] for (int l = 0; l < n; ++l) {
            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);

This crashes with the issue: Shader compilation failed for shaderType: 5.

The workaround appears to be “tail-ing” the if-branches (i.e. duplicating the code so the branches do not converge inside the loop).

const int y_idx = i * QUANT_K + 64 * il + n * ir;
        const int qs_idx = 32*il + n * ir;

        uint8_t sc;
        uint8_t m;
        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is] & 63);
            m  = uint8_t(data_a[i].scales[is + 4] & 63);

            const FLOAT_TYPE d1 = dall * sc;
            const FLOAT_TYPE m1 = dmin * m;

            if (is < 4) {
                sc = uint8_t(data_a[i].scales[is + 1] & 63);
                m  = uint8_t(data_a[i].scales[is + 5] & 63);

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            } else {
                sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
                m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            }
        } else {
            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));

            const FLOAT_TYPE d1 = dall * sc;
            const FLOAT_TYPE m1 = dmin * m;

            if (is < 4) {
                sc = uint8_t(data_a[i].scales[is + 1] & 63);
                m  = uint8_t(data_a[i].scales[is + 5] & 63);

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            } else {
                sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
                m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            }
        }

This workaround is tested to succeed when compiling for the Adreno GPU in Samsung Galaxy S23 Ultra.

This seems to indicate a subtle bug in the adreno shader compiler? Does anyone know what’s going on?

About this issue

Original URL
State: open
Created 5 months ago
Comments: 29 (7 by maintainers)

Most upvoted comments

Can you share the change where you can set the number of Ops per buffer? I would love to test this on my device as well

This is very crude and probably wrong on many levels. The trick to getting some semi-decent TG rate was to not always restrict the buffer, but only when it matches some criteria. To get some idea of the criteria, I first started dumping the ops and counting their numbers and associated buffer sizes. I suspect that I am counting it wrong, but by accident or luck there was some semblance of a pattern - shit seems to hit the fan when the sum of buffer sizes gets too close to the total heap size (your total available RAM size, or check what vulkaninfo reports). Trying to do something more sophisticated killed my evening but did not produce anything more useful. This at least gets you some small models like phi, stablelm, gemma, to experiment with.

This whole thing goes in place of just “last_node = true” 😉

#define VK_VENDOR_ID_QUALCOMM 0x5143
uint32_t g_node_count = 0;
uint64_t g_buffer_size = 0;
uint64_t g_guard_size = 0;
uint64_t g_total_heap = 0;
uint64_t g_free_heap = 0;

    // Adreno has limited maxMemoryAllocation (1GB) and will die when too many layers are offloaded
    if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_QUALCOMM) {
        g_node_count++;
        //std::cerr << "tensor: " << node->name << ", op: " << ggml_op_name(node->op) << "[" << ggml_nbytes(node) << "]" << std::endl; 
        g_buffer_size += ggml_backend_buffer_get_size(node->buffer);
        if (last_node) {
                //std::cerr << "*** VK tensors [" << g_node_count << ", " << g_buffer_size << "] ***" << std::endl;
                g_node_count = 0;
                g_buffer_size = 0;
                g_guard_size = 0;
        }
        if (g_guard_size == 0)
                g_guard_size = ggml_backend_buffer_get_size(node->buffer) * 2; // just some margin
        if (g_total_heap == 0)
                ggml_backend_vk_get_device_memory(0, &g_free_heap, &g_total_heap);
        if (g_buffer_size + g_guard_size >= g_total_heap) {
                //std::cerr << "*** VK tensors [" << g_node_count << ", " << g_buffer_size << "] ***" << std::endl;
                last_node = true;
                g_node_count = 0;
                g_buffer_size = 0;
                g_guard_size = 0;
        }
    }

akingoverlook on Feb 25, 2024