compute-runtime: Miscompilation on Tiger Lake Xe GPU with early loop exit

I’m experiencing a miscompilation with our Julia-based oneAPI stack. The reproducer is as follows (I’ve documented the code, but it should be very straightforward even without knowing Julia):

using oneAPI

# simple kernel that copies `len` values from `src` to `dst`.
# each thread processes `elem_per_thread` values
function kernel(dest, src, elem_per_thread, len)
    for j in 1:elem_per_thread
        # compute the (1-based) index to copy
        i = get_local_id(0) + (get_group_id(0) - 1) * get_local_size(0)
        i += (j - 1) * get_local_size(0) * get_num_groups(0)

        # this early-exit causes the miscompilation
        i > len && return

        val = unsafe_load(src, i)
        unsafe_store!(dest, val, i)
    end
    return
end

function main(T=Int32, n=10)
    # allocate two vectors of 100 elements
    A = oneAPI.ones(T, n*n)
    B = oneAPI.zeros(T, n*n)
    ptrA = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(A))
    ptrB = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(B))

    # invoke the kernel with 10 groups of 10 threads
    @oneapi items=n groups=n kernel(ptrB, ptrA, 1, n*n)

    # display the result as an 10x10 matrix
    reshape(Array(B), (n,n))
end

The LLVM IR we generate for this kernel looks as follows:

source_filename = "text"
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-unknown"

declare i64 @_Z12get_local_idj(i32)

declare i64 @_Z12get_group_idj(i32)

declare i64 @_Z14get_local_sizej(i32)

declare i64 @_Z14get_num_groupsj(i32)

define spir_kernel void @_Z17julia_kernel_78787LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_(i8 addrspace(1)* %0, i8 addrspace(1)* %1, i64 signext %2, i64 signext %3) local_unnamed_addr #0 {
entry:
  %.inv = icmp sgt i64 %2, 0
  %4 = select i1 %.inv, i64 %2, i64 0
  br i1 %.inv, label %L13.preheader, label %L56

L13.preheader:                                    ; preds = %entry
  %5 = bitcast i8 addrspace(1)* %1 to i32 addrspace(1)*
  %6 = bitcast i8 addrspace(1)* %0 to i32 addrspace(1)*
  br label %L13

L13:                                              ; preds = %L57, %L13.preheader
  %value_phi4 = phi i64 [ %23, %L57 ], [ 1, %L13.preheader ]
  %7 = call i64 @_Z12get_local_idj(i32 0)
  %8 = add i64 %7, 1
  %9 = call i64 @_Z12get_group_idj(i32 0)
  %10 = call i64 @_Z14get_local_sizej(i32 0)
  %11 = mul i64 %10, %9
  %12 = add i64 %8, %11
  %13 = add nsw i64 %value_phi4, -1
  %14 = call i64 @_Z14get_local_sizej(i32 0)
  %15 = call i64 @_Z14get_num_groupsj(i32 0)
  %16 = mul i64 %14, %13
  %17 = mul i64 %16, %15
  %18 = add i64 %12, %17
  %.not = icmp sgt i64 %18, %3
  br i1 %.not, label %L56, label %L57

L56:                                              ; preds = %L57, %L13, %entry
  ret void

L57:                                              ; preds = %L13
  %19 = add i64 %18, -1
  %20 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %19
  %21 = load i32, i32 addrspace(1)* %20, align 1
  %22 = getelementptr inbounds i32, i32 addrspace(1)* %6, i64 %19
  store i32 %21, i32 addrspace(1)* %22, align 1
  %.not1 = icmp eq i64 %value_phi4, %4
  %23 = add nuw i64 %value_phi4, 1
  br i1 %.not1, label %L56, label %L13
}

attributes #0 = { "probe-stack"="inline-asm" }

!llvm.module.flags = !{!0, !1}
!opencl.ocl.version = !{!2}
!opencl.spirv.version = !{!3}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 2, i32 0}
!3 = !{i32 1, i32 5}

We then translate that IR to SPIR-V using the Khronos LLVM to SPIR-V translator:

; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 59
; Schema: 0
               OpCapability Addresses
               OpCapability Linkage
               OpCapability Kernel
               OpCapability Int64
               OpCapability Int8
          %1 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %13 "_Z17julia_kernel_79077LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_" %__spirv_BuiltInLocalInvocationId %__spirv_BuiltInWorkgroupId %__spirv_BuiltInWorkgroupSize %__spirv_BuiltInNumWorkgroups
               OpSource OpenCL_C 200000
               OpName %__spirv_BuiltInLocalInvocationId "__spirv_BuiltInLocalInvocationId"
               OpName %__spirv_BuiltInWorkgroupId "__spirv_BuiltInWorkgroupId"
               OpName %__spirv_BuiltInWorkgroupSize "__spirv_BuiltInWorkgroupSize"
               OpName %__spirv_BuiltInNumWorkgroups "__spirv_BuiltInNumWorkgroups"
               OpName %entry "entry"
               OpName %L13_preheader "L13.preheader"
               OpName %L13 "L13"
               OpName %L56 "L56"
               OpName %L57 "L57"
               OpName %_inv ".inv"
               OpName %value_phi4 "value_phi4"
               OpName %_not ".not"
               OpName %_not1 ".not1"
               OpDecorate %__spirv_BuiltInNumWorkgroups BuiltIn NumWorkgroups
               OpDecorate %__spirv_BuiltInWorkgroupSize BuiltIn WorkgroupSize
               OpDecorate %__spirv_BuiltInWorkgroupId BuiltIn WorkgroupId
               OpDecorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
               OpDecorate %__spirv_BuiltInLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInWorkgroupId Constant
               OpDecorate %__spirv_BuiltInWorkgroupSize Constant
               OpDecorate %__spirv_BuiltInNumWorkgroups Constant
               OpDecorate %16 FuncParamAttr Sext
               OpDecorate %17 FuncParamAttr Sext
               OpDecorate %__spirv_BuiltInWorkgroupId LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
               OpDecorate %__spirv_BuiltInNumWorkgroups LinkageAttributes "__spirv_BuiltInNumWorkgroups" Import
               OpDecorate %__spirv_BuiltInWorkgroupSize LinkageAttributes "__spirv_BuiltInWorkgroupSize" Import
               OpDecorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
      %ulong = OpTypeInt 64 0
      %uchar = OpTypeInt 8 0
       %uint = OpTypeInt 32 0
    %ulong_0 = OpConstant %ulong 0
    %ulong_1 = OpConstant %ulong 1
%ulong_18446744073709551615 = OpConstant %ulong 18446744073709551615
    %v3ulong = OpTypeVector %ulong 3
%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
       %void = OpTypeVoid
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
         %12 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar %ulong %ulong
       %bool = OpTypeBool
%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
%__spirv_BuiltInLocalInvocationId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupSize = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInNumWorkgroups = OpVariable %_ptr_Input_v3ulong Input
         %13 = OpFunction %void None %12
         %14 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
         %15 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
         %16 = OpFunctionParameter %ulong
         %17 = OpFunctionParameter %ulong
      %entry = OpLabel
       %_inv = OpSGreaterThan %bool %16 %ulong_0
         %26 = OpSelect %ulong %_inv %16 %ulong_0
               OpBranchConditional %_inv %L13_preheader %L56
%L13_preheader = OpLabel
         %29 = OpBitcast %_ptr_CrossWorkgroup_uint %15
         %30 = OpBitcast %_ptr_CrossWorkgroup_uint %14
               OpBranch %L13
        %L13 = OpLabel
 %value_phi4 = OpPhi %ulong %31 %L57 %ulong_1 %L13_preheader
         %34 = OpLoad %v3ulong %__spirv_BuiltInLocalInvocationId Aligned 32
         %35 = OpCompositeExtract %ulong %34 0
         %36 = OpIAdd %ulong %35 %ulong_1
         %37 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupId Aligned 32
         %38 = OpCompositeExtract %ulong %37 0
         %39 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
         %40 = OpCompositeExtract %ulong %39 0
         %41 = OpIMul %ulong %40 %38
         %42 = OpIAdd %ulong %36 %41
         %44 = OpIAdd %ulong %value_phi4 %ulong_18446744073709551615
         %45 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
         %46 = OpCompositeExtract %ulong %45 0
         %47 = OpLoad %v3ulong %__spirv_BuiltInNumWorkgroups Aligned 32
         %48 = OpCompositeExtract %ulong %47 0
         %49 = OpIMul %ulong %46 %44
         %50 = OpIMul %ulong %49 %48
         %51 = OpIAdd %ulong %42 %50
       %_not = OpSGreaterThan %bool %51 %17
               OpBranchConditional %_not %L56 %L57
        %L56 = OpLabel
               OpReturn
        %L57 = OpLabel
         %53 = OpIAdd %ulong %51 %ulong_18446744073709551615
         %54 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %29 %53
         %55 = OpLoad %uint %54 Aligned 1
         %56 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %30 %53
               OpStore %56 %55 Aligned 1
      %_not1 = OpIEqual %bool %value_phi4 %26
         %31 = OpIAdd %ulong %value_phi4 %ulong_1
               OpBranchConditional %_not1 %L56 %L13
               OpFunctionEnd

Executing this simple kernel should evidently yield an output consisting of all ones, but instead I’m getting zeros at certain places:

 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  0  0  0  0  0  0  0  0  0
 1  1  1  1  1  1  1  1  1  1

The exact pattern is consistent, but depends on the amount of threads/blocks. It seems caused by the early exit from the loop, even though that code isn’t hit (in this example I’m launching exactly 10*10=100 threads for arrays of length 100). It also doesn’t reproduce on an Intel HD Graphics P630. I’m using compute-runtime 22.8.22549, which depends on IGC 1.0.10395, on a 64-bit Linux running kernel 5.10.

I hope this is enough information to help isolate the issue. Feel free to reach out to me for more information, or if needed for help to set-up a Julia environment (which is really easy).

About this issue

Original URL
State: closed
Created 2 years ago
Comments: 24

Most upvoted comments

The engineering build mailed to me did work, so I guess there other changes on IGC#master that matter for this test (assuming that build was just using the latest versions of everything). I’ll just wait until everything is released first before debugging again.

maleadt on Apr 26, 2022