compute-runtime: Miscompilation on Tiger Lake Xe GPU with early loop exit

I’m experiencing a miscompilation with our Julia-based oneAPI stack. The reproducer is as follows (I’ve documented the code, but it should be very straightforward even without knowing Julia):

using oneAPI

# simple kernel that copies `len` values from `src` to `dst`.
# each thread processes `elem_per_thread` values
function kernel(dest, src, elem_per_thread, len)
    for j in 1:elem_per_thread
        # compute the (1-based) index to copy
        i = get_local_id(0) + (get_group_id(0) - 1) * get_local_size(0)
        i += (j - 1) * get_local_size(0) * get_num_groups(0)

        # this early-exit causes the miscompilation
        i > len && return

        val = unsafe_load(src, i)
        unsafe_store!(dest, val, i)
    end
    return
end

function main(T=Int32, n=10)
    # allocate two vectors of 100 elements
    A = oneAPI.ones(T, n*n)
    B = oneAPI.zeros(T, n*n)
    ptrA = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(A))
    ptrB = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(B))

    # invoke the kernel with 10 groups of 10 threads
    @oneapi items=n groups=n kernel(ptrB, ptrA, 1, n*n)

    # display the result as an 10x10 matrix
    reshape(Array(B), (n,n))
end

The LLVM IR we generate for this kernel looks as follows:

source_filename = "text"
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-unknown"

declare i64 @_Z12get_local_idj(i32)

declare i64 @_Z12get_group_idj(i32)

declare i64 @_Z14get_local_sizej(i32)

declare i64 @_Z14get_num_groupsj(i32)

define spir_kernel void @_Z17julia_kernel_78787LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_(i8 addrspace(1)* %0, i8 addrspace(1)* %1, i64 signext %2, i64 signext %3) local_unnamed_addr #0 {
entry:
  %.inv = icmp sgt i64 %2, 0
  %4 = select i1 %.inv, i64 %2, i64 0
  br i1 %.inv, label %L13.preheader, label %L56

L13.preheader:                                    ; preds = %entry
  %5 = bitcast i8 addrspace(1)* %1 to i32 addrspace(1)*
  %6 = bitcast i8 addrspace(1)* %0 to i32 addrspace(1)*
  br label %L13

L13:                                              ; preds = %L57, %L13.preheader
  %value_phi4 = phi i64 [ %23, %L57 ], [ 1, %L13.preheader ]
  %7 = call i64 @_Z12get_local_idj(i32 0)
  %8 = add i64 %7, 1
  %9 = call i64 @_Z12get_group_idj(i32 0)
  %10 = call i64 @_Z14get_local_sizej(i32 0)
  %11 = mul i64 %10, %9
  %12 = add i64 %8, %11
  %13 = add nsw i64 %value_phi4, -1
  %14 = call i64 @_Z14get_local_sizej(i32 0)
  %15 = call i64 @_Z14get_num_groupsj(i32 0)
  %16 = mul i64 %14, %13
  %17 = mul i64 %16, %15
  %18 = add i64 %12, %17
  %.not = icmp sgt i64 %18, %3
  br i1 %.not, label %L56, label %L57

L56:                                              ; preds = %L57, %L13, %entry
  ret void

L57:                                              ; preds = %L13
  %19 = add i64 %18, -1
  %20 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %19
  %21 = load i32, i32 addrspace(1)* %20, align 1
  %22 = getelementptr inbounds i32, i32 addrspace(1)* %6, i64 %19
  store i32 %21, i32 addrspace(1)* %22, align 1
  %.not1 = icmp eq i64 %value_phi4, %4
  %23 = add nuw i64 %value_phi4, 1
  br i1 %.not1, label %L56, label %L13
}

attributes #0 = { "probe-stack"="inline-asm" }

!llvm.module.flags = !{!0, !1}
!opencl.ocl.version = !{!2}
!opencl.spirv.version = !{!3}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 2, i32 0}
!3 = !{i32 1, i32 5}

We then translate that IR to SPIR-V using the Khronos LLVM to SPIR-V translator:

; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 59
; Schema: 0
               OpCapability Addresses
               OpCapability Linkage
               OpCapability Kernel
               OpCapability Int64
               OpCapability Int8
          %1 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %13 "_Z17julia_kernel_79077LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_" %__spirv_BuiltInLocalInvocationId %__spirv_BuiltInWorkgroupId %__spirv_BuiltInWorkgroupSize %__spirv_BuiltInNumWorkgroups
               OpSource OpenCL_C 200000
               OpName %__spirv_BuiltInLocalInvocationId "__spirv_BuiltInLocalInvocationId"
               OpName %__spirv_BuiltInWorkgroupId "__spirv_BuiltInWorkgroupId"
               OpName %__spirv_BuiltInWorkgroupSize "__spirv_BuiltInWorkgroupSize"
               OpName %__spirv_BuiltInNumWorkgroups "__spirv_BuiltInNumWorkgroups"
               OpName %entry "entry"
               OpName %L13_preheader "L13.preheader"
               OpName %L13 "L13"
               OpName %L56 "L56"
               OpName %L57 "L57"
               OpName %_inv ".inv"
               OpName %value_phi4 "value_phi4"
               OpName %_not ".not"
               OpName %_not1 ".not1"
               OpDecorate %__spirv_BuiltInNumWorkgroups BuiltIn NumWorkgroups
               OpDecorate %__spirv_BuiltInWorkgroupSize BuiltIn WorkgroupSize
               OpDecorate %__spirv_BuiltInWorkgroupId BuiltIn WorkgroupId
               OpDecorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
               OpDecorate %__spirv_BuiltInLocalInvocationId Constant
               OpDecorate %__spirv_BuiltInWorkgroupId Constant
               OpDecorate %__spirv_BuiltInWorkgroupSize Constant
               OpDecorate %__spirv_BuiltInNumWorkgroups Constant
               OpDecorate %16 FuncParamAttr Sext
               OpDecorate %17 FuncParamAttr Sext
               OpDecorate %__spirv_BuiltInWorkgroupId LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
               OpDecorate %__spirv_BuiltInNumWorkgroups LinkageAttributes "__spirv_BuiltInNumWorkgroups" Import
               OpDecorate %__spirv_BuiltInWorkgroupSize LinkageAttributes "__spirv_BuiltInWorkgroupSize" Import
               OpDecorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
      %ulong = OpTypeInt 64 0
      %uchar = OpTypeInt 8 0
       %uint = OpTypeInt 32 0
    %ulong_0 = OpConstant %ulong 0
    %ulong_1 = OpConstant %ulong 1
%ulong_18446744073709551615 = OpConstant %ulong 18446744073709551615
    %v3ulong = OpTypeVector %ulong 3
%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
       %void = OpTypeVoid
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
         %12 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar %ulong %ulong
       %bool = OpTypeBool
%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
%__spirv_BuiltInLocalInvocationId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupSize = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInNumWorkgroups = OpVariable %_ptr_Input_v3ulong Input
         %13 = OpFunction %void None %12
         %14 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
         %15 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
         %16 = OpFunctionParameter %ulong
         %17 = OpFunctionParameter %ulong
      %entry = OpLabel
       %_inv = OpSGreaterThan %bool %16 %ulong_0
         %26 = OpSelect %ulong %_inv %16 %ulong_0
               OpBranchConditional %_inv %L13_preheader %L56
%L13_preheader = OpLabel
         %29 = OpBitcast %_ptr_CrossWorkgroup_uint %15
         %30 = OpBitcast %_ptr_CrossWorkgroup_uint %14
               OpBranch %L13
        %L13 = OpLabel
 %value_phi4 = OpPhi %ulong %31 %L57 %ulong_1 %L13_preheader
         %34 = OpLoad %v3ulong %__spirv_BuiltInLocalInvocationId Aligned 32
         %35 = OpCompositeExtract %ulong %34 0
         %36 = OpIAdd %ulong %35 %ulong_1
         %37 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupId Aligned 32
         %38 = OpCompositeExtract %ulong %37 0
         %39 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
         %40 = OpCompositeExtract %ulong %39 0
         %41 = OpIMul %ulong %40 %38
         %42 = OpIAdd %ulong %36 %41
         %44 = OpIAdd %ulong %value_phi4 %ulong_18446744073709551615
         %45 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
         %46 = OpCompositeExtract %ulong %45 0
         %47 = OpLoad %v3ulong %__spirv_BuiltInNumWorkgroups Aligned 32
         %48 = OpCompositeExtract %ulong %47 0
         %49 = OpIMul %ulong %46 %44
         %50 = OpIMul %ulong %49 %48
         %51 = OpIAdd %ulong %42 %50
       %_not = OpSGreaterThan %bool %51 %17
               OpBranchConditional %_not %L56 %L57
        %L56 = OpLabel
               OpReturn
        %L57 = OpLabel
         %53 = OpIAdd %ulong %51 %ulong_18446744073709551615
         %54 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %29 %53
         %55 = OpLoad %uint %54 Aligned 1
         %56 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %30 %53
               OpStore %56 %55 Aligned 1
      %_not1 = OpIEqual %bool %value_phi4 %26
         %31 = OpIAdd %ulong %value_phi4 %ulong_1
               OpBranchConditional %_not1 %L56 %L13
               OpFunctionEnd

Executing this simple kernel should evidently yield an output consisting of all ones, but instead I’m getting zeros at certain places:

 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1  1  1
 1  0  0  0  0  0  0  0  0  0
 1  1  1  1  1  1  1  1  1  1

The exact pattern is consistent, but depends on the amount of threads/blocks. It seems caused by the early exit from the loop, even though that code isn’t hit (in this example I’m launching exactly 10*10=100 threads for arrays of length 100). It also doesn’t reproduce on an Intel HD Graphics P630. I’m using compute-runtime 22.8.22549, which depends on IGC 1.0.10395, on a 64-bit Linux running kernel 5.10.

I hope this is enough information to help isolate the issue. Feel free to reach out to me for more information, or if needed for help to set-up a Julia environment (which is really easy).

About this issue

  • Original URL
  • State: closed
  • Created 2 years ago
  • Comments: 24

Most upvoted comments

The engineering build mailed to me did work, so I guess there other changes on IGC#master that matter for this test (assuming that build was just using the latest versions of everything). I’ll just wait until everything is released first before debugging again.