compute-runtime: Miscompilation on Tiger Lake Xe GPU with early loop exit
I’m experiencing a miscompilation with our Julia-based oneAPI stack. The reproducer is as follows (I’ve documented the code, but it should be very straightforward even without knowing Julia):
using oneAPI
# simple kernel that copies `len` values from `src` to `dst`.
# each thread processes `elem_per_thread` values
function kernel(dest, src, elem_per_thread, len)
for j in 1:elem_per_thread
# compute the (1-based) index to copy
i = get_local_id(0) + (get_group_id(0) - 1) * get_local_size(0)
i += (j - 1) * get_local_size(0) * get_num_groups(0)
# this early-exit causes the miscompilation
i > len && return
val = unsafe_load(src, i)
unsafe_store!(dest, val, i)
end
return
end
function main(T=Int32, n=10)
# allocate two vectors of 100 elements
A = oneAPI.ones(T, n*n)
B = oneAPI.zeros(T, n*n)
ptrA = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(A))
ptrB = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(B))
# invoke the kernel with 10 groups of 10 threads
@oneapi items=n groups=n kernel(ptrB, ptrA, 1, n*n)
# display the result as an 10x10 matrix
reshape(Array(B), (n,n))
end
The LLVM IR we generate for this kernel looks as follows:
source_filename = "text"
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-unknown"
declare i64 @_Z12get_local_idj(i32)
declare i64 @_Z12get_group_idj(i32)
declare i64 @_Z14get_local_sizej(i32)
declare i64 @_Z14get_num_groupsj(i32)
define spir_kernel void @_Z17julia_kernel_78787LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_(i8 addrspace(1)* %0, i8 addrspace(1)* %1, i64 signext %2, i64 signext %3) local_unnamed_addr #0 {
entry:
%.inv = icmp sgt i64 %2, 0
%4 = select i1 %.inv, i64 %2, i64 0
br i1 %.inv, label %L13.preheader, label %L56
L13.preheader: ; preds = %entry
%5 = bitcast i8 addrspace(1)* %1 to i32 addrspace(1)*
%6 = bitcast i8 addrspace(1)* %0 to i32 addrspace(1)*
br label %L13
L13: ; preds = %L57, %L13.preheader
%value_phi4 = phi i64 [ %23, %L57 ], [ 1, %L13.preheader ]
%7 = call i64 @_Z12get_local_idj(i32 0)
%8 = add i64 %7, 1
%9 = call i64 @_Z12get_group_idj(i32 0)
%10 = call i64 @_Z14get_local_sizej(i32 0)
%11 = mul i64 %10, %9
%12 = add i64 %8, %11
%13 = add nsw i64 %value_phi4, -1
%14 = call i64 @_Z14get_local_sizej(i32 0)
%15 = call i64 @_Z14get_num_groupsj(i32 0)
%16 = mul i64 %14, %13
%17 = mul i64 %16, %15
%18 = add i64 %12, %17
%.not = icmp sgt i64 %18, %3
br i1 %.not, label %L56, label %L57
L56: ; preds = %L57, %L13, %entry
ret void
L57: ; preds = %L13
%19 = add i64 %18, -1
%20 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %19
%21 = load i32, i32 addrspace(1)* %20, align 1
%22 = getelementptr inbounds i32, i32 addrspace(1)* %6, i64 %19
store i32 %21, i32 addrspace(1)* %22, align 1
%.not1 = icmp eq i64 %value_phi4, %4
%23 = add nuw i64 %value_phi4, 1
br i1 %.not1, label %L56, label %L13
}
attributes #0 = { "probe-stack"="inline-asm" }
!llvm.module.flags = !{!0, !1}
!opencl.ocl.version = !{!2}
!opencl.spirv.version = !{!3}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 2, i32 0}
!3 = !{i32 1, i32 5}
We then translate that IR to SPIR-V using the Khronos LLVM to SPIR-V translator:
; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 59
; Schema: 0
OpCapability Addresses
OpCapability Linkage
OpCapability Kernel
OpCapability Int64
OpCapability Int8
%1 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %13 "_Z17julia_kernel_79077LLVMPtrI5Int32Li1EES_IS0_Li1EE5Int64S1_" %__spirv_BuiltInLocalInvocationId %__spirv_BuiltInWorkgroupId %__spirv_BuiltInWorkgroupSize %__spirv_BuiltInNumWorkgroups
OpSource OpenCL_C 200000
OpName %__spirv_BuiltInLocalInvocationId "__spirv_BuiltInLocalInvocationId"
OpName %__spirv_BuiltInWorkgroupId "__spirv_BuiltInWorkgroupId"
OpName %__spirv_BuiltInWorkgroupSize "__spirv_BuiltInWorkgroupSize"
OpName %__spirv_BuiltInNumWorkgroups "__spirv_BuiltInNumWorkgroups"
OpName %entry "entry"
OpName %L13_preheader "L13.preheader"
OpName %L13 "L13"
OpName %L56 "L56"
OpName %L57 "L57"
OpName %_inv ".inv"
OpName %value_phi4 "value_phi4"
OpName %_not ".not"
OpName %_not1 ".not1"
OpDecorate %__spirv_BuiltInNumWorkgroups BuiltIn NumWorkgroups
OpDecorate %__spirv_BuiltInWorkgroupSize BuiltIn WorkgroupSize
OpDecorate %__spirv_BuiltInWorkgroupId BuiltIn WorkgroupId
OpDecorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
OpDecorate %__spirv_BuiltInLocalInvocationId Constant
OpDecorate %__spirv_BuiltInWorkgroupId Constant
OpDecorate %__spirv_BuiltInWorkgroupSize Constant
OpDecorate %__spirv_BuiltInNumWorkgroups Constant
OpDecorate %16 FuncParamAttr Sext
OpDecorate %17 FuncParamAttr Sext
OpDecorate %__spirv_BuiltInWorkgroupId LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
OpDecorate %__spirv_BuiltInNumWorkgroups LinkageAttributes "__spirv_BuiltInNumWorkgroups" Import
OpDecorate %__spirv_BuiltInWorkgroupSize LinkageAttributes "__spirv_BuiltInWorkgroupSize" Import
OpDecorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
%ulong = OpTypeInt 64 0
%uchar = OpTypeInt 8 0
%uint = OpTypeInt 32 0
%ulong_0 = OpConstant %ulong 0
%ulong_1 = OpConstant %ulong 1
%ulong_18446744073709551615 = OpConstant %ulong 18446744073709551615
%v3ulong = OpTypeVector %ulong 3
%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
%void = OpTypeVoid
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
%12 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_uchar %ulong %ulong
%bool = OpTypeBool
%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
%__spirv_BuiltInLocalInvocationId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupId = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInWorkgroupSize = OpVariable %_ptr_Input_v3ulong Input
%__spirv_BuiltInNumWorkgroups = OpVariable %_ptr_Input_v3ulong Input
%13 = OpFunction %void None %12
%14 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
%15 = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
%16 = OpFunctionParameter %ulong
%17 = OpFunctionParameter %ulong
%entry = OpLabel
%_inv = OpSGreaterThan %bool %16 %ulong_0
%26 = OpSelect %ulong %_inv %16 %ulong_0
OpBranchConditional %_inv %L13_preheader %L56
%L13_preheader = OpLabel
%29 = OpBitcast %_ptr_CrossWorkgroup_uint %15
%30 = OpBitcast %_ptr_CrossWorkgroup_uint %14
OpBranch %L13
%L13 = OpLabel
%value_phi4 = OpPhi %ulong %31 %L57 %ulong_1 %L13_preheader
%34 = OpLoad %v3ulong %__spirv_BuiltInLocalInvocationId Aligned 32
%35 = OpCompositeExtract %ulong %34 0
%36 = OpIAdd %ulong %35 %ulong_1
%37 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupId Aligned 32
%38 = OpCompositeExtract %ulong %37 0
%39 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
%40 = OpCompositeExtract %ulong %39 0
%41 = OpIMul %ulong %40 %38
%42 = OpIAdd %ulong %36 %41
%44 = OpIAdd %ulong %value_phi4 %ulong_18446744073709551615
%45 = OpLoad %v3ulong %__spirv_BuiltInWorkgroupSize Aligned 32
%46 = OpCompositeExtract %ulong %45 0
%47 = OpLoad %v3ulong %__spirv_BuiltInNumWorkgroups Aligned 32
%48 = OpCompositeExtract %ulong %47 0
%49 = OpIMul %ulong %46 %44
%50 = OpIMul %ulong %49 %48
%51 = OpIAdd %ulong %42 %50
%_not = OpSGreaterThan %bool %51 %17
OpBranchConditional %_not %L56 %L57
%L56 = OpLabel
OpReturn
%L57 = OpLabel
%53 = OpIAdd %ulong %51 %ulong_18446744073709551615
%54 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %29 %53
%55 = OpLoad %uint %54 Aligned 1
%56 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %30 %53
OpStore %56 %55 Aligned 1
%_not1 = OpIEqual %bool %value_phi4 %26
%31 = OpIAdd %ulong %value_phi4 %ulong_1
OpBranchConditional %_not1 %L56 %L13
OpFunctionEnd
Executing this simple kernel should evidently yield an output consisting of all ones, but instead I’m getting zeros at certain places:
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1
1 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1
The exact pattern is consistent, but depends on the amount of threads/blocks. It seems caused by the early exit from the loop, even though that code isn’t hit (in this example I’m launching exactly 10*10=100 threads for arrays of length 100). It also doesn’t reproduce on an Intel HD Graphics P630. I’m using compute-runtime 22.8.22549, which depends on IGC 1.0.10395, on a 64-bit Linux running kernel 5.10.
I hope this is enough information to help isolate the issue. Feel free to reach out to me for more information, or if needed for help to set-up a Julia environment (which is really easy).
About this issue
- Original URL
- State: closed
- Created 2 years ago
- Comments: 24
The engineering build mailed to me did work, so I guess there other changes on IGC#master that matter for this test (assuming that build was just using the latest versions of everything). I’ll just wait until everything is released first before debugging again.