runtime: Question: Why for loop is 1.3 slower over byte[] than foreach
Hi Guys!
Usually, people used to thing foreach version of loop is always slower than for and so I, but, when I benchmarked it I was surprised how much (1.3 times) for version is actually slower when iterating over bytes array (similar for int/uint). Is that something expected and ok? Here is the code.
using System;
public class C {
private byte[] _byteData;
public int byte_sum_ByIndex()
{
int sum = 0;
for (int i = 0; i < _byteData.Length; ++i)
{
sum += _byteData[i];
}
return sum;
}
public int byte_sum_Foreach()
{
int sum = 0;
foreach (var n in _byteData)
{
sum += n;
}
return sum;
}
}
.class private auto ansi '<Module>'
{
} // end of class <Module>
.class public auto ansi beforefieldinit C
extends [System.Private.CoreLib]System.Object
{
// Fields
.field private uint8[] _byteData
// Methods
.method public hidebysig
instance int32 byte_sum_ByIndex () cil managed
{
// Method begins at RVA 0x2050
// Code size 34 (0x22)
.maxstack 3
.locals init (
[0] int32,
[1] int32
)
IL_0000: ldc.i4.0
IL_0001: stloc.0
IL_0002: ldc.i4.0
IL_0003: stloc.1
// sequence point: hidden
IL_0004: br.s IL_0015
// loop start (head: IL_0015)
IL_0006: ldloc.0
IL_0007: ldarg.0
IL_0008: ldfld uint8[] C::_byteData
IL_000d: ldloc.1
IL_000e: ldelem.u1
IL_000f: add
IL_0010: stloc.0
IL_0011: ldloc.1
IL_0012: ldc.i4.1
IL_0013: add
IL_0014: stloc.1
IL_0015: ldloc.1
IL_0016: ldarg.0
IL_0017: ldfld uint8[] C::_byteData
IL_001c: ldlen
IL_001d: conv.i4
IL_001e: blt.s IL_0006
// end loop
IL_0020: ldloc.0
IL_0021: ret
} // end of method C::byte_sum_ByIndex
.method public hidebysig
instance int32 byte_sum_Foreach () cil managed
{
// Method begins at RVA 0x2080
// Code size 33 (0x21)
.maxstack 2
.locals init (
[0] int32,
[1] uint8[],
[2] int32,
[3] uint8
)
IL_0000: ldc.i4.0
IL_0001: stloc.0
IL_0002: ldarg.0
IL_0003: ldfld uint8[] C::_byteData
IL_0008: stloc.1
IL_0009: ldc.i4.0
IL_000a: stloc.2
// sequence point: hidden
IL_000b: br.s IL_0019
// loop start (head: IL_0019)
IL_000d: ldloc.1
IL_000e: ldloc.2
IL_000f: ldelem.u1
IL_0010: stloc.3
IL_0011: ldloc.0
IL_0012: ldloc.3
IL_0013: add
IL_0014: stloc.0
// sequence point: hidden
IL_0015: ldloc.2
IL_0016: ldc.i4.1
IL_0017: add
IL_0018: stloc.2
IL_0019: ldloc.2
IL_001a: ldloc.1
IL_001b: ldlen
IL_001c: conv.i4
IL_001d: blt.s IL_000d
// end loop
IL_001f: ldloc.0
IL_0020: ret
} // end of method C::byte_sum_Foreach
.method public hidebysig specialname rtspecialname
instance void .ctor () cil managed
{
// Method begins at RVA 0x20ad
// Code size 7 (0x7)
.maxstack 8
IL_0000: ldarg.0
IL_0001: call instance void [System.Private.CoreLib]System.Object::.ctor()
IL_0006: ret
} // end of method C::.ctor
} // end of class C
; Core CLR v4.700.19.46205 (coreclr.dll) on x86.
C..ctor(Byte[])
L0000: mov eax, edx
L0002: lea edx, [ecx+0x4]
L0005: call 0x6ed991d0
L000a: ret
C.byte_sum_ByIndex()
L0000: push ebp
L0001: mov ebp, esp
L0003: push esi
L0004: push ebx
L0005: xor eax, eax
L0007: xor edx, edx
L0009: mov ecx, [ecx+0x4]
L000c: cmp dword [ecx+0x4], 0x0
L0010: jle L0026
L0012: mov esi, ecx
L0014: cmp edx, [esi+0x4]
L0017: jae L002a
L0019: movzx ebx, byte [esi+edx+0x8]
L001e: add eax, ebx
L0020: inc edx
L0021: cmp [ecx+0x4], edx
L0024: jg L0012
L0026: pop ebx
L0027: pop esi
L0028: pop ebp
L0029: ret
L002a: call 0x6eec2680
L002f: int3
C.byte_sum_Foreach()
L0000: push ebp
L0001: mov ebp, esp
L0003: push esi
L0004: push ebx
L0005: xor eax, eax
L0007: mov edx, [ecx+0x4]
L000a: xor ecx, ecx
L000c: mov esi, [edx+0x4]
L000f: test esi, esi
L0011: jle L001f
L0013: movzx ebx, byte [edx+ecx+0x8]
L0018: add eax, ebx
L001a: inc ecx
L001b: cmp esi, ecx
L001d: jg L0013
L001f: pop ebx
L0020: pop esi
L0021: pop ebp
L0022: ret
category:cq theme:range-check skill-level:intermediate cost:medium impact:medium
About this issue
- Original URL
- State: open
- Created 5 years ago
- Comments: 22 (17 by maintainers)
What is maybe a bit unfortunate is that making the field readonly doesn’t help while you would expect.
@yahorsi the answer is there in your sharplab link I guess 🙂 (unsuga
It’s better to work with a local variable in case of performance critical code in such cases, e.g. JIT doesn’t eliminate bound checks for your
for
loop (anyone can modify the field while you iterate it)This is likely a CQ regression introduced by the fix for dotnet/runtime#9486, I’m pretty sure range check elimination was working in this case before that fix.
The problem is that JIT’s value numbering correctly concluded that the 2
_byteData
accesses will produce the same value, assuming that there is no thread interference. This assumption is valid for many purposes, since thread interference in this case implies that there’s a race condition in the user code and race conditions are basically undefined behavior.However, this assumption is not valid for range check elimination because said undefined behavior does not include out of range array access, that would compromise type safety.
The fix that was done is probably more conservative than it needs to be. At least in this case, the redundant field load is eliminated by CSE so the race condition effectively disappears. Unfortunately CSE is not something that can be guaranteed so performing the optimization correctly is a bit more complicated - it has to be done only after CSE did eliminate the load or it has to somehow force CSE to happen.
It may be possible to improve this. But at least for the presented example this is completely pointless, there’s no reason not to use
foreach
in this case.And that reference points to some old object, and the field now points to a different one. Btw, here is an example how CoreCLR developers sometimes optimize range checks: https://github.com/dotnet/coreclr/pull/27340/files
@stephentoub oops, indeed 😃 Well, at least it’s safe to assume that
static readonly
fields never change once the host type is initializedreadonly
instance fields can be changed via reflection, at least for now. And anyway it’s not clear how much relying onreadonly
would actually help. While it’s nice to make fieldsreadonly
it is not always possible and I have a feeling that it’s even less possible/practical in the case of arrays. For example, think of all collection classes that use arrays under the cover.If you copy
_byteData
to a local variable in both methods - the codegen will be the same.