runtime: Question: Why for loop is 1.3 slower over byte[] than foreach

Hi Guys!

Usually, people used to thing foreach version of loop is always slower than for and so I, but, when I benchmarked it I was surprised how much (1.3 times) for version is actually slower when iterating over bytes array (similar for int/uint). Is that something expected and ok? Here is the code.

using System;

public class C {
    private byte[] _byteData;

    public int byte_sum_ByIndex()
    {
        int sum = 0;
        for (int i = 0; i < _byteData.Length; ++i)
        {
            sum += _byteData[i];
        }
        
        return sum;
    }

    public int byte_sum_Foreach()
    {
        int sum = 0;
        foreach (var n in _byteData)
        {
            sum += n;
        }
        
        return sum;
    }
}
.class private auto ansi '<Module>'
{
} // end of class <Module>

.class public auto ansi beforefieldinit C
    extends [System.Private.CoreLib]System.Object
{
    // Fields
    .field private uint8[] _byteData

    // Methods
    .method public hidebysig 
        instance int32 byte_sum_ByIndex () cil managed 
    {
        // Method begins at RVA 0x2050
        // Code size 34 (0x22)
        .maxstack 3
        .locals init (
            [0] int32,
            [1] int32
        )

        IL_0000: ldc.i4.0
        IL_0001: stloc.0
        IL_0002: ldc.i4.0
        IL_0003: stloc.1
        // sequence point: hidden
        IL_0004: br.s IL_0015
        // loop start (head: IL_0015)
            IL_0006: ldloc.0
            IL_0007: ldarg.0
            IL_0008: ldfld uint8[] C::_byteData
            IL_000d: ldloc.1
            IL_000e: ldelem.u1
            IL_000f: add
            IL_0010: stloc.0
            IL_0011: ldloc.1
            IL_0012: ldc.i4.1
            IL_0013: add
            IL_0014: stloc.1

            IL_0015: ldloc.1
            IL_0016: ldarg.0
            IL_0017: ldfld uint8[] C::_byteData
            IL_001c: ldlen
            IL_001d: conv.i4
            IL_001e: blt.s IL_0006
        // end loop

        IL_0020: ldloc.0
        IL_0021: ret
    } // end of method C::byte_sum_ByIndex

    .method public hidebysig 
        instance int32 byte_sum_Foreach () cil managed 
    {
        // Method begins at RVA 0x2080
        // Code size 33 (0x21)
        .maxstack 2
        .locals init (
            [0] int32,
            [1] uint8[],
            [2] int32,
            [3] uint8
        )

        IL_0000: ldc.i4.0
        IL_0001: stloc.0
        IL_0002: ldarg.0
        IL_0003: ldfld uint8[] C::_byteData
        IL_0008: stloc.1
        IL_0009: ldc.i4.0
        IL_000a: stloc.2
        // sequence point: hidden
        IL_000b: br.s IL_0019
        // loop start (head: IL_0019)
            IL_000d: ldloc.1
            IL_000e: ldloc.2
            IL_000f: ldelem.u1
            IL_0010: stloc.3
            IL_0011: ldloc.0
            IL_0012: ldloc.3
            IL_0013: add
            IL_0014: stloc.0
            // sequence point: hidden
            IL_0015: ldloc.2
            IL_0016: ldc.i4.1
            IL_0017: add
            IL_0018: stloc.2

            IL_0019: ldloc.2
            IL_001a: ldloc.1
            IL_001b: ldlen
            IL_001c: conv.i4
            IL_001d: blt.s IL_000d
        // end loop

        IL_001f: ldloc.0
        IL_0020: ret
    } // end of method C::byte_sum_Foreach

    .method public hidebysig specialname rtspecialname 
        instance void .ctor () cil managed 
    {
        // Method begins at RVA 0x20ad
        // Code size 7 (0x7)
        .maxstack 8

        IL_0000: ldarg.0
        IL_0001: call instance void [System.Private.CoreLib]System.Object::.ctor()
        IL_0006: ret
    } // end of method C::.ctor

} // end of class C
; Core CLR v4.700.19.46205 (coreclr.dll) on x86.

C..ctor(Byte[])
    L0000: mov eax, edx
    L0002: lea edx, [ecx+0x4]
    L0005: call 0x6ed991d0
    L000a: ret

C.byte_sum_ByIndex()
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: push esi
    L0004: push ebx
    L0005: xor eax, eax
    L0007: xor edx, edx
    L0009: mov ecx, [ecx+0x4]
    L000c: cmp dword [ecx+0x4], 0x0
    L0010: jle L0026
    L0012: mov esi, ecx
    L0014: cmp edx, [esi+0x4]
    L0017: jae L002a
    L0019: movzx ebx, byte [esi+edx+0x8]
    L001e: add eax, ebx
    L0020: inc edx
    L0021: cmp [ecx+0x4], edx
    L0024: jg L0012
    L0026: pop ebx
    L0027: pop esi
    L0028: pop ebp
    L0029: ret
    L002a: call 0x6eec2680
    L002f: int3

C.byte_sum_Foreach()
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: push esi
    L0004: push ebx
    L0005: xor eax, eax
    L0007: mov edx, [ecx+0x4]
    L000a: xor ecx, ecx
    L000c: mov esi, [edx+0x4]
    L000f: test esi, esi
    L0011: jle L001f
    L0013: movzx ebx, byte [edx+ecx+0x8]
    L0018: add eax, ebx
    L001a: inc ecx
    L001b: cmp esi, ecx
    L001d: jg L0013
    L001f: pop ebx
    L0020: pop esi
    L0021: pop ebp
    L0022: ret

https://sharplab.io/#v2:C4LghgzgtgPgAgJgIwFgBQcDMACR2DC2A3utmdgA4BOAlgG5jACm2ARgJ7MDaAutgPodmAEUZgA3OlLks2GgDtgbTk34QArlH4AhdgEl5AEyYAPABQBKaWRJpy9uYuwao2ALzYADJLsOyAMwB7KmwzBSUady9xOWwAHgEhJlFgMAA6ABkmeQBzYAALGIBqIporXz9bPz8XbCKPQRUUsC4aHh9q7ABfawde+zgAdmdNDvIetF7ZcOVmNU1+ADFgpjAAY3zLXqrqmdqPb37yIKpVjdCGEPlHRKaxcs6dzrJa+ux5Mb8JzqOyIZGoJ9uugukA==

category:cq theme:range-check skill-level:intermediate cost:medium impact:medium

About this issue

  • Original URL
  • State: open
  • Created 5 years ago
  • Comments: 22 (17 by maintainers)

Most upvoted comments

What is maybe a bit unfortunate is that making the field readonly doesn’t help while you would expect.

@yahorsi the answer is there in your sharplab link I guess 🙂 (unsuga

image

It’s better to work with a local variable in case of performance critical code in such cases, e.g. JIT doesn’t eliminate bound checks for your for loop (anyone can modify the field while you iterate it)

This is likely a CQ regression introduced by the fix for dotnet/runtime#9486, I’m pretty sure range check elimination was working in this case before that fix.

The problem is that JIT’s value numbering correctly concluded that the 2 _byteData accesses will produce the same value, assuming that there is no thread interference. This assumption is valid for many purposes, since thread interference in this case implies that there’s a race condition in the user code and race conditions are basically undefined behavior.

However, this assumption is not valid for range check elimination because said undefined behavior does not include out of range array access, that would compromise type safety.

The fix that was done is probably more conservative than it needs to be. At least in this case, the redundant field load is eliminated by CSE so the race condition effectively disappears. Unfortunately CSE is not something that can be guaranteed so performing the optimization correctly is a bit more complicated - it has to be done only after CSE did eliminate the load or it has to somehow force CSE to happen.

It may be possible to improve this. But at least for the presented example this is completely pointless, there’s no reason not to use foreach in this case.

anyone can modify the field while you iterate it

Wait 😃 We just copy the reference, so, anyone still can change it 😃

And that reference points to some old object, and the field now points to a different one. Btw, here is an example how CoreCLR developers sometimes optimize range checks: https://github.com/dotnet/coreclr/pull/27340/files

@stephentoub oops, indeed 😃 Well, at least it’s safe to assume that static readonly fields never change once the host type is initialized

What is maybe a bit unfortunate is that making the field readonly doesn’t help while you would expect.

readonly instance fields can be changed via reflection, at least for now. And anyway it’s not clear how much relying on readonly would actually help. While it’s nice to make fields readonly it is not always possible and I have a feeling that it’s even less possible/practical in the case of arrays. For example, think of all collection classes that use arrays under the cover.

If you copy _byteData to a local variable in both methods - the codegen will be the same.