runtime: Span.CopyTo is 10-20% slower than Array.CopyTo

How to run the benchmarks:

git clone https://github.com/dotnet/performance.git
# if you have .NET Core 3.0 installed
dotnet run -c Release -f netcoreapp3.0 -p .\performance\src\benchmarks\micro\MicroBenchmarks.csproj --filter *CopyTo<String>*.Array *CopyTo<String>.Span --join --affinity 2
# if you don't have .NET Core 3.0 installed
py .\performance\scripts\benchmarks_ci.py -f netcoreapp3.0 --filter *CopyTo<String>*.Array *CopyTo<String>.Span --bdn-arguments="--join true --affinity 2"
Method Size Mean
Array 2048 511.1 ns
Span 2048 609.9 ns

Full docs for the new benchmarking workflow: https://github.com/dotnet/performance/blob/master/docs/benchmarking-workflow-corefx.md

category:cq theme:optimization skill-level:expert cost:medium

About this issue

  • Original URL
  • State: open
  • Created 5 years ago
  • Reactions: 3
  • Comments: 21 (21 by maintainers)

Most upvoted comments

The jit doesn’t see string here, it sees __Canon. If we inline enough we can sometimes get all this into unshared code and figure out the actual types, but not always…

Contrast this against the codegen for the generic CopyTo test:

; CopyTo<string>.Array

00007ff9`b6fab690 48894c2408      mov     qword ptr [rsp+8],rcx
00007ff9`b6fab695 488b4908        mov     rcx,qword ptr [rcx+8]
00007ff9`b6fab699 4c8b442408      mov     r8,qword ptr [rsp+8]
00007ff9`b6fab69e 498b5018        mov     rdx,qword ptr [r8+18h]
00007ff9`b6fab6a2 458b4020        mov     r8d,dword ptr [r8+20h]
00007ff9`b6fab6a6 e95d66d8ff      jmp     <Array.CopyTo(Array, Array, int)>

; CopyTo<string>.Span

00007ff9`b6fdbb40 57              push    rdi
00007ff9`b6fdbb41 56              push    rsi
00007ff9`b6fdbb42 4883ec68        sub     rsp,68h
00007ff9`b6fdbb46 488bf1          mov     rsi,rcx
00007ff9`b6fdbb49 488d7c2420      lea     rdi,[rsp+20h]
00007ff9`b6fdbb4e b910000000      mov     ecx,10h
00007ff9`b6fdbb53 33c0            xor     eax,eax
00007ff9`b6fdbb55 f3ab            rep stos dword ptr [rdi]
00007ff9`b6fdbb57 488bce          mov     rcx,rsi
00007ff9`b6fdbb5a 48894c2460      mov     qword ptr [rsp+60h],rcx
00007ff9`b6fdbb5f 488bf1          mov     rsi,rcx
00007ff9`b6fdbb62 488b0e          mov     rcx,qword ptr [rsi] ; classHnd
00007ff9`b6fdbb65 48ba80f407b7f97f0000 mov rdx,7FF9B707F480h  ; signature
00007ff9`b6fdbb6f e88c88825f      call    CoreCLR!JIT_GenericHandleClass (00007ffa`16804400)
00007ff9`b6fdbb74 488bf8          mov     rdi,rax
00007ff9`b6fdbb77 488bd7          mov     rdx,rdi
00007ff9`b6fdbb7a 4c8b4608        mov     r8,qword ptr [rsi+8]
00007ff9`b6fdbb7e 488d4c2440      lea     rcx,[rsp+40h]
00007ff9`b6fdbb83 e810daffff      call    <Span<__Canon>.ctor(__Canon[])>
00007ff9`b6fdbb88 488b542440      mov     rdx,qword ptr [rsp+40h]
00007ff9`b6fdbb8d 4889542450      mov     qword ptr [rsp+50h],rdx
00007ff9`b6fdbb92 8b542448        mov     edx,dword ptr [rsp+48h]
00007ff9`b6fdbb96 89542458        mov     dword ptr [rsp+58h],edx
00007ff9`b6fdbb9a 488bd7          mov     rdx,rdi
00007ff9`b6fdbb9d 4c8b4618        mov     r8,qword ptr [rsi+18h]
00007ff9`b6fdbba1 488d4c2430      lea     rcx,[rsp+30h]
00007ff9`b6fdbba6 e8edd9ffff      call    CLRStub[MethodDescPrestub]@7ff9b6fd9598 (00007ff9`b6fd9598)
00007ff9`b6fdbbab 488d4c2450      lea     rcx,[rsp+50h]
00007ff9`b6fdbbb0 488bd7          mov     rdx,rdi
00007ff9`b6fdbbb3 4c8d442420      lea     r8,[rsp+20h]
00007ff9`b6fdbbb8 488b442430      mov     rax,qword ptr [rsp+30h]
00007ff9`b6fdbbbd 498900          mov     qword ptr [r8],rax
00007ff9`b6fdbbc0 8b442438        mov     eax,dword ptr [rsp+38h]
00007ff9`b6fdbbc4 41894008        mov     dword ptr [r8+8],eax
00007ff9`b6fdbbc8 4c8d442420      lea     r8,[rsp+20h]
00007ff9`b6fdbbcd e85edaffff      call    CLRStub[MethodDescPrestub]@7ff9b6fd9630 (00007ff9`b6fd9630)
00007ff9`b6fdbbd2 90              nop
00007ff9`b6fdbbd3 4883c468        add     rsp,68h
00007ff9`b6fdbbd7 5e              pop     rsi
00007ff9`b6fdbbd8 5f              pop     rdi
00007ff9`b6fdbbd9 c3              ret

; Span<__Canon>.ctor(__Canon[])

00007ff9`b6fdba50 57              push    rdi
00007ff9`b6fdba51 56              push    rsi
00007ff9`b6fdba52 55              push    rbp
00007ff9`b6fdba53 53              push    rbx
00007ff9`b6fdba54 4883ec28        sub     rsp,28h
00007ff9`b6fdba58 c5f877          vzeroupper
00007ff9`b6fdba5b 4889542420      mov     qword ptr [rsp+20h],rdx
00007ff9`b6fdba60 488bd9          mov     rbx,rcx
00007ff9`b6fdba63 488bfa          mov     rdi,rdx
00007ff9`b6fdba66 498bf0          mov     rsi,r8
00007ff9`b6fdba69 4885f6          test    rsi,rsi
00007ff9`b6fdba6c 7511            jne     System_Private_CoreLib!System.Span`1[[System.__Canon, System.Private.CoreLib]]..ctor(System.__Canon[])+0xffffffff`a3af5ddf (00007ff9`b6fdba7f)
00007ff9`b6fdba6e c5f857c0        vxorps  xmm0,xmm0,xmm0
00007ff9`b6fdba72 c5fa7f03        vmovdqu xmmword ptr [rbx],xmm0
00007ff9`b6fdba76 4883c428        add     rsp,28h
00007ff9`b6fdba7a 5b              pop     rbx
00007ff9`b6fdba7b 5d              pop     rbp
00007ff9`b6fdba7c 5e              pop     rsi
00007ff9`b6fdba7d 5f              pop     rdi
00007ff9`b6fdba7e c3              ret
00007ff9`b6fdba7f 488bce          mov     rcx,rsi
00007ff9`b6fdba82 e889988d5f      call    CoreCLR!ObjectNative::GetClass (00007ffa`168b5310)
00007ff9`b6fdba87 488be8          mov     rbp,rax
00007ff9`b6fdba8a 488b4f30        mov     rcx,qword ptr [rdi+30h]
00007ff9`b6fdba8e 488b09          mov     rcx,qword ptr [rcx]
00007ff9`b6fdba91 488b4908        mov     rcx,qword ptr [rcx+8]
00007ff9`b6fdba95 4885c9          test    rcx,rcx
00007ff9`b6fdba98 7402            je      System_Private_CoreLib!System.Span`1[[System.__Canon, System.Private.CoreLib]]..ctor(System.__Canon[])+0xffffffff`a3af5dfc (00007ff9`b6fdba9c)
00007ff9`b6fdba9a eb15            jmp     System_Private_CoreLib!System.Span`1[[System.__Canon, System.Private.CoreLib]]..ctor(System.__Canon[])+0xffffffff`a3af5e11 (00007ff9`b6fdbab1)
00007ff9`b6fdba9c 488bcf          mov     rcx,rdi
00007ff9`b6fdba9f 48ba40060ab7f97f0000 mov rdx,7FF9B70A0640h
00007ff9`b6fdbaa9 e85289825f      call    CoreCLR!JIT_GenericHandleClass (00007ffa`16804400)
00007ff9`b6fdbaae 488bc8          mov     rcx,rax
00007ff9`b6fdbab1 e8eaed825f      call    CoreCLR!JIT_GetRuntimeType (00007ffa`1680a8a0)
00007ff9`b6fdbab6 483bc5          cmp     rax,rbp
00007ff9`b6fdbab9 7516            jne     System_Private_CoreLib!System.Span`1[[System.__Canon, System.Private.CoreLib]]..ctor(System.__Canon[])+0xffffffff`a3af5e31 (00007ff9`b6fdbad1)
00007ff9`b6fdbabb 488d4610        lea     rax,[rsi+10h]
00007ff9`b6fdbabf 488903          mov     qword ptr [rbx],rax
00007ff9`b6fdbac2 8b4608          mov     eax,dword ptr [rsi+8]
00007ff9`b6fdbac5 894308          mov     dword ptr [rbx+8],eax
00007ff9`b6fdbac8 4883c428        add     rsp,28h
00007ff9`b6fdbacc 5b              pop     rbx
00007ff9`b6fdbacd 5d              pop     rbp
00007ff9`b6fdbace 5e              pop     rsi
00007ff9`b6fdbacf 5f              pop     rdi
00007ff9`b6fdbad0 c3              ret
00007ff9`b6fdbad1 e8a2e2d9ff      call    CLRStub[MethodDescPrestub]@7ff9b6d79d78 (00007ff9`b6d79d78)
00007ff9`b6fdbad6 cc              int     3

I didn’t even dump all the relevant codegen since it’s so big, but a few things immediately stand out:

  • Methods like the Span<T>.ctor(T[]) and Buffer.Memmove(__Canon&, __Canon&, ulong) aren’t being inlined into their caller.
  • There’s hydration of a RuntimeType instance from the type handle, possibly due to the array.GetType() != typeof(T[]) check at the beginning of the ctor.
<< Partial stack trace showing memmove not inlined >>
00 0000009a`7337ce08 00007ff9`b6fdbc56 System_Private_CoreLib!System.Buffer.Memmove[[System.__Canon, System.Private.CoreLib]](System.__Canon ByRef, System.__Canon ByRef, UInt64)+0xffffffff`a3b81660 [C:\runtime\src\libraries\System.Private.CoreLib\src\System\Buffer.cs @ 338] 
01 0000009a`7337ce10 00007ff9`b6fdbbd2 System_Private_CoreLib!System.Span`1[[System.__Canon, System.Private.CoreLib]].CopyTo(System.Span`1<System.__Canon>)+0xffffffff`a3af5976 [C:\runtime\src\libraries\System.Private.CoreLib\src\System\Span.cs @ 363] 

/cc @EgorBo In case he has any suggestions.

There is no functional problem with it. If you look at what InlinedSetCardsAfterBulkCopyHelper does, it casts the Object** to BYTE* or size_t.

Maybe the argument of InlinedSetCardsAfterBulkCopyHelper should be typed void* to make this less confusing.