runtime: Regressions in System.Memory.Span

Found as part of 5.0 -> 6.0 comparison.

Run Information

Architecture x64
OS Windows 10.0.18362
Baseline a1fd4e98c87300110a3bdf55c400ec133e9202da
Compare 84bf5bcc43fe7a95f73f4b7cee9c0e23af248dd7

Regressions in System.Memory.Span<Int32>

Benchmark Baseline Test Test/Base Baseline IR Compare IR IR Ratio Baseline ETL Compare ETL
SequenceCompareTo 655.53 ns 796.30 ns 1.21

graph Historical Data in Reporting System

Repro

git clone https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Memory.Span&lt;Int32&gt;*'

Payloads

Baseline Compare

Histogram

System.Memory.Span<Int32>.SequenceCompareTo(Size: 512)


Docs

Profiling workflow for dotnet/runtime repository Benchmarking workflow for dotnet/runtime repository

### Run Information
Architecture x64
OS Windows 10.0.18362
Baseline a1fd4e98c87300110a3bdf55c400ec133e9202da
Compare 84bf5bcc43fe7a95f73f4b7cee9c0e23af248dd7

Regressions in System.Collections.Sort<BigStruct>

Benchmark Baseline Test Test/Base Baseline IR Compare IR IR Ratio Baseline ETL Compare ETL
List 9.80 μs 12.16 μs 1.24

graph Historical Data in Reporting System

Repro

git clone https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Collections.Sort&lt;BigStruct&gt;*'

Payloads

Baseline Compare

Histogram

System.Collections.Sort<BigStruct>.List(Size: 512)


Docs

Profiling workflow for dotnet/runtime repository Benchmarking workflow for dotnet/runtime repository

Run Information

Architecture x64
OS Windows 10.0.18362
Baseline a1fd4e98c87300110a3bdf55c400ec133e9202da
Compare 84bf5bcc43fe7a95f73f4b7cee9c0e23af248dd7

Regressions in System.Net.Tests.Perf_WebUtility

Benchmark Baseline Test Test/Base Baseline IR Compare IR IR Ratio Baseline ETL Compare ETL
Decode_NoDecodingRequired 62.76 ns 72.79 ns 1.16 Trace Trace

graph Historical Data in Reporting System

Repro

git clone https://github.com/dotnet/performance.git
py .\performance\scripts\benchmarks_ci.py -f netcoreapp5.0 --filter 'System.Net.Tests.Perf_WebUtility*'

Payloads

Baseline Compare

Histogram

System.Net.Tests.Perf_WebUtility.Decode_NoDecodingRequired


Baseline Jit Disasm

; System.Net.Tests.Perf_WebUtility.Decode_NoDecodingRequired()
       mov       rcx,29B98006708
       mov       rcx,[rcx]
       mov       rdx,29B98001B08
       mov       rdx,[rdx]
       jmp       near ptr System.Net.WebUtility.UrlDecodeInternal(System.String, System.Text.Encoding)
; Total bytes of code 31
; System.Net.WebUtility.UrlDecodeInternal(System.String, System.Text.Encoding)
       push      r15
       push      r14
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,48
       vxorps    xmm4,xmm4,xmm4
       vmovdqa   xmmword ptr [rsp+20],xmm4
       vmovdqa   xmmword ptr [rsp+30],xmm4
       xor       eax,eax
       mov       [rsp+40],rax
       mov       rsi,rcx
       test      rsi,rsi
       je        short M01_L00
       cmp       dword ptr [rsi+8],0
       jne       short M01_L01
M01_L00:
       mov       rax,rsi
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r14
       pop       r15
       ret
M01_L01:
       mov       edi,[rsi+8]
       mov       [rsp+38],edi
       mov       [rsp+30],rdx
       xor       edx,edx
       mov       [rsp+20],rdx
       mov       [rsp+3C],edx
       mov       [rsp+40],edx
       mov       [rsp+28],rdx
       xor       ebx,ebx
       xor       ebp,ebp
       xor       r14d,r14d
       test      edi,edi
       jle       near ptr M01_L15
M01_L02:
       movsxd    rdx,r14d
       movzx     r15d,word ptr [rsi+rdx*2+0C]
       cmp       r15d,2B
       jne       short M01_L03
       mov       ebp,1
       mov       r15d,20
       jmp       near ptr M01_L09
M01_L03:
       cmp       r15d,25
       jne       near ptr M01_L09
       lea       edx,[rdi+0FFFE]
       cmp       r14d,edx
       jge       near ptr M01_L09
       lea       edx,[r14+1]
       movsxd    rdx,edx
       movzx     edx,word ptr [rsi+rdx*2+0C]
       cmp       edx,100
       jge       short M01_L04
       movsxd    rdx,edx
       mov       rcx,7FF83D2D0C88
       movzx     edx,byte ptr [rdx+rcx]
       jmp       short M01_L05
M01_L04:
       mov       edx,0FF
M01_L05:
       lea       ecx,[r14+2]
       movsxd    rax,ecx
       movzx     eax,word ptr [rsi+rax*2+0C]
       cmp       eax,100
       jge       short M01_L06
       movsxd    rax,eax
       mov       r8,7FF83D2D0C88
       movzx     eax,byte ptr [rax+r8]
       jmp       short M01_L07
M01_L06:
       mov       eax,0FF
M01_L07:
       mov       r8d,edx
       or        r8d,eax
       cmp       r8d,0FF
       je        short M01_L09
       shl       edx,4
       or        edx,eax
       movzx     ebx,dl
       mov       r14d,ecx
       cmp       qword ptr [rsp+28],0
       jne       short M01_L08
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Byte[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+28],rax
M01_L08:
       mov       edx,[rsp+40]
       mov       rcx,[rsp+28]
       lea       eax,[rdx+1]
       mov       [rsp+40],eax
       cmp       edx,[rcx+8]
       jae       near ptr M01_L18
       movsxd    rdx,edx
       mov       [rcx+rdx+10],bl
       mov       ebx,1
       jmp       near ptr M01_L14
M01_L09:
       test      r15d,0FF80
       jne       short M01_L11
       movzx     r15d,r15b
       cmp       qword ptr [rsp+28],0
       jne       short M01_L10
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Byte[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+28],rax
M01_L10:
       mov       ecx,[rsp+40]
       mov       rax,[rsp+28]
       lea       edx,[rcx+1]
       mov       [rsp+40],edx
       cmp       ecx,[rax+8]
       jae       near ptr M01_L18
       movsxd    rcx,ecx
       mov       [rax+rcx+10],r15b
       jmp       short M01_L14
M01_L11:
       cmp       dword ptr [rsp+40],0
       jle       short M01_L12
       lea       rcx,[rsp+20]
       call      System.Net.WebUtility+UrlDecoder.FlushBytes()
M01_L12:
       cmp       qword ptr [rsp+20],0
       jne       short M01_L13
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Char[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+20],rax
M01_L13:
       mov       ecx,[rsp+3C]
       mov       rdx,[rsp+20]
       lea       r8d,[rcx+1]
       mov       [rsp+3C],r8d
       cmp       ecx,[rdx+8]
       jae       short M01_L18
       movsxd    rcx,ecx
       mov       [rdx+rcx*2+10],r15w
M01_L14:
       inc       r14d
       cmp       r14d,edi
       jl        near ptr M01_L02
M01_L15:
       test      ebx,ebx
       jne       short M01_L17
       test      ebp,ebp
       je        short M01_L16
       mov       rcx,rsi
       mov       edx,2B
       mov       r8d,20
       call      System.String.Replace(Char, Char)
       nop
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r14
       pop       r15
       ret
M01_L16:
       mov       rax,rsi
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r14
       pop       r15
       ret
M01_L17:
       lea       rcx,[rsp+20]
       call      System.Net.WebUtility+UrlDecoder.GetString()
       nop
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r14
       pop       r15
       ret
M01_L18:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 622

Compare Jit Disasm

; System.Net.Tests.Perf_WebUtility.Decode_NoDecodingRequired()
       mov       rcx,1B19ECE6708
       mov       rcx,[rcx]
       mov       rdx,1B19ECE1B10
       mov       rdx,[rdx]
       jmp       near ptr System.Net.WebUtility.UrlDecodeInternal(System.String, System.Text.Encoding)
; Total bytes of code 31
; System.Net.WebUtility.UrlDecodeInternal(System.String, System.Text.Encoding)
       push      r15
       push      r14
       push      r13
       push      r12
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,48
       vxorps    xmm4,xmm4,xmm4
       vmovdqa   xmmword ptr [rsp+20],xmm4
       vmovdqa   xmmword ptr [rsp+30],xmm4
       xor       eax,eax
       mov       [rsp+40],rax
       mov       rsi,rcx
       test      rsi,rsi
       je        short M01_L00
       mov       edi,[rsi+8]
       test      edi,edi
       jne       short M01_L01
M01_L00:
       mov       rax,rsi
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r13
       pop       r14
       pop       r15
       ret
M01_L01:
       mov       edi,[rsi+8]
       mov       [rsp+38],edi
       mov       [rsp+30],rdx
       xor       edx,edx
       mov       [rsp+20],rdx
       mov       [rsp+3C],edx
       mov       [rsp+40],edx
       mov       [rsp+28],rdx
       xor       ebx,ebx
       xor       ebp,ebp
       xor       r14d,r14d
       test      edi,edi
       jle       near ptr M01_L13
M01_L02:
       movsxd    rdx,r14d
       movzx     r15d,word ptr [rsi+rdx*2+0C]
       cmp       r15d,2B
       jne       short M01_L03
       mov       ebp,1
       mov       r15d,20
       jmp       near ptr M01_L07
M01_L03:
       cmp       r15d,25
       jne       near ptr M01_L07
       lea       edx,[rdi+0FFFE]
       cmp       r14d,edx
       jge       near ptr M01_L07
       lea       edx,[r14+1]
       movsxd    rdx,edx
       movzx     edx,word ptr [rsi+rdx*2+0C]
       cmp       edx,100
       jge       near ptr M01_L16
       movsxd    rdx,edx
       mov       rcx,7FF828D6E728
       movzx     r12d,byte ptr [rdx+rcx]
M01_L04:
       lea       r13d,[r14+2]
       movsxd    rdx,r13d
       movzx     edx,word ptr [rsi+rdx*2+0C]
       cmp       edx,100
       jge       near ptr M01_L17
       movsxd    rdx,edx
       mov       rcx,7FF828D6E728
       movzx     edx,byte ptr [rdx+rcx]
M01_L05:
       mov       ecx,r12d
       or        ecx,edx
       cmp       ecx,0FF
       je        short M01_L07
       shl       r12d,4
       or        edx,r12d
       movzx     ebx,dl
       mov       r14d,r13d
       cmp       qword ptr [rsp+28],0
       jne       short M01_L06
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Byte[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+28],rax
M01_L06:
       mov       edx,[rsp+40]
       mov       rcx,[rsp+28]
       lea       eax,[rdx+1]
       mov       [rsp+40],eax
       cmp       edx,[rcx+8]
       jae       near ptr M01_L18
       movsxd    rdx,edx
       mov       [rcx+rdx+10],bl
       mov       ebx,1
       jmp       near ptr M01_L12
M01_L07:
       test      r15d,0FF80
       jne       short M01_L09
       movzx     r15d,r15b
       cmp       qword ptr [rsp+28],0
       jne       short M01_L08
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Byte[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+28],rax
M01_L08:
       mov       ecx,[rsp+40]
       mov       rax,[rsp+28]
       lea       edx,[rcx+1]
       mov       [rsp+40],edx
       cmp       ecx,[rax+8]
       jae       near ptr M01_L18
       movsxd    rcx,ecx
       mov       [rax+rcx+10],r15b
       jmp       short M01_L12
M01_L09:
       cmp       dword ptr [rsp+40],0
       jle       short M01_L10
       lea       rcx,[rsp+20]
       call      System.Net.WebUtility+UrlDecoder.FlushBytes()
M01_L10:
       cmp       qword ptr [rsp+20],0
       jne       short M01_L11
       mov       edx,[rsp+38]
       movsxd    rdx,edx
       mov       rcx,offset MT_System.Char[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       [rsp+20],rax
M01_L11:
       mov       ecx,[rsp+3C]
       mov       rdx,[rsp+20]
       lea       r8d,[rcx+1]
       mov       [rsp+3C],r8d
       cmp       ecx,[rdx+8]
       jae       near ptr M01_L18
       movsxd    rcx,ecx
       mov       [rdx+rcx*2+10],r15w
M01_L12:
       inc       r14d
       cmp       r14d,edi
       jl        near ptr M01_L02
M01_L13:
       test      ebx,ebx
       jne       short M01_L15
       test      ebp,ebp
       je        short M01_L14
       mov       rcx,rsi
       mov       edx,2B
       mov       r8d,20
       call      System.String.Replace(Char, Char)
       nop
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r13
       pop       r14
       pop       r15
       ret
M01_L14:
       mov       rax,rsi
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r13
       pop       r14
       pop       r15
       ret
M01_L15:
       lea       rcx,[rsp+20]
       call      System.Net.WebUtility+UrlDecoder.GetString()
       nop
       add       rsp,48
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r13
       pop       r14
       pop       r15
       ret
M01_L16:
       mov       r12d,0FF
       jmp       near ptr M01_L04
M01_L17:
       mov       edx,0FF
       jmp       near ptr M01_L05
M01_L18:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 663

Docs

Profiling workflow for dotnet/runtime repository Benchmarking workflow for dotnet/runtime repository

category:performance theme:benchmarks

About this issue

  • Original URL
  • State: open
  • Created 3 years ago
  • Comments: 19 (19 by maintainers)

Most upvoted comments

One other trick I found useful. I have before and after jits and they’re compatible. So I can run using one build variant (say before) and load up the other variant’s jit as the altjit, and experiment with fine-grained control over which jit handles which methods.

So if I allow the after jit to compile PickPivotAndPartition in the before build, I get slow runs; if allow the before jit to compile PickPivotAndPartition I get fast runs.

To make this work you have to copy the respective checked jits into the release test roots because selective alt-jitting is only possible with checked jits, and you have copy in the other build’s checked jit with a different name.

EG (here “2” is after, “3” is before)

;; "before"

dotnet run -c Release -f net5.0 -- --filter System.Collections.Sort^<BigStruct^>.List 
    --corerun  c:\repos\runtime3\artifacts\tests\coreclr\windows.x64.Release\Tests\Core_Root\corerun.exe

==> 7.5 us

;; "before" using "after" for PPP

dotnet run -c Release -f net5.0 -- --filter System.Collections.Sort^<BigStruct^>.List 
    --corerun  c:\repos\runtime3\artifacts\tests\coreclr\windows.x64.Release\Tests\Core_Root\corerun.exe 
    --envVars COMPlus_AltJitName:clrjit2.dll COMPlus_AltJit:PickPivotAndPartition

==> 9.4 us

;; "after"

dotnet run -c Release -f net5.0 -- --filter System.Collections.Sort^<BigStruct^>.List 
    --corerun  c:\repos\runtime2\artifacts\tests\coreclr\windows.x64.Release\Tests\Core_Root\corerun.exe

==> 9.5 us

;; "after" using "before" for PPP

dotnet run -c Release -f net5.0 -- --filter System.Collections.Sort^<BigStruct^>.List 
    --corerun  c:\repos\runtime2\artifacts\tests\coreclr\windows.x64.Release\Tests\Core_Root\corerun.exe 
    --envVars COMPlus_AltJitName:clrjit3.dll COMPlus_AltJit:PickPivotAndPartition 

==> 7.5 us

Agree – I need to drill in.

Interesting that the Byte and Char version of CompareTo were unaffected.