async-profiler: Unknown Java Frame on JDK-8180450 reproducer
MInimal (benchmark) code to reproduce it using
# JMH version: 1.35
# VM version: JDK 11.0.16.1, OpenJDK 64-Bit Server VM, 11.0.16.1+1
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
@State(Scope.Thread)
@Measurement(iterations = 10, time = 200, timeUnit = TimeUnit.MILLISECONDS)
@Warmup(iterations = 10, time = 200, timeUnit = TimeUnit.MILLISECONDS)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Fork(2)
public class RequireNonNullCheckcastScalability {
public interface InternalContext extends Context {
// Internal Framework API
boolean isDuplicated();
}
public interface Context {
// some public API
}
public static class DuplicatedContext implements InternalContext {
@Override
public boolean isDuplicated() {
return true;
}
}
public static class NonDuplicatedContext implements InternalContext {
@Override
public boolean isDuplicated() {
return false;
}
}
private Context ctx;
@Setup
public void init(Blackhole bh) {
ctx = new NonDuplicatedContext();
// let's warm it enough to get it compiled with C2 (by default)
for (int i = 0; i < 11000; i++) {
bh.consume(isDuplicated());
}
// deopt on warmup
ctx = new DuplicatedContext();
}
private static boolean isDuplicatedContext(Context message) {
Context actual = Objects.requireNonNull(message);
return ((InternalContext) actual).isDuplicated();
}
@Benchmark
@Threads(2)
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public boolean isDuplicated() {
return isDuplicatedContext(ctx);
}
}
Flamegraph is
Reporting a huge number of unknown Java frames.
perfasm
shows that:
....[Hottest Region 1]..............................................................................
c2, level 4, io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated, version 2, compile id 546
0x00007fc3dc300d44: shl $0x3,%r10
0x00007fc3dc300d48: movabs $0x800000000,%r12
0x00007fc3dc300d52: add %r12,%r10
0x00007fc3dc300d55: xor %r12,%r12
0x00007fc3dc300d58: cmp %r10,%rax
0x00007fc3dc300d5b: jne 0x00007fc3d484b080 ; {runtime_call ic_miss_stub}
0x00007fc3dc300d61: data16 xchg %ax,%ax
0x00007fc3dc300d64: nopl 0x0(%rax,%rax,1)
0x00007fc3dc300d6c: data16 data16 xchg %ax,%ax
[Verified Entry Point]
0.24% 0x00007fc3dc300d70: mov %eax,-0x14000(%rsp)
0.36% 0x00007fc3dc300d77: push %rbp
0x00007fc3dc300d78: sub $0x20,%rsp ;*synchronization entry
; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@-1 (line 63)
0.71% 0x00007fc3dc300d7c: mov 0xc(%rsi),%r11d ;*getfield ctx {reexecute=0 rethrow=0 return_oop=0}
; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@1 (line 63)
0x00007fc3dc300d80: mov 0x8(%r12,%r11,8),%r10d ; implicit exception: dispatches to 0x00007fc3dc300e58
0.44% 0x00007fc3dc300d85: movabs $0x800000000,%rsi
0x00007fc3dc300d8f: lea (%rsi,%r10,8),%rsi
0.50% 0x00007fc3dc300d93: mov 0x20(%rsi),%r10
2.87% 0x00007fc3dc300d97: movabs $0x840091818,%rax ; {metadata('io/forked/franz/benchmarks/RequireNonNullCheckcastScalability$Context')}
0x00007fc3dc300da1: cmp %rax,%r10
╭ 0x00007fc3dc300da4: jne 0x00007fc3dc300ddb ;*checkcast {reexecute=0 rethrow=0 return_oop=0}
│ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@4 (line 55)
│ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
0.21% │ ↗ 0x00007fc3dc300da6: movabs $0x840091a18,%rax ; {metadata('io/forked/franz/benchmarks/RequireNonNullCheckcastScalability$InternalContext')}
│ │ 0x00007fc3dc300db0: cmp %rax,%r10
│╭ │ 0x00007fc3dc300db3: jne 0x00007fc3dc300e0c
0.03% ││ │ ↗ 0x00007fc3dc300db5: lea (%r12,%r11,8),%rbp ;*checkcast {reexecute=0 rethrow=0 return_oop=0}
││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@9 (line 56)
││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
0.41% ││ │ │ 0x00007fc3dc300db9: mov 0x8(%rbp),%r11d
1.66% ││ │ │ 0x00007fc3dc300dbd: cmp $0x80123c8,%r11d ; {metadata('io/forked/franz/benchmarks/RequireNonNullCheckcastScalability$DuplicatedContext')}
││╭ │ │ 0x00007fc3dc300dc4: jne 0x00007fc3dc300e3c ;*invokeinterface isDuplicated {reexecute=0 rethrow=0 return_oop=0}
│││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@12 (line 56)
│││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
0.62% │││ │ │ 0x00007fc3dc300dc6: mov $0x1,%eax ;*synchronization entry
│││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@-1 (line 55)
│││ │ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
│││ │ │ 0x00007fc3dc300dcb: add $0x20,%rsp
│││ │ │ 0x00007fc3dc300dcf: pop %rbp
0.77% │││ │ │ 0x00007fc3dc300dd0: mov 0x108(%r15),%r10
│││ │ │ 0x00007fc3dc300dd7: test %eax,(%r10) ; {poll_return}
2.43% │││ │ │ 0x00007fc3dc300dda: retq
0.33% ↘││ │ │ 0x00007fc3dc300ddb: push %rax
││ │ │ 0x00007fc3dc300ddc: mov %rax,%rax
││ │ │ 0x00007fc3dc300ddf: mov 0x28(%rsi),%rdi
20.88% ││ │ │ 0x00007fc3dc300de3: mov (%rdi),%ecx
1.18% ││ │ │ 0x00007fc3dc300de5: add $0x8,%rdi
││ │ │ 0x00007fc3dc300de9: test %rax,%rax
││ │ │ 0x00007fc3dc300dec: repnz scas %es:(%rdi),%rax
11.16% ││ │ │ 0x00007fc3dc300def: pop %rax
2.72% ││╭│ │ 0x00007fc3dc300df0: jne 0x00007fc3dc300dfa
││││ │ 0x00007fc3dc300df6: mov %rax,0x20(%rsi)
0.36% ││↘╰ │ 0x00007fc3dc300dfa: je 0x00007fc3dc300da6
││ │ 0x00007fc3dc300dfc: mov $0xffffffde,%esi
││ │ 0x00007fc3dc300e01: mov %r11d,%ebp
││ │ 0x00007fc3dc300e04: data16 xchg %ax,%ax
││ │ 0x00007fc3dc300e07: callq 0x00007fc3d4849e00 ; ImmutableOopMap{rbp=NarrowOop }
││ │ ;*checkcast {reexecute=0 rethrow=0 return_oop=0}
││ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@4 (line 55)
││ │ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
││ │ ; {runtime_call UncommonTrapBlob}
0.18% ↘│ │ 0x00007fc3dc300e0c: push %rax
│ │ 0x00007fc3dc300e0d: mov %rax,%rax
│ │ 0x00007fc3dc300e10: mov 0x28(%rsi),%rdi
20.08% │ │ 0x00007fc3dc300e14: mov (%rdi),%ecx
1.36% │ │ 0x00007fc3dc300e16: add $0x8,%rdi
│ │ 0x00007fc3dc300e1a: test %rax,%rax
0.98% │ │ 0x00007fc3dc300e1d: repnz scas %es:(%rdi),%rax
10.75% │ │ 0x00007fc3dc300e20: pop %rax
3.35% │ ╭│ 0x00007fc3dc300e21: jne 0x00007fc3dc300e2b
│ ││ 0x00007fc3dc300e27: mov %rax,0x20(%rsi)
0.44% │ ↘╰ 0x00007fc3dc300e2b: je 0x00007fc3dc300db5
│ 0x00007fc3dc300e2d: mov $0xffffffde,%esi
│ 0x00007fc3dc300e32: mov %r11d,%ebp
│ 0x00007fc3dc300e35: xchg %ax,%ax
│ 0x00007fc3dc300e37: callq 0x00007fc3d4849e00 ; ImmutableOopMap{rbp=NarrowOop }
│ ;*checkcast {reexecute=0 rethrow=0 return_oop=0}
│ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicatedContext@9 (line 56)
│ ; - io.forked.franz.benchmarks.RequireNonNullCheckcastScalability::isDuplicated@4 (line 63)
│ ; {runtime_call UncommonTrapBlob}
↘ 0x00007fc3dc300e3c: cmp $0x8012383,%r11d ; {metadata('io/forked/franz/benchmarks/RequireNonNullCheckcastScalability$NonDuplicatedContext')}
0x00007fc3dc300e43: jne 0x00007fc3dc300e4c
....................................................................................................
85.02% <total for region 1>
The most of the cost is (as expected) while interacting with Klass::secondary_super_cache and Klass::secondary_supers (array’s length read), that’s the code injected by https://github.com/openjdk/jdk11/blob/37115c8ea4aff13a8148ee2b8832b20888a5d880/src/hotspot/cpu/x86/macroAssembler_x86.cpp#L5503
How to improve the accuracy of the profiler’s output? Is it AGCT to be blamed instead?
Many thanks!!
About this issue
- Original URL
- State: closed
- Created 2 years ago
- Reactions: 1
- Comments: 36 (11 by maintainers)
Commits related to this issue
- #673: Try adjusting SP when AGCT fails — committed to async-profiler/async-profiler by apangin 2 years ago
- Avoid slow type checks against promises on outbound buffer's progress Motivation: ChannelOutboundBuffer's progress perform expensive type checks that can reduce scalability or just performing wastef... — committed to franz1981/netty by franz1981 a year ago
- Avoid slow type checks against promises on outbound buffer's progress (#13225) Motivation: ChannelOutboundBuffer's progress perform expensive type checks that can reduce scalability or just perf... — committed to netty/netty by franz1981 a year ago
I verified that async-profiler 2.7 indeed recovers the problematic stack trace (due to trial and error technique mentioned above). Also, with JDK 19, both async-profiler 2.7 and 2.8.x show correct stack trace. As expected, the bottleneck is two
checkcast
instructions.The backport seems to fix the issue with async-profiler and JDK 11:
The backport applied clean and can be seen at https://github.com/openjdk/jdk11u-dev/pull/1524.
It is running over night in the SapMachine CI.
@franz1981 The linked code also changes SP and therefore breaks AGCT. As of now, there is no a good solution in HotSpot. Ideally, JVM should record a table that helps to find a proper sender SP from the current instruction address - something like Frame Description Entry in DWARF.
Async-profiler handles certain special cases on its own, e.g., arraycopy and itable stubs. But it’s hard to make a universal solution without JVM support, hence
[unknown_Java]
for less popular stubs. Async-profiler previously used to guess the correct SP by trial and error, but since AGCT is not protected well from bad input, such guessing could result in JVM crash. I therefore gave up this idea (after all,[unknown_Java]
is better than a crash). It probably makes sense to add an option (disabled by default) to bring back the SP finding heuristics.@apangin @parttimenerd I looked into the relevant code in JDK head and found that this unnecessary push of rax has been fixed in jdk19 and backported to jdk17u – see https://bugs.openjdk.org/browse/JDK-8287425.
I am going to propose the jdk17u fix for backport to jdk11u. This will not remove the need for the async stack walker to implement fuzzing – it will still be needed for deployments on other legacy jdk releases. However, it will avoid the need to actually try fuzzing when using jdk11u releases that include the backport.
Thanks everyone I now happily close it: mystery solved and everyone happy 😉
@parttimenerd I will be happy to help if you need assistance with the request.
I linked it and requested a backport, I’ll backport this small change myself if need be.
@apangin @parttimenerd Those pushes and pops are a really bad idea but I’m not sure how we easily we will be able to avoid them. There are a few places where generated code has to introduce pushes and pops in order to deal with the lack of spare registers. This is not a problem for JVM runtime code because it never chases stack frames inside these push/pop regions. However, it is clearly a problem for async walking because it invalidates the assumption that the stack can be walked by subtracting fixed (per nmethod) frame sizes. This really should be looked into further by Hotspot runtime devs.
@jpbempel
DebugNonSafepoints
doesn’t help, since the issue is not in the mapping between machine code and bci, but in the ability to compute SP of the sender frame.SP + frame_size
yields incorrect result, since SP is temporarily adjusted by the stub.perf works, since it relies on FP for stack walking rather than SP, but that requires
PreserveFramePointer
option.