runtime: Segfault when attaching profiler

Description

See initial thread: https://github.com/dotnet/diagnostics/issues/3600

While developing CLR profilers, I encountered a segfault that happens systematically when attaching for the second time to a dotnet process in Linux (Debian 11). I don’t know if this because it’s running in docker or not.

I managed to take a core dump that you can get here: segfault_coredump.zip

Using lldb + sos, I was able to load the symbols and here the stacktrace for the thread that ends in segfault:

* thread #1, name = 'dotnet', stop reason = signal SIGSEGV
  * frame #0: 0x00007f8c755eb06c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] InterlockedOr(Destination=0x0000000000000008, Value=64) at pal.h:3748:1
    frame #1: 0x00007f8c755eb06c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] Thread::SetThreadState(this=0x0000000000000000, ts=TS_ExecutingOnAltStack) at threads.h:1060
    frame #2: 0x00007f8c755eb06c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] Thread::SetExecutingOnAltStack(this=0x0000000000000000) at threads.h:1257
    frame #3: 0x00007f8c755eb06c libcoreclr.so`EESocketCleanupHelper(isExecutingOnAltStack=<unavailable>) at ceemain.cpp:560
    frame #4: 0x00007f8c756020b8 libcoreclr.so`sigsegv_handler(int, siginfo_t*, void*) [inlined] invoke_previous_action(action=<unavailable>, code=11, siginfo=0x00007f8c73fd1bf0, context=0x00007f8c73fd1ac0, signalRestarts=true) at signal.cpp:430:5
    frame #5: 0x00007f8c7560204e libcoreclr.so`sigsegv_handler(code=11, siginfo=0x00007f8c73fd1bf0, context=0x00007f8c73fd1ac0) at signal.cpp:639
    frame #6: 0x00007f8c75cf1140 libpthread.so.0`__restore_rt
    frame #7: 0x00007f8c75d0de7a ld-linux-x86-64.so.2`___lldb_unnamed_symbol38$$ld-linux-x86-64.so.2 + 10
    frame #8: 0x00007f8c75d0e3a4 ld-linux-x86-64.so.2`___lldb_unnamed_symbol39$$ld-linux-x86-64.so.2 + 932
    frame #9: 0x00007f8c75d0ece1 ld-linux-x86-64.so.2`___lldb_unnamed_symbol40$$ld-linux-x86-64.so.2 + 289
    frame #10: 0x00007f8c7590e3a4 libc.so.6`___lldb_unnamed_symbol1115$$libc.so.6 + 116
    frame #11: 0x00007f8c75cd93b4 libdl.so.2`___lldb_unnamed_symbol6$$libdl.so.2 + 20
    frame #12: 0x00007f8c7590ea90 libc.so.6`_dl_catch_exception + 128
    frame #13: 0x00007f8c7590eb4f libc.so.6`_dl_catch_error + 47
    frame #14: 0x00007f8c75cd9a65 libdl.so.2`___lldb_unnamed_symbol11$$libdl.so.2 + 101
    frame #15: 0x00007f8c75cd941c libdl.so.2`dlsym + 92
    frame #16: 0x00007f8c7560e44c libcoreclr.so`::GetProcAddress(hModule=0x00007f8be8002790, lpProcName="") at module.cpp:333:33
    frame #17: 0x00007f8c754d56de libcoreclr.so`FakeCoCreateInstanceEx(_GUID const&, char16_t const*, _GUID const&, void**, void**) [inlined] (anonymous namespace)::FakeCoCallDllGetClassObject(rclsid=<unavailable>, wszDllPath=u"/tmp/dr-dotnet/libprofilers.so", riid=<unavailable>, ppv=0x00007f8c747d13c8, phmodDll=<unavailable>) at util.cpp:224:68
    frame #18: 0x00007f8c754d5639 libcoreclr.so`FakeCoCreateInstanceEx(rclsid=0x00007f8be800794c, wszDllPath=u"/tmp/dr-dotnet/libprofilers.so", riid=<unavailable>, ppv=0x00007f8c747d1678, phmodDll=0x00007f8c747d1658) at util.cpp:292
    frame #19: 0x00007f8c75319bea libcoreclr.so`EEToProfInterfaceImpl::CreateProfiler(_GUID const*, char const*, char16_t const*) [inlined] CoCreateProfiler(pClsid=<unavailable>, szClsid=<unavailable>, wszProfileDLL=<unavailable>, ppCallback=<unavailable>, phmodProfilerDLL=0x00007f8c747d1658) at eetoprofinterfaceimpl.cpp:293:10
    frame #20: 0x00007f8c75319b9e libcoreclr.so`EEToProfInterfaceImpl::CreateProfiler(this=0x00007f8be8007590, pClsid=<unavailable>, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfileDLL=<unavailable>) at eetoprofinterfaceimpl.cpp:667
    frame #21: 0x00007f8c75319908 libcoreclr.so`EEToProfInterfaceImpl::Init(this=0x00007f8be8007590, pProfToEE=0x00007f8be8007380, pClsid=0x00007f8be800794c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfileDLL=u"/tmp/dr-dotnet/libprofilers.so", fLoadedViaAttach=<unavailable>, dwConcurrentGCWaitTimeoutInMs=10) at eetoprofinterfaceimpl.cpp:581:14
    frame #22: 0x00007f8c75388d21 libcoreclr.so`ProfilingAPIUtility::DoPreInitialization(pEEProf=0x00007f8be8007590, pClsid=0x00007f8be800794c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", loadType=kAttachLoad, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.cpp:989:19
    frame #23: 0x00007f8c75388779 libcoreclr.so`ProfilingAPIUtility::LoadProfiler(loadType=kAttachLoad, pClsid=0x00007f8be800794c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", pvClientData=0x00007f8be80014aa, cbClientData=<unavailable>, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.cpp:1133:10
    frame #24: 0x00007f8c755e0a74 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) [inlined] ProfilingAPIUtility::LoadProfilerForAttach(pClsid=<unavailable>, wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", pvClientData=0x00007f8be80014aa, cbClientData=37, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.inl:183:12
    frame #25: 0x00007f8c755e0a47 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) at ds-rt-coreclr.h:294
    frame #26: 0x00007f8c755e0a47 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) [inlined] profiler_protocol_helper_attach_profiler(message=<unavailable>, stream=0x00007f8be8007570) at ds-profiler-protocol.c:129
    frame #27: 0x00007f8c755e0938 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(message=<unavailable>, stream=0x00007f8be8007570) at ds-profiler-protocol.c:269
    frame #28: 0x00007f8c755dbfb1 libcoreclr.so`server_thread(data=<unavailable>) at ds-server.c:167:4
    frame #29: 0x00007f8c7563bfee libcoreclr.so`CorUnix::CPalThread::ThreadEntry(pvParam=0x000055ec7ccbb580) at thread.cpp:1829:16
    frame #30: 0x00007f8c75ce5ea7 libpthread.so.0`start_thread + 215
    frame #31: 0x00007f8c758d4a2f libc.so.6`__clone + 63

Reproduction Steps

Reproduction is currently a little complex, but I believe it could be reproduced by attaching twice in linux (eg: tweaking the CLR GC profiler test to attach, detach and reattach).

Expected behavior

Attach without segfault

Actual behavior

Segfault and crash the profiler application

Regression?

No response

Known Workarounds

No response

Configuration

The bug was observed under net 6.0 and net 7.0. I haven’t tested on earlier versions. Linux version is Debian 11 (the one that is shipped with the CLR on Microsoft docker repository)

Other information

No response

About this issue

  • Original URL
  • State: closed
  • Created a year ago
  • Comments: 27 (27 by maintainers)

Most upvoted comments

I’m going to go ahead and close this issue for now because it sounds like we’ve got a good understanding on what was causing the segfault. Still happy to chat if further questions arise, and if your future investigation does find something that we should be addressing in the runtime we can either re-open this issue or open a new one as needed. Thanks everyone!

Hmm… as long as you are happy with your solution don’t feel obliged to do anything further here, but in case you did want to keep going down the rabbit hole I’m guessing a tool like strace would show all the calls to dlopen/dlclose (or maybe some underlying operation like mmap).

In terms of the difference in behavior between File.Copy( overwrite:true) and File.Delete() + File.Copy(), I’d guess the delete syscall on a file mapped into memory causes the OS to unbind the file inode/file descriptor from the filesystem path, but the file content still exists on the physical disk until all the open descriptors are closed. Effectively this would create a private snapshot of the old file contents until the library is unmapped. On the other hand directly issuing the write syscall via the Copy() is telling the OS to modify the content of the existing file inode and that is visible to any open file descriptors that are reading from the file. Take all that with a big grain of salt, I have no in-depth knowledge of the Linux filesystem.

@ogxd - Glad you were able to get to the bottom of it and very appreciated for sharing your findings!

As for dlcose, perhaps profiler detach would help? @davmason - Did we get profiler detach working for .NET Core? I thought we did but maybe my memory playing tricks on me.

@ogxd you can try to set LD_DEBUG=all and then capture the output from both runs. Maybe use LD_DEBUG_OUTPUT env var to let the output go to a specific file instead of console.

Thanks for the update! Do you know if the same issue reproduces if you write dlopen() and dlsym() directly in C++ rather than using .NET wrappers? My guess is that it will reproduce in C++ as well indicating that this is an underlying Linux issue, but if it works fine in C++ then it we’d need to investigate what CLR is doing differently.

If its useful for comparison purposes, you can also build C++ profilers we use for testing from the source here: https://github.com/dotnet/runtime/tree/main/src/tests/profiler/native

If memory that backs the symbol table is being corrupted by something its possible you would be able to observe when it is happening by capturing CORE dumps before and after your profiler is loaded. Of course there are probably other things that are legitimately updating the symbol table so I don’t know how easy it would be to separate correct modifications from the incorrect ones. Another possibility is you might be able to create a simpler repro that doesn’t need coreclr at all with just a trivial app that invokes dlopen() + dlsym() on your profiler library. The little bit of logic where the runtime does that is here: https://github.com/dotnet/runtime/blob/1350579859435ee18bd530e18022a880ae418132/src/coreclr/utilcode/util.cpp#L219-L224

The Windows-looking LoadLibrary/GetProcAddr calls get forwarded to their Linux equivalents in the coreclr PAL. Sadly if it is memory corruption there is no guarantee the trivial app will reproduce the problem, but it wouldn’t require too much effort to do the experiment.

I am not familiar with how profiler attach / detach works. My wild guess is that some native library gets freed (refcount getting to zero) and it somehow screws the symbol table walks that dlsym does. I have unassigned myself for now, please feel free to pull me back if you’ll need any additional help with the primary issue investigations / reasoning.

Thanks @janvorli! Adding @davmason.

Interesting… I’ll check if there is something in my profiler implementation that could lead to this when unloading.

Thanks @ogxd! Please let us know if more help is needed. For now I will mark this issue as “needs-author-action”

Thanks a lot for this analysis @janvorli

My wild guess is that some native library gets freed (refcount getting to zero) and it somehow screws the symbol table walks that dlsym does.

Interesting… I’ll check if there is something in my profiler implementation that could lead to this when unloading.

@dotnet/dotnet-diag I have unassigned myself for now, please feel free to pull me back if you’ll need any additional help with the primary issue investigations / reasoning.

This is the stack trace with symbol packages of libc6:

(lldb) bt
* thread #1, name = 'dotnet', stop reason = signal SIGSEGV
    frame #0: 0x00007f82b4b4606c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] InterlockedOr(Destination=0x0000000000000008, Value=64) at pal.h:3748:1
    frame #1: 0x00007f82b4b4606c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] Thread::SetThreadState(this=0x0000000000000000, ts=TS_ExecutingOnAltStack) at threads.h:1060
    frame #2: 0x00007f82b4b4606c libcoreclr.so`EESocketCleanupHelper(bool) [inlined] Thread::SetExecutingOnAltStack(this=0x0000000000000000) at threads.h:1257
    frame #3: 0x00007f82b4b4606c libcoreclr.so`EESocketCleanupHelper(isExecutingOnAltStack=<unavailable>) at ceemain.cpp:560
    frame #4: 0x00007f82b4b5d0b8 libcoreclr.so`sigsegv_handler(int, siginfo_t*, void*) [inlined] invoke_previous_action(action=<unavailable>, code=11, siginfo=0x00007f82b352cbf0, context=0x00007f82b352cac0, signalRestarts=true) at signal.cpp:430:5
    frame #5: 0x00007f82b4b5d04e libcoreclr.so`sigsegv_handler(code=11, siginfo=0x00007f82b352cbf0, context=0x00007f82b352cac0) at signal.cpp:639
    frame #6: 0x00007f82b524c140 libpthread.so.0`__restore_rt
    frame #7: 0x00007f82b5268e7a ld-linux-x86-64.so.2`check_match(undef_name="", ref=0x0000000000000000, version=0x0000000000000000, flags=2, type_class=0, sym=0x0000000000000ab0, symidx=84, strtab="", map=0x00007f8228002100, versioned_sym=0x00007f82b3d2bf48, num_versions=0x00007f82b3d2bf44) at dl-lookup.c:78:7
  * frame #8: 0x00007f82b52693a4 ld-linux-x86-64.so.2`do_lookup_x(undef_name="", new_hash=2743789806, old_hash=0x00007f82b3d2c000, ref=0x0000000000000000, result=0x00007f82b3d2c010, scope=<unavailable>, i=<unavailable>, version=0x0000000000000000, flags=2, skip=<unavailable>, type_class=0, undef_map=0x00007f8228002100) at dl-lookup.c:436:10
    frame #9: 0x00007f82b5269ce1 ld-linux-x86-64.so.2`_dl_lookup_symbol_x(undef_name="", undef_map=0x00007f8228002100, ref=0x00007f82b3d2c0c8, symbol_scope=0x00007f8228002498, version=0x0000000000000000, type_class=0, flags=2, skip_map=0x0000000000000000) at dl-lookup.c:861:9
    frame #10: 0x00007f82b4e693a4 libc.so.6`do_sym(handle=<unavailable>, name="", who=0x00007f82b4b6944c, vers=0x0000000000000000, flags=2) at dl-sym.c:165:16
    frame #11: 0x00007f82b4e6988d libc.so.6`_dl_sym(handle=<unavailable>, name=<unavailable>, who=<unavailable>) at dl-sym.c:274:10 [artificial]
    frame #12: 0x00007f82b52343b4 libdl.so.2`dlsym_doit(a=0x00007f82b3d2c300) at dlsym.c:50:15
    frame #13: 0x00007f82b4e69a90 libc.so.6`__GI__dl_catch_exception(exception=0x00007f82b3d2c2a0, operate=(libdl.so.2`dlsym_doit at dlsym.c:47:1), args=0x00007f82b3d2c300) at dl-error-skeleton.c:208:8
    frame #14: 0x00007f82b4e69b4f libc.so.6`__GI__dl_catch_error(objname=0x00007f82280020e0, errstring=0x00007f82280020e8, mallocedp=0x00007f82280020d8, operate=(libdl.so.2`dlsym_doit at dlsym.c:47:1), args=0x00007f82b3d2c300) at dl-error-skeleton.c:227:19
    frame #15: 0x00007f82b5234a65 libdl.so.2`_dlerror_run(operate=(libdl.so.2`dlsym_doit at dlsym.c:47:1), args=0x00007f82b3d2c300) at dlerror.c:170:21
    frame #16: 0x00007f82b523441c libdl.so.2`__dlsym(handle=<unavailable>, name=<unavailable>) at dlsym.c:70:19
    frame #17: 0x00007f82b4b6944c libcoreclr.so`::GetProcAddress(hModule=0x00007f8228002890, lpProcName="") at module.cpp:333:33
    frame #18: 0x00007f82b4a306de libcoreclr.so`FakeCoCreateInstanceEx(_GUID const&, char16_t const*, _GUID const&, void**, void**) [inlined] (anonymous namespace)::FakeCoCallDllGetClassObject(rclsid=<unavailable>, wszDllPath=u"/tmp/dr-dotnet/libprofilers.so", riid=<unavailable>, ppv=0x00007f82b3d2c3c8, phmodDll=<unavailable>) at util.cpp:224:68
    frame #19: 0x00007f82b4a30639 libcoreclr.so`FakeCoCreateInstanceEx(rclsid=0x00007f8228007a4c, wszDllPath=u"/tmp/dr-dotnet/libprofilers.so", riid=<unavailable>, ppv=0x00007f82b3d2c678, phmodDll=0x00007f82b3d2c658) at util.cpp:292
    frame #20: 0x00007f82b4874bea libcoreclr.so`EEToProfInterfaceImpl::CreateProfiler(_GUID const*, char const*, char16_t const*) [inlined] CoCreateProfiler(pClsid=<unavailable>, szClsid=<unavailable>, wszProfileDLL=<unavailable>, ppCallback=<unavailable>, phmodProfilerDLL=0x00007f82b3d2c658) at eetoprofinterfaceimpl.cpp:293:10
    frame #21: 0x00007f82b4874b9e libcoreclr.so`EEToProfInterfaceImpl::CreateProfiler(this=0x00007f8228007480, pClsid=<unavailable>, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfileDLL=<unavailable>) at eetoprofinterfaceimpl.cpp:667
    frame #22: 0x00007f82b4874908 libcoreclr.so`EEToProfInterfaceImpl::Init(this=0x00007f8228007480, pProfToEE=0x00007f8228000da0, pClsid=0x00007f8228007a4c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfileDLL=u"/tmp/dr-dotnet/libprofilers.so", fLoadedViaAttach=<unavailable>, dwConcurrentGCWaitTimeoutInMs=10) at eetoprofinterfaceimpl.cpp:581:14
    frame #23: 0x00007f82b48e3d21 libcoreclr.so`ProfilingAPIUtility::DoPreInitialization(pEEProf=0x00007f8228007480, pClsid=0x00007f8228007a4c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", loadType=kAttachLoad, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.cpp:989:19
    frame #24: 0x00007f82b48e3779 libcoreclr.so`ProfilingAPIUtility::LoadProfiler(loadType=kAttachLoad, pClsid=0x00007f8228007a4c, szClsid="{805a308b-061c-47f3-9b30-f785c3186e82}", wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", pvClientData=0x00007f822800130a, cbClientData=<unavailable>, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.cpp:1133:10
    frame #25: 0x00007f82b4b3ba74 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) [inlined] ProfilingAPIUtility::LoadProfilerForAttach(pClsid=<unavailable>, wszProfilerDLL=u"/tmp/dr-dotnet/libprofilers.so", pvClientData=0x00007f822800130a, cbClientData=37, dwConcurrentGCWaitTimeoutInMs=10) at profilinghelper.inl:183:12
    frame #26: 0x00007f82b4b3ba47 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) at ds-rt-coreclr.h:294
    frame #27: 0x00007f82b4b3ba47 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(_DiagnosticsIpcMessage*, _DiagnosticsIpcStream*) [inlined] profiler_protocol_helper_attach_profiler(message=<unavailable>, stream=0x00007f8228000c80) at ds-profiler-protocol.c:129
    frame #28: 0x00007f82b4b3b938 libcoreclr.so`ds_profiler_protocol_helper_handle_ipc_message(message=<unavailable>, stream=0x00007f8228000c80) at ds-profiler-protocol.c:269
    frame #29: 0x00007f82b4b36fb1 libcoreclr.so`server_thread(data=<unavailable>) at ds-server.c:167:4
    frame #30: 0x00007f82b4b96fee libcoreclr.so`CorUnix::CPalThread::ThreadEntry(pvParam=0x000055c09a21c650) at thread.cpp:1829:16
    frame #31: 0x00007f82b5240ea7 libpthread.so.0`start_thread(arg=<unavailable>) at pthread_create.c:477:8
    frame #32: 0x00007f82b4e2fa2f libc.so.6`__clone at clone.S:95

The check_match is crashing because here (I’ve used apt-get source libc6 to get the sources):

frame #7: 0x00007f82b5268e7a ld-linux-x86-64.so.2`check_match(undef_name="", ref=0x0000000000000000, version=0x0000000000000000, flags=2, type_class=0, sym=0x0000000000000ab0, symidx=84, strtab="", map=0x00007f8228002100, versioned_sym=0x00007f82b3d2bf48, num_versions=0x00007f82b3d2bf44) at dl-lookup.c:78:7
   75   {
   76     unsigned int stt = ELFW(ST_TYPE) (sym->st_info);
   77     assert (ELF_RTYPE_CLASS_PLT == 1);
-> 78     if (__glibc_unlikely ((sym->st_value == 0 /* No value.  */
   79                            && sym->st_shndx != SHN_ABS
   80                            && stt != STT_TLS)
   81                           || ELF_MACHINE_SYM_NO_MATCH (sym)

The sym pointer has a strange value that isn’t a valid pointer.

(lldb) frame variable
(const char *const) undef_name = 0x00007f82b4616f9e ""
(const Elf64_Sym *const) ref = 0x0000000000000000
(const r_found_version *const) version = 0x0000000000000000
(const int) flags = 2
(const int) type_class = 0
(const Elf64_Sym *const) sym = 0x0000000000000ab0
(const Elf_Symndx) symidx = 84
(const char *const) strtab = 0x0000000000000ae0 ""
(const link_map *const) map = 0x00007f8228002100
(const Elf64_Sym **const) versioned_sym = 0x00007f82b3d2bf48
(int *const) num_versions = 0x00007f82b3d2bf44
(unsigned int) stt = <variable not available>

(const Elf64_Half *) verstab = <variable not available>