bazel: Bazel hangs indefinitely when building with remote cache

We intermittently see an issue where bazel will hang indefinitely while building a target:

[5,322 / 5,621] Compiling Swift module SomeTarget; 11111s remote-cache

Seemingly while trying to fetch it from the remote cache. In these cases once we kill the bazel process no execution logs or chrome trace logs are produced, so I don’t have that info, but when I sample the process on macOS, or kill -QUIT PID it I get these logs:

sample log:

Call graph:
    2796 Thread_77950543   DispatchQueue_1: com.apple.main-thread  (serial)
    + 2796 start  (in libdyld.dylib) + 1  [0x7fff736f0cc9]
    +   2796 main  (in bazel) + 143  [0x106f197ff]
    +     2796 blaze::Main(int, char const* const*, blaze::WorkspaceLayout*, blaze::OptionProcessor*, unsigned long long)  (in bazel) + 5317  [0x106f012a5]
    +       2796 blaze::RunLauncher(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, blaze::StartupOptions const&, blaze::OptionProcessor const&, blaze::WorkspaceLayout const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, blaze::LoggingInfo*)  (in bazel) + 11470  [0x106f0477e]
    +         2796 blaze::RunClientServerMode(blaze_util::Path const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&, blaze_util::Path const&, blaze::WorkspaceLayout const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, blaze::OptionProcessor const&, blaze::StartupOptions const&, blaze::LoggingInfo*, blaze::DurationMillis, blaze::DurationMillis, blaze::BlazeServer*)  (in bazel) + 4966  [0x106f113d6]
    +           2796 blaze::BlazeServer::Communicate(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<blaze::RcStartupFlag, std::__1::allocator<blaze::RcStartupFlag> > const&, blaze::LoggingInfo const&, blaze::DurationMillis, blaze::DurationMillis, blaze::DurationMillis)  (in bazel) + 1632  [0x106f09350]
    +             2796 grpc::ClientReader<command_server::RunResponse>::Read(command_server::RunResponse*)  (in bazel) + 496  [0x106f086f0]
    +               2796 cq_pluck(grpc_completion_queue*, void*, gpr_timespec, void*)  (in bazel) + 548  [0x106fda5e4]
    +                 2796 pollset_work(grpc_pollset*, grpc_pollset_worker**, long long)  (in bazel) + 1303  [0x106fb8fc7]
    +                   2796 poll  (in libsystem_kernel.dylib) + 10  [0x7fff738383d6]
    2796 Thread_77950545
    + 2796 start_wqthread  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b77]
    +   2796 _pthread_wqthread  (in libsystem_pthread.dylib) + 390  [0x7fff738f1aa1]
    +     2796 __workq_kernreturn  (in libsystem_kernel.dylib) + 10  [0x7fff738334ce]
    2796 Thread_77950552: default-executor
    + 2796 thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]
    +   2796 _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
    +     2796 grpc_core::(anonymous namespace)::ThreadInternalsPosix::ThreadInternalsPosix(char const*, void (*)(void*), void*, bool*)::'lambda'(void*)::__invoke(void*)  (in bazel) + 125  [0x106fe9edd]
    +       2796 GrpcExecutor::ThreadMain(void*)  (in bazel) + 550  [0x106fbce36]
    +         2796 gpr_cv_wait  (in bazel) + 120  [0x106fe8768]
    +           2796 _pthread_cond_wait  (in libsystem_pthread.dylib) + 698  [0x7fff738f5425]
    +             2796 __psynch_cvwait  (in libsystem_kernel.dylib) + 10  [0x7fff73834882]
    2796 Thread_77950553: resolver-executor
    + 2796 thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]
    +   2796 _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
    +     2796 grpc_core::(anonymous namespace)::ThreadInternalsPosix::ThreadInternalsPosix(char const*, void (*)(void*), void*, bool*)::'lambda'(void*)::__invoke(void*)  (in bazel) + 125  [0x106fe9edd]
    +       2796 GrpcExecutor::ThreadMain(void*)  (in bazel) + 550  [0x106fbce36]
    +         2796 gpr_cv_wait  (in bazel) + 120  [0x106fe8768]
    +           2796 _pthread_cond_wait  (in libsystem_pthread.dylib) + 698  [0x7fff738f5425]
    +             2796 __psynch_cvwait  (in libsystem_kernel.dylib) + 10  [0x7fff73834882]
    2796 Thread_77950554: grpc_global_timer
    + 2796 thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]
    +   2796 _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
    +     2796 grpc_core::(anonymous namespace)::ThreadInternalsPosix::ThreadInternalsPosix(char const*, void (*)(void*), void*, bool*)::'lambda'(void*)::__invoke(void*)  (in bazel) + 125  [0x106fe9edd]
    +       2796 timer_thread(void*)  (in bazel) + 895  [0x106fc9dff]
    +         2796 gpr_cv_wait  (in bazel) + 102  [0x106fe8756]
    +           2796 _pthread_cond_wait  (in libsystem_pthread.dylib) + 698  [0x7fff738f5425]
    +             2796 __psynch_cvwait  (in libsystem_kernel.dylib) + 10  [0x7fff73834882]
    2796 Thread_77950566
    + 2796 thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]
    +   2796 _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
    +     2796 void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, void (blaze::BlazeServer::*)(), blaze::BlazeServer*> >(void*)  (in bazel) + 62  [0x106f1950e]
    +       2796 blaze::BlazeServer::CancelThread()  (in bazel) + 152  [0x106f073d8]
    +         2796 blaze_util::PosixPipe::Receive(void*, int, int*)  (in bazel) + 25  [0x1070ff1f9]
    +           2796 read  (in libsystem_kernel.dylib) + 10  [0x7fff7383281e]
    2796 Thread_77950583: grpc_global_timer
      2796 thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]
        2796 _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
          2796 grpc_core::(anonymous namespace)::ThreadInternalsPosix::ThreadInternalsPosix(char const*, void (*)(void*), void*, bool*)::'lambda'(void*)::__invoke(void*)  (in bazel) + 125  [0x106fe9edd]
            2796 timer_thread(void*)  (in bazel) + 895  [0x106fc9dff]
              2796 gpr_cv_wait  (in bazel) + 120  [0x106fe8768]
                2796 _pthread_cond_wait  (in libsystem_pthread.dylib) + 698  [0x7fff738f5425]
                  2796 __psynch_cvwait  (in libsystem_kernel.dylib) + 10  [0x7fff73834882]

Total number in stack (recursive counted multiple, when >=5):
        5       _pthread_start  (in libsystem_pthread.dylib) + 148  [0x7fff738f5109]
        5       thread_start  (in libsystem_pthread.dylib) + 15  [0x7fff738f0b8b]

Sort by top of stack, same collapsed (when >= 5):
        __psynch_cvwait  (in libsystem_kernel.dylib)        11184
        __workq_kernreturn  (in libsystem_kernel.dylib)        2796
        poll  (in libsystem_kernel.dylib)        2796
        read  (in libsystem_kernel.dylib)        2796

jvm.out produced by sigquit https://gist.github.com/keith/3c77f7e49c108964596440a251c05929

It’s worth noting that during this time bazel is consuming 0% CPU, so it seems likely that it’s locked waiting on a response that has already timed out. Currently we’re using a gRPC remote cache, with the default remote timeout of 60, and the default value for --cpus.

What operating system are you running Bazel on?

macOS

What’s the output of bazel info release?

3.3.0 we’ve heard reports from others that this repros on 3.2.0 as well, we have since updated to 3.4.1 but we don’t see this often enough to know if it’s an issue there yet as well.

Have you found anything relevant by searching the web?

There are various related sounding issues but AFAICT they’re all closed

About this issue

  • Original URL
  • State: closed
  • Created 4 years ago
  • Reactions: 6
  • Comments: 74 (62 by maintainers)

Commits related to this issue

Most upvoted comments

  • #12422 adds check around NetworkTime to ensure we don’t break gRPC.
  • #12426 adds checks around SettableFutures in remote module to ensure they are all set.

All these places can throw an unchecked exception in theory. By inserting code there, which randomly throws RuntimeException, we can reproduce the hangs and the stack traces are same with what we get from users.

At this point, I believe the root cause of the hanging issue is that an unchecked exception was thrown during the RPC so that a Future never completes. It’s hard to reproduce locally because the conditions are hard to meet e.g.

The issue is fixed by the PRs in theory. Can you help to verify?

@ejona86 thanks for those details! Unfortunate to hear it does in fact work this way, but good to have confirmation.

@ulfjack :

AIUI, Bazel currently doesn’t set a timeout on execute calls.

Interesting! We should also get that fixed. Bazel should add a timeout to all outgoing RPCs. Retry logic here is a little tricky due to the Execute/WaitExecution dance and the need for supporting arbitrarily long actions, but I think the result should be something like:

  • Bazel sets a timeout on all Execute and WaitExecution calls. Probably just the value of the remote RPC timeout flag, like everything else, for now? But it could go reasonably low (few minutes) if we also:
  • Correct the retrier to not count DEADLINE_EXCEEDED on Execute or WaitExecution against the maximum retries. That’s an expected, normal case, meaning “please retry waiting”. Right now it does, which requires raising max retries to support really long actions 😦.

We could send app-level keep-alives from the server but I don’t see how that’s better than client-side pings.

I’d definitely recommend it. Client-side pings happen at the connection level, but different HTTP2 proxies may also apply idle timeouts to the carried streams IIUC. For Execute requests RBE intentionally sends a no-op streaming response once per minute, which has proven sufficient to keep streams alive in all networking environments our clients run in.

I’m not fully understanding the scenario you’re running into with dropped connections, but this may also suffice to avoid your issue, with server-side changes only. That should be quick and easy for you to land.

It sounds like you are saying a TCP LB is being used (not L7).

You should be aware there is a 10 minute idle timeout. https://cloud.google.com/compute/docs/troubleshooting/general-tips#communicatewithinternet . But it doesn’t seem you are hitting it. In order to avoid that you need a keepalive time less than 10 minutes (say, 9 minutes). I’d suggest the gRPC server to configure this keepalive.

You could also be hitting a default 30 second idle timeout. This value can be increased, but probably not above 10 minutes. Again, it would be best for the server to manage the keepalive here, as it is controlling the TCP proxy settings and can update them when things change.

I assume this is the stack trace of the blocking thread:

"skyframe-evaluator 12": awaiting notification on [0x0000000645000ca0]

	at jdk.internal.misc.Unsafe.park(java.base@11.0.6/Native Method)

	at java.util.concurrent.locks.LockSupport.park(java.base@11.0.6/Unknown Source)

	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:497)

	at com.google.common.util.concurrent.AbstractFuture$TrustedFuture.get(AbstractFuture.java:83)

	at com.google.devtools.build.lib.remote.util.Utils.getFromFuture(Utils.java:56)

	at com.google.devtools.build.lib.remote.RemoteCache.waitForBulkTransfer(RemoteCache.java:219)

	at com.google.devtools.build.lib.remote.RemoteCache.download(RemoteCache.java:331)

	at com.google.devtools.build.lib.remote.RemoteSpawnCache.lookup(RemoteSpawnCache.java:179)

	at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:123)

	at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:96)

	at com.google.devtools.build.lib.actions.SpawnStrategy.beginExecution(SpawnStrategy.java:39)

	at com.google.devtools.build.lib.exec.SpawnStrategyResolver.beginExecution(SpawnStrategyResolver.java:62)

	at com.google.devtools.build.lib.analysis.actions.SpawnAction.beginExecution(SpawnAction.java:331)

	at com.google.devtools.build.lib.actions.Action.execute(Action.java:124)

	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$4.execute(SkyframeActionExecutor.java:779)

	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.continueAction(SkyframeActionExecutor.java:925)

	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.run(SkyframeActionExecutor.java:896)

	at com.google.devtools.build.lib.skyframe.ActionExecutionState.runStateMachine(ActionExecutionState.java:128)

	at com.google.devtools.build.lib.skyframe.ActionExecutionState.getResultOrDependOnFuture(ActionExecutionState.java:80)

	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor.executeAction(SkyframeActionExecutor.java:419)

	at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.checkCacheAndExecuteIfNeeded(ActionExecutionFunction.java:897)

	at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.compute(ActionExecutionFunction.java:300)

	at com.google.devtools.build.skyframe.AbstractParallelEvaluator$Evaluate.run(AbstractParallelEvaluator.java:438)

	at com.google.devtools.build.lib.concurrent.AbstractQueueVisitor$WrappedRunnable.run(AbstractQueueVisitor.java:398)

	at java.util.concurrent.ThreadPoolExecutor.runWorker(java.base@11.0.6/Unknown Source)

	at java.util.concurrent.ThreadPoolExecutor$Worker.run(java.base@11.0.6/Unknown Source)

	at java.lang.Thread.run(java.base@11.0.6/Unknown Source)