go: net,runtime: apparent deadlock in (*net.conn).Close and runtime.netpollblock on arm64 platforms

greplogs --dashboard -md -l -e '(?m).*\[semacquire, \d+ minutes\]:\n(.+\n\t.+\n)*net\.\(\*conn\)\.Close'

2021-10-08T16:26:20-99c1b24/windows-arm64-10 2021-10-05T15:58:05-f1f626d/linux-arm64-aws 2021-09-30T18:10:18-e81a3d9-205640e/linux-arm64-packet 2021-09-29T18:36:18-e48cf0d/android-arm64-corellium 2021-09-28T18:58:50-7d9f5e0-9c43872/windows-arm64-10 2021-09-28T17:19:08-1c6e50a/android-arm64-corellium 2021-09-27T22:22:04-52b23a5/android-arm64-corellium 2021-09-27T19:42:34-4e4d966-315cec2/linux-arm64-packet 2021-09-24T18:21:14-f6b5ffb/android-arm64-corellium 2021-09-22T20:53:48-41bb744/linux-arm64-packet 2021-09-18T06:04:41-c084706-c894b44/windows-arm64-10 2021-09-17T18:33:15-70493b3/ios-arm64-corellium 2021-09-16T23:13:58-e09dcc2/android-arm64-corellium 2021-09-09T21:28:56-c163c31-c981874/linux-arm64-aws 2021-09-07T23:27:08-3fff213/windows-arm64-10 2021-09-03T01:38:54-9f69a44/ios-arm64-corellium 2021-09-02T19:36:22-a8aa6cf/windows-arm64-10 2021-09-02T19:16:19-9633195/ios-arm64-corellium 2021-09-02T19:16:19-9633195/linux-arm64-packet 2021-09-02T16:59:57-2a463a2/android-arm64-corellium 2021-09-02T14:02:42-b8420ba/ios-arm64-corellium 2021-09-01T22:28:21-0bfd6fc/android-arm64-corellium 2021-08-28T04:46:13-0108177/windows-arm64-10 2021-08-24T15:55:38-60bc85c-d70c69d/windows-arm64-10 2021-08-23T12:26:30-c7e354d/linux-arm64-packet 2021-08-21T00:55:22-e17439e/windows-arm64-10 2021-08-17T15:00:04-3001b0a/windows-arm64-10 2021-08-05T20:22:31-fd45e26/android-arm64-corellium 2021-07-31T23:59:40-c6fcb2d-b8ca6e5/windows-arm64-10 2021-07-26T22:15:24-bfbb288/ios-arm64-corellium 2021-07-20T19:54:36-d568e6e/ios-arm64-corellium 2021-07-16T02:43:48-aa4e0f5/linux-arm64-aws 2021-06-30T01:29:49-c45e800/windows-arm64-10 2021-05-04T14:38:36-5e4f9b0/linux-arm64-aws 2021-05-03T16:23:09-7b768d4/linux-arm64-aws 2021-04-29T17:08:05-e03cca6/linux-arm64-aws 2021-04-29T11:37:05-c4c68fb/linux-arm64-aws 2021-04-22T17:47:59-537cde0/linux-arm64-aws 2021-04-16T16:36:57-2f0e5bf/linux-arm64-packet 2021-04-14T03:15:00-2d4ba26/linux-arm64-packet 2021-04-13T21:13:09-e512bc2/ios-arm64-corellium 2021-04-09T18:49:05-c3faff7/linux-arm64-aws 2021-04-09T18:49:05-c3faff7/linux-arm64-packet 2021-04-09T14:33:44-a690a5d/android-arm64-corellium 2021-04-08T18:47:17-96a6745/linux-arm64-packet 2021-03-29T21:50:16-4e1bf8e/linux-arm64-packet 2021-03-15T23:47:56-661f3f1/linux-arm64-aws 2021-03-12T21:52:25-5ea612d-f39c4de/linux-arm64-aws 2021-03-05T22:14:48-a829114/linux-arm64-aws 2021-01-25T22:54:28-e6b6d10/android-arm64-corellium 2020-12-16T16:40:57-5abda26/android-arm64-corellium 2020-12-10T23:11:25-1fe891a/ios-arm64-corellium 2020-10-15T18:07:26-3c9488e/linux-arm64-aws 2020-09-10T14:44:25-8098dbb/linux-arm64-aws

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 21 (14 by maintainers)

Commits related to this issue

Most upvoted comments

One theory:

Inside netpollblock and netpollunblock, gpp is read using a non-atomic read. In the normal case, writes use atomics, creating a barrier.

However, when consuming a ready event, netpollblock resets gpp using a non-atomic store, thus skipping a barrier.

In netpollunblock, we can return early if gpp == pdReady, not actually performing an unblock.

Thus I posit that an earlier netpollblock sees pdReady and resets to 0. Later, on another core we read gpp and see the stale pdReady and incorrectly return early without triggering a new unblock.

I haven’t validated this theory, but my log does show that this pollDesc has earlier calls to netpollblock, so it is possible. If this is the case, I believe the fix is simply to use a atomic store to reset gpp to 0.

I reproduced this on linux-arm64-aws quite quickly, so I’ll dig in some more:

$ GOROOT=$(pwd) gomote run $(gomote-instance) ./go/bin/go test -count=1000 crypto/tls
panic: test timed out after 10m0s

goroutine 117595 [running]:
testing.(*M).startAlarm.func1()
        /workdir/go/src/testing/testing.go:1961 +0x8c
created by time.goFunc
        /workdir/go/src/time/sleep.go:176 +0x3c

goroutine 1 [chan receive, 6 minutes]:
testing.(*T).Run(0x40003e4340, {0x2cce47, 0x75bdd8b719}, 0x2eef18)
        /workdir/go/src/testing/testing.go:1425 +0x33c
testing.runTests.func1(0x0)
        /workdir/go/src/testing/testing.go:1771 +0x74
testing.tRunner(0x40003e4340, 0x4000133b68)
        /workdir/go/src/testing/testing.go:1377 +0x110
testing.runTests(0x4000106b40, {0x4e2da0, 0x95, 0x95}, {0x46c, 0x4000014280, 0x4e7040})
        /workdir/go/src/testing/testing.go:1769 +0x3e0
testing.(*M).Run(0x4000106b40)
        /workdir/go/src/testing/testing.go:1657 +0x510
crypto/tls.runMain(0x400005c180)
        /workdir/go/src/crypto/tls/handshake_test.go:381 +0x5b4
crypto/tls.TestMain(0x4000045ec8)
        /workdir/go/src/crypto/tls/handshake_test.go:333 +0x80
main.main()
        _testmain.go:353 +0x1ec

goroutine 6 [IO wait, 6 minutes]:
internal/poll.runtime_pollWait(0xffff65e8e818, 0x72)
        /workdir/go/src/runtime/netpoll.go:229 +0x9c
internal/poll.(*pollDesc).wait(0x400013a080, 0x87ac4, 0x0)
        /workdir/go/src/internal/poll/fd_poll_runtime.go:84 +0x2c
internal/poll.(*pollDesc).waitRead(...)
        /workdir/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0x400013a080)
        /workdir/go/src/internal/poll/fd_unix.go:508 +0x1d8
net.(*netFD).accept(0x400013a080)
        /workdir/go/src/net/fd_unix.go:173 +0x28
net.(*TCPListener).accept(0x400000e198)
        /workdir/go/src/net/tcpsock_posix.go:140 +0x2c
net.(*TCPListener).Accept(0x400000e198)
        /workdir/go/src/net/tcpsock.go:262 +0x30
crypto/tls.localServer({0x340ac8, 0x400000e198})
        /workdir/go/src/crypto/tls/handshake_test.go:247 +0x5c
created by crypto/tls.runMain
        /workdir/go/src/crypto/tls/handshake_test.go:353 +0x1e0

goroutine 117621 [IO wait, 6 minutes]:
internal/poll.runtime_pollWait(0xffff65e8db68, 0x72)
        /workdir/go/src/runtime/netpoll.go:229 +0x9c
internal/poll.(*pollDesc).wait(0x400013a500, 0x40004e3440, 0x0)
        /workdir/go/src/internal/poll/fd_poll_runtime.go:84 +0x2c
internal/poll.(*pollDesc).waitRead(...)
        /workdir/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0x400013a500, {0x40004e3440, 0x205, 0x205})
        /workdir/go/src/internal/poll/fd_unix.go:167 +0x1ec
net.(*netFD).Read(0x400013a500, {0x40004e3440, 0x0, 0x1c58933d99f3})
        /workdir/go/src/net/fd_posix.go:56 +0x2c
net.(*conn).Read(0x4000558058, {0x40004e3440, 0x40003f2698, 0x160ca8})
        /workdir/go/src/net/net.go:183 +0x38
crypto/tls.(*atLeastReader).Read(0x40004e05a0, {0x40004e3440, 0x0, 0x1ac18})
        /workdir/go/src/crypto/tls/conn.go:777 +0x44
bytes.(*Buffer).ReadFrom(0x4000077af8, {0x33f328, 0x40004e05a0})
        /workdir/go/src/bytes/buffer.go:204 +0x94
crypto/tls.(*Conn).readFromUntil(0x4000077880, {0x33f588, 0x4000558058}, 0x160ca8)
        /workdir/go/src/crypto/tls/conn.go:799 +0xd8
crypto/tls.(*Conn).readRecordOrCCS(0x4000077880, 0x0)
        /workdir/go/src/crypto/tls/conn.go:606 +0xdc
crypto/tls.(*Conn).readRecord(...)
        /workdir/go/src/crypto/tls/conn.go:574
crypto/tls.(*Conn).readHandshake(0x4000077880)
        /workdir/go/src/crypto/tls/conn.go:1009 +0x4c
crypto/tls.(*Conn).clientHandshake(0x4000077880, {0x340dd0, 0x40005bc480})
        /workdir/go/src/crypto/tls/handshake_client.go:179 +0x1c0
crypto/tls.(*Conn).handshakeContext(0x4000077880, {0x340e08, 0x400001a040})
        /workdir/go/src/crypto/tls/conn.go:1445 +0x3c8
crypto/tls.(*Conn).HandshakeContext(...)
        /workdir/go/src/crypto/tls/conn.go:1395
crypto/tls.(*Conn).Handshake(...)
        /workdir/go/src/crypto/tls/conn.go:1379
crypto/tls.TestHostnameInSNI.func1({0x2cacbe, 0xb})
        /workdir/go/src/crypto/tls/handshake_client_test.go:1439 +0x154
created by crypto/tls.TestHostnameInSNI
        /workdir/go/src/crypto/tls/handshake_client_test.go:1438 +0x184

goroutine 117585 [semacquire, 6 minutes]:
internal/poll.runtime_Semacquire(0xffff65e8db68)
        /workdir/go/src/runtime/sema.go:61 +0x2c
internal/poll.(*FD).Close(0x400013a500)
        /workdir/go/src/internal/poll/fd_unix.go:116 +0x68
net.(*netFD).Close(0x400013a500)
        /workdir/go/src/net/fd_posix.go:38 +0x44
net.(*conn).Close(0x4000558058)
        /workdir/go/src/net/net.go:207 +0x34
crypto/tls.TestHostnameInSNI(0x40003e4820)
        /workdir/go/src/crypto/tls/handshake_client_test.go:1453 +0x278
testing.tRunner(0x40003e4820, 0x2eef18)
        /workdir/go/src/testing/testing.go:1377 +0x110
created by testing.(*T).Run
        /workdir/go/src/testing/testing.go:1424 +0x328
FAIL    crypto/tls      600.010s
FAIL
Error running run: exit status 1