go: internal/poll: deadlock in Read on arm64 when an FD is closed
It’s not obvious to me whether this deadlock is a bug in the specific test function, the net/http
package, or *httptest.Server
in particular.
2021-03-24T14:20:32-747f426/linux-arm64-aws
…
goroutine 3475 [IO wait, 2 minutes]:
internal/poll.runtime_pollWait(0xffff94117d10, 0x72, 0xffffffffffffffff)
/workdir/go/src/runtime/netpoll.go:229 +0xa4
internal/poll.(*pollDesc).wait(0x400030a898, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)
/workdir/go/src/internal/poll/fd_poll_runtime.go:84 +0x38
internal/poll.(*pollDesc).waitRead(...)
/workdir/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0x400030a880, 0x400048b000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/workdir/go/src/internal/poll/fd_unix.go:167 +0x1ec
net.(*netFD).Read(0x400030a880, 0x400048b000, 0x1000, 0x1000, 0xffff94e94f01, 0x40003bd680, 0x4000432668)
/workdir/go/src/net/fd_posix.go:56 +0x44
net.(*conn).Read(0x400037c168, 0x400048b000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/workdir/go/src/net/net.go:183 +0x4c
net/http.(*connReader).Read(0x40000a1b60, 0x400048b000, 0x1000, 0x1000, 0xffff94275900, 0x1, 0x4000432788)
/workdir/go/src/net/http/server.go:800 +0x268
bufio.(*Reader).fill(0x40005e6300)
/workdir/go/src/bufio/bufio.go:101 +0x10c
bufio.(*Reader).ReadSlice(0x40005e6300, 0x1df0a, 0x40004327e8, 0x1000101d7b4, 0xffff94279340, 0x0, 0xf0)
/workdir/go/src/bufio/bufio.go:360 +0x38
bufio.(*Reader).ReadLine(0x40005e6300, 0x40004ea000, 0x1000, 0x4000352b40, 0x77, 0xf0, 0x4000432898)
/workdir/go/src/bufio/bufio.go:389 +0x30
net/textproto.(*Reader).readLineSlice(0x40000a1440, 0x40003a5200, 0x4000432968, 0xab164, 0x400015cc20, 0x0)
/workdir/go/src/net/textproto/reader.go:57 +0x84
net/textproto.(*Reader).ReadLine(...)
/workdir/go/src/net/textproto/reader.go:38
net/http.readRequest(0x40005e6300, 0x0, 0x40003a5200, 0x0, 0x0)
/workdir/go/src/net/http/request.go:1028 +0x74
net/http.(*conn).readRequest(0x400011f900, 0x539c98, 0x40003bd640, 0x0, 0x0, 0x0)
/workdir/go/src/net/http/server.go:986 +0x21c
net/http.(*conn).serve(0x400011f900, 0x539d40, 0x40003bd640)
/workdir/go/src/net/http/server.go:1878 +0x844
created by net/http.(*Server).Serve
/workdir/go/src/net/http/server.go:3013 +0x4b0
…
goroutine 3439 [semacquire, 2 minutes]:
internal/poll.runtime_Semacquire(0x400015ce28)
/workdir/go/src/runtime/sema.go:61 +0x38
internal/poll.(*FD).Close(0x400015ce00, 0x400015ce00, 0x0)
/workdir/go/src/internal/poll/fd_unix.go:116 +0x80
net.(*netFD).Close(0x400015ce00, 0x0, 0x40000ca1c8)
/workdir/go/src/net/fd_posix.go:38 +0x48
net.(*TCPListener).close(...)
/workdir/go/src/net/tcpsock_posix.go:157
net.(*TCPListener).Close(0x40003a1308, 0x4, 0x40004213a0)
/workdir/go/src/net/tcpsock.go:275 +0x34
net/http/httptest.(*Server).Close(0x40004f6000)
/workdir/go/src/net/http/httptest/server.go:204 +0xd0
net/http_test.TestTransportDialCancelRace(0x40004ec180)
/workdir/go/src/net/http/transport_test.go:4015 +0x308
testing.tRunner(0x40004ec180, 0x4b91d8)
/workdir/go/src/testing/testing.go:1235 +0x100
created by testing.(*T).Run
/workdir/go/src/testing/testing.go:1280 +0x350
…
goroutine 3490 [IO wait, 2 minutes]:
internal/poll.runtime_pollWait(0xffff941180b0, 0x72, 0xffffffffffffffff)
/workdir/go/src/runtime/netpoll.go:229 +0xa4
internal/poll.(*pollDesc).wait(0x400015cf18, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)
/workdir/go/src/internal/poll/fd_poll_runtime.go:84 +0x38
internal/poll.(*pollDesc).waitRead(...)
/workdir/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0x400015cf00, 0x40004df000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/workdir/go/src/internal/poll/fd_unix.go:167 +0x1ec
net.(*netFD).Read(0x400015cf00, 0x40004df000, 0x1000, 0x1000, 0x400035cc68, 0xe53, 0x0)
/workdir/go/src/net/fd_posix.go:56 +0x44
net.(*conn).Read(0x40006a61f0, 0x40004df000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/workdir/go/src/net/net.go:183 +0x4c
net/http.(*persistConn).Read(0x40000b0900, 0x40004df000, 0x1000, 0x1000, 0x60, 0x0, 0x101801)
/workdir/go/src/net/http/transport.go:1922 +0x64
bufio.(*Reader).fill(0x40004c68a0)
/workdir/go/src/bufio/bufio.go:101 +0x10c
bufio.(*Reader).Peek(0x40004c68a0, 0x1, 0x40003fb620, 0x483939, 0x3, 0x486b25, 0xb)
/workdir/go/src/bufio/bufio.go:139 +0x74
net/http.(*persistConn).readLoop(0x40000b0900)
/workdir/go/src/net/http/transport.go:2083 +0x170
created by net/http.(*Transport).dialConn
/workdir/go/src/net/http/transport.go:1743 +0x18b0
…
goroutine 3440 [IO wait, 2 minutes]:
internal/poll.runtime_pollWait(0xffff94117c28, 0x72, 0x0)
/workdir/go/src/runtime/netpoll.go:229 +0xa4
internal/poll.(*pollDesc).wait(0x400015ce18, 0x72, 0x0, 0x0, 0x484ab9)
/workdir/go/src/internal/poll/fd_poll_runtime.go:84 +0x38
internal/poll.(*pollDesc).waitRead(...)
/workdir/go/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0x400015ce00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
/workdir/go/src/internal/poll/fd_unix.go:402 +0x204
net.(*netFD).accept(0x400015ce00, 0x4000182cb8, 0x3755834c00, 0x12b9526b)
/workdir/go/src/net/fd_unix.go:173 +0x2c
net.(*TCPListener).accept(0x40003a1308, 0x400037c168, 0x40002a1e08, 0x53a74)
/workdir/go/src/net/tcpsock_posix.go:140 +0x2c
net.(*TCPListener).Accept(0x40003a1308, 0x40002a1e70, 0x40002a1e78, 0x18, 0x40001f1680)
/workdir/go/src/net/tcpsock.go:262 +0x34
net/http.(*Server).Serve(0x400067eee0, 0x539100, 0x40003a1308, 0x0, 0x0)
/workdir/go/src/net/http/server.go:2981 +0x364
net/http/httptest.(*Server).goServe.func1(0x40004f6000)
/workdir/go/src/net/http/httptest/server.go:308 +0x68
created by net/http/httptest.(*Server).goServe
/workdir/go/src/net/http/httptest/server.go:306 +0x58
…
About this issue
- Original URL
- State: closed
- Created 3 years ago
- Comments: 36 (31 by maintainers)
Commits related to this issue
- [release-branch.go1.17] runtime: fix net poll races The netpoll code was written long ago, when the only multiprocessors that Go ran on were x86. It assumed that an atomic store would trigger a full ... — committed to golang/go by rsc 2 years ago
- runtime: fix net poll races The netpoll code was written long ago, when the only multiprocessors that Go ran on were x86. It assumed that an atomic store would trigger a full memory barrier and then ... — committed to jproberts/go by rsc 2 years ago
Looking more at this, there are only two un-locked accesses to pd.closing and neither seems performance critical enough to justify a race instead of an atomic. If we make pd.closing an atomic bool then the questions all go away. I will send a CL for that.
By the way I think we’re pretty clearly tracking two independent problems in this one issue.
@bradfitz Sure, I can take a look though I don’t suppose that I’ll see anything. But something must be holding the lock, somehow.