kubernetes: k8s.io/kubernetes/pkg/kubelet/cm/devicemanager test failure

Is this a BUG REPORT or FEATURE REQUEST?:

/kind bug

What happened:

I’m running into a k8s.io/kubernetes/pkg/kubelet/cm/devicemanager test failure during “make test”. To get more output and to avoid issue #59425 , I am using:

HOME=/tmp/home make GOFLAGS='-v -exec "strace -f -e trace=connect,bind,unlinkat"' test 
...
=== RUN   TestDevicePluginReRegistration
[pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/server.sock", 0) = 0
[pid 29568] bind(11, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0
[pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = -1 ENOENT (No such file or directory)
[pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 29568] bind(12, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = 0
[pid 29568] connect(13, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = 0
2018/02/07 14:34:39 Starting to serve on /tmp/device_plugin/device-plugin.sock
[pid 29568] connect(14, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0
[pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/server.sock", 0) = 0
[pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = 0
[pid 29568] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=29568, si_uid=1000} ---
[pid 29568] connect(6, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory)
E0207 14:34:39.961321   29568 endpoint.go:117] listAndWatch ended unexpectedly for device plugin mock with error rpc error: code = Unavailable desc = transport is closing
strace: Process 29623 attached
[pid 29574] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory)
[pid 29623] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory)
[pid 29574] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory)
E0207 14:34:49.961324   29568 endpoint.go:60] Can't create new endpoint with path /tmp/device_plugin/device-plugin.sock err failed to dial device plugin: context deadline exceeded
E0207 14:34:49.961390   29568 manager.go:340] Failed to dial device plugin with request &RegisterRequest{Version:v1alpha2,Endpoint:device-plugin.sock,ResourceName:fake-domain/resource,}: failed to dial device plugin: context deadline exceeded
panic: test timed out after 2m0s

goroutine 116 [running]:
testing.startAlarm.func1()
	/usr/lib/go-1.9/src/testing/testing.go:1145 +0xf9
created by time.goFunc
	/usr/lib/go-1.9/src/time/sleep.go:170 +0x44

goroutine 1 [chan receive]:
testing.(*T).Run(0xc4201c40f0, 0x137221d, 0x1e, 0x13e0118, 0x481901)
	/usr/lib/go-1.9/src/testing/testing.go:790 +0x2fc
testing.runTests.func1(0xc4201c40f0)
	/usr/lib/go-1.9/src/testing/testing.go:1004 +0x64
testing.tRunner(0xc4201c40f0, 0xc4204fbde0)
	/usr/lib/go-1.9/src/testing/testing.go:746 +0xd0
testing.runTests(0xc420408bc0, 0x1bd9700, 0xb, 0xb, 0x1181640)
	/usr/lib/go-1.9/src/testing/testing.go:1002 +0x2d8
testing.(*M).Run(0xc4205cff18, 0xc4204fbf70)
	/usr/lib/go-1.9/src/testing/testing.go:921 +0x111
main.main()
	k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/_test/_testmain.go:64 +0xdb

goroutine 20 [chan receive]:
k8s.io/kubernetes/vendor/github.com/golang/glog.(*loggingT).flushDaemon(0x1be5be0)
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/vendor/github.com/golang/glog/glog.go:879 +0x9f
created by k8s.io/kubernetes/vendor/github.com/golang/glog.init.0
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/vendor/github.com/golang/glog/glog.go:410 +0x203

goroutine 150 [chan receive]:
k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.TestDevicePluginReRegistration(0xc4201c4870)
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager_test.go:85 +0x351
testing.tRunner(0xc4201c4870, 0x13e0118)
	/usr/lib/go-1.9/src/testing/testing.go:746 +0xd0
created by testing.(*T).Run
	/usr/lib/go-1.9/src/testing/testing.go:789 +0x2de

goroutine 151 [IO wait]:
internal/poll.runtime_pollWait(0x7f139654cc70, 0x72, 0xffffffffffffffff)
	/usr/lib/go-1.9/src/runtime/netpoll.go:173 +0x57
internal/poll.(*pollDesc).wait(0xc4203ea898, 0x72, 0xc420089c00, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_poll_runtime.go:85 +0xae
internal/poll.(*pollDesc).waitRead(0xc4203ea898, 0xffffffffffffff00, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_poll_runtime.go:90 +0x3d
internal/poll.(*FD).Accept(0xc4203ea880, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_unix.go:335 +0x1e2
net.(*netFD).accept(0xc4203ea880, 0xc42052c080, 0x0, 0x0)
	/usr/lib/go-1.9/src/net/fd_unix.go:238 +0x42
net.(*UnixListener).accept(0xc4203dac60, 0x7e3449, 0x45aae0, 0xc420089e98)
	/usr/lib/go-1.9/src/net/unixsock_posix.go:162 +0x32
net.(*UnixListener).Accept(0xc4203dac60, 0x13e0f28, 0xc42013c640, 0x1b8c7c0, 0xc42052c080)
	/usr/lib/go-1.9/src/net/unixsock.go:241 +0x49
k8s.io/kubernetes/vendor/google.golang.org/grpc.(*Server).Serve(0xc42013c640, 0x1b88420, 0xc4203dac60, 0x0, 0x0)
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/vendor/google.golang.org/grpc/server.go:475 +0x196
created by k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.(*ManagerImpl).Start
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go:232 +0x4b0

goroutine 152 [IO wait]:
internal/poll.runtime_pollWait(0x7f139654cbb0, 0x72, 0xffffffffffffffff)
	/usr/lib/go-1.9/src/runtime/netpoll.go:173 +0x57
internal/poll.(*pollDesc).wait(0xc4203eaa18, 0x72, 0xc42021cc00, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_poll_runtime.go:85 +0xae
internal/poll.(*pollDesc).waitRead(0xc4203eaa18, 0xffffffffffffff00, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_poll_runtime.go:90 +0x3d
internal/poll.(*FD).Accept(0xc4203eaa00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
	/usr/lib/go-1.9/src/internal/poll/fd_unix.go:335 +0x1e2
net.(*netFD).accept(0xc4203eaa00, 0xc42052c088, 0x0, 0x0)
	/usr/lib/go-1.9/src/net/fd_unix.go:238 +0x42
net.(*UnixListener).accept(0xc4203dade0, 0x7e3449, 0x45aae0, 0xc42021ce98)
	/usr/lib/go-1.9/src/net/unixsock_posix.go:162 +0x32
net.(*UnixListener).Accept(0xc4203dade0, 0x13e0f28, 0xc42013c780, 0x1b8c7c0, 0xc42052c088)
	/usr/lib/go-1.9/src/net/unixsock.go:241 +0x49
k8s.io/kubernetes/vendor/google.golang.org/grpc.(*Server).Serve(0xc42013c780, 0x1b88420, 0xc4203dade0, 0x0, 0x0)
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/vendor/google.golang.org/grpc/server.go:475 +0x196
created by k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.(*Stub).Start
	/fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/device_plugin_stub.go:88 +0x151
[pid 29623] +++ exited with 2 +++
[pid 29617] +++ exited with 2 +++
[pid 29596] +++ exited with 2 +++
[pid 29594] +++ exited with 2 +++
[pid 29588] +++ exited with 2 +++
[pid 29587] +++ exited with 2 +++
[pid 29574] +++ exited with 2 +++
[pid 29573] +++ exited with 2 +++
[pid 29572] +++ exited with 2 +++
[pid 29571] +++ exited with 2 +++
+++ exited with 2 +++
FAIL	k8s.io/kubernetes/pkg/kubelet/cm/devicemanager	120.063s

What you expected to happen:

The test should pass.

How to reproduce it (as minimally and precisely as possible):

It happens for me only when running all tests, but then it happens reliably. When limiting the test run with WHAT=k8s.io/kubernetes/pkg/kubelet/cm/devicemanager, it works.

The failing connect() calls are not surprising, because /tmp/device_plugin/device-plugin.sock got unlinked after binding to it. Those unlinkedat() might come from manager.go’s RemoveContent: it removes everything in the socket dir, which happens to be /tmp/device_plugin.

For comparison, here’s a successful run:

$ HOME=/tmp/home make GOFLAGS='-v -exec "strace -f -e trace=connect,bind,unlinkat" -run TestDevicePluginReRegistration' WHAT=k8s.io/kubernetes/pkg/kubelet/cm/devicemanager  test  2>&1 | tee /tmp/log5
Running tests for APIVersion: v1,admissionregistration.k8s.io/v1alpha1,admissionregistration.k8s.io/v1beta1,admission.k8s.io/v1beta1,apps/v1beta1,apps/v1beta2,apps/v1,authentication.k8s.io/v1,authentication.k8s.io/v1beta1,authorization.k8s.io/v1,authorization.k8s.io/v1beta1,autoscaling/v1,autoscaling/v2beta1,batch/v1,batch/v1beta1,batch/v2alpha1,certificates.k8s.io/v1beta1,extensions/v1beta1,events.k8s.io/v1beta1,imagepolicy.k8s.io/v1alpha1,networking.k8s.io/v1,policy/v1beta1,rbac.authorization.k8s.io/v1,rbac.authorization.k8s.io/v1beta1,rbac.authorization.k8s.io/v1alpha1,scheduling.k8s.io/v1alpha1,settings.k8s.io/v1alpha1,storage.k8s.io/v1beta1,storage.k8s.io/v1,storage.k8s.io/v1alpha1,
+++ [0207 14:58:50] Running tests without code coverage
strace: Process 21963 attached
strace: Process 21964 attached
strace: Process 21965 attached
strace: Process 21966 attached
[pid 21962] connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
[pid 21962] connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
strace: Process 21967 attached
strace: Process 21968 attached
strace: Process 21969 attached
strace: Process 21970 attached
strace: Process 21971 attached
strace: Process 21972 attached
strace: Process 21973 attached
strace: Process 21974 attached
strace: Process 21975 attached
strace: Process 21976 attached
strace: Process 21977 attached
[pid 21962] bind(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0
[pid 21962] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = -1 ENOENT (No such file or directory)
[pid 21962] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21962] bind(5, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = 0
[pid 21962] connect(6, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = 0
2018/02/07 14:58:55 Starting to serve on /tmp/device_plugin/device-plugin.sock
[pid 21962] connect(8, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0
[pid 21966] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=21962, si_uid=1000} ---
[pid 21966] unlinkat(AT_FDCWD, "/tmp/devicemanager.test.INFO", 0) = 0
[pid 21965] connect(7, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = 0
2018/02/07 14:58:55 ListAndWatch
[pid 21977] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.new", 0) = -1 ENOENT (No such file or directory)
[pid 21977] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.new", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21977] bind(9, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.new"}, 44) = 0
[pid 21977] connect(10, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.new"}, 44) = 0
strace: Process 21978 attached
2018/02/07 14:58:55 Starting to serve on /tmp/device_plugin/device-plugin.sock.new
[pid 21965] connect(12, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33strace: Process 21979 attached
) = 0
[pid 21979] connect(11, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.new"}, 44) = 0
E0207 14:58:55.277382   21962 endpoint.go:117] listAndWatch ended unexpectedly for device plugin fake-domain/resource with error rpc error: code = Unavailable desc = transport is closing
[pid 21972] unlinkat(AT_FDCWD, "/tmp/devicemanager.test.ERROR", 0) = 0
[pid 21972] unlinkat(AT_FDCWD, "/tmp/devicemanager.test.WARNING", 0) = 0
2018/02/07 14:58:55 ListAndWatch
[pid 21965] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.third", 0) = -1 ENOENT (No such file or directory)
[pid 21965] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.third", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21965] bind(10, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.third"}, 46) = 0
[pid 21965] connect(12, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.third"}, 46) = 0
2018/02/07 14:58:55 Starting to serve on /tmp/device_plugin/device-plugin.sock.third
[pid 21972] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=21962, si_uid=1000} ---
[pid 21965] connect(12, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0
[pid 21979] connect(15, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock.third"}, 46) = 0
strace: Process 21980 attached
E0207 14:58:55.281826   21962 endpoint.go:117] listAndWatch ended unexpectedly for device plugin fake-domain/resource with error rpc error: code = FailedPrecondition desc = grpc: the client connection is closing
[pid 21970] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=21962, si_uid=1000} ---
strace: Process 21981 attached
2018/02/07 14:58:55 ListAndWatch
E0207 14:58:55.285335   21962 endpoint.go:160] Device Dev2 was deleted
E0207 14:58:55.285380   21962 endpoint.go:160] Device Dev1 was deleted
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.new", 0) = 0
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.new", 0) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.new", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.third", 0) = 0
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.third", 0) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock.third", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = 0
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
[pid 21978] unlinkat(AT_FDCWD, "/tmp/device_plugin/server.sock", 0E0207 14:58:55.285972   21962 endpoint.go:117] listAndWatch ended unexpectedly for device plugin fake-domain/resource with error rpc error: code = FailedPrecondition desc = grpc: the client connection is closing
) = 0
[pid 21981] +++ exited with 0 +++
[pid 21980] +++ exited with 0 +++
[pid 21979] +++ exited with 0 +++
[pid 21978] +++ exited with 0 +++
[pid 21977] +++ exited with 0 +++
[pid 21976] +++ exited with 0 +++
[pid 21975] +++ exited with 0 +++
[pid 21974] +++ exited with 0 +++
[pid 21973] +++ exited with 0 +++
[pid 21972] +++ exited with 0 +++
[pid 21971] +++ exited with 0 +++
[pid 21970] +++ exited with 0 +++
[pid 21969] +++ exited with 0 +++
[pid 21968] +++ exited with 0 +++
[pid 21967] +++ exited with 0 +++
[pid 21966] +++ exited with 0 +++
[pid 21965] +++ exited with 0 +++
[pid 21964] +++ exited with 0 +++
[pid 21963] +++ exited with 0 +++
+++ exited with 0 +++
=== RUN   TestDevicePluginReRegistration
--- PASS: TestDevicePluginReRegistration (0.02s)
PASS
ok  	k8s.io/kubernetes/pkg/kubelet/cm/devicemanager	0.062s

Anything else we need to know?:

I’m wondering whether it is guaranteed that different tests in the same .go file run serialized. Does anyone know for sure?

I’m asking because all tests in manager_test.go use the same socket directory and thus would conflict with each other if they were to run in parallel. Note that the unlinkat and connect calls in the failed test are for different pids. I’ve not checked whether those are truly different processes (as in fork+exec) or worker threads (clone).

Environment:

  • Kubernetes version (use kubectl version): master branch, commit 68da45623f5b14f61d9ce51b4946c11e034eb25c
  • OS (e.g. from /etc/os-release): Debian Stretch (9.0)
  • Kernel (e.g. uname -a): 4.9.65-3+deb9u2

About this issue

  • Original URL
  • State: closed
  • Created 6 years ago
  • Comments: 17 (15 by maintainers)

Commits related to this issue

Most upvoted comments

On Sat, 2018-02-10 at 05:31 +0000, Vikas Choudhary (vikasc) wrote:

@RenaudWasTaken https://github.com/thtanaka/kubernetes/blob/master/do cs/devel/testing.md#unit-tests Concurrent unit test runs must pass.

True, but which unit tests run concurrently?

To answer that, I re-read the “testing” documentation and ran a quick experiment. That confirmed that individual tests inside the same test binary run sequentially. A test function has to explicitly call t.Parallel() before the next test starts, and that test blocks unless it also calls t.Parallel().

In other words, manager_test.go using the same /tmp/device_plugin between all test functions should have been okay. I suspect it goes wrong sometimes because one test function starts a goroutine, does not wait for its completion, and that goroutine then interferes with the next test function when using the same dir, depending on timing. Running other test binaries in parallel and thus increasing system load probably also matters, together with overall number of CPUs.

I think merging PR #59489 makes sense because a dynamic test directory is more flexible and enables running the tests in parallel (which could be a separate PR or be done as part of PR #59489), but I also think that it would be good if someone with a better understanding of the code that gets tested here could look into this race condition.

[(quite) a bit later…]

I chased down the spurious unlinkat() to UnixStream.Close for the socket, which unlinks the socket path [1]. By wrapping the listen socket such that a custom Close logs the call and a stack trace, I now have the smoking gun.

[1] https://golang.org/pkg/net/#UnixListener.SetUnlinkOnClose

With some extra logging one can see that the /tmp/device_plugin575993072/server.sock that was created for TestNewManagerImplStart gets closed and unlinked while TestDevicePluginReRegistration already runs. The goroutine which runs too long is the go m.server.Serve(s2) from manager.go/Start. manager.go/Stop was called, but didn’t wait for the goroutine to actually stop:

2018/02/13 14:56:22 ManagerImpl.Stop(): stopping for socket dir:  /tmp/device_plugin575993072/
2018/02/13 14:56:22 ManagerImpl.Stop(): stopping for socket dir done:  /tmp/device_plugin575993072/
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072", 0) = -1 EISDIR (Is a directory)
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072", AT_REMOVEDIR) = -1 ENOTEMPTY (Directory not empty)
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072/server.sock", 0) = 0
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072", 0) = -1 EISDIR (Is a directory)
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072", AT_REMOVEDIR) = 0
2018/02/13 14:56:22 Removed device_manager dir:  /tmp/device_plugin575993072
--- PASS: TestNewManagerImplStart (0.00s)
=== RUN   TestDevicePluginReRegistration
2018/02/13 14:56:22 Created device_plugin dir:  /tmp/device_plugin757024143
2018/02/13 14:56:22 ManagerImpl.removeContents(): removing content in:  /tmp/device_plugin757024143/
[pid 16648] bind(11, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin757024143/server.sock"}, 42) = 0
2018/02/13 14:56:22 Stub.cleanup(): Remove socket:  /tmp/device_plugin757024143/device-plugin.sock
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin757024143/device-plugin.sock", 0) = -1 ENOENT (No such file or directory)
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin757024143/device-plugin.sock", AT_REMOVEDIR) = -1 ENOENT (No such file or directory)
2018/02/13 14:56:22 Stub.cleanup(): Remove socket done:  /tmp/device_plugin757024143/device-plugin.sock
[pid 16648] bind(12, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin757024143/device-plugin.sock"}, 49) = 0
[pid 16648] connect(13, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin757024143/device-plugin.sock"}, 49) = 0
2018/02/13 14:56:22 Starting to serve on /tmp/device_plugin757024143/device-plugin.sock
[pid 16648] connect(14, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin757024143/server.sock"}, 42) = 0
E0213 14:56:22.983435   16648 endpoint.go:117] listAndWatch ended unexpectedly for device plugin mock with error rpc error: code = Unavailable desc = transport is closing
2018/02/13 14:56:22 Closing MySocket:  /tmp/device_plugin575993072/server.sock
goroutine 147 [running]:
runtime/debug.Stack(0x55169c, 0xc4200e40f0, 0x2)
        /usr/lib/go-1.9/src/runtime/debug/stack.go:24 +0xa7
runtime/debug.PrintStack()
        /usr/lib/go-1.9/src/runtime/debug/stack.go:16 +0x22
k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.MySocket.Close(0x1b7f2e0, 0xc4202a2d20, 0xc420678210, 0x27, 0xd, 0x15b4588)
        /fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go:210 +0xa9
k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.(*MySocket).Close(0xc4202be3c0, 0x1354c3c, 0x7)
        <autogenerated>:1 +0x57
k8s.io/kubernetes/vendor/google.golang.org/grpc.(*Server).Serve(0xc4202d88c0, 0x1b7f660, 0xc4202be3c0, 0x0, 0x0)
        /fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/vendor/google.golang.org/grpc/server.go:458 +0x601
created by k8s.io/kubernetes/pkg/kubelet/cm/devicemanager.(*ManagerImpl).Start
        /fast/work/gopath/src/k8s.io/kubernetes/_output/local/go/src/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go:248 +0x511
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072/server.sock", 0) = -1 ENOENT (No such file or directory)
[pid 16648] unlinkat(AT_FDCWD, "/tmp/device_plugin575993072/device-plugin.sock", 0) = -1 ENOENT (No such file or directory)
[pid 16710] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=16648, si_uid=1000} ---
[pid 16648] connect(5, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin757024143/device-plugin.sock"}, 49) = 0

That now begs the question: what is the right way to stop a grpc server?

Below is the change I made to manager.go to track when it closes the socket:

+type MySocket struct {
+       net.Listener
+       socketPath string
+}
+
+func (l MySocket) Close() error {
+       log.Println("Closing MySocket: ", l.socketPath)
+       debug.PrintStack()
+       return l.Listener.Close()
+}
+
 // Start starts the Device Plugin Manager amd start initialization of
 // podDevices and allocatedDevices information from checkpoint-ed
state and
 // starts device plugin registration service.
@@ -225,11 +240,12 @@ func (m *ManagerImpl) Start(activePods
ActivePodsFunc, sourcesReady config.Sourc
                glog.Errorf(errListenSocket+" %+v", err)
                return err
        }
+       s2 := MySocket{s, socketPath}
 
        m.server = grpc.NewServer([]grpc.ServerOption{}...)
 
        pluginapi.RegisterRegistrationServer(m.server, m)
-       go m.server.Serve(s)
+       go m.server.Serve(s2)
 
        glog.V(2).Infof("Serving device plugin registration server on
%q", socketPath)