Go version

go version go1.23-dev +5a18e79687

Output of go env in your module/workspace:

$ go env
GO111MODULE=''
GOARCH='amd64'
GOEXE=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOOS='linux'
GOAMD64='v1'
CGO_ENABLED='1'
...

What did you do?

See also similar issue https://github.com/golang/go/issues/45211.

After running this program in a loop for a while, one of them will deadlock:

GO111MODULE=off go build -gcflags=all="-N -l"  
while true ; do 
  for i in {1..128} ; do 
    GOMAXPROCS=$i ./unixconn &
   done
   wait 
done

Program source:

// unixconn.go
package main

import (
    "fmt"
    "log"
    "net"
    "os"
    "sync"
    "syscall"
    "time"
)

// GetPeerCredentials retrieves the credentials of the peer of the UNIX domain
// socket connection.
func getPeerCredentials(conn *net.UnixConn) (*syscall.Ucred, error) {
    f, err := conn.File()
    if err != nil {
        return nil, err
    }
    defer f.Close()
    return syscall.GetsockoptUcred(int(f.Fd()), syscall.SOL_SOCKET, syscall.SO_PEERCRED)
}

type authListener struct {
    net.Listener
}

func (l authListener) Accept() (net.Conn, error) {
    conn, err := l.Listener.Accept()
    if err != nil {
        return conn, err
    }
    unixconn, ok := conn.(*net.UnixConn)
    if !ok {
        return nil, fmt.Errorf("wanted *net.UnixConn, got %v", conn)
    }
    ucreds, err := getPeerCredentials(unixconn)
    if err != nil {
        return nil, err
    }
    uid := syscall.Getuid()
    if int(ucreds.Uid) != uid {
        return nil, fmt.Errorf("error: only allowing user %d to connect, but got user %d", uid, ucreds.Uid)
    }
    return conn, err
}

// Socket name (abstract UNIX domain namespace).
var sock = fmt.Sprintf("@unixconn.echo.%d", os.Getpid())

func main() {
    // Server.
    {
        socket := must(net.Listen("unix", sock))
        defer socket.Close()
        // I have not been able to reproduce without wrapping the listener in
        // authListener. Perhaps syscall.GetsockoptUcred tickles the file descriptor
        // in a bad way.
        go badserver(authListener{socket})
    }

    // Client.
    conn := must(net.Dial("unix", sock))
    defer conn.Close()
    must(conn.Write([]byte("GET /GLOOP\n\n")))
    must(conn.Read(make([]byte, 4096)))
}

// A stripped down version of a net/http.Server, emulating a HTTP handler that
// immediately calls Hijack. With small probability, this "deadlocks" in the
// sense that the background read does not return, and neither does hijack.
func badserver(socket net.Listener) {
    var mu sync.Mutex
    cond := sync.NewCond(&mu)
    for {
        conn := must(socket.Accept()) // Accept an incoming connection.

        // Handle the connection in a separate goroutine.
        go func(conn net.Conn) {
            defer conn.Close()
            buf := make([]byte, 4096)
            conn.SetReadDeadline(time.Time{})
            must(conn.Read(buf)) // Read data from the connection.

            var inRead bool
            startBackgroundRead(conn, &mu, cond, &inRead)
            hijack(conn, &mu, cond, &inRead) // Mimic a HTTP "handler" that hijacks immediately.

            conn.Close() // Hijack did not deadlock, terminate program.
        }(conn)
    }
}

func startBackgroundRead(conn net.Conn, mu *sync.Mutex, cond *sync.Cond, inRead *bool) {
    mu.Lock()
    defer mu.Unlock()
    *inRead = true
    conn.SetReadDeadline(time.Time{})
    go func() {
        var buf [1]byte
        conn.Read(buf[:])
        mu.Lock()
        *inRead = false
        mu.Unlock()
        cond.Broadcast()
    }()
}

// aLongTimeAgo is a non-zero time, far in the past, used for
// immediate cancellation of dials.
var aLongTimeAgo = time.Unix(1, 0)

func hijack(conn net.Conn, mu *sync.Mutex, cond *sync.Cond, inRead *bool) {
    // inlined abortPendingRead from net/http.
    mu.Lock()
    defer mu.Unlock()
    if !*inRead {
        return
    }
    conn.SetReadDeadline(aLongTimeAgo)
    for *inRead {
        cond.Wait()
    }
    conn.SetReadDeadline(time.Time{})
}

func must[T any](v T, err error) T {
    if err != nil {
        log.Fatal(err)
    }
    return v
}

cc @neild @ianlancetaylor

What did you see happen?

The program deadlocks. I've observed this in an internal program that uses our RPC library which is bade on top of net/http (internal bug: b/179661234). I can try to create a reproducer that just uses net/http to make it more realistic. Internally, the deadlocks are far more frequent. The reproducer above needs a good while on my workstation before one deadlocks.

Below is a delve session that I've got open now. Likely the runtime experts will know what things to actually looks at.

$ dlv attach 717471
Type 'help' for list of commands.
(dlv) goroutines
  Goroutine 1 - User: /home/go/src/net/fd_posix.go:55 net.(*netFD).Read (0x51da73) [IO wait]
  Goroutine 2 - User: /home/go/src/runtime/proc.go:428 runtime.gopark (0x47417c) [force gc (idle)]
  Goroutine 3 - User: /home/go/src/runtime/proc.go:428 runtime.gopark (0x47417c) [GC sweep wait]
  Goroutine 4 - User: /home/go/src/runtime/proc.go:428 runtime.gopark (0x47417c) [GC scavenge wait]
  Goroutine 5 - User: /home/go/src/runtime/proc.go:428 runtime.gopark (0x47417c) [finalizer wait]
  Goroutine 6 - User: /home/go/src/runtime/proc.go:428 runtime.gopark (0x47417c) [chan receive]
  Goroutine 7 - User: /home/go/src/net/fd_unix.go:172 net.(*netFD).accept (0x51f529) [IO wait]
  Goroutine 8 - User: /home/go/src/runtime/sema.go:587 sync.runtime_notifyListWait (0x475512) [sync.Cond.Wait]
  Goroutine 9 - User: /home/go/src/syscall/syscall_linux.go:64 syscall.RawSyscall6 (0x4afa25) (thread 717487)
[9 goroutines]

(dlv) goroutines -t 20
...
  Goroutine 1 - User: /home/go/src/net/fd_posix.go:55 net.(*netFD).Read (0x51da73) [IO wait]
         0  0x000000000047417c in runtime.gopark
             at /home/go/src/runtime/proc.go:428
         1  0x0000000000437a7e in runtime.netpollblock
             at /home/go/src/runtime/netpoll.go:575
         2  0x00000000004735fc in internal/poll.runtime_pollWait
             at /home/go/src/runtime/netpoll.go:351
         3  0x00000000004ca17a in internal/poll.(*pollDesc).wait
             at /home/go/src/internal/poll/fd_poll_runtime.go:84
         4  0x00000000004ca211 in internal/poll.(*pollDesc).waitRead
             at /home/go/src/internal/poll/fd_poll_runtime.go:89
         5  0x00000000004cb411 in internal/poll.(*FD).Read
             at /home/go/src/internal/poll/fd_unix.go:165
         6  0x000000000051da73 in net.(*netFD).Read
             at /home/go/src/net/fd_posix.go:55
         7  0x000000000052e9e5 in net.(*conn).Read
             at /home/go/src/net/net.go:189
         8  0x00000000005453a5 in net.(*UnixConn).Read
             at <autogenerated>:1
         9  0x0000000000547937 in main.client
             at ./unixconn.go:88
        10  0x0000000000547305 in main.main
             at ./unixconn.go:71
        11  0x000000000043f2e7 in runtime.main
             at /home/go/src/runtime/proc.go:272
        12  0x000000000047af41 in runtime.goexit
             at /home/go/src/runtime/asm_amd64.s:1700

...

  Goroutine 8 - User: /home/go/src/runtime/sema.go:587 sync.runtime_notifyListWait (0x475512) [sync.Cond.Wait]
        0  0x000000000047417c in runtime.gopark
            at /home/go/src/runtime/proc.go:428
        1  0x000000000043f745 in runtime.goparkunlock
            at /home/go/src/runtime/proc.go:433
        2  0x0000000000475512 in sync.runtime_notifyListWait
            at /home/go/src/runtime/sema.go:587
        3  0x000000000048cd0f in sync.(*Cond).Wait
            at /home/go/src/sync/cond.go:71
        4  0x00000000005488b2 in main.hijack
            at ./unixconn.go:154
        5  0x00000000005482ea in main.badserver.func1
            at ./unixconn.go:118
        6  0x0000000000547f9c in main.badserver.gowrap1
            at ./unixconn.go:122
        7  0x000000000047af41 in runtime.goexit
            at /home/go/src/runtime/asm_amd64.s:1700

  Goroutine 9 - User: /home/go/src/syscall/syscall_linux.go:64 syscall.RawSyscall6 (0x4afa25) (thread 717487)
         0  0x000000000047f7ce in internal/runtime/syscall.Syscall6
             at /home/go/src/internal/runtime/syscall/asm_linux_amd64.s:36
         1  0x00000000004afa25 in syscall.RawSyscall6
             at /home/go/src/syscall/syscall_linux.go:64
         2  0x00000000004afaf5 in syscall.Syscall
             at /home/go/src/syscall/syscall_linux.go:86
         3  0x00000000004ae12e in syscall.read
             at /home/go/src/syscall/zsyscall_linux_amd64.go:736
         4  0x00000000004ac8c6 in syscall.Read
             at /home/go/src/syscall/syscall_unix.go:183
         5  0x00000000004cce6b in internal/poll.ignoringEINTRIO
             at /home/go/src/internal/poll/fd_unix.go:745
         6  0x00000000004cb2f3 in internal/poll.(*FD).Read
             at /home/go/src/internal/poll/fd_unix.go:161
         7  0x000000000051da73 in net.(*netFD).Read
             at /home/go/src/net/fd_posix.go:55
         8  0x000000000052e9e5 in net.(*conn).Read
             at /home/go/src/net/net.go:189
         9  0x00000000005453a5 in net.(*UnixConn).Read
             at <autogenerated>:1
        10  0x00000000005486cd in main.startBackgroundRead.func1
             at ./unixconn.go:133
        11  0x000000000047af41 in runtime.goexit
             at /home/go/src/runtime/asm_amd64.s:1700

(dlv) goroutine 9
(dlv) frame 10
(dlv) p conn.conn.fd
(*net.netFD)(0xc0001a4400)
*net.netFD {
        pfd: internal/poll.FD {
                fdmu: (*"internal/poll.fdMutex")(0xc0001a4400),
                Sysfd: 7,
                SysFile: (*"internal/poll.SysFile")(0xc0001a4418),
                pd: (*"internal/poll.pollDesc")(0xc0001a4420),
                csema: 0,
                isBlocking: 0,
                IsStream: true,
                ZeroReadIsEOF: true,
                isFile: false,},
        family: 1,
        sotype: 1,
        isConnected: false,
        net: "unix",
        laddr: net.Addr(*net.UnixAddr) *{
                Name: "@unixconn.echo.717471",
                Net: "unix",},
        raddr: net.Addr(*net.UnixAddr) *{Name: "@", Net: "unix"},}
(dlv) p conn.conn.fd.pfd
internal/poll.FD {
        fdmu: internal/poll.fdMutex {state: 10, rsema: 0, wsema: 0},
        Sysfd: 7,
        SysFile: internal/poll.SysFile {
                iovecs: *[]syscall.Iovec nil,},
        pd: internal/poll.pollDesc {runtimeCtx: 140156362077400},
        csema: 0,
        isBlocking: 0,
        IsStream: true,
        ZeroReadIsEOF: true,
        isFile: false,}
(dlv) p *(*runtime.pollDesc)(conn.conn.fd.pfd.pd.runtimeCtx)
runtime.pollDesc {
        _: runtime/internal/sys.NotInHeap {
                _: runtime/internal/sys.nih {},},
        link: *runtime.pollDesc {
                _: (*"runtime/internal/sys.NotInHeap")(0x7f78b22cb3d0),
                link: *(*runtime.pollDesc)(0x7f78b22cb2c8),
                fd: 8,
                fdseq: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb3e0),
                atomicInfo: (*"internal/runtime/atomic.Uint32")(0x7f78b22cb3e8),
                rg: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb3f0),
                wg: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb3f8),
                lock: (*runtime.mutex)(0x7f78b22cb400),
                closing: true,
                rrun: false,
                wrun: false,
                user: 0,
                rseq: 2,
                rt: (*runtime.timer)(0x7f78b22cb418),
                rd: 0,
                wseq: 2,
                wt: (*runtime.timer)(0x7f78b22cb478),
                wd: 0,
                self: *(*runtime.pollDesc)(0x7f78b22cb3d0),},
        fd: 7,
        fdseq: internal/runtime/atomic.Uintptr {
                noCopy: internal/runtime/atomic.noCopy {},
                value: 1,},
        atomicInfo: internal/runtime/atomic.Uint32 {
                noCopy: internal/runtime/atomic.noCopy {},
                value: 65540,},
        rg: internal/runtime/atomic.Uintptr {
                noCopy: internal/runtime/atomic.noCopy {},
                value: 0,},
        wg: internal/runtime/atomic.Uintptr {
                noCopy: internal/runtime/atomic.noCopy {},
                value: 1,},
        lock: runtime.mutex {
                lockRankStruct: runtime.lockRankStruct {},
                key: 0,},
        closing: false,
        rrun: false,
        wrun: false,
        user: 0,
        rseq: 1,
        rt: runtime.timer {
                mu: (*runtime.mutex)(0x7f78b22cb520),
                astate: (*"internal/runtime/atomic.Uint8")(0x7f78b22cb528),
                state: 0,
                isChan: false,
                blocked: 0,
                when: 0,
                period: 0,
                f: nil,
                arg: interface {} nil,
                seq: 0,
                ts: *runtime.timers nil,
                sendLock: (*runtime.mutex)(0x7f78b22cb568),},
        rd: -1719926605958302132,
        wseq: 1,
        wt: runtime.timer {
                mu: (*runtime.mutex)(0x7f78b22cb580),
                astate: (*"internal/runtime/atomic.Uint8")(0x7f78b22cb588),
                state: 0,
                isChan: false,
                blocked: 0,
                when: 0,
                period: 0,
                f: nil,
                arg: interface {} nil,
                seq: 0,
                ts: *runtime.timers nil,
                sendLock: (*runtime.mutex)(0x7f78b22cb5c8),},
        wd: 0,
        self: *runtime.pollDesc {
                _: (*"runtime/internal/sys.NotInHeap")(0x7f78b22cb4d8),
                link: *(*runtime.pollDesc)(0x7f78b22cb3d0),
                fd: 7,
                fdseq: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb4e8),
                atomicInfo: (*"internal/runtime/atomic.Uint32")(0x7f78b22cb4f0),
                rg: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb4f8),
                wg: (*"internal/runtime/atomic.Uintptr")(0x7f78b22cb500),
                lock: (*runtime.mutex)(0x7f78b22cb508),
                closing: false,
                rrun: false,
                wrun: false,
                user: 0,
                rseq: 1,
                rt: (*runtime.timer)(0x7f78b22cb520),
                rd: -1719926605958302132,
                wseq: 1,
                wt: (*runtime.timer)(0x7f78b22cb580),
                wd: 0,
                self: *(*runtime.pollDesc)(0x7f78b22cb4d8),},}

What did you expect to see?

Either never deadlocking, or always deadlocking.

Comment From: gabyhelp

Related Issues

(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in this discussion.)

Comment From: aktau

Here's a version that uses net/http.Server, and thus more closely matches what I've observed internally: https://go.dev/play/p/QhCe3BOTnLv. It takes around 5 minutes of running the shell loop to reproduce on my workstation. This example is less minimal than the one above due to involving more machinery (net/http), but I believe the underlying cause is the same.

Comment From: thanm

@golang/runtime

Comment From: ianlancetaylor

The problem here is the getPeerCredentials function. It calls net.UnixConn.File to get a *os.File, and then calls the os.File.Fd method. As the documentation for the latter method (https://pkg.go.dev/os#File.Fd) says: "On Unix systems this will cause the SetDeadline methods to stop working." Although I haven't worked through all the details, I expect that that is what is happening here.

The docs also say "As an alternative, see the f.SyscallConn method." If I rewrite getPeerCredentials as follows, I do not see a deadlock.

func getPeerCredentials(conn *net.UnixConn) (*syscall.Ucred, error) {
    rc, err := conn.SyscallConn()
    if err != nil {
        return nil, err
    }
    var (
        cred *syscall.Ucred
        cerr error
    )
    err = rc.Control(func(fd uintptr) {
        cred, cerr = syscall.GetsockoptUcred(int(fd), syscall.SOL_SOCKET, syscall.SO_PEERCRED)
    })
    if err != nil {
        return nil, err
    }
    if cerr != nil {
        return nil, cerr
    }
    return cred, nil
}

Closing optimistically since I don't think there is a bug here. Please comment if you disagree.