From 212bf2b7022fae46672bf6aa154c50737c676361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 2 Dec 2025 13:53:20 +0100 Subject: [PATCH] daemon: fix deadlock when SSH client disconnects during remote builds When a remote SSH client disconnects during a long-running operation like addToStore(), the nix-daemon can deadlock in a circular wait: - Process A (SSH daemon): blocked reading from downstream store socket, waiting for response from local daemon - Process B (local daemon): blocked reading from upstream socket, waiting for more NAR data from SSH daemon The existing interrupt mechanism (ReceiveInterrupts + MonitorFdHup) correctly detects the SSH disconnect and sets _isInterrupted, but the daemon remains blocked in read() on the downstream store connection. Even though SIGUSR1 causes read() to return EINTR, the circular dependency prevents forward progress. Fix this by adding shutdownConnections() to RemoteStore that calls shutdown(fd, SHUT_RDWR) on all tracked connection file descriptors. Register an interrupt callback in processConnection() that invokes this method when the store is a RemoteStore. This causes any blocking read() to return 0 (EOF), breaking the circular wait and allowing both processes to exit cleanly. The fix tracks connection FDs in a synchronized set, populated when connections are created by the Pool factory. On interrupt, all FDs are shut down regardless of whether they're idle or in-use. --- src/libstore/daemon.cc | 11 ++++++++++ .../include/nix/store/remote-store.hh | 16 +++++++++++++++ src/libstore/remote-store.cc | 20 +++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/src/libstore/daemon.cc b/src/libstore/daemon.cc index 4d1c9078f..d3cbc92a2 100644 --- a/src/libstore/daemon.cc +++ b/src/libstore/daemon.cc @@ -9,6 +9,7 @@ #include "nix/store/gc-store.hh" #include "nix/store/log-store.hh" #include "nix/store/indirect-root-store.hh" +#include "nix/store/remote-store.hh" #include "nix/store/path-with-outputs.hh" #include "nix/util/finally.hh" #include "nix/util/archive.hh" @@ -1026,6 +1027,16 @@ void processConnection(ref store, FdSource && from, FdSink && to, Trusted auto monitor = !recursive ? std::make_unique(from.fd) : nullptr; (void) monitor; // suppress warning ReceiveInterrupts receiveInterrupts; + + /* When interrupted (e.g., SSH client disconnects), shutdown any downstream + store connections to break circular waits. This fixes deadlocks where the + daemon is waiting for a response from a downstream store while the downstream + is waiting for more data from this daemon. */ + auto shutdownStoreOnInterrupt = createInterruptCallback([&store]() { + if (auto remoteStore = dynamic_cast(&*store)) { + remoteStore->shutdownConnections(); + } + }); #endif /* Exchange the greeting. */ diff --git a/src/libstore/include/nix/store/remote-store.hh b/src/libstore/include/nix/store/remote-store.hh index b152e054b..33f518fe9 100644 --- a/src/libstore/include/nix/store/remote-store.hh +++ b/src/libstore/include/nix/store/remote-store.hh @@ -2,9 +2,12 @@ ///@file #include +#include #include #include "nix/store/store-api.hh" +#include "nix/util/sync.hh" +#include "nix/util/file-descriptor.hh" #include "nix/store/gc-store.hh" #include "nix/store/log-store.hh" @@ -153,6 +156,13 @@ struct RemoteStore : public virtual Store, public virtual GcStore, public virtua void flushBadConnections(); + /** + * Shutdown all connections (both idle and in-use) to break any blocking I/O. + * This is called on interrupt to allow graceful termination when the client + * disconnects during a long-running operation. + */ + void shutdownConnections(); + struct Connection; ref openConnectionWrapper(); @@ -191,6 +201,12 @@ private: std::atomic_bool failed{false}; + /** + * Track all active connection file descriptors (both idle and in-use). + * Used by shutdownConnections() to break blocking I/O on interrupt. + */ + Sync> connectionFds; + void copyDrvsFromEvalStore(const std::vector & paths, std::shared_ptr evalStore); }; diff --git a/src/libstore/remote-store.cc b/src/libstore/remote-store.cc index 6d1204570..22af53ae8 100644 --- a/src/libstore/remote-store.cc +++ b/src/libstore/remote-store.cc @@ -19,6 +19,10 @@ #include "nix/store/filetransfer.hh" #include "nix/util/signals.hh" +#ifndef _WIN32 +# include +#endif + #include namespace nix { @@ -38,6 +42,8 @@ RemoteStore::RemoteStore(const Config & config) failed = true; throw; } + /* Track the connection FD for shutdownConnections() */ + connectionFds.lock()->insert(conn->from.fd); return conn; }, [this](const ref & r) { @@ -797,6 +803,20 @@ void RemoteStore::flushBadConnections() connections->flushBad(); } +void RemoteStore::shutdownConnections() +{ +#ifndef _WIN32 + auto fds = connectionFds.lock(); + for (auto fd : *fds) { + /* Use shutdown() instead of close() to signal EOF to any blocking + reads/writes without actually closing the FD (which would cause + issues if the connection is still in use). This breaks circular + waits when the client disconnects during long-running operations. */ + ::shutdown(fromDescriptorReadOnly(fd), SHUT_RDWR); + } +#endif +} + void RemoteStore::narFromPath(const StorePath & path, Sink & sink) { auto conn(getConnection());