1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2026-01-11 16:58:03 +01:00

daemon: fix deadlock when SSH client disconnects during remote builds

When a remote SSH client disconnects during a long-running operation
like addToStore(), the nix-daemon can deadlock in a circular wait:

  - Process A (SSH daemon): blocked reading from downstream store socket,
    waiting for response from local daemon
  - Process B (local daemon): blocked reading from upstream socket,
    waiting for more NAR data from SSH daemon

The existing interrupt mechanism (ReceiveInterrupts + MonitorFdHup)
correctly detects the SSH disconnect and sets _isInterrupted, but the
daemon remains blocked in read() on the downstream store connection.
Even though SIGUSR1 causes read() to return EINTR, the circular
dependency prevents forward progress.

Fix this by adding shutdownConnections() to RemoteStore that calls
shutdown(fd, SHUT_RDWR) on all tracked connection file descriptors.
Register an interrupt callback in processConnection() that invokes
this method when the store is a RemoteStore. This causes any blocking
read() to return 0 (EOF), breaking the circular wait and allowing
both processes to exit cleanly.

The fix tracks connection FDs in a synchronized set, populated when
connections are created by the Pool factory. On interrupt, all FDs
are shut down regardless of whether they're idle or in-use.
This commit is contained in:
Jörg Thalheim 2025-12-02 13:53:20 +01:00
parent a3bcd2543e
commit 212bf2b702
3 changed files with 47 additions and 0 deletions

View file

@ -9,6 +9,7 @@
#include "nix/store/gc-store.hh"
#include "nix/store/log-store.hh"
#include "nix/store/indirect-root-store.hh"
#include "nix/store/remote-store.hh"
#include "nix/store/path-with-outputs.hh"
#include "nix/util/finally.hh"
#include "nix/util/archive.hh"
@ -1026,6 +1027,16 @@ void processConnection(ref<Store> store, FdSource && from, FdSink && to, Trusted
auto monitor = !recursive ? std::make_unique<MonitorFdHup>(from.fd) : nullptr;
(void) monitor; // suppress warning
ReceiveInterrupts receiveInterrupts;
/* When interrupted (e.g., SSH client disconnects), shutdown any downstream
store connections to break circular waits. This fixes deadlocks where the
daemon is waiting for a response from a downstream store while the downstream
is waiting for more data from this daemon. */
auto shutdownStoreOnInterrupt = createInterruptCallback([&store]() {
if (auto remoteStore = dynamic_cast<RemoteStore *>(&*store)) {
remoteStore->shutdownConnections();
}
});
#endif
/* Exchange the greeting. */

View file

@ -2,9 +2,12 @@
///@file
#include <limits>
#include <set>
#include <string>
#include "nix/store/store-api.hh"
#include "nix/util/sync.hh"
#include "nix/util/file-descriptor.hh"
#include "nix/store/gc-store.hh"
#include "nix/store/log-store.hh"
@ -153,6 +156,13 @@ struct RemoteStore : public virtual Store, public virtual GcStore, public virtua
void flushBadConnections();
/**
* Shutdown all connections (both idle and in-use) to break any blocking I/O.
* This is called on interrupt to allow graceful termination when the client
* disconnects during a long-running operation.
*/
void shutdownConnections();
struct Connection;
ref<Connection> openConnectionWrapper();
@ -191,6 +201,12 @@ private:
std::atomic_bool failed{false};
/**
* Track all active connection file descriptors (both idle and in-use).
* Used by shutdownConnections() to break blocking I/O on interrupt.
*/
Sync<std::set<Descriptor>> connectionFds;
void copyDrvsFromEvalStore(const std::vector<DerivedPath> & paths, std::shared_ptr<Store> evalStore);
};

View file

@ -19,6 +19,10 @@
#include "nix/store/filetransfer.hh"
#include "nix/util/signals.hh"
#ifndef _WIN32
# include <sys/socket.h>
#endif
#include <nlohmann/json.hpp>
namespace nix {
@ -38,6 +42,8 @@ RemoteStore::RemoteStore(const Config & config)
failed = true;
throw;
}
/* Track the connection FD for shutdownConnections() */
connectionFds.lock()->insert(conn->from.fd);
return conn;
},
[this](const ref<Connection> & r) {
@ -797,6 +803,20 @@ void RemoteStore::flushBadConnections()
connections->flushBad();
}
void RemoteStore::shutdownConnections()
{
#ifndef _WIN32
auto fds = connectionFds.lock();
for (auto fd : *fds) {
/* Use shutdown() instead of close() to signal EOF to any blocking
reads/writes without actually closing the FD (which would cause
issues if the connection is still in use). This breaks circular
waits when the client disconnects during long-running operations. */
::shutdown(fromDescriptorReadOnly(fd), SHUT_RDWR);
}
#endif
}
void RemoteStore::narFromPath(const StorePath & path, Sink & sink)
{
auto conn(getConnection());