mirror of
https://github.com/NixOS/nix.git
synced 2026-01-11 16:58:03 +01:00
daemon: fix deadlock when SSH client disconnects during remote builds
When a remote SSH client disconnects during a long-running operation
like addToStore(), the nix-daemon can deadlock in a circular wait:
- Process A (SSH daemon): blocked reading from downstream store socket,
waiting for response from local daemon
- Process B (local daemon): blocked reading from upstream socket,
waiting for more NAR data from SSH daemon
The existing interrupt mechanism (ReceiveInterrupts + MonitorFdHup)
correctly detects the SSH disconnect and sets _isInterrupted, but the
daemon remains blocked in read() on the downstream store connection.
Even though SIGUSR1 causes read() to return EINTR, the circular
dependency prevents forward progress.
Fix this by adding shutdownConnections() to RemoteStore that calls
shutdown(fd, SHUT_RDWR) on all tracked connection file descriptors.
Register an interrupt callback in processConnection() that invokes
this method when the store is a RemoteStore. This causes any blocking
read() to return 0 (EOF), breaking the circular wait and allowing
both processes to exit cleanly.
The fix tracks connection FDs in a synchronized set, populated when
connections are created by the Pool factory. On interrupt, all FDs
are shut down regardless of whether they're idle or in-use.
This commit is contained in:
parent
a3bcd2543e
commit
212bf2b702
3 changed files with 47 additions and 0 deletions
|
|
@ -9,6 +9,7 @@
|
|||
#include "nix/store/gc-store.hh"
|
||||
#include "nix/store/log-store.hh"
|
||||
#include "nix/store/indirect-root-store.hh"
|
||||
#include "nix/store/remote-store.hh"
|
||||
#include "nix/store/path-with-outputs.hh"
|
||||
#include "nix/util/finally.hh"
|
||||
#include "nix/util/archive.hh"
|
||||
|
|
@ -1026,6 +1027,16 @@ void processConnection(ref<Store> store, FdSource && from, FdSink && to, Trusted
|
|||
auto monitor = !recursive ? std::make_unique<MonitorFdHup>(from.fd) : nullptr;
|
||||
(void) monitor; // suppress warning
|
||||
ReceiveInterrupts receiveInterrupts;
|
||||
|
||||
/* When interrupted (e.g., SSH client disconnects), shutdown any downstream
|
||||
store connections to break circular waits. This fixes deadlocks where the
|
||||
daemon is waiting for a response from a downstream store while the downstream
|
||||
is waiting for more data from this daemon. */
|
||||
auto shutdownStoreOnInterrupt = createInterruptCallback([&store]() {
|
||||
if (auto remoteStore = dynamic_cast<RemoteStore *>(&*store)) {
|
||||
remoteStore->shutdownConnections();
|
||||
}
|
||||
});
|
||||
#endif
|
||||
|
||||
/* Exchange the greeting. */
|
||||
|
|
|
|||
|
|
@ -2,9 +2,12 @@
|
|||
///@file
|
||||
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "nix/store/store-api.hh"
|
||||
#include "nix/util/sync.hh"
|
||||
#include "nix/util/file-descriptor.hh"
|
||||
#include "nix/store/gc-store.hh"
|
||||
#include "nix/store/log-store.hh"
|
||||
|
||||
|
|
@ -153,6 +156,13 @@ struct RemoteStore : public virtual Store, public virtual GcStore, public virtua
|
|||
|
||||
void flushBadConnections();
|
||||
|
||||
/**
|
||||
* Shutdown all connections (both idle and in-use) to break any blocking I/O.
|
||||
* This is called on interrupt to allow graceful termination when the client
|
||||
* disconnects during a long-running operation.
|
||||
*/
|
||||
void shutdownConnections();
|
||||
|
||||
struct Connection;
|
||||
|
||||
ref<Connection> openConnectionWrapper();
|
||||
|
|
@ -191,6 +201,12 @@ private:
|
|||
|
||||
std::atomic_bool failed{false};
|
||||
|
||||
/**
|
||||
* Track all active connection file descriptors (both idle and in-use).
|
||||
* Used by shutdownConnections() to break blocking I/O on interrupt.
|
||||
*/
|
||||
Sync<std::set<Descriptor>> connectionFds;
|
||||
|
||||
void copyDrvsFromEvalStore(const std::vector<DerivedPath> & paths, std::shared_ptr<Store> evalStore);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,10 @@
|
|||
#include "nix/store/filetransfer.hh"
|
||||
#include "nix/util/signals.hh"
|
||||
|
||||
#ifndef _WIN32
|
||||
# include <sys/socket.h>
|
||||
#endif
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace nix {
|
||||
|
|
@ -38,6 +42,8 @@ RemoteStore::RemoteStore(const Config & config)
|
|||
failed = true;
|
||||
throw;
|
||||
}
|
||||
/* Track the connection FD for shutdownConnections() */
|
||||
connectionFds.lock()->insert(conn->from.fd);
|
||||
return conn;
|
||||
},
|
||||
[this](const ref<Connection> & r) {
|
||||
|
|
@ -797,6 +803,20 @@ void RemoteStore::flushBadConnections()
|
|||
connections->flushBad();
|
||||
}
|
||||
|
||||
void RemoteStore::shutdownConnections()
|
||||
{
|
||||
#ifndef _WIN32
|
||||
auto fds = connectionFds.lock();
|
||||
for (auto fd : *fds) {
|
||||
/* Use shutdown() instead of close() to signal EOF to any blocking
|
||||
reads/writes without actually closing the FD (which would cause
|
||||
issues if the connection is still in use). This breaks circular
|
||||
waits when the client disconnects during long-running operations. */
|
||||
::shutdown(fromDescriptorReadOnly(fd), SHUT_RDWR);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RemoteStore::narFromPath(const StorePath & path, Sink & sink)
|
||||
{
|
||||
auto conn(getConnection());
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue