diff --git a/doc/manual/source/advanced-topics/distributed-builds.md b/doc/manual/source/advanced-topics/distributed-builds.md index 08a980643..059012ac5 100644 --- a/doc/manual/source/advanced-topics/distributed-builds.md +++ b/doc/manual/source/advanced-topics/distributed-builds.md @@ -107,4 +107,18 @@ file included in `builders` via the syntax `@/path/to/file`. For example, causes the list of machines in `/etc/nix/machines` to be included. (This is the default.) -[Nix instance]: @docroot@/glossary.md#gloss-nix-instance \ No newline at end of file +[Nix instance]: @docroot@/glossary.md#gloss-nix-instance + +## Resource Management + +Adding `resource-management` to the `experimental-features` setting in `nix.conf` enables a basic resource management scheme for system features. This is akin to what can be accomplished with job schedulers like Slurm, where a remote machine can have a limited quantity of a resource that can be temporarily "consumed" by a job. This can be used with memory-heavy builds, or derivations that require exclusive access to particular hardware resources. + +Resource management is supported in both the supported features and mandatory features of a remote machine configuration, by appending a colon `:` to a feature name followed by the quantity that this machine has. This is tracked on a per-store basis, so different users on a multi-user installation share the same pool of resources for their remote build machines. A derivation specifies that it consumes a resource with the same notation in the `requiredSystemFeatures` attribute. + +For example, this builder can provide exclusive access to two GPUs and 128G of memory for remote builds: + + builders = ssh://gpu-node x86_64-linux - 32 1 gpu:2,mem:128 + +A derivation that might use this machine may set its `requiredSystemFeatures` to `["gpu:1" "mem:4"]` to indicate that it requires a GPU and consumes 4G of system memory. A particularly memory-heavy derivation that doesn't need a GPU may still use the machine with a value of `["mem:64"]`. This helps ensure that limited system resources are not over-consumed by remote builds. Note that Nix does not do any actual delegation or enforcement of GPU, memory, or other resource usage, that is up to the derivations to manage. + +When configuring the `system-features` setting on the remote machine's `nix.conf`, only include the name of the consumable feature, not the quantity availble. Resource limits are tracked on the dispatching end within the local store. diff --git a/src/libstore/derivation-options.cc b/src/libstore/derivation-options.cc index 75313841c..58223d206 100644 --- a/src/libstore/derivation-options.cc +++ b/src/libstore/derivation-options.cc @@ -6,6 +6,7 @@ #include "nix/util/types.hh" #include "nix/util/util.hh" #include "nix/store/globals.hh" +#include "nix/store/machines.hh" #include #include @@ -327,9 +328,16 @@ bool DerivationOptions::canBuildLocally(Store & localStore, const BasicDerivatio if (settings.maxBuildJobs.get() == 0 && !drv.isBuiltin()) return false; - for (auto & feature : getRequiredSystemFeatures(drv)) - if (!localStore.config.systemFeatures.get().count(feature)) - return false; + auto features = getRequiredSystemFeatures(drv); + if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) { + auto featureCount = Machine::countFeatures(features); + for (auto & feature : featureCount) + if (!localStore.config.systemFeatures.get().count(feature.first)) + return false; + } else + for (auto & feature : features) + if (!localStore.config.systemFeatures.get().count(feature)) + return false; return true; } diff --git a/src/libstore/include/nix/store/machines.hh b/src/libstore/include/nix/store/machines.hh index 1f7bb669a..63e082d0f 100644 --- a/src/libstore/include/nix/store/machines.hh +++ b/src/libstore/include/nix/store/machines.hh @@ -12,6 +12,8 @@ struct Machine; typedef std::vector Machines; +typedef std::map FeatureCount; + struct Machine { @@ -21,7 +23,9 @@ struct Machine const unsigned int maxJobs; const float speedFactor; const StringSet supportedFeatures; + const FeatureCount supportedFeaturesCount; const StringSet mandatoryFeatures; + const FeatureCount mandatoryFeaturesCount; const std::string sshPublicHostKey; bool enabled = true; @@ -77,6 +81,11 @@ struct Machine * the same format. */ static Machines parseConfig(const StringSet & defaultSystems, const std::string & config); + + /** + * Count the number of each feature specified in a feature string. + */ + static FeatureCount countFeatures(const StringSet & features); }; /** diff --git a/src/libstore/machines.cc b/src/libstore/machines.cc index d61467666..b000359ea 100644 --- a/src/libstore/machines.cc +++ b/src/libstore/machines.cc @@ -31,7 +31,9 @@ Machine::Machine( , maxJobs(maxJobs) , speedFactor(speedFactor == 0.0f ? 1.0f : speedFactor) , supportedFeatures(supportedFeatures) + , supportedFeaturesCount(countFeatures(supportedFeatures)) , mandatoryFeatures(mandatoryFeatures) + , mandatoryFeaturesCount(countFeatures(mandatoryFeatures)) , sshPublicHostKey(sshPublicHostKey) { if (speedFactor < 0.0) @@ -45,16 +47,48 @@ bool Machine::systemSupported(const std::string & system) const bool Machine::allSupported(const StringSet & features) const { - return std::all_of(features.begin(), features.end(), [&](const std::string & feature) { - return supportedFeatures.count(feature) || mandatoryFeatures.count(feature); - }); + if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) { + auto featuresCount = countFeatures(features); + return std::all_of(featuresCount.begin(), featuresCount.end(), [&](const auto & f) { + return ( + supportedFeaturesCount.count(f.first) > 0 && ( // feature is supported, and + supportedFeaturesCount.at(f.first) >= f.second || // we have the quantity of it needed or + supportedFeaturesCount.at(f.first) == 0 // we have a limitless supply of it + ) + ) || ( + mandatoryFeaturesCount.count(f.first) > 0 && ( + mandatoryFeaturesCount.at(f.first) >= f.second || + mandatoryFeaturesCount.at(f.first) == 0 + ) + ); + }); + } else { + return std::all_of(features.begin(), features.end(), [&](const std::string & feature) { + return supportedFeatures.count(feature) || mandatoryFeatures.count(feature); + }); + } } bool Machine::mandatoryMet(const StringSet & features) const { - return std::all_of(mandatoryFeatures.begin(), mandatoryFeatures.end(), [&](const std::string & feature) { - return features.count(feature); - }); + if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) { + auto featureCount = countFeatures(features); + return std::all_of(mandatoryFeaturesCount.begin(), mandatoryFeaturesCount.end(), [&](const auto & feature) { + return featureCount.count(feature.first); + }); + } else { + return std::all_of(mandatoryFeatures.begin(), mandatoryFeatures.end(), [&](const std::string & feature) { + return features.count(feature); + }); + } +} + +std::string escapeUri(std::string uri) +{ + if (uri.find(':') != std::string::npos) { + uri.replace(uri.find(':'), 3, "%3A"); + } + return uri; } StoreReference Machine::completeStoreReference() const @@ -81,7 +115,7 @@ StoreReference Machine::completeStoreReference() const for (auto & f : feats) { if (fs.size() > 0) fs += ' '; - fs += f; + fs += escapeUri(f); } }; append(supportedFeatures); @@ -207,6 +241,26 @@ Machines Machine::parseConfig(const StringSet & defaultSystems, const std::strin return parseBuilderLines(defaultSystems, builderLines); } +FeatureCount Machine::countFeatures(const StringSet & features) +{ + FeatureCount fc; + for (auto & f : features) { + std::istringstream fss(f); + std::string name; + std::string quantity; + unsigned long ulquantity = 0; + std::getline(fss, name, ':'); + if (std::getline(fss, quantity)) { + ulquantity = std::stoul(quantity); + if (ulquantity == 0) { + throw UsageError("quantity for feature %s must be > 0", name); + } + } + fc.emplace(name, ulquantity); + }; + return fc; +} + Machines getMachines() { return Machine::parseConfig({settings.thisSystem}, settings.builders); diff --git a/src/libutil/experimental-features.cc b/src/libutil/experimental-features.cc index 69ba62b56..60eb5386e 100644 --- a/src/libutil/experimental-features.cc +++ b/src/libutil/experimental-features.cc @@ -25,7 +25,7 @@ struct ExperimentalFeatureDetails * feature, we either have no issue at all if few features are not added * at the end of the list, or a proper merge conflict if they are. */ -constexpr size_t numXpFeatures = 1 + static_cast(Xp::BLAKE3Hashes); +constexpr size_t numXpFeatures = 1 + static_cast(Xp::ResourceManagement); constexpr std::array xpFeatureDetails = {{ { @@ -321,6 +321,14 @@ constexpr std::array xpFeatureDetails )", .trackingUrl = "", }, + { + .tag = Xp::ResourceManagement, + .name = "resource-management", + .description = R"( + Enables support for resource management in remote build system features. + )", + .trackingUrl = "", + } }}; static_assert( diff --git a/src/libutil/include/nix/util/experimental-features.hh b/src/libutil/include/nix/util/experimental-features.hh index aca14bfbb..f1a0c5656 100644 --- a/src/libutil/include/nix/util/experimental-features.hh +++ b/src/libutil/include/nix/util/experimental-features.hh @@ -39,6 +39,7 @@ enum struct ExperimentalFeature { PipeOperators, ExternalBuilders, BLAKE3Hashes, + ResourceManagement, }; /** diff --git a/src/nix/build-remote/build-remote.cc b/src/nix/build-remote/build-remote.cc index ffb77ddf1..20e4fe063 100644 --- a/src/nix/build-remote/build-remote.cc +++ b/src/nix/build-remote/build-remote.cc @@ -37,9 +37,14 @@ std::string escapeUri(std::string uri) static std::string currentLoad; -static AutoCloseFD openSlotLock(const Machine & m, uint64_t slot) +static AutoCloseFD openSlotLock(const std::string storeUri, uint64_t slot) { - return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(m.storeUri.render()), slot), true); + return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(storeUri), slot), true); +} + +static AutoCloseFD openFeatureSlotLock(const std::string storeUri, const std::string feature, unsigned int slot) +{ + return openLockFile(fmt("%s/%s-%s-%d", currentLoad, escapeUri(storeUri), feature, slot), true); } static bool allSupportedLocally(Store & store, const StringSet & requiredFeatures) @@ -50,6 +55,47 @@ static bool allSupportedLocally(Store & store, const StringSet & requiredFeature return true; } +using FeatureSlotLocks = std::map>; + +static bool tryReserveFeatures( + const Machine & m, + const FeatureCount requiredFeatures, + FeatureSlotLocks & featureSlotLocks +) { + bool allSatisfied = true; + for (auto & f : requiredFeatures) { + if (!f.second) { + continue; + } + std::vector locks(f.second); + unsigned int numLocked = 0; + for (unsigned int s = 0; + numLocked < f.second && ( + (m.supportedFeaturesCount.find(f.first) != m.supportedFeaturesCount.end() && + s < m.supportedFeaturesCount.at(f.first)) || + (m.mandatoryFeaturesCount.find(f.first) != m.mandatoryFeaturesCount.end() && + s < m.mandatoryFeaturesCount.at(f.first)) + ); ++s) { + auto lock = openFeatureSlotLock(m.storeUri.render(), f.first, s); + if (lockFile(lock.get(), ltWrite, false)) { + locks[numLocked] = std::move(lock); + ++numLocked; + } + } + if (numLocked < f.second) { + allSatisfied = false; + break; + } + auto & fslDest = featureSlotLocks[f.first]; + fslDest.insert(fslDest.end(), std::make_move_iterator(locks.begin()), + std::make_move_iterator(locks.end())); + } + if (!allSatisfied) { + featureSlotLocks.clear(); + } + return allSatisfied; +} + static int main_build_remote(int argc, char ** argv) { { @@ -93,6 +139,7 @@ static int main_build_remote(int argc, char ** argv) std::shared_ptr sshStore; AutoCloseFD bestSlotLock; + FeatureSlotLocks bestFeatureSlotLocks; auto machines = getMachines(); debug("got %d remote builders", machines.size()); @@ -119,6 +166,12 @@ static int main_build_remote(int argc, char ** argv) auto neededSystem = readString(source); drvPath = store->parseStorePath(readString(source)); auto requiredFeatures = readStrings(source); + auto requiredFeaturesCount = Machine::countFeatures(requiredFeatures); + bool needsResourceManagement = 0 < std::accumulate( + requiredFeaturesCount.begin(), requiredFeaturesCount.end(), 0, + [](auto total, auto feature) { + return std::move(total) + feature.second; + }); /* It would be possible to build locally after some builds clear out, so don't show the warning now: */ @@ -150,7 +203,7 @@ static int main_build_remote(int argc, char ** argv) AutoCloseFD free; uint64_t load = 0; for (uint64_t slot = 0; slot < m.maxJobs; ++slot) { - auto slotLock = openSlotLock(m, slot); + auto slotLock = openSlotLock(m.storeUri.render(), slot); if (lockFile(slotLock.get(), ltWrite, false)) { if (!free) { free = std::move(slotLock); @@ -162,6 +215,13 @@ static int main_build_remote(int argc, char ** argv) if (!free) { continue; } + FeatureSlotLocks featureSlotLocks; + if (needsResourceManagement && + experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) { + if (!tryReserveFeatures(m, requiredFeaturesCount, featureSlotLocks)) { + continue; + } + } bool best = false; if (!bestSlotLock) { best = true; @@ -179,6 +239,7 @@ static int main_build_remote(int argc, char ** argv) if (best) { bestLoad = load; bestSlotLock = std::move(free); + bestFeatureSlotLocks = std::move(featureSlotLocks); bestMachine = &m; } } diff --git a/tests/functional/build-remote-resource-management-should-fail.sh b/tests/functional/build-remote-resource-management-should-fail.sh new file mode 100644 index 000000000..4de1099e4 --- /dev/null +++ b/tests/functional/build-remote-resource-management-should-fail.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +source common.sh + +enableFeatures "resource-management" + +requireSandboxSupport +[[ $busybox =~ busybox ]] || skipTest "no busybox" + +here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")") +export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf + +expectStderr 1 nix build -Lvf resource-management.nix \ + --arg busybox "$busybox" \ + --out-link "$TEST_ROOT/result-from-remote" \ + --store "$TEST_ROOT/local" \ + --builders "ssh-ng://localhost?system-features=testf - - 4 1 testf:1" \ +| grepQuiet "Failed to find a machine for remote build!" diff --git a/tests/functional/build-remote-resource-management.sh b/tests/functional/build-remote-resource-management.sh new file mode 100644 index 000000000..f1779e57e --- /dev/null +++ b/tests/functional/build-remote-resource-management.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +source common.sh + +enableFeatures "resource-management" + +requireSandboxSupport +[[ $busybox =~ busybox ]] || skipTest "no busybox" + +here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")") +export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf + +nix build -Lvf resource-management.nix \ + --arg busybox "$busybox" \ + --out-link "$TEST_ROOT/result-from-remote" \ + --store "$TEST_ROOT/local" \ + --builders "ssh-ng://localhost?system-features=test - - 4 1 test:4" + +grepQuiet 'Hello World!' < "$TEST_ROOT/result-from-remote/hello" diff --git a/tests/functional/config/nix-with-resource-management.conf b/tests/functional/config/nix-with-resource-management.conf new file mode 100644 index 000000000..51d0abdc8 --- /dev/null +++ b/tests/functional/config/nix-with-resource-management.conf @@ -0,0 +1,2 @@ +experimental-features = resource-management nix-command +system-features = test diff --git a/tests/functional/meson.build b/tests/functional/meson.build index 6f649c836..9b4511a33 100644 --- a/tests/functional/meson.build +++ b/tests/functional/meson.build @@ -109,6 +109,8 @@ suites = [ 'build-remote-trustless-should-pass-3.sh', 'build-remote-trustless-should-fail-0.sh', 'build-remote-with-mounted-ssh-ng.sh', + 'build-remote-resource-management-should-fail.sh', + 'build-remote-resource-management.sh', 'nar-access.sh', 'impure-eval.sh', 'pure-eval.sh', diff --git a/tests/functional/resource-management.nix b/tests/functional/resource-management.nix new file mode 100644 index 000000000..ad5f5fb5d --- /dev/null +++ b/tests/functional/resource-management.nix @@ -0,0 +1,50 @@ +{ busybox }: + +with import ./config.nix; + +let + drv1 = mkDerivation { + name = "resource-management-1"; + shell = busybox; + builder = ./simple.builder.sh; + PATH = ""; + goodPath = path; + requiredSystemFeatures = ["test:2"]; + meta.position = "${__curPos.file}:${toString __curPos.line}"; + }; + drv2 = mkDerivation { + name = "resource-management-2"; + shell = busybox; + builder = ./simple.builder.sh; + PATH = ""; + goodPath = path; + requiredSystemFeatures = ["test:2"]; + meta.position = "${__curPos.file}:${toString __curPos.line}"; + }; + drv3 = mkDerivation { + name = "resource-management-3"; + shell = busybox; + builder = ./simple.builder.sh; + PATH = ""; + goodPath = path; + requiredSystemFeatures = ["test:2"]; + meta.position = "${__curPos.file}:${toString __curPos.line}"; + }; + drv4 = mkDerivation { + name = "resource-management-4"; + shell = busybox; + builder = ./simple.builder.sh; + PATH = ""; + goodPath = path; + requiredSystemFeatures = ["test:2"]; + meta.position = "${__curPos.file}:${toString __curPos.line}"; + }; +in mkDerivation { + name = "resource-management"; + shell = busybox; + builder = ./simple.builder.sh; + PATH = ""; + goodPath = path; + DRVS = "${drv1}${drv2}${drv3}${drv4}"; + meta.position = "${__curPos.file}:${toString __curPos.line}"; +}