1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-09 03:56:01 +01:00
This commit is contained in:
Lisanna Dettwyler 2025-11-07 17:21:49 -05:00 committed by GitHub
commit 1e73d56a5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 261 additions and 15 deletions

View file

@ -108,3 +108,17 @@ causes the list of machines in `/etc/nix/machines` to be included.
(This is the default.) (This is the default.)
[Nix instance]: @docroot@/glossary.md#gloss-nix-instance [Nix instance]: @docroot@/glossary.md#gloss-nix-instance
## Resource Management
Adding `resource-management` to the `experimental-features` setting in `nix.conf` enables a basic resource management scheme for system features. This is akin to what can be accomplished with job schedulers like Slurm, where a remote machine can have a limited quantity of a resource that can be temporarily "consumed" by a job. This can be used with memory-heavy builds, or derivations that require exclusive access to particular hardware resources.
Resource management is supported in both the supported features and mandatory features of a remote machine configuration, by appending a colon `:` to a feature name followed by the quantity that this machine has. This is tracked on a per-store basis, so different users on a multi-user installation share the same pool of resources for their remote build machines. A derivation specifies that it consumes a resource with the same notation in the `requiredSystemFeatures` attribute.
For example, this builder can provide exclusive access to two GPUs and 128G of memory for remote builds:
builders = ssh://gpu-node x86_64-linux - 32 1 gpu:2,mem:128
A derivation that might use this machine may set its `requiredSystemFeatures` to `["gpu:1" "mem:4"]` to indicate that it requires a GPU and consumes 4G of system memory. A particularly memory-heavy derivation that doesn't need a GPU may still use the machine with a value of `["mem:64"]`. This helps ensure that limited system resources are not over-consumed by remote builds. Note that Nix does not do any actual delegation or enforcement of GPU, memory, or other resource usage, that is up to the derivations to manage.
When configuring the `system-features` setting on the remote machine's `nix.conf`, only include the name of the consumable feature, not the quantity availble. Resource limits are tracked on the dispatching end within the local store.

View file

@ -6,6 +6,7 @@
#include "nix/util/types.hh" #include "nix/util/types.hh"
#include "nix/util/util.hh" #include "nix/util/util.hh"
#include "nix/store/globals.hh" #include "nix/store/globals.hh"
#include "nix/store/machines.hh"
#include <optional> #include <optional>
#include <string> #include <string>
@ -327,7 +328,14 @@ bool DerivationOptions::canBuildLocally(Store & localStore, const BasicDerivatio
if (settings.maxBuildJobs.get() == 0 && !drv.isBuiltin()) if (settings.maxBuildJobs.get() == 0 && !drv.isBuiltin())
return false; return false;
for (auto & feature : getRequiredSystemFeatures(drv)) auto features = getRequiredSystemFeatures(drv);
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
auto featureCount = Machine::countFeatures(features);
for (auto & feature : featureCount)
if (!localStore.config.systemFeatures.get().count(feature.first))
return false;
} else
for (auto & feature : features)
if (!localStore.config.systemFeatures.get().count(feature)) if (!localStore.config.systemFeatures.get().count(feature))
return false; return false;

View file

@ -12,6 +12,8 @@ struct Machine;
typedef std::vector<Machine> Machines; typedef std::vector<Machine> Machines;
typedef std::map<std::string, unsigned long> FeatureCount;
struct Machine struct Machine
{ {
@ -21,7 +23,9 @@ struct Machine
const unsigned int maxJobs; const unsigned int maxJobs;
const float speedFactor; const float speedFactor;
const StringSet supportedFeatures; const StringSet supportedFeatures;
const FeatureCount supportedFeaturesCount;
const StringSet mandatoryFeatures; const StringSet mandatoryFeatures;
const FeatureCount mandatoryFeaturesCount;
const std::string sshPublicHostKey; const std::string sshPublicHostKey;
bool enabled = true; bool enabled = true;
@ -77,6 +81,11 @@ struct Machine
* the same format. * the same format.
*/ */
static Machines parseConfig(const StringSet & defaultSystems, const std::string & config); static Machines parseConfig(const StringSet & defaultSystems, const std::string & config);
/**
* Count the number of each feature specified in a feature string.
*/
static FeatureCount countFeatures(const StringSet & features);
}; };
/** /**

View file

@ -31,7 +31,9 @@ Machine::Machine(
, maxJobs(maxJobs) , maxJobs(maxJobs)
, speedFactor(speedFactor == 0.0f ? 1.0f : speedFactor) , speedFactor(speedFactor == 0.0f ? 1.0f : speedFactor)
, supportedFeatures(supportedFeatures) , supportedFeatures(supportedFeatures)
, supportedFeaturesCount(countFeatures(supportedFeatures))
, mandatoryFeatures(mandatoryFeatures) , mandatoryFeatures(mandatoryFeatures)
, mandatoryFeaturesCount(countFeatures(mandatoryFeatures))
, sshPublicHostKey(sshPublicHostKey) , sshPublicHostKey(sshPublicHostKey)
{ {
if (speedFactor < 0.0) if (speedFactor < 0.0)
@ -45,16 +47,48 @@ bool Machine::systemSupported(const std::string & system) const
bool Machine::allSupported(const StringSet & features) const bool Machine::allSupported(const StringSet & features) const
{ {
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
auto featuresCount = countFeatures(features);
return std::all_of(featuresCount.begin(), featuresCount.end(), [&](const auto & f) {
return (
supportedFeaturesCount.count(f.first) > 0 && ( // feature is supported, and
supportedFeaturesCount.at(f.first) >= f.second || // we have the quantity of it needed or
supportedFeaturesCount.at(f.first) == 0 // we have a limitless supply of it
)
) || (
mandatoryFeaturesCount.count(f.first) > 0 && (
mandatoryFeaturesCount.at(f.first) >= f.second ||
mandatoryFeaturesCount.at(f.first) == 0
)
);
});
} else {
return std::all_of(features.begin(), features.end(), [&](const std::string & feature) { return std::all_of(features.begin(), features.end(), [&](const std::string & feature) {
return supportedFeatures.count(feature) || mandatoryFeatures.count(feature); return supportedFeatures.count(feature) || mandatoryFeatures.count(feature);
}); });
}
} }
bool Machine::mandatoryMet(const StringSet & features) const bool Machine::mandatoryMet(const StringSet & features) const
{ {
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
auto featureCount = countFeatures(features);
return std::all_of(mandatoryFeaturesCount.begin(), mandatoryFeaturesCount.end(), [&](const auto & feature) {
return featureCount.count(feature.first);
});
} else {
return std::all_of(mandatoryFeatures.begin(), mandatoryFeatures.end(), [&](const std::string & feature) { return std::all_of(mandatoryFeatures.begin(), mandatoryFeatures.end(), [&](const std::string & feature) {
return features.count(feature); return features.count(feature);
}); });
}
}
std::string escapeUri(std::string uri)
{
if (uri.find(':') != std::string::npos) {
uri.replace(uri.find(':'), 3, "%3A");
}
return uri;
} }
StoreReference Machine::completeStoreReference() const StoreReference Machine::completeStoreReference() const
@ -81,7 +115,7 @@ StoreReference Machine::completeStoreReference() const
for (auto & f : feats) { for (auto & f : feats) {
if (fs.size() > 0) if (fs.size() > 0)
fs += ' '; fs += ' ';
fs += f; fs += escapeUri(f);
} }
}; };
append(supportedFeatures); append(supportedFeatures);
@ -207,6 +241,26 @@ Machines Machine::parseConfig(const StringSet & defaultSystems, const std::strin
return parseBuilderLines(defaultSystems, builderLines); return parseBuilderLines(defaultSystems, builderLines);
} }
FeatureCount Machine::countFeatures(const StringSet & features)
{
FeatureCount fc;
for (auto & f : features) {
std::istringstream fss(f);
std::string name;
std::string quantity;
unsigned long ulquantity = 0;
std::getline(fss, name, ':');
if (std::getline(fss, quantity)) {
ulquantity = std::stoul(quantity);
if (ulquantity == 0) {
throw UsageError("quantity for feature %s must be > 0", name);
}
}
fc.emplace(name, ulquantity);
};
return fc;
}
Machines getMachines() Machines getMachines()
{ {
return Machine::parseConfig({settings.thisSystem}, settings.builders); return Machine::parseConfig({settings.thisSystem}, settings.builders);

View file

@ -25,7 +25,7 @@ struct ExperimentalFeatureDetails
* feature, we either have no issue at all if few features are not added * feature, we either have no issue at all if few features are not added
* at the end of the list, or a proper merge conflict if they are. * at the end of the list, or a proper merge conflict if they are.
*/ */
constexpr size_t numXpFeatures = 1 + static_cast<size_t>(Xp::BLAKE3Hashes); constexpr size_t numXpFeatures = 1 + static_cast<size_t>(Xp::ResourceManagement);
constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails = {{ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails = {{
{ {
@ -321,6 +321,14 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
)", )",
.trackingUrl = "", .trackingUrl = "",
}, },
{
.tag = Xp::ResourceManagement,
.name = "resource-management",
.description = R"(
Enables support for resource management in remote build system features.
)",
.trackingUrl = "",
}
}}; }};
static_assert( static_assert(

View file

@ -39,6 +39,7 @@ enum struct ExperimentalFeature {
PipeOperators, PipeOperators,
ExternalBuilders, ExternalBuilders,
BLAKE3Hashes, BLAKE3Hashes,
ResourceManagement,
}; };
/** /**

View file

@ -37,9 +37,14 @@ std::string escapeUri(std::string uri)
static std::string currentLoad; static std::string currentLoad;
static AutoCloseFD openSlotLock(const Machine & m, uint64_t slot) static AutoCloseFD openSlotLock(const std::string storeUri, uint64_t slot)
{ {
return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(m.storeUri.render()), slot), true); return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(storeUri), slot), true);
}
static AutoCloseFD openFeatureSlotLock(const std::string storeUri, const std::string feature, unsigned int slot)
{
return openLockFile(fmt("%s/%s-%s-%d", currentLoad, escapeUri(storeUri), feature, slot), true);
} }
static bool allSupportedLocally(Store & store, const StringSet & requiredFeatures) static bool allSupportedLocally(Store & store, const StringSet & requiredFeatures)
@ -50,6 +55,47 @@ static bool allSupportedLocally(Store & store, const StringSet & requiredFeature
return true; return true;
} }
using FeatureSlotLocks = std::map<std::string, std::vector<AutoCloseFD>>;
static bool tryReserveFeatures(
const Machine & m,
const FeatureCount requiredFeatures,
FeatureSlotLocks & featureSlotLocks
) {
bool allSatisfied = true;
for (auto & f : requiredFeatures) {
if (!f.second) {
continue;
}
std::vector<AutoCloseFD> locks(f.second);
unsigned int numLocked = 0;
for (unsigned int s = 0;
numLocked < f.second && (
(m.supportedFeaturesCount.find(f.first) != m.supportedFeaturesCount.end() &&
s < m.supportedFeaturesCount.at(f.first)) ||
(m.mandatoryFeaturesCount.find(f.first) != m.mandatoryFeaturesCount.end() &&
s < m.mandatoryFeaturesCount.at(f.first))
); ++s) {
auto lock = openFeatureSlotLock(m.storeUri.render(), f.first, s);
if (lockFile(lock.get(), ltWrite, false)) {
locks[numLocked] = std::move(lock);
++numLocked;
}
}
if (numLocked < f.second) {
allSatisfied = false;
break;
}
auto & fslDest = featureSlotLocks[f.first];
fslDest.insert(fslDest.end(), std::make_move_iterator(locks.begin()),
std::make_move_iterator(locks.end()));
}
if (!allSatisfied) {
featureSlotLocks.clear();
}
return allSatisfied;
}
static int main_build_remote(int argc, char ** argv) static int main_build_remote(int argc, char ** argv)
{ {
{ {
@ -93,6 +139,7 @@ static int main_build_remote(int argc, char ** argv)
std::shared_ptr<Store> sshStore; std::shared_ptr<Store> sshStore;
AutoCloseFD bestSlotLock; AutoCloseFD bestSlotLock;
FeatureSlotLocks bestFeatureSlotLocks;
auto machines = getMachines(); auto machines = getMachines();
debug("got %d remote builders", machines.size()); debug("got %d remote builders", machines.size());
@ -119,6 +166,12 @@ static int main_build_remote(int argc, char ** argv)
auto neededSystem = readString(source); auto neededSystem = readString(source);
drvPath = store->parseStorePath(readString(source)); drvPath = store->parseStorePath(readString(source));
auto requiredFeatures = readStrings<StringSet>(source); auto requiredFeatures = readStrings<StringSet>(source);
auto requiredFeaturesCount = Machine::countFeatures(requiredFeatures);
bool needsResourceManagement = 0 < std::accumulate(
requiredFeaturesCount.begin(), requiredFeaturesCount.end(), 0,
[](auto total, auto feature) {
return std::move(total) + feature.second;
});
/* It would be possible to build locally after some builds clear out, /* It would be possible to build locally after some builds clear out,
so don't show the warning now: */ so don't show the warning now: */
@ -150,7 +203,7 @@ static int main_build_remote(int argc, char ** argv)
AutoCloseFD free; AutoCloseFD free;
uint64_t load = 0; uint64_t load = 0;
for (uint64_t slot = 0; slot < m.maxJobs; ++slot) { for (uint64_t slot = 0; slot < m.maxJobs; ++slot) {
auto slotLock = openSlotLock(m, slot); auto slotLock = openSlotLock(m.storeUri.render(), slot);
if (lockFile(slotLock.get(), ltWrite, false)) { if (lockFile(slotLock.get(), ltWrite, false)) {
if (!free) { if (!free) {
free = std::move(slotLock); free = std::move(slotLock);
@ -162,6 +215,13 @@ static int main_build_remote(int argc, char ** argv)
if (!free) { if (!free) {
continue; continue;
} }
FeatureSlotLocks featureSlotLocks;
if (needsResourceManagement &&
experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
if (!tryReserveFeatures(m, requiredFeaturesCount, featureSlotLocks)) {
continue;
}
}
bool best = false; bool best = false;
if (!bestSlotLock) { if (!bestSlotLock) {
best = true; best = true;
@ -179,6 +239,7 @@ static int main_build_remote(int argc, char ** argv)
if (best) { if (best) {
bestLoad = load; bestLoad = load;
bestSlotLock = std::move(free); bestSlotLock = std::move(free);
bestFeatureSlotLocks = std::move(featureSlotLocks);
bestMachine = &m; bestMachine = &m;
} }
} }

View file

@ -0,0 +1,18 @@
#!/usr/bin/env bash
source common.sh
enableFeatures "resource-management"
requireSandboxSupport
[[ $busybox =~ busybox ]] || skipTest "no busybox"
here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")")
export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf
expectStderr 1 nix build -Lvf resource-management.nix \
--arg busybox "$busybox" \
--out-link "$TEST_ROOT/result-from-remote" \
--store "$TEST_ROOT/local" \
--builders "ssh-ng://localhost?system-features=testf - - 4 1 testf:1" \
| grepQuiet "Failed to find a machine for remote build!"

View file

@ -0,0 +1,19 @@
#!/usr/bin/env bash
source common.sh
enableFeatures "resource-management"
requireSandboxSupport
[[ $busybox =~ busybox ]] || skipTest "no busybox"
here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")")
export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf
nix build -Lvf resource-management.nix \
--arg busybox "$busybox" \
--out-link "$TEST_ROOT/result-from-remote" \
--store "$TEST_ROOT/local" \
--builders "ssh-ng://localhost?system-features=test - - 4 1 test:4"
grepQuiet 'Hello World!' < "$TEST_ROOT/result-from-remote/hello"

View file

@ -0,0 +1,2 @@
experimental-features = resource-management nix-command
system-features = test

View file

@ -109,6 +109,8 @@ suites = [
'build-remote-trustless-should-pass-3.sh', 'build-remote-trustless-should-pass-3.sh',
'build-remote-trustless-should-fail-0.sh', 'build-remote-trustless-should-fail-0.sh',
'build-remote-with-mounted-ssh-ng.sh', 'build-remote-with-mounted-ssh-ng.sh',
'build-remote-resource-management-should-fail.sh',
'build-remote-resource-management.sh',
'nar-access.sh', 'nar-access.sh',
'impure-eval.sh', 'impure-eval.sh',
'pure-eval.sh', 'pure-eval.sh',

View file

@ -0,0 +1,50 @@
{ busybox }:
with import ./config.nix;
let
drv1 = mkDerivation {
name = "resource-management-1";
shell = busybox;
builder = ./simple.builder.sh;
PATH = "";
goodPath = path;
requiredSystemFeatures = ["test:2"];
meta.position = "${__curPos.file}:${toString __curPos.line}";
};
drv2 = mkDerivation {
name = "resource-management-2";
shell = busybox;
builder = ./simple.builder.sh;
PATH = "";
goodPath = path;
requiredSystemFeatures = ["test:2"];
meta.position = "${__curPos.file}:${toString __curPos.line}";
};
drv3 = mkDerivation {
name = "resource-management-3";
shell = busybox;
builder = ./simple.builder.sh;
PATH = "";
goodPath = path;
requiredSystemFeatures = ["test:2"];
meta.position = "${__curPos.file}:${toString __curPos.line}";
};
drv4 = mkDerivation {
name = "resource-management-4";
shell = busybox;
builder = ./simple.builder.sh;
PATH = "";
goodPath = path;
requiredSystemFeatures = ["test:2"];
meta.position = "${__curPos.file}:${toString __curPos.line}";
};
in mkDerivation {
name = "resource-management";
shell = busybox;
builder = ./simple.builder.sh;
PATH = "";
goodPath = path;
DRVS = "${drv1}${drv2}${drv3}${drv4}";
meta.position = "${__curPos.file}:${toString __curPos.line}";
}