mirror of
https://github.com/NixOS/nix.git
synced 2025-11-09 03:56:01 +01:00
Merge 1b69645641 into 479b6b73a9
This commit is contained in:
commit
1e73d56a5e
12 changed files with 261 additions and 15 deletions
|
|
@ -108,3 +108,17 @@ causes the list of machines in `/etc/nix/machines` to be included.
|
|||
(This is the default.)
|
||||
|
||||
[Nix instance]: @docroot@/glossary.md#gloss-nix-instance
|
||||
|
||||
## Resource Management
|
||||
|
||||
Adding `resource-management` to the `experimental-features` setting in `nix.conf` enables a basic resource management scheme for system features. This is akin to what can be accomplished with job schedulers like Slurm, where a remote machine can have a limited quantity of a resource that can be temporarily "consumed" by a job. This can be used with memory-heavy builds, or derivations that require exclusive access to particular hardware resources.
|
||||
|
||||
Resource management is supported in both the supported features and mandatory features of a remote machine configuration, by appending a colon `:` to a feature name followed by the quantity that this machine has. This is tracked on a per-store basis, so different users on a multi-user installation share the same pool of resources for their remote build machines. A derivation specifies that it consumes a resource with the same notation in the `requiredSystemFeatures` attribute.
|
||||
|
||||
For example, this builder can provide exclusive access to two GPUs and 128G of memory for remote builds:
|
||||
|
||||
builders = ssh://gpu-node x86_64-linux - 32 1 gpu:2,mem:128
|
||||
|
||||
A derivation that might use this machine may set its `requiredSystemFeatures` to `["gpu:1" "mem:4"]` to indicate that it requires a GPU and consumes 4G of system memory. A particularly memory-heavy derivation that doesn't need a GPU may still use the machine with a value of `["mem:64"]`. This helps ensure that limited system resources are not over-consumed by remote builds. Note that Nix does not do any actual delegation or enforcement of GPU, memory, or other resource usage, that is up to the derivations to manage.
|
||||
|
||||
When configuring the `system-features` setting on the remote machine's `nix.conf`, only include the name of the consumable feature, not the quantity availble. Resource limits are tracked on the dispatching end within the local store.
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include "nix/util/types.hh"
|
||||
#include "nix/util/util.hh"
|
||||
#include "nix/store/globals.hh"
|
||||
#include "nix/store/machines.hh"
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
|
@ -327,7 +328,14 @@ bool DerivationOptions::canBuildLocally(Store & localStore, const BasicDerivatio
|
|||
if (settings.maxBuildJobs.get() == 0 && !drv.isBuiltin())
|
||||
return false;
|
||||
|
||||
for (auto & feature : getRequiredSystemFeatures(drv))
|
||||
auto features = getRequiredSystemFeatures(drv);
|
||||
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
|
||||
auto featureCount = Machine::countFeatures(features);
|
||||
for (auto & feature : featureCount)
|
||||
if (!localStore.config.systemFeatures.get().count(feature.first))
|
||||
return false;
|
||||
} else
|
||||
for (auto & feature : features)
|
||||
if (!localStore.config.systemFeatures.get().count(feature))
|
||||
return false;
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ struct Machine;
|
|||
|
||||
typedef std::vector<Machine> Machines;
|
||||
|
||||
typedef std::map<std::string, unsigned long> FeatureCount;
|
||||
|
||||
struct Machine
|
||||
{
|
||||
|
||||
|
|
@ -21,7 +23,9 @@ struct Machine
|
|||
const unsigned int maxJobs;
|
||||
const float speedFactor;
|
||||
const StringSet supportedFeatures;
|
||||
const FeatureCount supportedFeaturesCount;
|
||||
const StringSet mandatoryFeatures;
|
||||
const FeatureCount mandatoryFeaturesCount;
|
||||
const std::string sshPublicHostKey;
|
||||
bool enabled = true;
|
||||
|
||||
|
|
@ -77,6 +81,11 @@ struct Machine
|
|||
* the same format.
|
||||
*/
|
||||
static Machines parseConfig(const StringSet & defaultSystems, const std::string & config);
|
||||
|
||||
/**
|
||||
* Count the number of each feature specified in a feature string.
|
||||
*/
|
||||
static FeatureCount countFeatures(const StringSet & features);
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -31,7 +31,9 @@ Machine::Machine(
|
|||
, maxJobs(maxJobs)
|
||||
, speedFactor(speedFactor == 0.0f ? 1.0f : speedFactor)
|
||||
, supportedFeatures(supportedFeatures)
|
||||
, supportedFeaturesCount(countFeatures(supportedFeatures))
|
||||
, mandatoryFeatures(mandatoryFeatures)
|
||||
, mandatoryFeaturesCount(countFeatures(mandatoryFeatures))
|
||||
, sshPublicHostKey(sshPublicHostKey)
|
||||
{
|
||||
if (speedFactor < 0.0)
|
||||
|
|
@ -45,16 +47,48 @@ bool Machine::systemSupported(const std::string & system) const
|
|||
|
||||
bool Machine::allSupported(const StringSet & features) const
|
||||
{
|
||||
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
|
||||
auto featuresCount = countFeatures(features);
|
||||
return std::all_of(featuresCount.begin(), featuresCount.end(), [&](const auto & f) {
|
||||
return (
|
||||
supportedFeaturesCount.count(f.first) > 0 && ( // feature is supported, and
|
||||
supportedFeaturesCount.at(f.first) >= f.second || // we have the quantity of it needed or
|
||||
supportedFeaturesCount.at(f.first) == 0 // we have a limitless supply of it
|
||||
)
|
||||
) || (
|
||||
mandatoryFeaturesCount.count(f.first) > 0 && (
|
||||
mandatoryFeaturesCount.at(f.first) >= f.second ||
|
||||
mandatoryFeaturesCount.at(f.first) == 0
|
||||
)
|
||||
);
|
||||
});
|
||||
} else {
|
||||
return std::all_of(features.begin(), features.end(), [&](const std::string & feature) {
|
||||
return supportedFeatures.count(feature) || mandatoryFeatures.count(feature);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
bool Machine::mandatoryMet(const StringSet & features) const
|
||||
{
|
||||
if (experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
|
||||
auto featureCount = countFeatures(features);
|
||||
return std::all_of(mandatoryFeaturesCount.begin(), mandatoryFeaturesCount.end(), [&](const auto & feature) {
|
||||
return featureCount.count(feature.first);
|
||||
});
|
||||
} else {
|
||||
return std::all_of(mandatoryFeatures.begin(), mandatoryFeatures.end(), [&](const std::string & feature) {
|
||||
return features.count(feature);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
std::string escapeUri(std::string uri)
|
||||
{
|
||||
if (uri.find(':') != std::string::npos) {
|
||||
uri.replace(uri.find(':'), 3, "%3A");
|
||||
}
|
||||
return uri;
|
||||
}
|
||||
|
||||
StoreReference Machine::completeStoreReference() const
|
||||
|
|
@ -81,7 +115,7 @@ StoreReference Machine::completeStoreReference() const
|
|||
for (auto & f : feats) {
|
||||
if (fs.size() > 0)
|
||||
fs += ' ';
|
||||
fs += f;
|
||||
fs += escapeUri(f);
|
||||
}
|
||||
};
|
||||
append(supportedFeatures);
|
||||
|
|
@ -207,6 +241,26 @@ Machines Machine::parseConfig(const StringSet & defaultSystems, const std::strin
|
|||
return parseBuilderLines(defaultSystems, builderLines);
|
||||
}
|
||||
|
||||
FeatureCount Machine::countFeatures(const StringSet & features)
|
||||
{
|
||||
FeatureCount fc;
|
||||
for (auto & f : features) {
|
||||
std::istringstream fss(f);
|
||||
std::string name;
|
||||
std::string quantity;
|
||||
unsigned long ulquantity = 0;
|
||||
std::getline(fss, name, ':');
|
||||
if (std::getline(fss, quantity)) {
|
||||
ulquantity = std::stoul(quantity);
|
||||
if (ulquantity == 0) {
|
||||
throw UsageError("quantity for feature %s must be > 0", name);
|
||||
}
|
||||
}
|
||||
fc.emplace(name, ulquantity);
|
||||
};
|
||||
return fc;
|
||||
}
|
||||
|
||||
Machines getMachines()
|
||||
{
|
||||
return Machine::parseConfig({settings.thisSystem}, settings.builders);
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ struct ExperimentalFeatureDetails
|
|||
* feature, we either have no issue at all if few features are not added
|
||||
* at the end of the list, or a proper merge conflict if they are.
|
||||
*/
|
||||
constexpr size_t numXpFeatures = 1 + static_cast<size_t>(Xp::BLAKE3Hashes);
|
||||
constexpr size_t numXpFeatures = 1 + static_cast<size_t>(Xp::ResourceManagement);
|
||||
|
||||
constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails = {{
|
||||
{
|
||||
|
|
@ -321,6 +321,14 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
|
|||
)",
|
||||
.trackingUrl = "",
|
||||
},
|
||||
{
|
||||
.tag = Xp::ResourceManagement,
|
||||
.name = "resource-management",
|
||||
.description = R"(
|
||||
Enables support for resource management in remote build system features.
|
||||
)",
|
||||
.trackingUrl = "",
|
||||
}
|
||||
}};
|
||||
|
||||
static_assert(
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ enum struct ExperimentalFeature {
|
|||
PipeOperators,
|
||||
ExternalBuilders,
|
||||
BLAKE3Hashes,
|
||||
ResourceManagement,
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -37,9 +37,14 @@ std::string escapeUri(std::string uri)
|
|||
|
||||
static std::string currentLoad;
|
||||
|
||||
static AutoCloseFD openSlotLock(const Machine & m, uint64_t slot)
|
||||
static AutoCloseFD openSlotLock(const std::string storeUri, uint64_t slot)
|
||||
{
|
||||
return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(m.storeUri.render()), slot), true);
|
||||
return openLockFile(fmt("%s/%s-%d", currentLoad, escapeUri(storeUri), slot), true);
|
||||
}
|
||||
|
||||
static AutoCloseFD openFeatureSlotLock(const std::string storeUri, const std::string feature, unsigned int slot)
|
||||
{
|
||||
return openLockFile(fmt("%s/%s-%s-%d", currentLoad, escapeUri(storeUri), feature, slot), true);
|
||||
}
|
||||
|
||||
static bool allSupportedLocally(Store & store, const StringSet & requiredFeatures)
|
||||
|
|
@ -50,6 +55,47 @@ static bool allSupportedLocally(Store & store, const StringSet & requiredFeature
|
|||
return true;
|
||||
}
|
||||
|
||||
using FeatureSlotLocks = std::map<std::string, std::vector<AutoCloseFD>>;
|
||||
|
||||
static bool tryReserveFeatures(
|
||||
const Machine & m,
|
||||
const FeatureCount requiredFeatures,
|
||||
FeatureSlotLocks & featureSlotLocks
|
||||
) {
|
||||
bool allSatisfied = true;
|
||||
for (auto & f : requiredFeatures) {
|
||||
if (!f.second) {
|
||||
continue;
|
||||
}
|
||||
std::vector<AutoCloseFD> locks(f.second);
|
||||
unsigned int numLocked = 0;
|
||||
for (unsigned int s = 0;
|
||||
numLocked < f.second && (
|
||||
(m.supportedFeaturesCount.find(f.first) != m.supportedFeaturesCount.end() &&
|
||||
s < m.supportedFeaturesCount.at(f.first)) ||
|
||||
(m.mandatoryFeaturesCount.find(f.first) != m.mandatoryFeaturesCount.end() &&
|
||||
s < m.mandatoryFeaturesCount.at(f.first))
|
||||
); ++s) {
|
||||
auto lock = openFeatureSlotLock(m.storeUri.render(), f.first, s);
|
||||
if (lockFile(lock.get(), ltWrite, false)) {
|
||||
locks[numLocked] = std::move(lock);
|
||||
++numLocked;
|
||||
}
|
||||
}
|
||||
if (numLocked < f.second) {
|
||||
allSatisfied = false;
|
||||
break;
|
||||
}
|
||||
auto & fslDest = featureSlotLocks[f.first];
|
||||
fslDest.insert(fslDest.end(), std::make_move_iterator(locks.begin()),
|
||||
std::make_move_iterator(locks.end()));
|
||||
}
|
||||
if (!allSatisfied) {
|
||||
featureSlotLocks.clear();
|
||||
}
|
||||
return allSatisfied;
|
||||
}
|
||||
|
||||
static int main_build_remote(int argc, char ** argv)
|
||||
{
|
||||
{
|
||||
|
|
@ -93,6 +139,7 @@ static int main_build_remote(int argc, char ** argv)
|
|||
|
||||
std::shared_ptr<Store> sshStore;
|
||||
AutoCloseFD bestSlotLock;
|
||||
FeatureSlotLocks bestFeatureSlotLocks;
|
||||
|
||||
auto machines = getMachines();
|
||||
debug("got %d remote builders", machines.size());
|
||||
|
|
@ -119,6 +166,12 @@ static int main_build_remote(int argc, char ** argv)
|
|||
auto neededSystem = readString(source);
|
||||
drvPath = store->parseStorePath(readString(source));
|
||||
auto requiredFeatures = readStrings<StringSet>(source);
|
||||
auto requiredFeaturesCount = Machine::countFeatures(requiredFeatures);
|
||||
bool needsResourceManagement = 0 < std::accumulate(
|
||||
requiredFeaturesCount.begin(), requiredFeaturesCount.end(), 0,
|
||||
[](auto total, auto feature) {
|
||||
return std::move(total) + feature.second;
|
||||
});
|
||||
|
||||
/* It would be possible to build locally after some builds clear out,
|
||||
so don't show the warning now: */
|
||||
|
|
@ -150,7 +203,7 @@ static int main_build_remote(int argc, char ** argv)
|
|||
AutoCloseFD free;
|
||||
uint64_t load = 0;
|
||||
for (uint64_t slot = 0; slot < m.maxJobs; ++slot) {
|
||||
auto slotLock = openSlotLock(m, slot);
|
||||
auto slotLock = openSlotLock(m.storeUri.render(), slot);
|
||||
if (lockFile(slotLock.get(), ltWrite, false)) {
|
||||
if (!free) {
|
||||
free = std::move(slotLock);
|
||||
|
|
@ -162,6 +215,13 @@ static int main_build_remote(int argc, char ** argv)
|
|||
if (!free) {
|
||||
continue;
|
||||
}
|
||||
FeatureSlotLocks featureSlotLocks;
|
||||
if (needsResourceManagement &&
|
||||
experimentalFeatureSettings.isEnabled(Xp::ResourceManagement)) {
|
||||
if (!tryReserveFeatures(m, requiredFeaturesCount, featureSlotLocks)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
bool best = false;
|
||||
if (!bestSlotLock) {
|
||||
best = true;
|
||||
|
|
@ -179,6 +239,7 @@ static int main_build_remote(int argc, char ** argv)
|
|||
if (best) {
|
||||
bestLoad = load;
|
||||
bestSlotLock = std::move(free);
|
||||
bestFeatureSlotLocks = std::move(featureSlotLocks);
|
||||
bestMachine = &m;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
source common.sh
|
||||
|
||||
enableFeatures "resource-management"
|
||||
|
||||
requireSandboxSupport
|
||||
[[ $busybox =~ busybox ]] || skipTest "no busybox"
|
||||
|
||||
here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")")
|
||||
export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf
|
||||
|
||||
expectStderr 1 nix build -Lvf resource-management.nix \
|
||||
--arg busybox "$busybox" \
|
||||
--out-link "$TEST_ROOT/result-from-remote" \
|
||||
--store "$TEST_ROOT/local" \
|
||||
--builders "ssh-ng://localhost?system-features=testf - - 4 1 testf:1" \
|
||||
| grepQuiet "Failed to find a machine for remote build!"
|
||||
19
tests/functional/build-remote-resource-management.sh
Normal file
19
tests/functional/build-remote-resource-management.sh
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
source common.sh
|
||||
|
||||
enableFeatures "resource-management"
|
||||
|
||||
requireSandboxSupport
|
||||
[[ $busybox =~ busybox ]] || skipTest "no busybox"
|
||||
|
||||
here=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")")
|
||||
export NIX_USER_CONF_FILES=$here/config/nix-with-resource-management.conf
|
||||
|
||||
nix build -Lvf resource-management.nix \
|
||||
--arg busybox "$busybox" \
|
||||
--out-link "$TEST_ROOT/result-from-remote" \
|
||||
--store "$TEST_ROOT/local" \
|
||||
--builders "ssh-ng://localhost?system-features=test - - 4 1 test:4"
|
||||
|
||||
grepQuiet 'Hello World!' < "$TEST_ROOT/result-from-remote/hello"
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
experimental-features = resource-management nix-command
|
||||
system-features = test
|
||||
|
|
@ -109,6 +109,8 @@ suites = [
|
|||
'build-remote-trustless-should-pass-3.sh',
|
||||
'build-remote-trustless-should-fail-0.sh',
|
||||
'build-remote-with-mounted-ssh-ng.sh',
|
||||
'build-remote-resource-management-should-fail.sh',
|
||||
'build-remote-resource-management.sh',
|
||||
'nar-access.sh',
|
||||
'impure-eval.sh',
|
||||
'pure-eval.sh',
|
||||
|
|
|
|||
50
tests/functional/resource-management.nix
Normal file
50
tests/functional/resource-management.nix
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
{ busybox }:
|
||||
|
||||
with import ./config.nix;
|
||||
|
||||
let
|
||||
drv1 = mkDerivation {
|
||||
name = "resource-management-1";
|
||||
shell = busybox;
|
||||
builder = ./simple.builder.sh;
|
||||
PATH = "";
|
||||
goodPath = path;
|
||||
requiredSystemFeatures = ["test:2"];
|
||||
meta.position = "${__curPos.file}:${toString __curPos.line}";
|
||||
};
|
||||
drv2 = mkDerivation {
|
||||
name = "resource-management-2";
|
||||
shell = busybox;
|
||||
builder = ./simple.builder.sh;
|
||||
PATH = "";
|
||||
goodPath = path;
|
||||
requiredSystemFeatures = ["test:2"];
|
||||
meta.position = "${__curPos.file}:${toString __curPos.line}";
|
||||
};
|
||||
drv3 = mkDerivation {
|
||||
name = "resource-management-3";
|
||||
shell = busybox;
|
||||
builder = ./simple.builder.sh;
|
||||
PATH = "";
|
||||
goodPath = path;
|
||||
requiredSystemFeatures = ["test:2"];
|
||||
meta.position = "${__curPos.file}:${toString __curPos.line}";
|
||||
};
|
||||
drv4 = mkDerivation {
|
||||
name = "resource-management-4";
|
||||
shell = busybox;
|
||||
builder = ./simple.builder.sh;
|
||||
PATH = "";
|
||||
goodPath = path;
|
||||
requiredSystemFeatures = ["test:2"];
|
||||
meta.position = "${__curPos.file}:${toString __curPos.line}";
|
||||
};
|
||||
in mkDerivation {
|
||||
name = "resource-management";
|
||||
shell = busybox;
|
||||
builder = ./simple.builder.sh;
|
||||
PATH = "";
|
||||
goodPath = path;
|
||||
DRVS = "${drv1}${drv2}${drv3}${drv4}";
|
||||
meta.position = "${__curPos.file}:${toString __curPos.line}";
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue