1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-12-05 16:41:01 +01:00

libfetchers/git-utils: Avoid using git_writestream for small files

It turns out that libgit2 is incredibly naive and each git_writestream creates
a new temporary file like .cache/nix/tarball-cache/objects/streamed_git2_6a82bb68dc0a3918
that it reads from afterwards. It doesn't do any internal buffering.

Doing (with a fresh fetcher cache) a simple:

strace -c nix flake metadata "https://releases.nixos.org/nixos/25.05/nixos-25.05.813095.1c8ba8d3f763/nixexprs.tar.xz" --store "dummy://?read-only=false"

(Before)

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ------------------
 31.05    2.372728           9    259790     81917 openat
 19.21    1.467784          30     48157           unlink
 10.43    0.796793           4    162898           getdents64
  7.75    0.592637           4    145969           read
  7.67    0.585976           3    177877           close
  7.11    0.543032           4    129970       190 newfstatat
  6.98    0.533211          10     48488           write
  4.09    0.312585           3     81443     81443 utimensat
  3.22    0.246158           3     81552           fstat

(After)

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ------------------
 29.61    0.639393           3    162898           getdents64
 26.26    0.567119           3    163523     81934 openat
 12.50    0.269835           3     81848       207 newfstatat
 11.60    0.250429           3     81443     81443 utimensat
  9.82    0.212053           2     81593           close
  9.33    0.201390           2     81544           fstat
  0.18    0.003814           9       406        17 futex
This commit is contained in:
Sergei Zimmerman 2025-12-02 04:48:43 +03:00
parent 0ec93e7ae7
commit 2f6550b7a7
No known key found for this signature in database

View file

@ -1031,6 +1031,11 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
std::vector<PendingDir> pendingDirs; std::vector<PendingDir> pendingDirs;
/**
* Temporary buffer used by createRegularFile for storing small file contents.
*/
std::string regularFileContentsBuffer;
void pushBuilder(std::string name) void pushBuilder(std::string name)
{ {
const git_tree_entry * entry; const git_tree_entry * entry;
@ -1112,41 +1117,83 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
if (!prepareDirs(pathComponents, false)) if (!prepareDirs(pathComponents, false))
return; return;
git_writestream * stream = nullptr; using WriteStream = std::unique_ptr<::git_writestream, decltype([](::git_writestream * stream) {
if (git_blob_create_from_stream(&stream, *repo, nullptr)) if (stream)
throw Error("creating a blob stream object: %s", git_error_last()->message); stream->free(stream);
})>;
/* Maximum file size that gets buffered in memory before flushing to a WriteStream,
that's backed by a temporary objects/streamed_git2_* file. We should avoid that
for common cases, since creating (and deleting) a temporary file for each blob
is insanely expensive. */
static constexpr std::size_t maxBufferSize = 1024 * 1024; /* 1 MiB */
struct CRF : CreateRegularFileSink struct CRF : CreateRegularFileSink
{ {
const CanonPath & path; const CanonPath & path;
GitFileSystemObjectSinkImpl & back; GitFileSystemObjectSinkImpl & back;
git_writestream * stream; WriteStream stream;
std::string & contents;
bool executable = false; bool executable = false;
CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream) CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, std::string & regularFileContentsBuffer)
: path(path) : path(path)
, back(back) , back(back)
, stream(stream) , stream(nullptr)
, contents(regularFileContentsBuffer)
{ {
contents.clear();
}
void writeToStream(std::string_view data)
{
/* Lazily create the stream. */
if (!stream) {
::git_writestream * stream2 = nullptr;
if (git_blob_create_from_stream(&stream2, *back.repo, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
stream = WriteStream{stream2};
assert(stream);
}
if (stream->write(stream.get(), data.data(), data.size()))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
} }
void operator()(std::string_view data) override void operator()(std::string_view data) override
{ {
if (stream->write(stream, data.data(), data.size())) /* Already in slow path. Just write to the slow stream. */
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); if (stream) {
writeToStream(data);
return;
}
contents += data;
if (contents.size() > maxBufferSize) {
writeToStream(contents); /* Will initialize stream. */
contents.clear();
}
} }
void isExecutable() override void isExecutable() override
{ {
executable = true; executable = true;
} }
} crf{path, *this, stream}; } crf{path, *this, regularFileContentsBuffer};
func(crf); func(crf);
git_oid oid; git_oid oid;
if (git_blob_create_from_stream_commit(&oid, stream)) if (crf.stream) {
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); /* Call .release(), since git_blob_create_from_stream_commit
acquires ownership and frees the stream. */
if (git_blob_create_from_stream_commit(&oid, crf.stream.release()))
throw Error("creating a blob object for '%s': %s", path, git_error_last()->message);
} else {
if (git_blob_create_from_buffer(&oid, *repo, crf.contents.data(), crf.contents.size()))
throw Error(
"creating a blob object for '%s' from in-memory buffer: %s", path, git_error_last()->message);
}
addToTree(*pathComponents.rbegin(), oid, crf.executable ? GIT_FILEMODE_BLOB_EXECUTABLE : GIT_FILEMODE_BLOB); addToTree(*pathComponents.rbegin(), oid, crf.executable ? GIT_FILEMODE_BLOB_EXECUTABLE : GIT_FILEMODE_BLOB);
} }