From 2f6550b7a7a582c7726d9657e8e3794f9b2f1f2c Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Tue, 2 Dec 2025 04:48:43 +0300 Subject: [PATCH] libfetchers/git-utils: Avoid using git_writestream for small files It turns out that libgit2 is incredibly naive and each git_writestream creates a new temporary file like .cache/nix/tarball-cache/objects/streamed_git2_6a82bb68dc0a3918 that it reads from afterwards. It doesn't do any internal buffering. Doing (with a fresh fetcher cache) a simple: strace -c nix flake metadata "https://releases.nixos.org/nixos/25.05/nixos-25.05.813095.1c8ba8d3f763/nixexprs.tar.xz" --store "dummy://?read-only=false" (Before) % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ------------------ 31.05 2.372728 9 259790 81917 openat 19.21 1.467784 30 48157 unlink 10.43 0.796793 4 162898 getdents64 7.75 0.592637 4 145969 read 7.67 0.585976 3 177877 close 7.11 0.543032 4 129970 190 newfstatat 6.98 0.533211 10 48488 write 4.09 0.312585 3 81443 81443 utimensat 3.22 0.246158 3 81552 fstat (After) % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ------------------ 29.61 0.639393 3 162898 getdents64 26.26 0.567119 3 163523 81934 openat 12.50 0.269835 3 81848 207 newfstatat 11.60 0.250429 3 81443 81443 utimensat 9.82 0.212053 2 81593 close 9.33 0.201390 2 81544 fstat 0.18 0.003814 9 406 17 futex --- src/libfetchers/git-utils.cc | 69 ++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 1fc4e49bb..90c2e1a4f 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -1031,6 +1031,11 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink std::vector pendingDirs; + /** + * Temporary buffer used by createRegularFile for storing small file contents. + */ + std::string regularFileContentsBuffer; + void pushBuilder(std::string name) { const git_tree_entry * entry; @@ -1112,41 +1117,83 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink if (!prepareDirs(pathComponents, false)) return; - git_writestream * stream = nullptr; - if (git_blob_create_from_stream(&stream, *repo, nullptr)) - throw Error("creating a blob stream object: %s", git_error_last()->message); + using WriteStream = std::unique_ptr<::git_writestream, decltype([](::git_writestream * stream) { + if (stream) + stream->free(stream); + })>; + + /* Maximum file size that gets buffered in memory before flushing to a WriteStream, + that's backed by a temporary objects/streamed_git2_* file. We should avoid that + for common cases, since creating (and deleting) a temporary file for each blob + is insanely expensive. */ + static constexpr std::size_t maxBufferSize = 1024 * 1024; /* 1 MiB */ struct CRF : CreateRegularFileSink { const CanonPath & path; GitFileSystemObjectSinkImpl & back; - git_writestream * stream; + WriteStream stream; + std::string & contents; bool executable = false; - CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream) + CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, std::string & regularFileContentsBuffer) : path(path) , back(back) - , stream(stream) + , stream(nullptr) + , contents(regularFileContentsBuffer) { + contents.clear(); + } + + void writeToStream(std::string_view data) + { + /* Lazily create the stream. */ + if (!stream) { + ::git_writestream * stream2 = nullptr; + if (git_blob_create_from_stream(&stream2, *back.repo, nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + stream = WriteStream{stream2}; + assert(stream); + } + + if (stream->write(stream.get(), data.data(), data.size())) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); } void operator()(std::string_view data) override { - if (stream->write(stream, data.data(), data.size())) - throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + /* Already in slow path. Just write to the slow stream. */ + if (stream) { + writeToStream(data); + return; + } + + contents += data; + if (contents.size() > maxBufferSize) { + writeToStream(contents); /* Will initialize stream. */ + contents.clear(); + } } void isExecutable() override { executable = true; } - } crf{path, *this, stream}; + } crf{path, *this, regularFileContentsBuffer}; func(crf); git_oid oid; - if (git_blob_create_from_stream_commit(&oid, stream)) - throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + if (crf.stream) { + /* Call .release(), since git_blob_create_from_stream_commit + acquires ownership and frees the stream. */ + if (git_blob_create_from_stream_commit(&oid, crf.stream.release())) + throw Error("creating a blob object for '%s': %s", path, git_error_last()->message); + } else { + if (git_blob_create_from_buffer(&oid, *repo, crf.contents.data(), crf.contents.size())) + throw Error( + "creating a blob object for '%s' from in-memory buffer: %s", path, git_error_last()->message); + } addToTree(*pathComponents.rbegin(), oid, crf.executable ? GIT_FILEMODE_BLOB_EXECUTABLE : GIT_FILEMODE_BLOB); }