1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-12-06 09:01:01 +01:00

Merge pull request #14689 from NixOS/tarball-cache-faster

libfetchers/git-utils: Avoid using git_writestream for small files
This commit is contained in:
John Ericson 2025-12-02 03:53:54 +00:00 committed by GitHub
commit e67c97b5f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 108 additions and 49 deletions

View file

@ -24,6 +24,7 @@
#include <git2/indexer.h>
#include <git2/object.h>
#include <git2/odb.h>
#include <git2/odb_backend.h>
#include <git2/refs.h>
#include <git2/remote.h>
#include <git2/repository.h>
@ -31,6 +32,7 @@
#include <git2/status.h>
#include <git2/submodule.h>
#include <git2/sys/odb_backend.h>
#include <git2/sys/repository.h>
#include <git2/sys/mempack.h>
#include <git2/tag.h>
#include <git2/tree.h>
@ -89,7 +91,7 @@ typedef std::unique_ptr<git_odb, Deleter<git_odb_free>> ObjectDb;
typedef std::unique_ptr<git_packbuilder, Deleter<git_packbuilder_free>> PackBuilder;
typedef std::unique_ptr<git_indexer, Deleter<git_indexer_free>> Indexer;
Hash toHash(const git_oid & oid)
static Hash toHash(const git_oid & oid)
{
#ifdef GIT_EXPERIMENTAL_SHA256
assert(oid.type == GIT_OID_SHA1);
@ -108,7 +110,7 @@ static void initLibGit2()
});
}
git_oid hashToOID(const Hash & hash)
static git_oid hashToOID(const Hash & hash)
{
git_oid oid;
if (git_oid_fromstr(&oid, hash.gitRev().c_str()))
@ -116,7 +118,7 @@ git_oid hashToOID(const Hash & hash)
return oid;
}
Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t type = GIT_OBJECT_ANY)
static Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t type = GIT_OBJECT_ANY)
{
Object obj;
if (git_object_lookup(Setter(obj), repo, &oid, type)) {
@ -127,7 +129,7 @@ Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t typ
}
template<typename T>
T peelObject(git_object * obj, git_object_t type)
static T peelObject(git_object * obj, git_object_t type)
{
T obj2;
if (git_object_peel((git_object **) (typename T::pointer *) Setter(obj2), obj, type)) {
@ -138,7 +140,7 @@ T peelObject(git_object * obj, git_object_t type)
}
template<typename T>
T dupObject(typename T::pointer obj)
static T dupObject(typename T::pointer obj)
{
T obj2;
if (git_object_dup((git_object **) (typename T::pointer *) Setter(obj2), (git_object *) obj))
@ -245,9 +247,15 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
* In-memory object store for efficient batched writing to packfiles.
* Owned by `repo`.
*/
git_odb_backend * mempack_backend;
git_odb_backend * mempackBackend = nullptr;
GitRepoImpl(std::filesystem::path _path, bool create, bool bare)
/**
* On-disk packfile object store.
* Owned by `repo`.
*/
git_odb_backend * packBackend = nullptr;
GitRepoImpl(std::filesystem::path _path, bool create, bool bare, bool packfilesOnly = false)
: path(std::move(_path))
, bare(bare)
{
@ -258,15 +266,39 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
throw Error("opening Git repository %s: %s", path, git_error_last()->message);
ObjectDb odb;
if (git_repository_odb(Setter(odb), repo.get()))
throw Error("getting Git object database: %s", git_error_last()->message);
if (packfilesOnly) {
/* Create a fresh object database because by default the repo also
loose object backends. We are not using any of those for the
tarball cache, but libgit2 still does a bunch of unnecessary
syscalls that always fail with ENOENT. NOTE: We are only creating
a libgit2 object here and not modifying the repo. Think of this as
enabling the specific backend.
*/
if (git_odb_new(Setter(odb)))
throw Error("creating Git object database: %s", git_error_last()->message);
if (git_odb_backend_pack(&packBackend, (path / "objects").string().c_str()))
throw Error("creating pack backend: %s", git_error_last()->message);
if (git_odb_add_backend(odb.get(), packBackend, 1))
throw Error("adding pack backend to Git object database: %s", git_error_last()->message);
} else {
if (git_repository_odb(Setter(odb), repo.get()))
throw Error("getting Git object database: %s", git_error_last()->message);
}
// mempack_backend will be owned by the repository, so we are not expected to free it ourselves.
if (git_mempack_new(&mempack_backend))
if (git_mempack_new(&mempackBackend))
throw Error("creating mempack backend: %s", git_error_last()->message);
if (git_odb_add_backend(odb.get(), mempack_backend, 999))
if (git_odb_add_backend(odb.get(), mempackBackend, 999))
throw Error("adding mempack backend to Git object database: %s", git_error_last()->message);
if (packfilesOnly) {
if (git_repository_set_odb(repo.get(), odb.get()))
throw Error("setting Git object database: %s", git_error_last()->message);
}
}
operator git_repository *()
@ -287,7 +319,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
git_packbuilder_set_threads(packBuilder.get(), 0 /* autodetect */);
packBuilderContext.handleException(
"preparing packfile", git_mempack_write_thin_pack(mempack_backend, packBuilder.get()));
"preparing packfile", git_mempack_write_thin_pack(mempackBackend, packBuilder.get()));
checkInterrupt();
packBuilderContext.handleException("writing packfile", git_packbuilder_write_buf(&buf, packBuilder.get()));
checkInterrupt();
@ -320,7 +352,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
if (git_indexer_commit(indexer.get(), &stats))
throw Error("committing git packfile index: %s", git_error_last()->message);
if (git_mempack_reset(mempack_backend))
if (git_mempack_reset(mempackBackend))
throw Error("resetting git mempack backend: %s", git_error_last()->message);
checkInterrupt();
@ -553,27 +585,6 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
ref<GitFileSystemObjectSink> getFileSystemObjectSink() override;
static int sidebandProgressCallback(const char * str, int len, void * payload)
{
auto act = (Activity *) payload;
act->result(resFetchStatus, trim(std::string_view(str, len)));
return getInterrupted() ? -1 : 0;
}
static int transferProgressCallback(const git_indexer_progress * stats, void * payload)
{
auto act = (Activity *) payload;
act->result(
resFetchStatus,
fmt("%d/%d objects received, %d/%d deltas indexed, %s",
stats->received_objects,
stats->total_objects,
stats->indexed_deltas,
stats->total_deltas,
renderSize(stats->received_bytes)));
return getInterrupted() ? -1 : 0;
}
void fetch(const std::string & url, const std::string & refspec, bool shallow) override
{
Activity act(*logger, lvlTalkative, actFetchTree, fmt("fetching Git repository '%s'", url));
@ -701,9 +712,9 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
}
};
ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare)
ref<GitRepo> GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare, bool packfilesOnly)
{
return make_ref<GitRepoImpl>(path, create, bare);
return make_ref<GitRepoImpl>(path, create, bare, packfilesOnly);
}
/**
@ -1052,6 +1063,11 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
std::vector<PendingDir> pendingDirs;
/**
* Temporary buffer used by createRegularFile for storing small file contents.
*/
std::string regularFileContentsBuffer;
void pushBuilder(std::string name)
{
const git_tree_entry * entry;
@ -1133,41 +1149,83 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink
if (!prepareDirs(pathComponents, false))
return;
git_writestream * stream = nullptr;
if (git_blob_create_from_stream(&stream, *repo, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
using WriteStream = std::unique_ptr<::git_writestream, decltype([](::git_writestream * stream) {
if (stream)
stream->free(stream);
})>;
/* Maximum file size that gets buffered in memory before flushing to a WriteStream,
that's backed by a temporary objects/streamed_git2_* file. We should avoid that
for common cases, since creating (and deleting) a temporary file for each blob
is insanely expensive. */
static constexpr std::size_t maxBufferSize = 1024 * 1024; /* 1 MiB */
struct CRF : CreateRegularFileSink
{
const CanonPath & path;
GitFileSystemObjectSinkImpl & back;
git_writestream * stream;
WriteStream stream;
std::string & contents;
bool executable = false;
CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, git_writestream * stream)
CRF(const CanonPath & path, GitFileSystemObjectSinkImpl & back, std::string & regularFileContentsBuffer)
: path(path)
, back(back)
, stream(stream)
, stream(nullptr)
, contents(regularFileContentsBuffer)
{
contents.clear();
}
void writeToStream(std::string_view data)
{
/* Lazily create the stream. */
if (!stream) {
::git_writestream * stream2 = nullptr;
if (git_blob_create_from_stream(&stream2, *back.repo, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
stream = WriteStream{stream2};
assert(stream);
}
if (stream->write(stream.get(), data.data(), data.size()))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
}
void operator()(std::string_view data) override
{
if (stream->write(stream, data.data(), data.size()))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
/* Already in slow path. Just write to the slow stream. */
if (stream) {
writeToStream(data);
return;
}
contents += data;
if (contents.size() > maxBufferSize) {
writeToStream(contents); /* Will initialize stream. */
contents.clear();
}
}
void isExecutable() override
{
executable = true;
}
} crf{path, *this, stream};
} crf{path, *this, regularFileContentsBuffer};
func(crf);
git_oid oid;
if (git_blob_create_from_stream_commit(&oid, stream))
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message);
if (crf.stream) {
/* Call .release(), since git_blob_create_from_stream_commit
acquires ownership and frees the stream. */
if (git_blob_create_from_stream_commit(&oid, crf.stream.release()))
throw Error("creating a blob object for '%s': %s", path, git_error_last()->message);
} else {
if (git_blob_create_from_buffer(&oid, *repo, crf.contents.data(), crf.contents.size()))
throw Error(
"creating a blob object for '%s' from in-memory buffer: %s", path, git_error_last()->message);
}
addToTree(*pathComponents.rbegin(), oid, crf.executable ? GIT_FILEMODE_BLOB_EXECUTABLE : GIT_FILEMODE_BLOB);
}
@ -1335,7 +1393,7 @@ namespace fetchers {
ref<GitRepo> Settings::getTarballCache() const
{
static auto repoDir = std::filesystem::path(getCacheDir()) / "tarball-cache";
return GitRepo::openRepo(repoDir, true, true);
return GitRepo::openRepo(repoDir, /*create=*/true, /*bare=*/true, /*packfilesOnly=*/true);
}
} // namespace fetchers

View file

@ -32,7 +32,8 @@ struct GitRepo
{
virtual ~GitRepo() {}
static ref<GitRepo> openRepo(const std::filesystem::path & path, bool create = false, bool bare = false);
static ref<GitRepo>
openRepo(const std::filesystem::path & path, bool create = false, bool bare = false, bool packfilesOnly = false);
virtual uint64_t getRevCount(const Hash & rev) = 0;