diff --git a/doc/manual/rl-next/git-lfs-support.md b/doc/manual/rl-next/git-lfs-support.md new file mode 100644 index 000000000..2990fc76c --- /dev/null +++ b/doc/manual/rl-next/git-lfs-support.md @@ -0,0 +1,11 @@ +--- +synopsis: "Git LFS support" +prs: [10153] +--- + +The Git fetcher now supports Large File Storage (LFS). This can be enabled by passing the attribute `lfs = true` to the fetcher, e.g. +```console +nix flake prefetch 'git+ssh://git@github.com/Apress/repo-with-large-file-storage.git?lfs=1' +``` + +Author: [**@b-camacho**](https://github.com/b-camacho), [**@kip93**](https://github.com/kip93) diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index c4b8b2999..bd013eab2 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -367,6 +367,12 @@ static RegisterPrimOp primop_fetchTree({ Default: `false` + - `lfs` (Bool, optional) + + Fetch any [Git LFS](https://git-lfs.com/) files. + + Default: `false` + - `allRefs` (Bool, optional) By default, this has no effect. This becomes relevant only once `shallow` cloning is disabled. @@ -691,6 +697,13 @@ static RegisterPrimOp primop_fetchGit({ Make a shallow clone when fetching the Git tree. When this is enabled, the options `ref` and `allRefs` have no effect anymore. + + - `lfs` (default: `false`) + + A boolean that when `true` specifies that [Git LFS] files should be fetched. + + [Git LFS]: https://git-lfs.com/ + - `allRefs` Whether to fetch all references (eg. branches and tags) of the repository. diff --git a/src/libfetchers-tests/git-utils.cc b/src/libfetchers-tests/git-utils.cc index 0bf3076dc..10e98141f 100644 --- a/src/libfetchers-tests/git-utils.cc +++ b/src/libfetchers-tests/git-utils.cc @@ -7,6 +7,7 @@ #include #include "fs-sink.hh" #include "serialise.hh" +#include "git-lfs-fetch.hh" namespace nix { @@ -109,4 +110,131 @@ TEST_F(GitUtilsTest, sink_hardlink) } }; +namespace lfs { + +TEST_F(GitUtilsTest, parseGitRemoteUrl) +{ + { + GitUrl result = parseGitUrl("git@example.com:path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, "git"); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("example.com:/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "/path/repo.git"); + } + + { + GitUrl result = parseGitUrl("example.com:path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("https://example.com/path/repo.git"); + EXPECT_EQ(result.protocol, "https"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("ssh://git@example.com/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, "git"); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("ssh://example/path/repo.git"); + EXPECT_EQ(result.protocol, "ssh"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example"); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("http://example.com:8080/path/repo.git"); + EXPECT_EQ(result.protocol, "http"); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, "example.com"); + EXPECT_EQ(result.port, "8080"); + EXPECT_EQ(result.path, "path/repo.git"); + } + + { + GitUrl result = parseGitUrl("invalid-url"); + EXPECT_EQ(result.protocol, ""); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, ""); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, ""); + } + + { + GitUrl result = parseGitUrl(""); + EXPECT_EQ(result.protocol, ""); + EXPECT_EQ(result.user, ""); + EXPECT_EQ(result.host, ""); + EXPECT_EQ(result.port, ""); + EXPECT_EQ(result.path, ""); + } +} +TEST_F(GitUtilsTest, gitUrlToHttp) +{ + { + const GitUrl url = parseGitUrl("git@github.com:user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("https://github.com/user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("http://github.com/user/repo.git"); + EXPECT_EQ(url.toHttp(), "http://github.com/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("ssh://git@github.com:22/user/repo.git"); + EXPECT_EQ(url.toHttp(), "https://github.com:22/user/repo.git"); + } + { + const GitUrl url = parseGitUrl("invalid-url"); + EXPECT_EQ(url.toHttp(), ""); + } +} + +TEST_F(GitUtilsTest, gitUrlToSsh) +{ + { + const GitUrl url = parseGitUrl("https://example.com/user/repo.git"); + const auto [host, path] = url.toSsh(); + EXPECT_EQ(host, "example.com"); + EXPECT_EQ(path, "user/repo.git"); + } + { + const GitUrl url = parseGitUrl("git@example.com:user/repo.git"); + const auto [host, path] = url.toSsh(); + EXPECT_EQ(host, "git@example.com"); + EXPECT_EQ(path, "user/repo.git"); + } +} + +} // namespace lfs + } // namespace nix diff --git a/src/libfetchers/git-lfs-fetch.cc b/src/libfetchers/git-lfs-fetch.cc new file mode 100644 index 000000000..bd6c01435 --- /dev/null +++ b/src/libfetchers/git-lfs-fetch.cc @@ -0,0 +1,279 @@ +#include "git-lfs-fetch.hh" +#include "git-utils.hh" +#include "filetransfer.hh" +#include "processes.hh" +#include "url.hh" +#include "users.hh" +#include "hash.hh" + +#include +#include +#include +#include + +#include + +namespace nix::lfs { + +// if authHeader is "", downloadToSink assumes no auth is expected +static void downloadToSink( + const std::string & url, + const std::string & authHeader, + // FIXME: passing a StringSink is superfluous, we may as well + // return a string. Or use an abstract Sink for streaming. + StringSink & sink, + std::string sha256Expected, + size_t sizeExpected) +{ + FileTransferRequest request(url); + Headers headers; + if (!authHeader.empty()) + headers.push_back({"Authorization", authHeader}); + request.headers = headers; + getFileTransfer()->download(std::move(request), sink); + + auto sizeActual = sink.s.length(); + if (sizeExpected != sizeActual) + throw Error("size mismatch while fetching %s: expected %d but got %d", url, sizeExpected, sizeActual); + + auto sha256Actual = hashString(HashAlgorithm::SHA256, sink.s).to_string(HashFormat::Base16, false); + if (sha256Actual != sha256Expected) + throw Error( + "hash mismatch while fetching %s: expected sha256:%s but got sha256:%s", url, sha256Expected, sha256Actual); +} + +static std::string getLfsApiToken(const ParsedURL & url) +{ + auto [status, output] = runProgram(RunOptions{ + .program = "ssh", + .args = {*url.authority, "git-lfs-authenticate", url.path, "download"}, + }); + + if (output.empty()) + throw Error( + "git-lfs-authenticate: no output (cmd: ssh %s git-lfs-authenticate %s download)", + url.authority.value_or(""), + url.path); + + auto queryResp = nlohmann::json::parse(output); + if (!queryResp.contains("header")) + throw Error("no header in git-lfs-authenticate response"); + if (!queryResp["header"].contains("Authorization")) + throw Error("no Authorization in git-lfs-authenticate response"); + + return queryResp["header"]["Authorization"].get(); +} + +typedef std::unique_ptr> GitConfig; +typedef std::unique_ptr> GitConfigEntry; + +static std::string getLfsEndpointUrl(git_repository * repo) +{ + GitConfig config; + if (git_repository_config(Setter(config), repo)) { + GitConfigEntry entry; + if (!git_config_get_entry(Setter(entry), config.get(), "lfs.url")) { + auto value = std::string(entry->value); + if (!value.empty()) { + debug("Found explicit lfs.url value: %s", value); + return value; + } + } + } + + git_remote * remote = nullptr; + if (git_remote_lookup(&remote, repo, "origin")) + return ""; + + const char * url_c_str = git_remote_url(remote); + if (!url_c_str) + return ""; + + return std::string(url_c_str); +} + +static std::optional parseLfsPointer(std::string_view content, std::string_view filename) +{ + // https://github.com/git-lfs/git-lfs/blob/2ef4108/docs/spec.md + // + // example git-lfs pointer file: + // version https://git-lfs.github.com/spec/v1 + // oid sha256:f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf + // size 10000000 + // (ending \n) + + if (!content.starts_with("version ")) { + // Invalid pointer file + return std::nullopt; + } + + if (!content.starts_with("version https://git-lfs.github.com/spec/v1")) { + // In case there's new spec versions in the future, but for now only v1 exists + debug("Invalid version found on potential lfs pointer file, skipping"); + return std::nullopt; + } + + std::string oid; + std::string size; + + for (auto & line : tokenizeString(content, "\n")) { + if (line.starts_with("version ")) { + continue; + } + if (line.starts_with("oid sha256:")) { + oid = line.substr(11); // skip "oid sha256:" + continue; + } + if (line.starts_with("size ")) { + size = line.substr(5); // skip "size " + continue; + } + + debug("Custom extension '%s' found, ignoring", line); + } + + if (oid.length() != 64 || !std::all_of(oid.begin(), oid.end(), ::isxdigit)) { + debug("Invalid sha256 %s, skipping", oid); + return std::nullopt; + } + + if (size.length() == 0 || !std::all_of(size.begin(), size.end(), ::isdigit)) { + debug("Invalid size %s, skipping", size); + return std::nullopt; + } + + return std::make_optional(Pointer{oid, std::stoul(size)}); +} + +Fetch::Fetch(git_repository * repo, git_oid rev) +{ + this->repo = repo; + this->rev = rev; + + const auto remoteUrl = lfs::getLfsEndpointUrl(repo); + + this->url = nix::parseURL(nix::fixGitURL(remoteUrl)).canonicalise(); +} + +bool Fetch::shouldFetch(const CanonPath & path) const +{ + const char * attr = nullptr; + git_attr_options opts = GIT_ATTR_OPTIONS_INIT; + opts.attr_commit_id = this->rev; + opts.flags = GIT_ATTR_CHECK_INCLUDE_COMMIT | GIT_ATTR_CHECK_NO_SYSTEM; + if (git_attr_get_ext(&attr, (git_repository *) (this->repo), &opts, path.rel_c_str(), "filter")) + throw Error("cannot get git-lfs attribute: %s", git_error_last()->message); + debug("Git filter for '%s' is '%s'", path, attr ? attr : "null"); + return attr != nullptr && !std::string(attr).compare("lfs"); +} + +static nlohmann::json pointerToPayload(const std::vector & items) +{ + nlohmann::json jArray = nlohmann::json::array(); + for (const auto & pointer : items) + jArray.push_back({{"oid", pointer.oid}, {"size", pointer.size}}); + return jArray; +} + +std::vector Fetch::fetchUrls(const std::vector & pointers) const +{ + ParsedURL httpUrl(url); + httpUrl.scheme = url.scheme == "ssh" ? "https" : url.scheme; + FileTransferRequest request(httpUrl.to_string() + "/info/lfs/objects/batch"); + request.post = true; + Headers headers; + if (this->url.scheme == "ssh") + headers.push_back({"Authorization", lfs::getLfsApiToken(this->url)}); + headers.push_back({"Content-Type", "application/vnd.git-lfs+json"}); + headers.push_back({"Accept", "application/vnd.git-lfs+json"}); + request.headers = headers; + nlohmann::json oidList = pointerToPayload(pointers); + nlohmann::json data = {{"operation", "download"}}; + data["objects"] = oidList; + request.data = data.dump(); + + FileTransferResult result = getFileTransfer()->upload(request); + auto responseString = result.data; + + std::vector objects; + // example resp here: + // {"objects":[{"oid":"f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf","size":10000000,"actions":{"download":{"href":"https://gitlab.com/b-camacho/test-lfs.git/gitlab-lfs/objects/f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf","header":{"Authorization":"Basic + // Yi1jYW1hY2hvOmV5SjBlWEFpT2lKS1YxUWlMQ0poYkdjaU9pSklVekkxTmlKOS5leUprWVhSaElqcDdJbUZqZEc5eUlqb2lZaTFqWVcxaFkyaHZJbjBzSW1wMGFTSTZJbUptTURZNFpXVTFMVEprWmpVdE5HWm1ZUzFpWWpRMExUSXpNVEV3WVRReU1qWmtaaUlzSW1saGRDSTZNVGN4TkRZeE16ZzBOU3dpYm1KbUlqb3hOekUwTmpFek9EUXdMQ0psZUhBaU9qRTNNVFEyTWpFd05EVjkuZk9yMDNkYjBWSTFXQzFZaTBKRmJUNnJTTHJPZlBwVW9lYllkT0NQZlJ4QQ=="}}},"authenticated":true}]} + + try { + auto resp = nlohmann::json::parse(responseString); + if (resp.contains("objects")) + objects.insert(objects.end(), resp["objects"].begin(), resp["objects"].end()); + else + throw Error("response does not contain 'objects'"); + + return objects; + } catch (const nlohmann::json::parse_error & e) { + printMsg(lvlTalkative, "Full response: '%1%'", responseString); + throw Error("response did not parse as json: %s", e.what()); + } +} + +void Fetch::fetch( + const std::string & content, + const CanonPath & pointerFilePath, + StringSink & sink, + std::function sizeCallback) const +{ + debug("trying to fetch '%s' using git-lfs", pointerFilePath); + + if (content.length() >= 1024) { + warn("encountered file '%s' that should have been a git-lfs pointer, but is too large", pointerFilePath); + sizeCallback(content.length()); + sink(content); + return; + } + + const auto pointer = parseLfsPointer(content, pointerFilePath.rel()); + if (pointer == std::nullopt) { + warn("encountered file '%s' that should have been a git-lfs pointer, but is invalid", pointerFilePath); + sizeCallback(content.length()); + sink(content); + return; + } + + Path cacheDir = getCacheDir() + "/git-lfs"; + std::string key = hashString(HashAlgorithm::SHA256, pointerFilePath.rel()).to_string(HashFormat::Base16, false) + + "/" + pointer->oid; + Path cachePath = cacheDir + "/" + key; + if (pathExists(cachePath)) { + debug("using cache entry %s -> %s", key, cachePath); + sink(readFile(cachePath)); + return; + } + debug("did not find cache entry for %s", key); + + std::vector pointers; + pointers.push_back(pointer.value()); + const auto objUrls = fetchUrls(pointers); + + const auto obj = objUrls[0]; + try { + std::string sha256 = obj.at("oid"); // oid is also the sha256 + std::string ourl = obj.at("actions").at("download").at("href"); + std::string authHeader = ""; + if (obj.at("actions").at("download").contains("header") + && obj.at("actions").at("download").at("header").contains("Authorization")) { + authHeader = obj["actions"]["download"]["header"]["Authorization"]; + } + const uint64_t size = obj.at("size"); + sizeCallback(size); + downloadToSink(ourl, authHeader, sink, sha256, size); + + debug("creating cache entry %s -> %s", key, cachePath); + if (!pathExists(dirOf(cachePath))) + createDirs(dirOf(cachePath)); + writeFile(cachePath, sink.s); + + debug("%s fetched with git-lfs", pointerFilePath); + } catch (const nlohmann::json::out_of_range & e) { + throw Error("bad json from /info/lfs/objects/batch: %s %s", obj, e.what()); + } +} + +} // namespace nix::lfs diff --git a/src/libfetchers/git-lfs-fetch.hh b/src/libfetchers/git-lfs-fetch.hh new file mode 100644 index 000000000..36df91962 --- /dev/null +++ b/src/libfetchers/git-lfs-fetch.hh @@ -0,0 +1,43 @@ +#include "canon-path.hh" +#include "serialise.hh" +#include "url.hh" + +#include + +#include + +namespace nix::lfs { + +/** + * git-lfs pointer + * @see https://github.com/git-lfs/git-lfs/blob/2ef4108/docs/spec.md + */ +struct Pointer +{ + std::string oid; // git-lfs managed object id. you give this to the lfs server + // for downloads + size_t size; // in bytes +}; + +struct Fetch +{ + // Reference to the repository + const git_repository * repo; + + // Git commit being fetched + git_oid rev; + + // derived from git remote url + nix::ParsedURL url; + + Fetch(git_repository * repo, git_oid rev); + bool shouldFetch(const CanonPath & path) const; + void fetch( + const std::string & content, + const CanonPath & pointerFilePath, + StringSink & sink, + std::function sizeCallback) const; + std::vector fetchUrls(const std::vector & pointers) const; +}; + +} // namespace nix::lfs diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index a6b13fb31..a2761a543 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -1,4 +1,5 @@ #include "git-utils.hh" +#include "git-lfs-fetch.hh" #include "cache.hh" #include "finally.hh" #include "processes.hh" @@ -60,14 +61,6 @@ namespace nix { struct GitSourceAccessor; -// Some wrapper types that ensure that the git_*_free functions get called. -template -struct Deleter -{ - template - void operator()(T * p) const { del(p); }; -}; - typedef std::unique_ptr> Repository; typedef std::unique_ptr> TreeEntry; typedef std::unique_ptr> Tree; @@ -85,20 +78,6 @@ typedef std::unique_ptr> ObjectDb; typedef std::unique_ptr> PackBuilder; typedef std::unique_ptr> Indexer; -// A helper to ensure that we don't leak objects returned by libgit2. -template -struct Setter -{ - T & t; - typename T::pointer p = nullptr; - - Setter(T & t) : t(t) { } - - ~Setter() { if (p) t = T(p); } - - operator typename T::pointer * () { return &p; } -}; - Hash toHash(const git_oid & oid) { #ifdef GIT_EXPERIMENTAL_SHA256 @@ -506,12 +485,15 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this /** * A 'GitSourceAccessor' with no regard for export-ignore or any other transformations. */ - ref getRawAccessor(const Hash & rev); + ref getRawAccessor( + const Hash & rev, + bool smudgeLfs = false); ref getAccessor( const Hash & rev, bool exportIgnore, - std::string displayPrefix) override; + std::string displayPrefix, + bool smudgeLfs = false) override; ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError e) override; @@ -670,24 +652,40 @@ ref GitRepo::openRepo(const std::filesystem::path & path, bool create, /** * Raw git tree input accessor. */ + struct GitSourceAccessor : SourceAccessor { ref repo; Object root; + std::optional lfsFetch = std::nullopt; - GitSourceAccessor(ref repo_, const Hash & rev) + GitSourceAccessor(ref repo_, const Hash & rev, bool smudgeLfs) : repo(repo_) , root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get())) { + if (smudgeLfs) + lfsFetch = std::make_optional(lfs::Fetch(*repo, hashToOID(rev))); } std::string readBlob(const CanonPath & path, bool symlink) { - auto blob = getBlob(path, symlink); + const auto blob = getBlob(path, symlink); - auto data = std::string_view((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); + if (lfsFetch) { + if (lfsFetch->shouldFetch(path)) { + StringSink s; + try { + auto contents = std::string((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); + lfsFetch->fetch(contents, path, s, [&s](uint64_t size){ s.s.reserve(size); }); + } catch (Error & e) { + e.addTrace({}, "while smudging git-lfs file '%s'", path); + throw; + } + return s.s; + } + } - return std::string(data); + return std::string((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); } std::string readFile(const CanonPath & path) override @@ -1191,19 +1189,22 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink } }; -ref GitRepoImpl::getRawAccessor(const Hash & rev) +ref GitRepoImpl::getRawAccessor( + const Hash & rev, + bool smudgeLfs) { auto self = ref(shared_from_this()); - return make_ref(self, rev); + return make_ref(self, rev, smudgeLfs); } ref GitRepoImpl::getAccessor( const Hash & rev, bool exportIgnore, - std::string displayPrefix) + std::string displayPrefix, + bool smudgeLfs) { auto self = ref(shared_from_this()); - ref rawGitAccessor = getRawAccessor(rev); + ref rawGitAccessor = getRawAccessor(rev, smudgeLfs); rawGitAccessor->setPathDisplay(std::move(displayPrefix)); if (exportIgnore) return make_ref(self, rawGitAccessor, rev); diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 9677f5079..c683bd058 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -89,7 +89,8 @@ struct GitRepo virtual ref getAccessor( const Hash & rev, bool exportIgnore, - std::string displayPrefix) = 0; + std::string displayPrefix, + bool smudgeLfs = false) = 0; virtual ref getAccessor(const WorkdirInfo & wd, bool exportIgnore, MakeNotAllowedError makeNotAllowedError) = 0; @@ -126,4 +127,26 @@ struct GitRepo ref getTarballCache(); +// A helper to ensure that the `git_*_free` functions get called. +template +struct Deleter +{ + template + void operator()(T * p) const { del(p); }; +}; + +// A helper to ensure that we don't leak objects returned by libgit2. +template +struct Setter +{ + T & t; + typename T::pointer p = nullptr; + + Setter(T & t) : t(t) { } + + ~Setter() { if (p) t = T(p); } + + operator typename T::pointer * () { return &p; } +}; + } diff --git a/src/libfetchers/git.cc b/src/libfetchers/git.cc index 0d423a7a3..f7c4e6d5b 100644 --- a/src/libfetchers/git.cc +++ b/src/libfetchers/git.cc @@ -185,7 +185,7 @@ struct GitInputScheme : InputScheme for (auto & [name, value] : url.query) { if (name == "rev" || name == "ref" || name == "keytype" || name == "publicKey" || name == "publicKeys") attrs.emplace(name, value); - else if (name == "shallow" || name == "submodules" || name == "exportIgnore" || name == "allRefs" || name == "verifyCommit") + else if (name == "shallow" || name == "submodules" || name == "lfs" || name == "exportIgnore" || name == "allRefs" || name == "verifyCommit") attrs.emplace(name, Explicit { value == "1" }); else url2.query.emplace(name, value); @@ -210,6 +210,7 @@ struct GitInputScheme : InputScheme "rev", "shallow", "submodules", + "lfs", "exportIgnore", "lastModified", "revCount", @@ -262,6 +263,8 @@ struct GitInputScheme : InputScheme if (auto ref = input.getRef()) url.query.insert_or_assign("ref", *ref); if (getShallowAttr(input)) url.query.insert_or_assign("shallow", "1"); + if (getLfsAttr(input)) + url.query.insert_or_assign("lfs", "1"); if (getSubmodulesAttr(input)) url.query.insert_or_assign("submodules", "1"); if (maybeGetBoolAttr(input.attrs, "exportIgnore").value_or(false)) @@ -411,6 +414,11 @@ struct GitInputScheme : InputScheme return maybeGetBoolAttr(input.attrs, "submodules").value_or(false); } + bool getLfsAttr(const Input & input) const + { + return maybeGetBoolAttr(input.attrs, "lfs").value_or(false); + } + bool getExportIgnoreAttr(const Input & input) const { return maybeGetBoolAttr(input.attrs, "exportIgnore").value_or(false); @@ -678,7 +686,8 @@ struct GitInputScheme : InputScheme verifyCommit(input, repo); bool exportIgnore = getExportIgnoreAttr(input); - auto accessor = repo->getAccessor(rev, exportIgnore, "«" + input.to_string() + "»"); + bool smudgeLfs = getLfsAttr(input); + auto accessor = repo->getAccessor(rev, exportIgnore, "«" + input.to_string() + "»", smudgeLfs); /* If the repo has submodules, fetch them and return a mounted input accessor consisting of the accessor for the top-level @@ -698,6 +707,7 @@ struct GitInputScheme : InputScheme attrs.insert_or_assign("rev", submoduleRev.gitRev()); attrs.insert_or_assign("exportIgnore", Explicit{ exportIgnore }); attrs.insert_or_assign("submodules", Explicit{ true }); + attrs.insert_or_assign("lfs", Explicit{ smudgeLfs }); attrs.insert_or_assign("allRefs", Explicit{ true }); auto submoduleInput = fetchers::Input::fromAttrs(*input.settings, std::move(attrs)); auto [submoduleAccessor, submoduleInput2] = @@ -838,7 +848,7 @@ struct GitInputScheme : InputScheme { auto makeFingerprint = [&](const Hash & rev) { - return rev.gitRev() + (getSubmodulesAttr(input) ? ";s" : "") + (getExportIgnoreAttr(input) ? ";e" : ""); + return rev.gitRev() + (getSubmodulesAttr(input) ? ";s" : "") + (getExportIgnoreAttr(input) ? ";e" : "") + (getLfsAttr(input) ? ";l" : ""); }; if (auto rev = input.getRev()) diff --git a/src/libfetchers/meson.build b/src/libfetchers/meson.build index 58afbb7d0..ac69ab8dc 100644 --- a/src/libfetchers/meson.build +++ b/src/libfetchers/meson.build @@ -48,6 +48,7 @@ sources = files( 'fetch-to-store.cc', 'fetchers.cc', 'filtering-source-accessor.cc', + 'git-lfs-fetch.cc', 'git-utils.cc', 'git.cc', 'github.cc', @@ -69,6 +70,7 @@ headers = files( 'fetch-to-store.hh', 'fetchers.hh', 'filtering-source-accessor.hh', + 'git-lfs-fetch.hh', 'git-utils.hh', 'mounted-source-accessor.hh', 'registry.hh', diff --git a/src/libstore/filetransfer.cc b/src/libstore/filetransfer.cc index 932e1d756..f2430631d 100644 --- a/src/libstore/filetransfer.cc +++ b/src/libstore/filetransfer.cc @@ -94,7 +94,7 @@ struct curlFileTransfer : public FileTransfer : fileTransfer(fileTransfer) , request(request) , act(*logger, lvlTalkative, actFileTransfer, - fmt(request.data ? "uploading '%s'" : "downloading '%s'", request.uri), + request.post ? "" : fmt(request.data ? "uploading '%s'" : "downloading '%s'", request.uri), {request.uri}, request.parentAct) , callback(std::move(callback)) , finalSink([this](std::string_view data) { @@ -271,11 +271,21 @@ struct curlFileTransfer : public FileTransfer return getInterrupted(); } + int silentProgressCallback(double dltotal, double dlnow) + { + return getInterrupted(); + } + static int progressCallbackWrapper(void * userp, double dltotal, double dlnow, double ultotal, double ulnow) { return ((TransferItem *) userp)->progressCallback(dltotal, dlnow); } + static int silentProgressCallbackWrapper(void * userp, double dltotal, double dlnow, double ultotal, double ulnow) + { + return ((TransferItem *) userp)->silentProgressCallback(dltotal, dlnow); + } + static int debugCallback(CURL * handle, curl_infotype type, char * data, size_t size, void * userptr) { if (type == CURLINFO_TEXT) @@ -340,8 +350,11 @@ struct curlFileTransfer : public FileTransfer curl_easy_setopt(req, CURLOPT_HEADERFUNCTION, TransferItem::headerCallbackWrapper); curl_easy_setopt(req, CURLOPT_HEADERDATA, this); - curl_easy_setopt(req, CURLOPT_PROGRESSFUNCTION, progressCallbackWrapper); - curl_easy_setopt(req, CURLOPT_PROGRESSDATA, this); + if (request.post) + curl_easy_setopt(req, CURLOPT_XFERINFOFUNCTION, silentProgressCallbackWrapper); + else + curl_easy_setopt(req, CURLOPT_XFERINFOFUNCTION, progressCallbackWrapper); + curl_easy_setopt(req, CURLOPT_XFERINFODATA, this); curl_easy_setopt(req, CURLOPT_NOPROGRESS, 0); curl_easy_setopt(req, CURLOPT_HTTPHEADER, requestHeaders); @@ -353,7 +366,10 @@ struct curlFileTransfer : public FileTransfer curl_easy_setopt(req, CURLOPT_NOBODY, 1); if (request.data) { - curl_easy_setopt(req, CURLOPT_UPLOAD, 1L); + if (request.post) + curl_easy_setopt(req, CURLOPT_POST, 1L); + else + curl_easy_setopt(req, CURLOPT_UPLOAD, 1L); curl_easy_setopt(req, CURLOPT_READFUNCTION, readCallbackWrapper); curl_easy_setopt(req, CURLOPT_READDATA, this); curl_easy_setopt(req, CURLOPT_INFILESIZE_LARGE, (curl_off_t) request.data->length()); @@ -430,7 +446,8 @@ struct curlFileTransfer : public FileTransfer if (httpStatus == 304 && result.etag == "") result.etag = request.expectedETag; - act.progress(result.bodySize, result.bodySize); + if (!request.post) + act.progress(result.bodySize, result.bodySize); done = true; callback(std::move(result)); } diff --git a/src/libstore/filetransfer.hh b/src/libstore/filetransfer.hh index 43a384d71..0ecc7f376 100644 --- a/src/libstore/filetransfer.hh +++ b/src/libstore/filetransfer.hh @@ -65,6 +65,7 @@ struct FileTransferRequest std::string expectedETag; bool verifyTLS = true; bool head = false; + bool post = false; size_t tries = fileTransferSettings.tries; unsigned int baseRetryTimeMs = 250; ActivityId parentAct; diff --git a/tests/nixos/fetch-git/test-cases/lfs/default.nix b/tests/nixos/fetch-git/test-cases/lfs/default.nix new file mode 100644 index 000000000..a6b4fc77a --- /dev/null +++ b/tests/nixos/fetch-git/test-cases/lfs/default.nix @@ -0,0 +1,197 @@ +{ + # mostly copied from https://github.com/NixOS/nix/blob/358c26fd13a902d9a4032a00e6683571be07a384/tests/nixos/fetch-git/test-cases/fetchTree-shallow/default.nix#L1 + # ty @DavHau + description = "fetchGit smudges LFS pointers if lfs=true"; + script = '' + from tempfile import TemporaryDirectory + + expected_max_size_lfs_pointer = 1024 # 1 KiB (values >= than this cannot be pointers, and test files are 1 MiB) + + # purge nix git cache to make sure we start with a clean slate + client.succeed("rm -rf ~/.cache/nix") + + + with subtest("Request lfs fetch without any .gitattributes file"): + client.succeed(f"dd if=/dev/urandom of={repo.path}/regular bs=1M count=1 >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'no .gitattributes' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + no_gitattributes_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # fetch with lfs=true, and check that the lack of .gitattributes does not break anything + fetchGit_no_gitattributes_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{no_gitattributes_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetched_no_gitattributes = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_no_gitattributes_expr}).outPath' + """) + client.succeed(f"cmp {repo.path}/regular {fetched_no_gitattributes}/regular >&2") + + + with subtest("Add a file that should be tracked by lfs, but isn't"): + # (git lfs cli only throws a warning "Encountered 1 file that should have + # been a pointer, but wasn't") + + client.succeed(f"dd if=/dev/urandom of={repo.path}/black_sheep bs=1M count=1 >&2") + client.succeed(f"echo 'black_sheep filter=lfs -text' >>{repo.path}/.gitattributes") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add misleading file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + bad_lfs_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # test assumption that it can be cloned with regular git first + # (here we see the warning as stated above) + with TemporaryDirectory() as tempdir: + client.succeed(f"git clone -n {repo.remote} {tempdir} >&2") + client.succeed(f"git -C {tempdir} lfs install >&2") + client.succeed(f"git -C {tempdir} checkout {bad_lfs_rev} >&2") + + # check that the file is not a pointer, as expected + file_size_git = client.succeed(f"stat -c %s {tempdir}/black_sheep").strip() + assert int(file_size_git) == 1024 * 1024, \ + f"non lfs file is {file_size_git}b (!= 1MiB), probably a test implementation error" + + lfs_files = client.succeed(f"git -C {tempdir} lfs ls-files").strip() + assert lfs_files == "", "non lfs file is tracked by lfs, probably a test implementation error" + + client.succeed(f"cmp {repo.path}/black_sheep {tempdir}/black_sheep >&2") + + # now fetch without lfs, check that the file is not a pointer + fetchGit_bad_lfs_without_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = false; + }} + """ + fetched_bad_lfs_without_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_bad_lfs_without_lfs_expr}).outPath' + """) + + # check that file was not somehow turned into a pointer + file_size_bad_lfs_without_lfs = client.succeed(f"stat -c %s {fetched_bad_lfs_without_lfs}/black_sheep").strip() + + assert int(file_size_bad_lfs_without_lfs) == 1024 * 1024, \ + f"non lfs-enrolled file is {file_size_bad_lfs_without_lfs}b (!= 1MiB), probably a test implementation error" + client.succeed(f"cmp {repo.path}/black_sheep {fetched_bad_lfs_without_lfs}/black_sheep >&2") + + # finally fetch with lfs=true, and check that the bad file does not break anything + fetchGit_bad_lfs_with_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetchGit_bad_lfs_with_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_bad_lfs_with_lfs_expr}).outPath' + """) + + client.succeed(f"cmp {repo.path}/black_sheep {fetchGit_bad_lfs_with_lfs}/black_sheep >&2") + + + with subtest("Add an lfs-enrolled file to the repo"): + client.succeed(f"dd if=/dev/urandom of={repo.path}/beeg bs=1M count=1 >&2") + client.succeed(f"{repo.git} lfs install >&2") + client.succeed(f"{repo.git} lfs track --filename beeg >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add lfs file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + lfs_file_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # first fetch without lfs, check that we did not smudge the file + fetchGit_nolfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + lfs = false; + }} + """ + fetched_nolfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_nolfs_expr}).outPath' + """) + + # check that file was not smudged + file_size_nolfs = client.succeed(f"stat -c %s {fetched_nolfs}/beeg").strip() + + assert int(file_size_nolfs) < expected_max_size_lfs_pointer, \ + f"did not set lfs=true, yet lfs-enrolled file is {file_size_nolfs}b (>= 1KiB), probably smudged when we should not have" + + # now fetch with lfs=true and check that the file was smudged + fetchGit_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetched_lfs = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_lfs_expr}).outPath' + """) + + assert fetched_lfs != fetched_nolfs, \ + f"fetching with and without lfs yielded the same store path {fetched_lfs}, fingerprinting error?" + + # check that file was smudged + file_size_lfs = client.succeed(f"stat -c %s {fetched_lfs}/beeg").strip() + assert int(file_size_lfs) == 1024 * 1024, \ + f"set lfs=true, yet lfs-enrolled file is {file_size_lfs}b (!= 1MiB), probably did not smudge when we should have" + + + with subtest("Check that default is lfs=false"): + fetchGit_default_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + }} + """ + fetched_default = client.succeed(f""" + nix eval --debug --impure --raw --expr '({fetchGit_default_expr}).outPath' + """) + + # check that file was not smudged + file_size_default = client.succeed(f"stat -c %s {fetched_default}/beeg").strip() + + assert int(file_size_default) < expected_max_size_lfs_pointer, \ + f"did not set lfs, yet lfs-enrolled file is {file_size_default}b (>= 1KiB), probably bad default value" + + with subtest("Use as flake input"): + # May seem reduntant, but this has minor differences compared to raw + # fetchGit which caused failures before + with TemporaryDirectory() as tempdir: + client.succeed(f"mkdir -p {tempdir}") + client.succeed(f""" + printf '{{ + inputs = {{ + foo = {{ + url = "git+{repo.remote}?ref=main&rev={lfs_file_rev}&lfs=1"; + flake = false; + }}; + }}; + outputs = {{ foo, self }}: {{ inherit (foo) outPath; }}; + }}' >{tempdir}/flake.nix + """) + fetched_flake = client.succeed(f""" + nix eval --debug --raw {tempdir}#.outPath + """) + + assert fetched_lfs == fetched_flake, \ + f"fetching as flake input (store path {fetched_flake}) yielded a different result than using fetchGit (store path {fetched_lfs})" + ''; +} diff --git a/tests/nixos/fetch-git/testsupport/gitea.nix b/tests/nixos/fetch-git/testsupport/gitea.nix index 9409acff7..e63182639 100644 --- a/tests/nixos/fetch-git/testsupport/gitea.nix +++ b/tests/nixos/fetch-git/testsupport/gitea.nix @@ -29,9 +29,16 @@ in { pkgs, ... }: { services.gitea.enable = true; - services.gitea.settings.service.DISABLE_REGISTRATION = true; - services.gitea.settings.log.LEVEL = "Info"; - services.gitea.settings.database.LOG_SQL = false; + services.gitea.lfs.enable = true; + services.gitea.settings = { + service.DISABLE_REGISTRATION = true; + server = { + DOMAIN = "gitea"; + HTTP_PORT = 3000; + }; + log.LEVEL = "Info"; + database.LOG_SQL = false; + }; services.openssh.enable = true; networking.firewall.allowedTCPPorts = [ 3000 ]; environment.systemPackages = [ @@ -54,7 +61,10 @@ in client = { pkgs, ... }: { - environment.systemPackages = [ pkgs.git ]; + environment.systemPackages = [ + pkgs.git + pkgs.git-lfs + ]; }; }; defaults =