libfetchers: Restore path separator ignoring behavior for indirect and git-archive flakerefs

Old versions of nix happily accepted a lot of weird flake references, which we didn't have tests for, so this was accidentally broken in c436b7a32a. This patch restores previous behavior and adds a plethora of tests to ensure we don't break this in the future. These test cases are aligned with how 2.18/2.28 parsed flake references.
2025-11-11 13:06:01 +01:00 · 2025-08-30 14:40:56 +03:00 · 2025-08-30 14:40:56 +03:00 · a38ebdd511
commit a38ebdd511
parent 511d885d60
5 changed files with 184 additions and 2 deletions
--- a/src/libfetchers/github.cc
+++ b/src/libfetchers/github.cc
@ -38,7 +38,8 @@ struct GitArchiveInputScheme : InputScheme
        if (url.scheme != schemeName())
            return {};

-        const auto & path = url.path;
+        /* This ignores empty path segments for back-compat. Older versions used a tokenizeString here. */
+        auto path = url.pathSegments(/*skipEmpty=*/true) | std::ranges::to<std::vector<std::string>>();

        std::optional<Hash> rev;
        std::optional<std::string> ref;
--- a/src/libfetchers/indirect.cc
+++ b/src/libfetchers/indirect.cc
@ -14,7 +14,8 @@ struct IndirectInputScheme : InputScheme
        if (url.scheme != "flake")
            return {};

-        const auto & path = url.path;
+        /* This ignores empty path segments for back-compat. Older versions used a tokenizeString here. */
+        auto path = url.pathSegments(/*skipEmpty=*/true) | std::ranges::to<std::vector<std::string>>();

        std::optional<Hash> rev;
        std::optional<std::string> ref;
--- a/src/libflake-tests/flakeref.cc
+++ b/src/libflake-tests/flakeref.cc
@ -172,6 +172,74 @@ INSTANTIATE_TEST_SUITE_P(
                },
            .description = "flake_id_ref_branch_trailing_slash",
            .expectedUrl = "flake:nixpkgs/branch/2aae6c35c94fcfb415dbe95f408b9ce91ee846ed",
+        },
+        // The following tests are for back-compat with lax parsers in older versions
+        // that used `tokenizeString` for splitting path segments, which ignores empty
+        // strings.
+        InputFromURLTestCase{
+            .url = "nixpkgs/branch////",
+            .attrs =
+                {
+                    {"id", Attr("nixpkgs")},
+                    {"type", Attr("indirect")},
+                    {"ref", Attr("branch")},
+                },
+            .description = "flake_id_ref_branch_ignore_empty_trailing_segments",
+            .expectedUrl = "flake:nixpkgs/branch",
+        },
+        InputFromURLTestCase{
+            .url = "nixpkgs/branch///2aae6c35c94fcfb415dbe95f408b9ce91ee846ed///",
+            .attrs =
+                {
+                    {"id", Attr("nixpkgs")},
+                    {"type", Attr("indirect")},
+                    {"ref", Attr("branch")},
+                    {"rev", Attr("2aae6c35c94fcfb415dbe95f408b9ce91ee846ed")},
+                },
+            .description = "flake_id_ref_branch_ignore_empty_segments_ref_rev",
+            .expectedUrl = "flake:nixpkgs/branch/2aae6c35c94fcfb415dbe95f408b9ce91ee846ed",
+        },
+        InputFromURLTestCase{
+            // Note that this is different from above because the "flake id" shorthand
+            // doesn't allow this.
+            .url = "flake:/nixpkgs///branch////",
+            .attrs =
+                {
+                    {"id", Attr("nixpkgs")},
+                    {"type", Attr("indirect")},
+                    {"ref", Attr("branch")},
+                },
+            .description = "indirect_branch_empty_segments_everywhere",
+            .expectedUrl = "flake:nixpkgs/branch",
+        },
+        InputFromURLTestCase{
+            // TODO: Technically this has an empty authority, but it's ignored
+            // for now. Yes, this is what all versions going back to at least
+            // 2.18 did and yes, this should not be allowed.
+            .url = "github://////owner%42/////repo%41///branch%43////",
+            .attrs =
+                {
+                    {"type", Attr("github")},
+                    {"owner", Attr("ownerB")},
+                    {"repo", Attr("repoA")},
+                    {"ref", Attr("branchC")},
+                },
+            .description = "github_ref_slashes_in_path_everywhere",
+            .expectedUrl = "github:ownerB/repoA/branchC",
+        },
+        InputFromURLTestCase{
+            // FIXME: Subgroups in gitlab URLs are busted. This double-encoding
+            // behavior exists since 2.18. See issue #9161 and PR #8845.
+            .url = "gitlab:/owner%252Fsubgroup/////repo%41///branch%43////",
+            .attrs =
+                {
+                    {"type", Attr("gitlab")},
+                    {"owner", Attr("owner%2Fsubgroup")},
+                    {"repo", Attr("repoA")},
+                    {"ref", Attr("branchC")},
+                },
+            .description = "gitlab_ref_slashes_in_path_everywhere_with_pct_encoding",
+            .expectedUrl = "gitlab:owner%252Fsubgroup/repoA/branchC",
        }),
    [](const ::testing::TestParamInfo<InputFromURLTestCase> & info) { return info.param.description; });

--- a/src/libutil-tests/url.cc
+++ b/src/libutil-tests/url.cc
@ -3,6 +3,8 @@
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>

+#include <ranges>
+
 namespace nix {

 /* ----------- tests for url.hh --------------------------------------------------*/
@ -686,7 +688,102 @@ TEST(parseURL, gitlabNamespacedProjectUrls)
    ASSERT_EQ(s, parsed.to_string());
 }

+/* ----------------------------------------------------------------------------
+ * pathSegments
+ * --------------------------------------------------------------------------*/
+
+struct ParsedURLPathSegmentsTestCase
+{
+    std::string url;
+    std::vector<std::string> segments;
+    std::string path;
+    bool skipEmpty;
+    std::string description;
+};
+
+class ParsedURLPathSegmentsTest : public ::testing::TestWithParam<ParsedURLPathSegmentsTestCase>
+{};
+
+TEST_P(ParsedURLPathSegmentsTest, segmentsAreCorrect)
+{
+    const auto & testCase = GetParam();
+    auto segments = parseURL(testCase.url).pathSegments(/*skipEmpty=*/testCase.skipEmpty)
+                    | std::ranges::to<decltype(testCase.segments)>();
+    EXPECT_EQ(segments, testCase.segments);
+    EXPECT_EQ(encodeUrlPath(segments), testCase.path);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ParsedURL,
+    ParsedURLPathSegmentsTest,
+    ::testing::Values(
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme:",
+            .segments = {""},
+            .path = "",
+            .skipEmpty = false,
+            .description = "no_authority_empty_path",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme://",
+            .segments = {""},
+            .path = "",
+            .skipEmpty = false,
+            .description = "empty_authority_empty_path",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme:///",
+            .segments = {"", ""},
+            .path = "/",
+            .skipEmpty = false,
+            .description = "empty_authority_empty_path_trailing",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme://example.com/",
+            .segments = {"", ""},
+            .path = "/",
+            .skipEmpty = false,
+            .description = "non_empty_authority_empty_path",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme://example.com//",
+            .segments = {"", "", ""},
+            .path = "//",
+            .skipEmpty = false,
+            .description = "non_empty_authority_non_empty_path",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme://example.com///path///with//strange/empty///segments////",
+            .segments = {"path", "with", "strange", "empty", "segments"},
+            .path = "path/with/strange/empty/segments",
+            .skipEmpty = true,
+            .description = "skip_all_empty_segments_with_authority",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme://example.com///lots///empty///",
+            .segments = {"", "", "", "lots", "", "", "empty", "", "", ""},
+            .path = "///lots///empty///",
+            .skipEmpty = false,
+            .description = "empty_segments_with_authority",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme:/path///with//strange/empty///segments////",
+            .segments = {"path", "with", "strange", "empty", "segments"},
+            .path = "path/with/strange/empty/segments",
+            .skipEmpty = true,
+            .description = "skip_all_empty_segments_no_authority_starts_with_slash",
+        },
+        ParsedURLPathSegmentsTestCase{
+            .url = "scheme:path///with//strange/empty///segments////",
+            .segments = {"path", "with", "strange", "empty", "segments"},
+            .path = "path/with/strange/empty/segments",
+            .skipEmpty = true,
+            .description = "skip_all_empty_segments_no_authority_doesnt_start_with_slash",
+        }),
+    [](const auto & info) { return info.param.description; });
+
 TEST(nix, isValidSchemeName)
+
 {
    ASSERT_TRUE(isValidSchemeName("http"));
    ASSERT_TRUE(isValidSchemeName("https"));
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@ -1,6 +1,7 @@
 #pragma once
 ///@file

+#include <ranges>
 #include <span>

 #include "nix/util/error.hh"
@ -230,6 +231,20 @@ struct ParsedURL
     * Remove `.` and `..` path segments.
     */
    ParsedURL canonicalise();
+
+    /**
+     * Get a range of path segments (the substrings separated by '/' characters).
+     *
+     * @param skipEmpty Skip all empty path segments
+     */
+    auto pathSegments(bool skipEmpty) const &
+    {
+        return std::views::filter(path, [skipEmpty](std::string_view segment) {
+            if (skipEmpty)
+                return !segment.empty();
+            return true;
+        });
+    }
 };

 std::ostream & operator<<(std::ostream & os, const ParsedURL & url);