Merge pull request #13445 from xokdvium/simplify-util-url

libutil: Use Boost.URL for URI parsing
2025-11-09 12:06:01 +01:00 · 2025-08-04 19:46:58 +02:00 · 2025-08-04 19:46:58 +02:00 · c7af923865
commit c7af923865
parent 7abfc9b92a a54284cbc7
12 changed files with 192 additions and 81 deletions
--- a/doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md
+++ b/doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md
@ -0,0 +1,6 @@
+---
+synopsis: "Represent IPv6 RFC4007 ZoneId literals in conformance with RFC6874"
+prs: [13445]
+---
+
+Prior versions of Nix since [#4646](https://github.com/NixOS/nix/pull/4646) accepted [IPv6 scoped addresses](https://datatracker.ietf.org/doc/html/rfc4007) in URIs like [store references](@docroot@/store/types/index.md#store-url-format) in the textual representation with a literal percent character: `[fe80::1%18]`. This was ambiguous, because the the percent literal `%` is reserved by [RFC3986](https://datatracker.ietf.org/doc/html/rfc3986), since it's used to indicate percent encoding. Nix now requires that the percent `%` symbol is percent-encoded as `%25`. This implements [RFC6874](https://datatracker.ietf.org/doc/html/rfc6874), which defines the representation of zone identifiers in URIs. The example from above now has to be specified as `[fe80::1%2518]`.
--- a/packaging/dependencies.nix
+++ b/packaging/dependencies.nix
@ -62,6 +62,7 @@ scope: {
        "--with-context"
        "--with-coroutine"
        "--with-iostreams"
+        "--with-url"
      ];
      enableIcu = false;
    }).overrideAttrs
--- a/src/libflake-tests/flakeref.cc
+++ b/src/libflake-tests/flakeref.cc
@ -48,6 +48,13 @@ TEST(parseFlakeRef, path)
        ASSERT_EQ(flakeref.to_string(), "path:/foo/bar?revCount=123");
        ASSERT_EQ(fragment, "bla");
    }
+
+    {
+        auto s = "/foo bar/baz?dir=bla space";
+        auto flakeref = parseFlakeRef(fetchSettings, s);
+        ASSERT_EQ(flakeref.to_string(), "path:/foo%20bar/baz?dir=bla%20space");
+        ASSERT_EQ(flakeref.toAttrs().at("dir"), fetchers::Attr("bla space"));
+    }
 }

 TEST(to_string, doesntReencodeUrl)
--- a/src/libstore-tests/data/store-reference/local_3.txt
+++ b/src/libstore-tests/data/store-reference/local_3.txt
@ -0,0 +1 @@
+local://?root=/foo bar/baz
--- a/src/libstore-tests/store-reference.cc
+++ b/src/libstore-tests/store-reference.cc
@ -85,10 +85,24 @@ static StoreReference localExample_2{
        },
 };

+static StoreReference localExample_3{
+    .variant =
+        StoreReference::Specified{
+            .scheme = "local",
+        },
+    .params =
+        {
+            {"root", "/foo bar/baz"},
+        },
+};
+
 URI_TEST(local_1, localExample_1)

 URI_TEST(local_2, localExample_2)

+/* Test path with spaces */
+URI_TEST(local_3, localExample_3)
+
 URI_TEST_READ(local_shorthand_1, localExample_1)

 URI_TEST_READ(local_shorthand_2, localExample_2)
--- a/src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh
+++ b/src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh
@ -0,0 +1,56 @@
+#pragma once
+///@file
+
+#include "nix/util/terminal.hh"
+#include <gmock/gmock.h>
+
+namespace nix::testing {
+
+namespace internal {
+
+/**
+ * GMock matcher that matches substring while stripping off all ANSI escapes.
+ * Useful for checking exceptions messages in unit tests.
+ */
+class HasSubstrIgnoreANSIMatcher
+{
+public:
+    explicit HasSubstrIgnoreANSIMatcher(std::string substring)
+        : substring(std::move(substring))
+    {
+    }
+
+    bool MatchAndExplain(const char * s, ::testing::MatchResultListener * listener) const
+    {
+        return s != nullptr && MatchAndExplain(std::string(s), listener);
+    }
+
+    template<typename MatcheeStringType>
+    bool MatchAndExplain(const MatcheeStringType & s, [[maybe_unused]] ::testing::MatchResultListener * listener) const
+    {
+        return filterANSIEscapes(s, /*filterAll=*/true).find(substring) != substring.npos;
+    }
+
+    void DescribeTo(::std::ostream * os) const
+    {
+        *os << "has substring " << substring;
+    }
+
+    void DescribeNegationTo(::std::ostream * os) const
+    {
+        *os << "has no substring " << substring;
+    }
+
+private:
+    std::string substring;
+};
+
+} // namespace internal
+
+inline ::testing::PolymorphicMatcher<internal::HasSubstrIgnoreANSIMatcher>
+HasSubstrIgnoreANSIMatcher(const std::string & substring)
+{
+    return ::testing::MakePolymorphicMatcher(internal::HasSubstrIgnoreANSIMatcher(substring));
+}
+
+} // namespace nix::testing
--- a/src/libutil-test-support/include/nix/util/tests/meson.build
+++ b/src/libutil-test-support/include/nix/util/tests/meson.build
@ -4,6 +4,7 @@ include_dirs = [include_directories('../../..')]

 headers = files(
  'characterization.hh',
+  'gmock-matchers.hh',
  'gtest-with-params.hh',
  'hash.hh',
  'nix_api_util.hh',
--- a/src/libutil-tests/url.cc
+++ b/src/libutil-tests/url.cc
@ -1,5 +1,7 @@
 #include "nix/util/url.hh"
+#include "nix/util/tests/gmock-matchers.hh"
 #include <gtest/gtest.h>
+#include <gmock/gmock.h>

 namespace nix {

@ -122,9 +124,9 @@ TEST(parseURL, parseIPv4Address)
    ASSERT_EQ(parsed, expected);
 }

-TEST(parseURL, parseScopedRFC4007IPv6Address)
+TEST(parseURL, parseScopedRFC6874IPv6Address)
 {
-    auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080";
+    auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080";
    auto parsed = parseURL(s);

    ParsedURL expected{
@ -289,6 +291,14 @@ TEST(percentDecode, trailingPercent)
    ASSERT_EQ(d, s);
 }

+TEST(percentDecode, incompleteEncoding)
+{
+    ASSERT_THAT(
+        []() { percentDecode("%1"); },
+        ::testing::ThrowsMessage<BadURL>(
+            testing::HasSubstrIgnoreANSIMatcher("error: invalid URI parameter '%1': incomplete pct-encoding")));
+}
+
 /* ----------------------------------------------------------------------------
 * percentEncode
 * --------------------------------------------------------------------------*/
--- a/src/libutil/include/nix/util/url-parts.hh
+++ b/src/libutil/include/nix/util/url-parts.hh
@ -8,21 +8,10 @@ namespace nix {

 // URI stuff.
 const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])";
-const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
-const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?";
-const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")";
 const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])";
 const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])";
-const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)";
-const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")";
-const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)";
-const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
 const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
-const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
 const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
-const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
-const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
-const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";

 /// A Git ref (i.e. branch or tag name).
 /// \todo check that this is correct.
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@ -15,7 +15,7 @@ struct ParsedURL

    std::string to_string() const;

-    bool operator==(const ParsedURL & other) const noexcept;
+    bool operator==(const ParsedURL & other) const noexcept = default;

    /**
     * Remove `.` and `..` path elements.
@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query);

 std::string encodeQuery(const StringMap & query);

+/**
+ * Parse a Nix URL into a ParsedURL.
+ *
+ * Nix URI is mostly compliant with RFC3986, but with some deviations:
+ * - Literal spaces are allowed and don't have to be percent encoded.
+ *   This is mostly done for backward compatibility.
+ *
+ * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
+ *
+ * @throws BadURL
+ */
 ParsedURL parseURL(const std::string & url);

 /**
--- a/src/libutil/meson.build
+++ b/src/libutil/meson.build
@ -57,7 +57,7 @@ deps_private += blake3

 boost = dependency(
  'boost',
-  modules : ['context', 'coroutine', 'iostreams'],
+  modules : ['context', 'coroutine', 'iostreams', 'url'],
  include_type: 'system',
  version: '>=1.82.0'
 )
--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@ -4,100 +4,120 @@
 #include "nix/util/split.hh"
 #include "nix/util/canon-path.hh"

+#include <boost/url.hpp>
+
 namespace nix {

 std::regex refRegex(refRegexS, std::regex::ECMAScript);
 std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
 std::regex revRegex(revRegexS, std::regex::ECMAScript);

-ParsedURL parseURL(const std::string & url)
+/**
+ * Drop trailing shevron for output installable syntax.
+ *
+ * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
+ * get used. That code should actually use ExtendedOutputsSpec::parseOpt.
+ */
+static std::string_view dropShevronSuffix(std::string_view url)
 {
-    static std::regex uriRegex(
-        "((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex
-            + ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?",
-        std::regex::ECMAScript);
+    auto shevron = url.rfind("^");
+    if (shevron == std::string_view::npos)
+        return url;
+    return url.substr(0, shevron);
+}

-    std::smatch match;
+/**
+ * Percent encode spaces in the url.
+ */
+static std::string percentEncodeSpaces(std::string_view url)
+{
+    return replaceStrings(std::string(url), " ", percentEncode(" "));
+}

-    if (std::regex_match(url, match, uriRegex)) {
-        std::string scheme = match[2];
-        auto authority = match[3].matched ? std::optional<std::string>(match[3]) : std::nullopt;
-        std::string path = match[4].matched ? match[4] : match[5];
-        auto & query = match[6];
-        auto & fragment = match[7];
+ParsedURL parseURL(const std::string & url)
+try {
+    /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
+       shouldn't appear in normal URIs. */
+    auto unparsedView = dropShevronSuffix(url);
+    /* For back-compat literal spaces are allowed. */
+    auto withFixedSpaces = percentEncodeSpaces(unparsedView);
+    auto urlView = boost::urls::url_view(withFixedSpaces);

-        auto transportIsFile = parseUrlScheme(scheme).transport == "file";
+    if (!urlView.has_scheme())
+        throw BadURL("'%s' doesn't have a scheme", url);

-        if (authority && *authority != "" && transportIsFile)
-            throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
+    auto scheme = urlView.scheme();
+    auto authority = [&]() -> std::optional<std::string> {
+        if (urlView.has_authority())
+            return percentDecode(urlView.authority().buffer());
+        return std::nullopt;
+    }();

-        if (transportIsFile && path.empty())
-            path = "/";
+    auto transportIsFile = parseUrlScheme(scheme).transport == "file";
+    if (authority && *authority != "" && transportIsFile)
+        throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);

-        return ParsedURL{
-            .scheme = scheme,
-            .authority = authority,
-            .path = percentDecode(path),
-            .query = decodeQuery(query),
-            .fragment = percentDecode(std::string(fragment))};
-    }
+    auto path = urlView.path();         /* Does pct-decoding */
+    auto fragment = urlView.fragment(); /* Does pct-decoding */

-    else
-        throw BadURL("'%s' is not a valid URL", url);
+    if (transportIsFile && path.empty())
+        path = "/";
+
+    /* Get the raw query. Store URI supports smuggling doubly nested queries, where
+       the inner &/? are pct-encoded. */
+    auto query = std::string_view(urlView.encoded_query());
+
+    return ParsedURL{
+        .scheme = scheme,
+        .authority = authority,
+        .path = path,
+        .query = decodeQuery(std::string(query)),
+        .fragment = fragment,
+    };
+} catch (boost::system::system_error & e) {
+    throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
 }

 std::string percentDecode(std::string_view in)
 {
-    std::string decoded;
-    for (size_t i = 0; i < in.size();) {
-        if (in[i] == '%') {
-            if (i + 2 >= in.size())
-                throw BadURL("invalid URI parameter '%s'", in);
-            try {
-                decoded += std::stoul(std::string(in, i + 1, 2), 0, 16);
-                i += 3;
-            } catch (...) {
-                throw BadURL("invalid URI parameter '%s'", in);
-            }
-        } else
-            decoded += in[i++];
-    }
-    return decoded;
+    auto pctView = boost::urls::make_pct_string_view(in);
+    if (pctView.has_value())
+        return pctView->decode();
+    auto error = pctView.error();
+    throw BadURL("invalid URI parameter '%s': %s", in, error.message());
+}
+
+std::string percentEncode(std::string_view s, std::string_view keep)
+{
+    return boost::urls::encode(
+        s, [keep](char c) { return boost::urls::unreserved_chars(c) || keep.find(c) != keep.npos; });
 }

 StringMap decodeQuery(const std::string & query)
-{
+try {
+    /* For back-compat literal spaces are allowed. */
+    auto withFixedSpaces = percentEncodeSpaces(query);
+
    StringMap result;

-    for (const auto & s : tokenizeString<Strings>(query, "&")) {
-        auto e = s.find('=');
-        if (e == std::string::npos) {
-            warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "=");
+    auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
+    for (auto && [key, value, value_specified] : encodedQuery) {
+        if (!value_specified) {
+            warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
            continue;
        }

-        result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1)));
+        result.emplace(key.decode(), value.decode());
    }

    return result;
+} catch (boost::system::system_error & e) {
+    throw BadURL("invalid URI query '%s': %s", query, e.code().message());
 }

 const static std::string allowedInQuery = ":@/?";
 const static std::string allowedInPath = ":@/";

-std::string percentEncode(std::string_view s, std::string_view keep)
-{
-    std::string res;
-    for (auto & c : s)
-        // unreserved + keep
-        if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || strchr("-._~", c)
-            || keep.find(c) != std::string::npos)
-            res += c;
-        else
-            res += fmt("%%%02X", c & 0xFF);
-    return res;
-}
-
 std::string encodeQuery(const StringMap & ss)
 {
    std::string res;
@ -125,12 +145,6 @@ std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
    return os;
 }

-bool ParsedURL::operator==(const ParsedURL & other) const noexcept
-{
-    return scheme == other.scheme && authority == other.authority && path == other.path && query == other.query
-           && fragment == other.fragment;
-}
-
 ParsedURL ParsedURL::canonicalise()
 {
    ParsedURL res(*this);
@ -171,6 +185,7 @@ std::string fixGitURL(const std::string & url)
 // https://www.rfc-editor.org/rfc/rfc3986#section-3.1
 bool isValidSchemeName(std::string_view s)
 {
+    const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
    static std::regex regex(schemeNameRegex, std::regex::ECMAScript);

    return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);