From bd1d2d1041a321284efcf22e11beb86ede08648d Mon Sep 17 00:00:00 2001
From: Sergei Zimmerman <sergei@zimmerman.foo>
Date: Fri, 18 Jul 2025 21:20:59 +0300
Subject: [PATCH] libutil: Use Boost.URL in parseURL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Boost.URL is a significantly more RFC-compliant parser
than what libutil currently has a bundle of incomprehensible
regexes.

One aspect of this change is that RFC4007 ZoneId IPv6 literals
are represented in URIs according to RFC6874 [1].

Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25].
This is not entirely correct, because the percent itself has to be pct-encoded:

> "%" is always treated as
   an escape character in a URI, so, according to the established URI
   syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST
   be percent-encoded and represented in the form "%25".  Thus, the
   scoped address fe80::a%en1 would appear in a URI as
   http://[fe80::a%25en1].

[1]: https://datatracker.ietf.org/doc/html/rfc6874

Co-authored-by: Jörg Thalheim <joerg@thalheim.io>
---
 src/libutil-tests/url.cc                  |   4 +-
 src/libutil/include/nix/util/url-parts.hh |  11 ---
 src/libutil/include/nix/util/url.hh       |  11 +++
 src/libutil/url.cc                        | 100 +++++++++++++++-------
 4 files changed, 81 insertions(+), 45 deletions(-)

diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc
index 8f2033ded..5e9b81f46 100644
--- a/src/libutil-tests/url.cc
+++ b/src/libutil-tests/url.cc
@@ -124,9 +124,9 @@ TEST(parseURL, parseIPv4Address)
     ASSERT_EQ(parsed, expected);
 }
 
-TEST(parseURL, parseScopedRFC4007IPv6Address)
+TEST(parseURL, parseScopedRFC6874IPv6Address)
 {
-    auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080";
+    auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080";
     auto parsed = parseURL(s);
 
     ParsedURL expected{
diff --git a/src/libutil/include/nix/util/url-parts.hh b/src/libutil/include/nix/util/url-parts.hh
index bf1215b6d..72c901b5d 100644
--- a/src/libutil/include/nix/util/url-parts.hh
+++ b/src/libutil/include/nix/util/url-parts.hh
@@ -8,21 +8,10 @@ namespace nix {
 
 // URI stuff.
 const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])";
-const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
-const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?";
-const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")";
 const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])";
 const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])";
-const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)";
-const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")";
-const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)";
-const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
 const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
-const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
 const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
-const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
-const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
-const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";
 
 /// A Git ref (i.e. branch or tag name).
 /// \todo check that this is correct.
diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh
index e29226720..1c51ab797 100644
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query);
 
 std::string encodeQuery(const StringMap & query);
 
+/**
+ * Parse a Nix URL into a ParsedURL.
+ *
+ * Nix URI is mostly compliant with RFC3986, but with some deviations:
+ * - Literal spaces are allowed and don't have to be percent encoded.
+ *   This is mostly done for backward compatibility.
+ *
+ * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
+ *
+ * @throws BadURL
+ */
 ParsedURL parseURL(const std::string & url);
 
 /**
diff --git a/src/libutil/url.cc b/src/libutil/url.cc
index 7f31d0f1c..2f9c7736a 100644
--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@@ -12,40 +12,70 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript);
 std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
 std::regex revRegex(revRegexS, std::regex::ECMAScript);
 
-ParsedURL parseURL(const std::string & url)
+/**
+ * Drop trailing shevron for output installable syntax.
+ *
+ * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
+ * get used. That code should actually use ExtendedOutputsSpec::parseOpt.
+ */
+static std::string_view dropShevronSuffix(std::string_view url)
 {
-    static std::regex uriRegex(
-        "((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex
-            + ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?",
-        std::regex::ECMAScript);
+    auto shevron = url.rfind("^");
+    if (shevron == std::string_view::npos)
+        return url;
+    return url.substr(0, shevron);
+}
 
-    std::smatch match;
+/**
+ * Percent encode spaces in the url.
+ */
+static std::string percentEncodeSpaces(std::string_view url)
+{
+    return replaceStrings(std::string(url), " ", percentEncode(" "));
+}
 
-    if (std::regex_match(url, match, uriRegex)) {
-        std::string scheme = match[2];
-        auto authority = match[3].matched ? std::optional<std::string>(match[3]) : std::nullopt;
-        std::string path = match[4].matched ? match[4] : match[5];
-        auto & query = match[6];
-        auto & fragment = match[7];
+ParsedURL parseURL(const std::string & url)
+try {
+    /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
+       shouldn't appear in normal URIs. */
+    auto unparsedView = dropShevronSuffix(url);
+    /* For back-compat literal spaces are allowed. */
+    auto withFixedSpaces = percentEncodeSpaces(unparsedView);
+    auto urlView = boost::urls::url_view(withFixedSpaces);
 
-        auto transportIsFile = parseUrlScheme(scheme).transport == "file";
+    if (!urlView.has_scheme())
+        throw BadURL("'%s' doesn't have a scheme", url);
 
-        if (authority && *authority != "" && transportIsFile)
-            throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
+    auto scheme = urlView.scheme();
+    auto authority = [&]() -> std::optional<std::string> {
+        if (urlView.has_authority())
+            return percentDecode(urlView.authority().buffer());
+        return std::nullopt;
+    }();
 
-        if (transportIsFile && path.empty())
-            path = "/";
+    auto transportIsFile = parseUrlScheme(scheme).transport == "file";
+    if (authority && *authority != "" && transportIsFile)
+        throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
 
-        return ParsedURL{
-            .scheme = scheme,
-            .authority = authority,
-            .path = percentDecode(path),
-            .query = decodeQuery(query),
-            .fragment = percentDecode(std::string(fragment))};
-    }
+    auto path = urlView.path();         /* Does pct-decoding */
+    auto fragment = urlView.fragment(); /* Does pct-decoding */
 
-    else
-        throw BadURL("'%s' is not a valid URL", url);
+    if (transportIsFile && path.empty())
+        path = "/";
+
+    /* Get the raw query. Store URI supports smuggling doubly nested queries, where
+       the inner &/? are pct-encoded. */
+    auto query = std::string_view(urlView.encoded_query());
+
+    return ParsedURL{
+        .scheme = scheme,
+        .authority = authority,
+        .path = path,
+        .query = decodeQuery(std::string(query)),
+        .fragment = fragment,
+    };
+} catch (boost::system::system_error & e) {
+    throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
 }
 
 std::string percentDecode(std::string_view in)
@@ -64,20 +94,25 @@ std::string percentEncode(std::string_view s, std::string_view keep)
 }
 
 StringMap decodeQuery(const std::string & query)
-{
+try {
+    /* For back-compat literal spaces are allowed. */
+    auto withFixedSpaces = percentEncodeSpaces(query);
+
     StringMap result;
 
-    for (const auto & s : tokenizeString<Strings>(query, "&")) {
-        auto e = s.find('=');
-        if (e == std::string::npos) {
-            warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "=");
+    auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
+    for (auto && [key, value, value_specified] : encodedQuery) {
+        if (!value_specified) {
+            warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
             continue;
         }
 
-        result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1)));
+        result.emplace(key.decode(), value.decode());
     }
 
     return result;
+} catch (boost::system::system_error & e) {
+    throw BadURL("invalid URI query '%s': %s", query, e.code().message());
 }
 
 const static std::string allowedInQuery = ":@/?";
@@ -150,6 +185,7 @@ std::string fixGitURL(const std::string & url)
 // https://www.rfc-editor.org/rfc/rfc3986#section-3.1
 bool isValidSchemeName(std::string_view s)
 {
+    const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
     static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
 
     return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);