From dc1b2012afb9dbe2a2832da87ac9a2465e38ecbf Mon Sep 17 00:00:00 2001
From: Sergei Zimmerman <sergei@zimmerman.foo>
Date: Sat, 16 Aug 2025 23:00:31 +0300
Subject: [PATCH] libutil: Fix handling of unescaped spaces, quotes and
 shevrons in queries and fragments

Turns out we didn't have tests for some of the important behavior introduced
for flake reference fragments and url queries [1]. This is rather important
and is relied upon by existing tooling. This fixes up these exact cases before
handing off the URL to the Boost.URL parser.

To the best of my knowledge this implements the same behavior as prior regex-based
parser did [2]:

> fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
> queryRegex = "(?:" + pcharRegex + "|[/? \"])*";

[1]: 9c0a09f09fbb930483b26f60f8552fbe5236b777
[2]: https://github.com/NixOS/nix/blob/2.30.2/src/libutil/include/nix/util/url-parts.hh
---
 src/libflake-tests/flakeref.cc      | 14 +++++
 src/libutil-tests/url.cc            | 20 +++++++
 src/libutil/include/nix/util/url.hh |  4 +-
 src/libutil/url.cc                  | 91 +++++++++++++++++++----------
 4 files changed, 96 insertions(+), 33 deletions(-)

diff --git a/src/libflake-tests/flakeref.cc b/src/libflake-tests/flakeref.cc
index 2f8deb123..404d7590a 100644
--- a/src/libflake-tests/flakeref.cc
+++ b/src/libflake-tests/flakeref.cc
@@ -74,6 +74,20 @@ TEST(parseFlakeRef, GitArchiveInput)
         auto flakeref = parseFlakeRef(fetchSettings, s);
         ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch%23");
     }
+
+    {
+        auto s = "github:foo/bar?ref=branch#\"name.with.dot\""; // unescaped quotes `"`
+        auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
+        ASSERT_EQ(fragment, "\"name.with.dot\"");
+        ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch");
+    }
+
+    {
+        auto s = "github:foo/bar#\"name.with.dot\""; // unescaped quotes `"`
+        auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
+        ASSERT_EQ(fragment, "\"name.with.dot\"");
+        ASSERT_EQ(flakeref.to_string(), "github:foo/bar");
+    }
 }
 
 TEST(to_string, doesntReencodeUrl)
diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc
index fb27689de..0dfb5f463 100644
--- a/src/libutil-tests/url.cc
+++ b/src/libutil-tests/url.cc
@@ -212,6 +212,26 @@ TEST(parseURL, parsedUrlsIsEqualToItself)
     ASSERT_TRUE(url == url);
 }
 
+TEST(parseURL, parsedUrlsWithUnescapedChars)
+{
+    /* Test for back-compat. Behavior is rather questionable, but
+     * is ingrained pretty deep into how URL parsing is shared between
+     * flakes and libstore.
+     * 1. Unescaped spaces, quotes and shevron (^) in fragment.
+     * 2. Unescaped spaces and quotes in query.
+     */
+    auto s = "http://www.example.org/file.tar.gz?query \"= 123\"#shevron^quote\"space ";
+    auto url = parseURL(s);
+
+    ASSERT_EQ(url.fragment, "shevron^quote\"space ");
+
+    auto query = StringMap{
+        {"query \"", " 123\""},
+    };
+
+    ASSERT_EQ(url.query, query);
+}
+
 TEST(parseURL, parseFTPUrl)
 {
     auto s = "ftp://ftp.nixos.org/downloads/nixos.iso";
diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh
index 0a6194b19..0aa1eac9f 100644
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@@ -104,8 +104,8 @@ std::string encodeQuery(const StringMap & query);
  * Parse a Nix URL into a ParsedURL.
  *
  * Nix URI is mostly compliant with RFC3986, but with some deviations:
- * - Literal spaces are allowed and don't have to be percent encoded.
- *   This is mostly done for backward compatibility.
+ * - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals.
+ * - Queries may contain unescaped '"' or spaces.
  *
  * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
  *
diff --git a/src/libutil/url.cc b/src/libutil/url.cc
index 8f902552f..75f62d445 100644
--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@@ -11,28 +11,6 @@ namespace nix {
 std::regex refRegex(refRegexS, std::regex::ECMAScript);
 std::regex revRegex(revRegexS, std::regex::ECMAScript);
 
-/**
- * Drop trailing shevron for output installable syntax.
- *
- * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
- * get used. That code should actually use ExtendedOutputsSpec::parseOpt.
- */
-static std::string_view dropShevronSuffix(std::string_view url)
-{
-    auto shevron = url.rfind("^");
-    if (shevron == std::string_view::npos)
-        return url;
-    return url.substr(0, shevron);
-}
-
-/**
- * Percent encode spaces in the url.
- */
-static std::string percentEncodeSpaces(std::string_view url)
-{
-    return replaceStrings(std::string(url), " ", percentEncode(" "));
-}
-
 ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority)
 {
     auto parsed = boost::urls::parse_authority(encodedAuthority);
@@ -108,14 +86,65 @@ std::string ParsedURL::Authority::to_string() const
     return std::move(oss).str();
 }
 
+/**
+ * Additional characters that don't need URL encoding in the fragment.
+ */
+static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInFragment = " \"^";
+
+/**
+ * Additional characters that don't need URL encoding in the query.
+ */
+static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInQuery = " \"";
+
+static std::string percentEncodeCharSet(std::string_view s, auto charSet)
+{
+    std::string res;
+    for (auto c : s) {
+        if (charSet(c))
+            res += percentEncode(std::string_view{&c, &c + 1});
+        else
+            res += c;
+    }
+    return res;
+}
+
 ParsedURL parseURL(const std::string & url)
 try {
-    /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
-       shouldn't appear in normal URIs. */
-    auto unparsedView = dropShevronSuffix(url);
-    /* For back-compat literal spaces are allowed. */
-    auto withFixedSpaces = percentEncodeSpaces(unparsedView);
-    auto urlView = boost::urls::url_view(withFixedSpaces);
+    auto unparsedView = url;
+
+    /* Account for several non-standard properties of nix urls (for back-compat):
+     *  - Allow unescaped spaces ' ' and '"' characters in queries.
+     *  - Allow '"', ' ' and '^' characters in the fragment component.
+     * We could write our own grammar for this, but fixing it up here seems
+     * more concise, since the deviation is rather minor.
+     */
+    std::string fixedEncodedUrl = [&]() {
+        std::string fixed;
+        std::string_view view = url;
+
+        if (auto beforeQuery = splitPrefixTo(view, '?')) {
+            fixed += *beforeQuery;
+            fixed += '?';
+            auto fragmentStart = view.find('#');
+            auto queryView = view.substr(0, fragmentStart);
+            auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery);
+            fixed += fixedQuery;
+            view.remove_prefix(std::min(fragmentStart, view.size()));
+        }
+
+        if (auto beforeFragment = splitPrefixTo(view, '#')) {
+            fixed += *beforeFragment;
+            fixed += '#';
+            auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment);
+            fixed += fixedFragment;
+            return fixed;
+        }
+
+        fixed += view;
+        return fixed;
+    }();
+
+    auto urlView = boost::urls::url_view(fixedEncodedUrl);
 
     if (!urlView.has_scheme())
         throw BadURL("'%s' doesn't have a scheme", url);
@@ -176,12 +205,12 @@ std::string percentEncode(std::string_view s, std::string_view keep)
 
 StringMap decodeQuery(const std::string & query)
 try {
-    /* For back-compat literal spaces are allowed. */
-    auto withFixedSpaces = percentEncodeSpaces(query);
+    /* For back-compat unescaped characters are allowed. */
+    auto fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery);
 
     StringMap result;
 
-    auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
+    auto encodedQuery = boost::urls::params_encoded_view(fixedEncodedQuery);
     for (auto && [key, value, value_specified] : encodedQuery) {
         if (!value_specified) {
             warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");