libutil: Fix handling of unescaped spaces, quotes and shevrons in queries and fragments

Turns out we didn't have tests for some of the important behavior introduced for flake reference fragments and url queries [1]. This is rather important and is relied upon by existing tooling. This fixes up these exact cases before handing off the URL to the Boost.URL parser. To the best of my knowledge this implements the same behavior as prior regex-based parser did [2]: > fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*"; > queryRegex = "(?:" + pcharRegex + "|[/? \"])*"; [1]: 9c0a09f09f [2]: https://github.com/NixOS/nix/blob/2.30.2/src/libutil/include/nix/util/url-parts.hh
2025-11-09 20:16:03 +01:00 · 2025-08-16 23:00:31 +03:00 · 2025-08-16 23:00:31 +03:00 · dc1b2012af
commit dc1b2012af
parent 0fd9ef0cf3
4 changed files with 96 additions and 33 deletions
--- a/src/libflake-tests/flakeref.cc
+++ b/src/libflake-tests/flakeref.cc
@ -74,6 +74,20 @@ TEST(parseFlakeRef, GitArchiveInput)
        auto flakeref = parseFlakeRef(fetchSettings, s);
        ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch%23");
    }
    {
        auto s = "github:foo/bar?ref=branch#\"name.with.dot\""; // unescaped quotes `"`
        auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
        ASSERT_EQ(fragment, "\"name.with.dot\"");
        ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch");
    }
    {
        auto s = "github:foo/bar#\"name.with.dot\""; // unescaped quotes `"`
        auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
        ASSERT_EQ(fragment, "\"name.with.dot\"");
        ASSERT_EQ(flakeref.to_string(), "github:foo/bar");
    }
 }
 TEST(to_string, doesntReencodeUrl)
--- a/src/libutil-tests/url.cc
+++ b/src/libutil-tests/url.cc
@ -212,6 +212,26 @@ TEST(parseURL, parsedUrlsIsEqualToItself)
    ASSERT_TRUE(url == url);
 }
 TEST(parseURL, parsedUrlsWithUnescapedChars)
 {
    /* Test for back-compat. Behavior is rather questionable, but
     * is ingrained pretty deep into how URL parsing is shared between
     * flakes and libstore.
     * 1. Unescaped spaces, quotes and shevron (^) in fragment.
     * 2. Unescaped spaces and quotes in query.
     */
    auto s = "http://www.example.org/file.tar.gz?query \"= 123\"#shevron^quote\"space ";
    auto url = parseURL(s);
    ASSERT_EQ(url.fragment, "shevron^quote\"space ");
    auto query = StringMap{
        {"query \"", " 123\""},
    };
    ASSERT_EQ(url.query, query);
 }
 TEST(parseURL, parseFTPUrl)
 {
    auto s = "ftp://ftp.nixos.org/downloads/nixos.iso";
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@ -104,8 +104,8 @@ std::string encodeQuery(const StringMap & query);
 * Parse a Nix URL into a ParsedURL.
 *
 * Nix URI is mostly compliant with RFC3986, but with some deviations:
- * - Literal spaces are allowed and don't have to be percent encoded.
+ * - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals.
- *   This is mostly done for backward compatibility.
+ * - Queries may contain unescaped '"' or spaces.
 *
 * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
 *
--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@ -11,28 +11,6 @@ namespace nix {
 std::regex refRegex(refRegexS, std::regex::ECMAScript);
 std::regex revRegex(revRegexS, std::regex::ECMAScript);
 /**
 * Drop trailing shevron for output installable syntax.
 *
 * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
 * get used. That code should actually use ExtendedOutputsSpec::parseOpt.
 */
 static std::string_view dropShevronSuffix(std::string_view url)
 {
    auto shevron = url.rfind("^");
    if (shevron == std::string_view::npos)
        return url;
    return url.substr(0, shevron);
 }
 /**
 * Percent encode spaces in the url.
 */
 static std::string percentEncodeSpaces(std::string_view url)
 {
    return replaceStrings(std::string(url), " ", percentEncode(" "));
 }
 ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority)
 {
    auto parsed = boost::urls::parse_authority(encodedAuthority);
@ -108,14 +86,65 @@ std::string ParsedURL::Authority::to_string() const
    return std::move(oss).str();
 }
 /**
 * Additional characters that don't need URL encoding in the fragment.
 */
 static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInFragment = " \"^";
 /**
 * Additional characters that don't need URL encoding in the query.
 */
 static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInQuery = " \"";
 static std::string percentEncodeCharSet(std::string_view s, auto charSet)
 {
    std::string res;
    for (auto c : s) {
        if (charSet(c))
            res += percentEncode(std::string_view{&c, &c + 1});
        else
            res += c;
    }
    return res;
 }
 ParsedURL parseURL(const std::string & url)
 try {
-    /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
+    auto unparsedView = url;
-       shouldn't appear in normal URIs. */
+
-    auto unparsedView = dropShevronSuffix(url);
+    /* Account for several non-standard properties of nix urls (for back-compat):
-    /* For back-compat literal spaces are allowed. */
+     *  - Allow unescaped spaces ' ' and '"' characters in queries.
-    auto withFixedSpaces = percentEncodeSpaces(unparsedView);
+     *  - Allow '"', ' ' and '^' characters in the fragment component.
-    auto urlView = boost::urls::url_view(withFixedSpaces);
+     * We could write our own grammar for this, but fixing it up here seems
     * more concise, since the deviation is rather minor.
     */
    std::string fixedEncodedUrl = [&]() {
        std::string fixed;
        std::string_view view = url;
        if (auto beforeQuery = splitPrefixTo(view, '?')) {
            fixed += *beforeQuery;
            fixed += '?';
            auto fragmentStart = view.find('#');
            auto queryView = view.substr(0, fragmentStart);
            auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery);
            fixed += fixedQuery;
            view.remove_prefix(std::min(fragmentStart, view.size()));
        }
        if (auto beforeFragment = splitPrefixTo(view, '#')) {
            fixed += *beforeFragment;
            fixed += '#';
            auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment);
            fixed += fixedFragment;
            return fixed;
        }
        fixed += view;
        return fixed;
    }();
    auto urlView = boost::urls::url_view(fixedEncodedUrl);
    if (!urlView.has_scheme())
        throw BadURL("'%s' doesn't have a scheme", url);
@ -176,12 +205,12 @@ std::string percentEncode(std::string_view s, std::string_view keep)
 StringMap decodeQuery(const std::string & query)
 try {
-    /* For back-compat literal spaces are allowed. */
+    /* For back-compat unescaped characters are allowed. */
-    auto withFixedSpaces = percentEncodeSpaces(query);
+    auto fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery);
    StringMap result;
-    auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
+    auto encodedQuery = boost::urls::params_encoded_view(fixedEncodedQuery);
    for (auto && [key, value, value_specified] : encodedQuery) {
        if (!value_specified) {
            warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");