From ec6ba866d1a75f04a43d3ac4061a3b0277be842b Mon Sep 17 00:00:00 2001 From: John Ericson Date: Fri, 22 Aug 2025 12:26:11 -0400 Subject: [PATCH] Limit to lenient parsing of non-standard URLs only where needed This allows us to put `parseURL` in more spots without furthering technical debt. (cherry picked from commit 72a548ed6aa4677d66602c97f26a0b13d6729298) --- src/libexpr/primops/fetchClosure.cc | 2 +- src/libflake-tests/url-name.cc | 7 +-- src/libflake/flakeref.cc | 6 +-- src/libstore/store-reference.cc | 4 +- src/libutil-tests/url.cc | 11 +++-- src/libutil/include/nix/util/url.hh | 12 +++-- src/libutil/url.cc | 68 +++++++++++++++++------------ src/nix/profile.cc | 17 +++++--- 8 files changed, 76 insertions(+), 51 deletions(-) diff --git a/src/libexpr/primops/fetchClosure.cc b/src/libexpr/primops/fetchClosure.cc index 469459818..63da53aa9 100644 --- a/src/libexpr/primops/fetchClosure.cc +++ b/src/libexpr/primops/fetchClosure.cc @@ -185,7 +185,7 @@ static void prim_fetchClosure(EvalState & state, const PosIdx pos, Value ** args {.msg = HintFmt("attribute '%s' is missing in call to 'fetchClosure'", "fromStore"), .pos = state.positions[pos]}); - auto parsedURL = parseURL(*fromStoreUrl); + auto parsedURL = parseURL(*fromStoreUrl, /*lenient=*/true); if (parsedURL.scheme != "http" && parsedURL.scheme != "https" && !(getEnv("_NIX_IN_TEST").has_value() && parsedURL.scheme == "file")) diff --git a/src/libflake-tests/url-name.cc b/src/libflake-tests/url-name.cc index 78de34458..81ba516c8 100644 --- a/src/libflake-tests/url-name.cc +++ b/src/libflake-tests/url-name.cc @@ -13,8 +13,9 @@ TEST(getNameFromURL, getNameFromURL) ASSERT_EQ(getNameFromURL(parseURL("path:~/repos/nixpkgs#packages.x86_64-linux.Hello")), "Hello"); ASSERT_EQ(getNameFromURL(parseURL("path:.#nonStandardAttr.mylaptop")), "mylaptop"); ASSERT_EQ(getNameFromURL(parseURL("path:./repos/myflake#nonStandardAttr.mylaptop")), "mylaptop"); - ASSERT_EQ(getNameFromURL(parseURL("path:./nixpkgs#packages.x86_64-linux.complex^bin,man")), "complex"); - ASSERT_EQ(getNameFromURL(parseURL("path:./myproj#packages.x86_64-linux.default^*")), "myproj"); + ASSERT_EQ( + getNameFromURL(parseURL("path:./nixpkgs#packages.x86_64-linux.complex^bin,man", /*lenient=*/true)), "complex"); + ASSERT_EQ(getNameFromURL(parseURL("path:./myproj#packages.x86_64-linux.default^*", /*lenient=*/true)), "myproj"); ASSERT_EQ(getNameFromURL(parseURL("path:./myproj#defaultPackage.x86_64-linux")), "myproj"); ASSERT_EQ(getNameFromURL(parseURL("github:NixOS/nixpkgs#packages.x86_64-linux.hello")), "hello"); @@ -80,6 +81,6 @@ TEST(getNameFromURL, getNameFromURL) ASSERT_EQ(getNameFromURL(parseURL("path:.")), std::nullopt); ASSERT_EQ(getNameFromURL(parseURL("file:.#")), std::nullopt); ASSERT_EQ(getNameFromURL(parseURL("path:.#packages.x86_64-linux.default")), std::nullopt); - ASSERT_EQ(getNameFromURL(parseURL("path:.#packages.x86_64-linux.default^*")), std::nullopt); + ASSERT_EQ(getNameFromURL(parseURL("path:.#packages.x86_64-linux.default^*", /*lenient=*/true)), std::nullopt); } } // namespace nix diff --git a/src/libflake/flakeref.cc b/src/libflake/flakeref.cc index a562e29d2..070f4e483 100644 --- a/src/libflake/flakeref.cc +++ b/src/libflake/flakeref.cc @@ -82,7 +82,7 @@ std::pair parsePathFlakeRefWithFragment( auto succeeds = std::regex_match(url, match, pathFlakeRegex); assert(succeeds); auto path = match[1].str(); - auto query = decodeQuery(match[3].str()); + auto query = decodeQuery(match[3].str(), /*lenient=*/true); auto fragment = percentDecode(match[5].str()); if (baseDir) { @@ -210,7 +210,7 @@ std::optional> parseURLFlakeRef( bool isFlake) { try { - auto parsed = parseURL(url); + auto parsed = parseURL(url, /*lenient=*/true); if (baseDir && (parsed.scheme == "path" || parsed.scheme == "git+file") && !isAbsolute(parsed.path)) parsed.path = absPath(parsed.path, *baseDir); return fromParsedURL(fetchSettings, std::move(parsed), isFlake); @@ -289,7 +289,7 @@ FlakeRef FlakeRef::canonicalize() const filtering the `dir` query parameter from the URL. */ if (auto url = fetchers::maybeGetStrAttr(flakeRef.input.attrs, "url")) { try { - auto parsed = parseURL(*url); + auto parsed = parseURL(*url, /*lenient=*/true); if (auto dir2 = get(parsed.query, "dir")) { if (flakeRef.subdir != "" && flakeRef.subdir == *dir2) parsed.query.erase("dir"); diff --git a/src/libstore/store-reference.cc b/src/libstore/store-reference.cc index 2b8305072..adc60b391 100644 --- a/src/libstore/store-reference.cc +++ b/src/libstore/store-reference.cc @@ -45,7 +45,7 @@ StoreReference StoreReference::parse(const std::string & uri, const StoreReferen { auto params = extraParams; try { - auto parsedUri = parseURL(uri); + auto parsedUri = parseURL(uri, /*lenient=*/true); params.insert(parsedUri.query.begin(), parsedUri.query.end()); auto baseURI = parsedUri.authority.value_or(ParsedURL::Authority{}).to_string() + parsedUri.path; @@ -107,7 +107,7 @@ std::pair splitUriAndParams(const std::stri StoreReference::Params params; auto q = uri.find('?'); if (q != std::string::npos) { - params = decodeQuery(uri.substr(q + 1)); + params = decodeQuery(uri.substr(q + 1), /*lenient=*/true); uri = uri_.substr(0, q); } return {uri, params}; diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc index 0dfb5f463..b248421b3 100644 --- a/src/libutil-tests/url.cc +++ b/src/libutil-tests/url.cc @@ -221,15 +221,20 @@ TEST(parseURL, parsedUrlsWithUnescapedChars) * 2. Unescaped spaces and quotes in query. */ auto s = "http://www.example.org/file.tar.gz?query \"= 123\"#shevron^quote\"space "; - auto url = parseURL(s); - ASSERT_EQ(url.fragment, "shevron^quote\"space "); + /* Without leniency for back compat, this should throw. */ + EXPECT_THROW(parseURL(s), Error); + + /* With leniency for back compat, this should parse. */ + auto url = parseURL(s, /*lenient=*/true); + + EXPECT_EQ(url.fragment, "shevron^quote\"space "); auto query = StringMap{ {"query \"", " 123\""}, }; - ASSERT_EQ(url.query, query); + EXPECT_EQ(url.query, query); } TEST(parseURL, parseFTPUrl) diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh index cd20a08c6..3262b44b7 100644 --- a/src/libutil/include/nix/util/url.hh +++ b/src/libutil/include/nix/util/url.hh @@ -96,14 +96,18 @@ MakeError(BadURL, Error); std::string percentDecode(std::string_view in); std::string percentEncode(std::string_view s, std::string_view keep = ""); -StringMap decodeQuery(const std::string & query); +/** + * @param lenient @see parseURL + */ +StringMap decodeQuery(std::string_view query, bool lenient = false); std::string encodeQuery(const StringMap & query); /** - * Parse a Nix URL into a ParsedURL. + * Parse a URL into a ParsedURL. * - * Nix URI is mostly compliant with RFC3986, but with some deviations: + * @parm lenient Also allow some long-supported Nix URIs that are not quite compliant with RFC3986. + * Here are the deviations: * - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals. * - Queries may contain unescaped '"' or spaces. * @@ -111,7 +115,7 @@ std::string encodeQuery(const StringMap & query); * * @throws BadURL */ -ParsedURL parseURL(std::string_view url); +ParsedURL parseURL(std::string_view url, bool lenient = false); /** * Although that’s not really standardized anywhere, an number of tools diff --git a/src/libutil/url.cc b/src/libutil/url.cc index 07f4b29ea..b7f1eff30 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -108,41 +108,48 @@ static std::string percentEncodeCharSet(std::string_view s, auto charSet) return res; } -ParsedURL parseURL(std::string_view url) +ParsedURL parseURL(std::string_view url, bool lenient) try { /* Account for several non-standard properties of nix urls (for back-compat): * - Allow unescaped spaces ' ' and '"' characters in queries. * - Allow '"', ' ' and '^' characters in the fragment component. * We could write our own grammar for this, but fixing it up here seems * more concise, since the deviation is rather minor. + * + * If `!lenient` don't bother initializing, because we can just + * parse `url` directly`. */ - std::string fixedEncodedUrl = [&]() { - std::string fixed; - std::string_view view = url; + std::string fixedEncodedUrl; - if (auto beforeQuery = splitPrefixTo(view, '?')) { - fixed += *beforeQuery; - fixed += '?'; - auto fragmentStart = view.find('#'); - auto queryView = view.substr(0, fragmentStart); - auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery); - fixed += fixedQuery; - view.remove_prefix(std::min(fragmentStart, view.size())); - } + if (lenient) { + fixedEncodedUrl = [&] { + std::string fixed; + std::string_view view = url; - if (auto beforeFragment = splitPrefixTo(view, '#')) { - fixed += *beforeFragment; - fixed += '#'; - auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment); - fixed += fixedFragment; + if (auto beforeQuery = splitPrefixTo(view, '?')) { + fixed += *beforeQuery; + fixed += '?'; + auto fragmentStart = view.find('#'); + auto queryView = view.substr(0, fragmentStart); + auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery); + fixed += fixedQuery; + view.remove_prefix(std::min(fragmentStart, view.size())); + } + + if (auto beforeFragment = splitPrefixTo(view, '#')) { + fixed += *beforeFragment; + fixed += '#'; + auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment); + fixed += fixedFragment; + return fixed; + } + + fixed += view; return fixed; - } + }(); + } - fixed += view; - return fixed; - }(); - - auto urlView = boost::urls::url_view(fixedEncodedUrl); + auto urlView = boost::urls::url_view(lenient ? fixedEncodedUrl : url); if (!urlView.has_scheme()) throw BadURL("'%s' doesn't have a scheme", url); @@ -179,7 +186,7 @@ try { .scheme = scheme, .authority = authority, .path = path, - .query = decodeQuery(query), + .query = decodeQuery(query, lenient), .fragment = fragment, }; } catch (boost::system::system_error & e) { @@ -201,14 +208,17 @@ std::string percentEncode(std::string_view s, std::string_view keep) s, [keep](char c) { return boost::urls::unreserved_chars(c) || keep.find(c) != keep.npos; }); } -StringMap decodeQuery(std::string_view query) +StringMap decodeQuery(std::string_view query, bool lenient) try { - /* For back-compat unescaped characters are allowed. */ - auto fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery); + /* When `lenient = true`, for back-compat unescaped characters are allowed. */ + std::string fixedEncodedQuery; + if (lenient) { + fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery); + } StringMap result; - auto encodedQuery = boost::urls::params_encoded_view(fixedEncodedQuery); + auto encodedQuery = boost::urls::params_encoded_view(lenient ? fixedEncodedQuery : query); for (auto && [key, value, value_specified] : encodedQuery) { if (!value_specified) { warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "="); diff --git a/src/nix/profile.cc b/src/nix/profile.cc index df92d888e..0ed1face5 100644 --- a/src/nix/profile.cc +++ b/src/nix/profile.cc @@ -105,7 +105,8 @@ std::string getNameFromElement(const ProfileElement & element) { std::optional result = std::nullopt; if (element.source) { - result = getNameFromURL(parseURL(element.source->to_string())); + // Seems to be for Flake URLs + result = getNameFromURL(parseURL(element.source->to_string(), /*lenient=*/true)); } return result.value_or(element.identifier()); } @@ -160,11 +161,15 @@ struct ProfileManifest e["outputs"].get()}; } - std::string name = - elems.is_object() ? elem.key() - : element.source - ? getNameFromURL(parseURL(element.source->to_string())).value_or(element.identifier()) - : element.identifier(); + std::string name = [&] { + if (elems.is_object()) + return elem.key(); + if (element.source) { + if (auto optName = getNameFromURL(parseURL(element.source->to_string(), /*lenient=*/true))) + return *optName; + } + return element.identifier(); + }(); addElement(name, std::move(element)); }