diff --git a/src/libflake-tests/flakeref.cc b/src/libflake-tests/flakeref.cc index 2f8deb123..404d7590a 100644 --- a/src/libflake-tests/flakeref.cc +++ b/src/libflake-tests/flakeref.cc @@ -74,6 +74,20 @@ TEST(parseFlakeRef, GitArchiveInput) auto flakeref = parseFlakeRef(fetchSettings, s); ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch%23"); } + + { + auto s = "github:foo/bar?ref=branch#\"name.with.dot\""; // unescaped quotes `"` + auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s); + ASSERT_EQ(fragment, "\"name.with.dot\""); + ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch"); + } + + { + auto s = "github:foo/bar#\"name.with.dot\""; // unescaped quotes `"` + auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s); + ASSERT_EQ(fragment, "\"name.with.dot\""); + ASSERT_EQ(flakeref.to_string(), "github:foo/bar"); + } } TEST(to_string, doesntReencodeUrl) diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc index fb27689de..0dfb5f463 100644 --- a/src/libutil-tests/url.cc +++ b/src/libutil-tests/url.cc @@ -212,6 +212,26 @@ TEST(parseURL, parsedUrlsIsEqualToItself) ASSERT_TRUE(url == url); } +TEST(parseURL, parsedUrlsWithUnescapedChars) +{ + /* Test for back-compat. Behavior is rather questionable, but + * is ingrained pretty deep into how URL parsing is shared between + * flakes and libstore. + * 1. Unescaped spaces, quotes and shevron (^) in fragment. + * 2. Unescaped spaces and quotes in query. + */ + auto s = "http://www.example.org/file.tar.gz?query \"= 123\"#shevron^quote\"space "; + auto url = parseURL(s); + + ASSERT_EQ(url.fragment, "shevron^quote\"space "); + + auto query = StringMap{ + {"query \"", " 123\""}, + }; + + ASSERT_EQ(url.query, query); +} + TEST(parseURL, parseFTPUrl) { auto s = "ftp://ftp.nixos.org/downloads/nixos.iso"; diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh index 0a6194b19..0aa1eac9f 100644 --- a/src/libutil/include/nix/util/url.hh +++ b/src/libutil/include/nix/util/url.hh @@ -104,8 +104,8 @@ std::string encodeQuery(const StringMap & query); * Parse a Nix URL into a ParsedURL. * * Nix URI is mostly compliant with RFC3986, but with some deviations: - * - Literal spaces are allowed and don't have to be percent encoded. - * This is mostly done for backward compatibility. + * - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals. + * - Queries may contain unescaped '"' or spaces. * * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874. * diff --git a/src/libutil/url.cc b/src/libutil/url.cc index 8f902552f..75f62d445 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -11,28 +11,6 @@ namespace nix { std::regex refRegex(refRegexS, std::regex::ECMAScript); std::regex revRegex(revRegexS, std::regex::ECMAScript); -/** - * Drop trailing shevron for output installable syntax. - * - * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does - * get used. That code should actually use ExtendedOutputsSpec::parseOpt. - */ -static std::string_view dropShevronSuffix(std::string_view url) -{ - auto shevron = url.rfind("^"); - if (shevron == std::string_view::npos) - return url; - return url.substr(0, shevron); -} - -/** - * Percent encode spaces in the url. - */ -static std::string percentEncodeSpaces(std::string_view url) -{ - return replaceStrings(std::string(url), " ", percentEncode(" ")); -} - ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority) { auto parsed = boost::urls::parse_authority(encodedAuthority); @@ -108,14 +86,65 @@ std::string ParsedURL::Authority::to_string() const return std::move(oss).str(); } +/** + * Additional characters that don't need URL encoding in the fragment. + */ +static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInFragment = " \"^"; + +/** + * Additional characters that don't need URL encoding in the query. + */ +static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInQuery = " \""; + +static std::string percentEncodeCharSet(std::string_view s, auto charSet) +{ + std::string res; + for (auto c : s) { + if (charSet(c)) + res += percentEncode(std::string_view{&c, &c + 1}); + else + res += c; + } + return res; +} + ParsedURL parseURL(const std::string & url) try { - /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and - shouldn't appear in normal URIs. */ - auto unparsedView = dropShevronSuffix(url); - /* For back-compat literal spaces are allowed. */ - auto withFixedSpaces = percentEncodeSpaces(unparsedView); - auto urlView = boost::urls::url_view(withFixedSpaces); + auto unparsedView = url; + + /* Account for several non-standard properties of nix urls (for back-compat): + * - Allow unescaped spaces ' ' and '"' characters in queries. + * - Allow '"', ' ' and '^' characters in the fragment component. + * We could write our own grammar for this, but fixing it up here seems + * more concise, since the deviation is rather minor. + */ + std::string fixedEncodedUrl = [&]() { + std::string fixed; + std::string_view view = url; + + if (auto beforeQuery = splitPrefixTo(view, '?')) { + fixed += *beforeQuery; + fixed += '?'; + auto fragmentStart = view.find('#'); + auto queryView = view.substr(0, fragmentStart); + auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery); + fixed += fixedQuery; + view.remove_prefix(std::min(fragmentStart, view.size())); + } + + if (auto beforeFragment = splitPrefixTo(view, '#')) { + fixed += *beforeFragment; + fixed += '#'; + auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment); + fixed += fixedFragment; + return fixed; + } + + fixed += view; + return fixed; + }(); + + auto urlView = boost::urls::url_view(fixedEncodedUrl); if (!urlView.has_scheme()) throw BadURL("'%s' doesn't have a scheme", url); @@ -176,12 +205,12 @@ std::string percentEncode(std::string_view s, std::string_view keep) StringMap decodeQuery(const std::string & query) try { - /* For back-compat literal spaces are allowed. */ - auto withFixedSpaces = percentEncodeSpaces(query); + /* For back-compat unescaped characters are allowed. */ + auto fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery); StringMap result; - auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces); + auto encodedQuery = boost::urls::params_encoded_view(fixedEncodedQuery); for (auto && [key, value, value_specified] : encodedQuery) { if (!value_specified) { warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");