mirror of
https://github.com/NixOS/nix.git
synced 2025-11-09 20:16:03 +01:00
libutil: Fix handling of unescaped spaces, quotes and shevrons in queries and fragments
Turns out we didn't have tests for some of the important behavior introduced
for flake reference fragments and url queries [1]. This is rather important
and is relied upon by existing tooling. This fixes up these exact cases before
handing off the URL to the Boost.URL parser.
To the best of my knowledge this implements the same behavior as prior regex-based
parser did [2]:
> fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
> queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
[1]: 9c0a09f09f
[2]: https://github.com/NixOS/nix/blob/2.30.2/src/libutil/include/nix/util/url-parts.hh
This commit is contained in:
parent
0fd9ef0cf3
commit
dc1b2012af
4 changed files with 96 additions and 33 deletions
|
|
@ -74,6 +74,20 @@ TEST(parseFlakeRef, GitArchiveInput)
|
||||||
auto flakeref = parseFlakeRef(fetchSettings, s);
|
auto flakeref = parseFlakeRef(fetchSettings, s);
|
||||||
ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch%23");
|
ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch%23");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto s = "github:foo/bar?ref=branch#\"name.with.dot\""; // unescaped quotes `"`
|
||||||
|
auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
|
||||||
|
ASSERT_EQ(fragment, "\"name.with.dot\"");
|
||||||
|
ASSERT_EQ(flakeref.to_string(), "github:foo/bar/branch");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto s = "github:foo/bar#\"name.with.dot\""; // unescaped quotes `"`
|
||||||
|
auto [flakeref, fragment] = parseFlakeRefWithFragment(fetchSettings, s);
|
||||||
|
ASSERT_EQ(fragment, "\"name.with.dot\"");
|
||||||
|
ASSERT_EQ(flakeref.to_string(), "github:foo/bar");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(to_string, doesntReencodeUrl)
|
TEST(to_string, doesntReencodeUrl)
|
||||||
|
|
|
||||||
|
|
@ -212,6 +212,26 @@ TEST(parseURL, parsedUrlsIsEqualToItself)
|
||||||
ASSERT_TRUE(url == url);
|
ASSERT_TRUE(url == url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(parseURL, parsedUrlsWithUnescapedChars)
|
||||||
|
{
|
||||||
|
/* Test for back-compat. Behavior is rather questionable, but
|
||||||
|
* is ingrained pretty deep into how URL parsing is shared between
|
||||||
|
* flakes and libstore.
|
||||||
|
* 1. Unescaped spaces, quotes and shevron (^) in fragment.
|
||||||
|
* 2. Unescaped spaces and quotes in query.
|
||||||
|
*/
|
||||||
|
auto s = "http://www.example.org/file.tar.gz?query \"= 123\"#shevron^quote\"space ";
|
||||||
|
auto url = parseURL(s);
|
||||||
|
|
||||||
|
ASSERT_EQ(url.fragment, "shevron^quote\"space ");
|
||||||
|
|
||||||
|
auto query = StringMap{
|
||||||
|
{"query \"", " 123\""},
|
||||||
|
};
|
||||||
|
|
||||||
|
ASSERT_EQ(url.query, query);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(parseURL, parseFTPUrl)
|
TEST(parseURL, parseFTPUrl)
|
||||||
{
|
{
|
||||||
auto s = "ftp://ftp.nixos.org/downloads/nixos.iso";
|
auto s = "ftp://ftp.nixos.org/downloads/nixos.iso";
|
||||||
|
|
|
||||||
|
|
@ -104,8 +104,8 @@ std::string encodeQuery(const StringMap & query);
|
||||||
* Parse a Nix URL into a ParsedURL.
|
* Parse a Nix URL into a ParsedURL.
|
||||||
*
|
*
|
||||||
* Nix URI is mostly compliant with RFC3986, but with some deviations:
|
* Nix URI is mostly compliant with RFC3986, but with some deviations:
|
||||||
* - Literal spaces are allowed and don't have to be percent encoded.
|
* - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals.
|
||||||
* This is mostly done for backward compatibility.
|
* - Queries may contain unescaped '"' or spaces.
|
||||||
*
|
*
|
||||||
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
|
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -11,28 +11,6 @@ namespace nix {
|
||||||
std::regex refRegex(refRegexS, std::regex::ECMAScript);
|
std::regex refRegex(refRegexS, std::regex::ECMAScript);
|
||||||
std::regex revRegex(revRegexS, std::regex::ECMAScript);
|
std::regex revRegex(revRegexS, std::regex::ECMAScript);
|
||||||
|
|
||||||
/**
|
|
||||||
* Drop trailing shevron for output installable syntax.
|
|
||||||
*
|
|
||||||
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
|
|
||||||
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
|
|
||||||
*/
|
|
||||||
static std::string_view dropShevronSuffix(std::string_view url)
|
|
||||||
{
|
|
||||||
auto shevron = url.rfind("^");
|
|
||||||
if (shevron == std::string_view::npos)
|
|
||||||
return url;
|
|
||||||
return url.substr(0, shevron);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Percent encode spaces in the url.
|
|
||||||
*/
|
|
||||||
static std::string percentEncodeSpaces(std::string_view url)
|
|
||||||
{
|
|
||||||
return replaceStrings(std::string(url), " ", percentEncode(" "));
|
|
||||||
}
|
|
||||||
|
|
||||||
ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority)
|
ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority)
|
||||||
{
|
{
|
||||||
auto parsed = boost::urls::parse_authority(encodedAuthority);
|
auto parsed = boost::urls::parse_authority(encodedAuthority);
|
||||||
|
|
@ -108,14 +86,65 @@ std::string ParsedURL::Authority::to_string() const
|
||||||
return std::move(oss).str();
|
return std::move(oss).str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Additional characters that don't need URL encoding in the fragment.
|
||||||
|
*/
|
||||||
|
static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInFragment = " \"^";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Additional characters that don't need URL encoding in the query.
|
||||||
|
*/
|
||||||
|
static constexpr boost::urls::grammar::lut_chars extraAllowedCharsInQuery = " \"";
|
||||||
|
|
||||||
|
static std::string percentEncodeCharSet(std::string_view s, auto charSet)
|
||||||
|
{
|
||||||
|
std::string res;
|
||||||
|
for (auto c : s) {
|
||||||
|
if (charSet(c))
|
||||||
|
res += percentEncode(std::string_view{&c, &c + 1});
|
||||||
|
else
|
||||||
|
res += c;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
ParsedURL parseURL(const std::string & url)
|
ParsedURL parseURL(const std::string & url)
|
||||||
try {
|
try {
|
||||||
/* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
|
auto unparsedView = url;
|
||||||
shouldn't appear in normal URIs. */
|
|
||||||
auto unparsedView = dropShevronSuffix(url);
|
/* Account for several non-standard properties of nix urls (for back-compat):
|
||||||
/* For back-compat literal spaces are allowed. */
|
* - Allow unescaped spaces ' ' and '"' characters in queries.
|
||||||
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
|
* - Allow '"', ' ' and '^' characters in the fragment component.
|
||||||
auto urlView = boost::urls::url_view(withFixedSpaces);
|
* We could write our own grammar for this, but fixing it up here seems
|
||||||
|
* more concise, since the deviation is rather minor.
|
||||||
|
*/
|
||||||
|
std::string fixedEncodedUrl = [&]() {
|
||||||
|
std::string fixed;
|
||||||
|
std::string_view view = url;
|
||||||
|
|
||||||
|
if (auto beforeQuery = splitPrefixTo(view, '?')) {
|
||||||
|
fixed += *beforeQuery;
|
||||||
|
fixed += '?';
|
||||||
|
auto fragmentStart = view.find('#');
|
||||||
|
auto queryView = view.substr(0, fragmentStart);
|
||||||
|
auto fixedQuery = percentEncodeCharSet(queryView, extraAllowedCharsInQuery);
|
||||||
|
fixed += fixedQuery;
|
||||||
|
view.remove_prefix(std::min(fragmentStart, view.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (auto beforeFragment = splitPrefixTo(view, '#')) {
|
||||||
|
fixed += *beforeFragment;
|
||||||
|
fixed += '#';
|
||||||
|
auto fixedFragment = percentEncodeCharSet(view, extraAllowedCharsInFragment);
|
||||||
|
fixed += fixedFragment;
|
||||||
|
return fixed;
|
||||||
|
}
|
||||||
|
|
||||||
|
fixed += view;
|
||||||
|
return fixed;
|
||||||
|
}();
|
||||||
|
|
||||||
|
auto urlView = boost::urls::url_view(fixedEncodedUrl);
|
||||||
|
|
||||||
if (!urlView.has_scheme())
|
if (!urlView.has_scheme())
|
||||||
throw BadURL("'%s' doesn't have a scheme", url);
|
throw BadURL("'%s' doesn't have a scheme", url);
|
||||||
|
|
@ -176,12 +205,12 @@ std::string percentEncode(std::string_view s, std::string_view keep)
|
||||||
|
|
||||||
StringMap decodeQuery(const std::string & query)
|
StringMap decodeQuery(const std::string & query)
|
||||||
try {
|
try {
|
||||||
/* For back-compat literal spaces are allowed. */
|
/* For back-compat unescaped characters are allowed. */
|
||||||
auto withFixedSpaces = percentEncodeSpaces(query);
|
auto fixedEncodedQuery = percentEncodeCharSet(query, extraAllowedCharsInQuery);
|
||||||
|
|
||||||
StringMap result;
|
StringMap result;
|
||||||
|
|
||||||
auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
|
auto encodedQuery = boost::urls::params_encoded_view(fixedEncodedQuery);
|
||||||
for (auto && [key, value, value_specified] : encodedQuery) {
|
for (auto && [key, value, value_specified] : encodedQuery) {
|
||||||
if (!value_specified) {
|
if (!value_specified) {
|
||||||
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
|
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue