1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-09 12:06:01 +01:00

libutil: Use Boost.URL in parseURL

Boost.URL is a significantly more RFC-compliant parser
than what libutil currently has a bundle of incomprehensible
regexes.

One aspect of this change is that RFC4007 ZoneId IPv6 literals
are represented in URIs according to RFC6874 [1].

Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25].
This is not entirely correct, because the percent itself has to be pct-encoded:

> "%" is always treated as
   an escape character in a URI, so, according to the established URI
   syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST
   be percent-encoded and represented in the form "%25".  Thus, the
   scoped address fe80::a%en1 would appear in a URI as
   http://[fe80::a%25en1].

[1]: https://datatracker.ietf.org/doc/html/rfc6874

Co-authored-by: Jörg Thalheim <joerg@thalheim.io>
This commit is contained in:
Sergei Zimmerman 2025-07-18 21:20:59 +03:00
parent d020f21a2a
commit bd1d2d1041
No known key found for this signature in database
4 changed files with 81 additions and 45 deletions

View file

@ -124,9 +124,9 @@ TEST(parseURL, parseIPv4Address)
ASSERT_EQ(parsed, expected); ASSERT_EQ(parsed, expected);
} }
TEST(parseURL, parseScopedRFC4007IPv6Address) TEST(parseURL, parseScopedRFC6874IPv6Address)
{ {
auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080"; auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080";
auto parsed = parseURL(s); auto parsed = parseURL(s);
ParsedURL expected{ ParsedURL expected{

View file

@ -8,21 +8,10 @@ namespace nix {
// URI stuff. // URI stuff.
const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])"; const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])";
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?";
const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")";
const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])"; const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])";
const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])"; const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])";
const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)";
const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")";
const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)";
const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?";
const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])"; const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])";
const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*";
const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*"; const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*";
const static std::string segmentRegex = "(?:" + pcharRegex + "*)";
const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)";
const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)";
/// A Git ref (i.e. branch or tag name). /// A Git ref (i.e. branch or tag name).
/// \todo check that this is correct. /// \todo check that this is correct.

View file

@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query);
std::string encodeQuery(const StringMap & query); std::string encodeQuery(const StringMap & query);
/**
* Parse a Nix URL into a ParsedURL.
*
* Nix URI is mostly compliant with RFC3986, but with some deviations:
* - Literal spaces are allowed and don't have to be percent encoded.
* This is mostly done for backward compatibility.
*
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
*
* @throws BadURL
*/
ParsedURL parseURL(const std::string & url); ParsedURL parseURL(const std::string & url);
/** /**

View file

@ -12,40 +12,70 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript);
std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript); std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
std::regex revRegex(revRegexS, std::regex::ECMAScript); std::regex revRegex(revRegexS, std::regex::ECMAScript);
ParsedURL parseURL(const std::string & url) /**
* Drop trailing shevron for output installable syntax.
*
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
*/
static std::string_view dropShevronSuffix(std::string_view url)
{ {
static std::regex uriRegex( auto shevron = url.rfind("^");
"((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex if (shevron == std::string_view::npos)
+ ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?", return url;
std::regex::ECMAScript); return url.substr(0, shevron);
}
std::smatch match; /**
* Percent encode spaces in the url.
*/
static std::string percentEncodeSpaces(std::string_view url)
{
return replaceStrings(std::string(url), " ", percentEncode(" "));
}
if (std::regex_match(url, match, uriRegex)) { ParsedURL parseURL(const std::string & url)
std::string scheme = match[2]; try {
auto authority = match[3].matched ? std::optional<std::string>(match[3]) : std::nullopt; /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
std::string path = match[4].matched ? match[4] : match[5]; shouldn't appear in normal URIs. */
auto & query = match[6]; auto unparsedView = dropShevronSuffix(url);
auto & fragment = match[7]; /* For back-compat literal spaces are allowed. */
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
auto urlView = boost::urls::url_view(withFixedSpaces);
auto transportIsFile = parseUrlScheme(scheme).transport == "file"; if (!urlView.has_scheme())
throw BadURL("'%s' doesn't have a scheme", url);
if (authority && *authority != "" && transportIsFile) auto scheme = urlView.scheme();
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority); auto authority = [&]() -> std::optional<std::string> {
if (urlView.has_authority())
return percentDecode(urlView.authority().buffer());
return std::nullopt;
}();
if (transportIsFile && path.empty()) auto transportIsFile = parseUrlScheme(scheme).transport == "file";
path = "/"; if (authority && *authority != "" && transportIsFile)
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
return ParsedURL{ auto path = urlView.path(); /* Does pct-decoding */
.scheme = scheme, auto fragment = urlView.fragment(); /* Does pct-decoding */
.authority = authority,
.path = percentDecode(path),
.query = decodeQuery(query),
.fragment = percentDecode(std::string(fragment))};
}
else if (transportIsFile && path.empty())
throw BadURL("'%s' is not a valid URL", url); path = "/";
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
the inner &/? are pct-encoded. */
auto query = std::string_view(urlView.encoded_query());
return ParsedURL{
.scheme = scheme,
.authority = authority,
.path = path,
.query = decodeQuery(std::string(query)),
.fragment = fragment,
};
} catch (boost::system::system_error & e) {
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
} }
std::string percentDecode(std::string_view in) std::string percentDecode(std::string_view in)
@ -64,20 +94,25 @@ std::string percentEncode(std::string_view s, std::string_view keep)
} }
StringMap decodeQuery(const std::string & query) StringMap decodeQuery(const std::string & query)
{ try {
/* For back-compat literal spaces are allowed. */
auto withFixedSpaces = percentEncodeSpaces(query);
StringMap result; StringMap result;
for (const auto & s : tokenizeString<Strings>(query, "&")) { auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
auto e = s.find('='); for (auto && [key, value, value_specified] : encodedQuery) {
if (e == std::string::npos) { if (!value_specified) {
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "="); warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
continue; continue;
} }
result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1))); result.emplace(key.decode(), value.decode());
} }
return result; return result;
} catch (boost::system::system_error & e) {
throw BadURL("invalid URI query '%s': %s", query, e.code().message());
} }
const static std::string allowedInQuery = ":@/?"; const static std::string allowedInQuery = ":@/?";
@ -150,6 +185,7 @@ std::string fixGitURL(const std::string & url)
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1 // https://www.rfc-editor.org/rfc/rfc3986#section-3.1
bool isValidSchemeName(std::string_view s) bool isValidSchemeName(std::string_view s)
{ {
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
static std::regex regex(schemeNameRegex, std::regex::ECMAScript); static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default); return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);