mirror of
https://github.com/NixOS/nix.git
synced 2025-11-09 20:16:03 +01:00
libutil: Use Boost.URL in parseURL
Boost.URL is a significantly more RFC-compliant parser than what libutil currently has a bundle of incomprehensible regexes. One aspect of this change is that RFC4007 ZoneId IPv6 literals are represented in URIs according to RFC6874 [1]. Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25]. This is not entirely correct, because the percent itself has to be pct-encoded: > "%" is always treated as an escape character in a URI, so, according to the established URI syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST be percent-encoded and represented in the form "%25". Thus, the scoped address fe80::a%en1 would appear in a URI as http://[fe80::a%25en1]. [1]: https://datatracker.ietf.org/doc/html/rfc6874 Co-authored-by: Jörg Thalheim <joerg@thalheim.io>
This commit is contained in:
parent
d020f21a2a
commit
bd1d2d1041
4 changed files with 81 additions and 45 deletions
|
|
@ -12,40 +12,70 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript);
|
|||
std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript);
|
||||
std::regex revRegex(revRegexS, std::regex::ECMAScript);
|
||||
|
||||
ParsedURL parseURL(const std::string & url)
|
||||
/**
|
||||
* Drop trailing shevron for output installable syntax.
|
||||
*
|
||||
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
|
||||
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
|
||||
*/
|
||||
static std::string_view dropShevronSuffix(std::string_view url)
|
||||
{
|
||||
static std::regex uriRegex(
|
||||
"((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex
|
||||
+ ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?",
|
||||
std::regex::ECMAScript);
|
||||
auto shevron = url.rfind("^");
|
||||
if (shevron == std::string_view::npos)
|
||||
return url;
|
||||
return url.substr(0, shevron);
|
||||
}
|
||||
|
||||
std::smatch match;
|
||||
/**
|
||||
* Percent encode spaces in the url.
|
||||
*/
|
||||
static std::string percentEncodeSpaces(std::string_view url)
|
||||
{
|
||||
return replaceStrings(std::string(url), " ", percentEncode(" "));
|
||||
}
|
||||
|
||||
if (std::regex_match(url, match, uriRegex)) {
|
||||
std::string scheme = match[2];
|
||||
auto authority = match[3].matched ? std::optional<std::string>(match[3]) : std::nullopt;
|
||||
std::string path = match[4].matched ? match[4] : match[5];
|
||||
auto & query = match[6];
|
||||
auto & fragment = match[7];
|
||||
ParsedURL parseURL(const std::string & url)
|
||||
try {
|
||||
/* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
|
||||
shouldn't appear in normal URIs. */
|
||||
auto unparsedView = dropShevronSuffix(url);
|
||||
/* For back-compat literal spaces are allowed. */
|
||||
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
|
||||
auto urlView = boost::urls::url_view(withFixedSpaces);
|
||||
|
||||
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
|
||||
if (!urlView.has_scheme())
|
||||
throw BadURL("'%s' doesn't have a scheme", url);
|
||||
|
||||
if (authority && *authority != "" && transportIsFile)
|
||||
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
|
||||
auto scheme = urlView.scheme();
|
||||
auto authority = [&]() -> std::optional<std::string> {
|
||||
if (urlView.has_authority())
|
||||
return percentDecode(urlView.authority().buffer());
|
||||
return std::nullopt;
|
||||
}();
|
||||
|
||||
if (transportIsFile && path.empty())
|
||||
path = "/";
|
||||
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
|
||||
if (authority && *authority != "" && transportIsFile)
|
||||
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
|
||||
|
||||
return ParsedURL{
|
||||
.scheme = scheme,
|
||||
.authority = authority,
|
||||
.path = percentDecode(path),
|
||||
.query = decodeQuery(query),
|
||||
.fragment = percentDecode(std::string(fragment))};
|
||||
}
|
||||
auto path = urlView.path(); /* Does pct-decoding */
|
||||
auto fragment = urlView.fragment(); /* Does pct-decoding */
|
||||
|
||||
else
|
||||
throw BadURL("'%s' is not a valid URL", url);
|
||||
if (transportIsFile && path.empty())
|
||||
path = "/";
|
||||
|
||||
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
|
||||
the inner &/? are pct-encoded. */
|
||||
auto query = std::string_view(urlView.encoded_query());
|
||||
|
||||
return ParsedURL{
|
||||
.scheme = scheme,
|
||||
.authority = authority,
|
||||
.path = path,
|
||||
.query = decodeQuery(std::string(query)),
|
||||
.fragment = fragment,
|
||||
};
|
||||
} catch (boost::system::system_error & e) {
|
||||
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
|
||||
}
|
||||
|
||||
std::string percentDecode(std::string_view in)
|
||||
|
|
@ -64,20 +94,25 @@ std::string percentEncode(std::string_view s, std::string_view keep)
|
|||
}
|
||||
|
||||
StringMap decodeQuery(const std::string & query)
|
||||
{
|
||||
try {
|
||||
/* For back-compat literal spaces are allowed. */
|
||||
auto withFixedSpaces = percentEncodeSpaces(query);
|
||||
|
||||
StringMap result;
|
||||
|
||||
for (const auto & s : tokenizeString<Strings>(query, "&")) {
|
||||
auto e = s.find('=');
|
||||
if (e == std::string::npos) {
|
||||
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "=");
|
||||
auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
|
||||
for (auto && [key, value, value_specified] : encodedQuery) {
|
||||
if (!value_specified) {
|
||||
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
|
||||
continue;
|
||||
}
|
||||
|
||||
result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1)));
|
||||
result.emplace(key.decode(), value.decode());
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (boost::system::system_error & e) {
|
||||
throw BadURL("invalid URI query '%s': %s", query, e.code().message());
|
||||
}
|
||||
|
||||
const static std::string allowedInQuery = ":@/?";
|
||||
|
|
@ -150,6 +185,7 @@ std::string fixGitURL(const std::string & url)
|
|||
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1
|
||||
bool isValidSchemeName(std::string_view s)
|
||||
{
|
||||
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
|
||||
static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
|
||||
|
||||
return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue