1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-16 15:32:43 +01:00
nix/src/libutil/url.cc
Sergei Zimmerman e8e9376a7b
libfetchers: Remove badGitRefRegex and use libgit2 for reference validation
Fixes usage of `#` symbol in the reference name.
This also seems to identify several deficiencies in the libgit2 refname
validation code wrt to DEL symbol and a singular `@` symbol [1].

[1]: https://git-scm.com/docs/git-check-ref-format#_description
2025-08-11 02:38:45 +03:00

275 lines
8.6 KiB
C++

#include "nix/util/url.hh"
#include "nix/util/url-parts.hh"
#include "nix/util/util.hh"
#include "nix/util/split.hh"
#include "nix/util/canon-path.hh"
#include <boost/url.hpp>
namespace nix {
std::regex refRegex(refRegexS, std::regex::ECMAScript);
std::regex revRegex(revRegexS, std::regex::ECMAScript);
/**
* Drop trailing shevron for output installable syntax.
*
* FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does
* get used. That code should actually use ExtendedOutputsSpec::parseOpt.
*/
static std::string_view dropShevronSuffix(std::string_view url)
{
auto shevron = url.rfind("^");
if (shevron == std::string_view::npos)
return url;
return url.substr(0, shevron);
}
/**
* Percent encode spaces in the url.
*/
static std::string percentEncodeSpaces(std::string_view url)
{
return replaceStrings(std::string(url), " ", percentEncode(" "));
}
ParsedURL::Authority ParsedURL::Authority::parse(std::string_view encodedAuthority)
{
auto parsed = boost::urls::parse_authority(encodedAuthority);
if (!parsed)
throw BadURL("invalid URL authority: '%s': %s", encodedAuthority, parsed.error().message());
auto hostType = [&]() {
switch (parsed->host_type()) {
case boost::urls::host_type::ipv4:
return HostType::IPv4;
case boost::urls::host_type::ipv6:
return HostType::IPv6;
case boost::urls::host_type::ipvfuture:
return HostType::IPvFuture;
case boost::urls::host_type::none:
case boost::urls::host_type::name:
return HostType::Name;
}
unreachable();
}();
auto port = [&]() -> std::optional<uint16_t> {
if (!parsed->has_port())
return std::nullopt;
/* If the port number is non-zero and representable. */
if (auto portNumber = parsed->port_number())
return portNumber;
throw BadURL("port '%s' is invalid", parsed->port());
}();
return {
.hostType = hostType,
.host = parsed->host_address(),
.user = parsed->has_userinfo() ? parsed->user() : std::optional<std::string>{},
.password = parsed->has_password() ? parsed->password() : std::optional<std::string>{},
.port = port,
};
}
std::ostream & operator<<(std::ostream & os, const ParsedURL::Authority & self)
{
if (self.user) {
os << percentEncode(*self.user);
if (self.password)
os << ":" << percentEncode(*self.password);
os << "@";
}
using HostType = ParsedURL::Authority::HostType;
switch (self.hostType) {
case HostType::Name:
os << percentEncode(self.host);
break;
case HostType::IPv4:
os << self.host;
break;
case HostType::IPv6:
case HostType::IPvFuture:
/* Reencode percent sign for RFC4007 ScopeId literals. */
os << "[" << percentEncode(self.host, ":") << "]";
}
if (self.port)
os << ":" << *self.port;
return os;
}
std::string ParsedURL::Authority::to_string() const
{
std::ostringstream oss;
oss << *this;
return std::move(oss).str();
}
ParsedURL parseURL(const std::string & url)
try {
/* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and
shouldn't appear in normal URIs. */
auto unparsedView = dropShevronSuffix(url);
/* For back-compat literal spaces are allowed. */
auto withFixedSpaces = percentEncodeSpaces(unparsedView);
auto urlView = boost::urls::url_view(withFixedSpaces);
if (!urlView.has_scheme())
throw BadURL("'%s' doesn't have a scheme", url);
auto scheme = urlView.scheme();
auto authority = [&]() -> std::optional<ParsedURL::Authority> {
if (urlView.has_authority())
return ParsedURL::Authority::parse(urlView.authority().buffer());
return std::nullopt;
}();
/* 3.2.2. Host (RFC3986):
* If the URI scheme defines a default for host, then that default
* applies when the host subcomponent is undefined or when the
* registered name is empty (zero length). For example, the "file" URI
* scheme is defined so that no authority, an empty host, and
* "localhost" all mean the end-user's machine, whereas the "http"
* scheme considers a missing authority or empty host invalid. */
auto transportIsFile = parseUrlScheme(scheme).transport == "file";
if (authority && authority->host.size() && transportIsFile)
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority);
auto path = urlView.path(); /* Does pct-decoding */
auto fragment = urlView.fragment(); /* Does pct-decoding */
if (transportIsFile && path.empty())
path = "/";
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
the inner &/? are pct-encoded. */
auto query = std::string_view(urlView.encoded_query());
return ParsedURL{
.scheme = scheme,
.authority = authority,
.path = path,
.query = decodeQuery(std::string(query)),
.fragment = fragment,
};
} catch (boost::system::system_error & e) {
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
}
std::string percentDecode(std::string_view in)
{
auto pctView = boost::urls::make_pct_string_view(in);
if (pctView.has_value())
return pctView->decode();
auto error = pctView.error();
throw BadURL("invalid URI parameter '%s': %s", in, error.message());
}
std::string percentEncode(std::string_view s, std::string_view keep)
{
return boost::urls::encode(
s, [keep](char c) { return boost::urls::unreserved_chars(c) || keep.find(c) != keep.npos; });
}
StringMap decodeQuery(const std::string & query)
try {
/* For back-compat literal spaces are allowed. */
auto withFixedSpaces = percentEncodeSpaces(query);
StringMap result;
auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces);
for (auto && [key, value, value_specified] : encodedQuery) {
if (!value_specified) {
warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "=");
continue;
}
result.emplace(key.decode(), value.decode());
}
return result;
} catch (boost::system::system_error & e) {
throw BadURL("invalid URI query '%s': %s", query, e.code().message());
}
const static std::string allowedInQuery = ":@/?";
const static std::string allowedInPath = ":@/";
std::string encodeQuery(const StringMap & ss)
{
std::string res;
bool first = true;
for (auto & [name, value] : ss) {
if (!first)
res += '&';
first = false;
res += percentEncode(name, allowedInQuery);
res += '=';
res += percentEncode(value, allowedInQuery);
}
return res;
}
std::string ParsedURL::to_string() const
{
return scheme + ":" + (authority ? "//" + authority->to_string() : "") + percentEncode(path, allowedInPath)
+ (query.empty() ? "" : "?" + encodeQuery(query)) + (fragment.empty() ? "" : "#" + percentEncode(fragment));
}
std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
{
os << url.to_string();
return os;
}
ParsedURL ParsedURL::canonicalise()
{
ParsedURL res(*this);
res.path = CanonPath(res.path).abs();
return res;
}
/**
* Parse a URL scheme of the form '(applicationScheme\+)?transportScheme'
* into a tuple '(applicationScheme, transportScheme)'
*
* > parseUrlScheme("http") == ParsedUrlScheme{ {}, "http"}
* > parseUrlScheme("tarball+http") == ParsedUrlScheme{ {"tarball"}, "http"}
*/
ParsedUrlScheme parseUrlScheme(std::string_view scheme)
{
auto application = splitPrefixTo(scheme, '+');
auto transport = scheme;
return ParsedUrlScheme{
.application = application,
.transport = transport,
};
}
std::string fixGitURL(const std::string & url)
{
std::regex scpRegex("([^/]*)@(.*):(.*)");
if (!hasPrefix(url, "/") && std::regex_match(url, scpRegex))
return std::regex_replace(url, scpRegex, "ssh://$1@$2/$3");
if (hasPrefix(url, "file:"))
return url;
if (url.find("://") == std::string::npos) {
return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url}).to_string();
}
return url;
}
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1
bool isValidSchemeName(std::string_view s)
{
const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)";
static std::regex regex(schemeNameRegex, std::regex::ECMAScript);
return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default);
}
} // namespace nix