1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-12-07 01:21:00 +01:00

Merge pull request #13819 from obsidiansystems/relative-url

Implement `parseURLRelative`, use in `HttpBinaryCacheStore`
This commit is contained in:
Sergei Zimmerman 2025-08-27 03:34:57 +03:00 committed by GitHub
commit 8ee74792fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 278 additions and 30 deletions

View file

@ -113,7 +113,7 @@ static void fetchTree(
auto s = state.coerceToString(attr.pos, *attr.value, context, "", false, false).toOwned(); auto s = state.coerceToString(attr.pos, *attr.value, context, "", false, false).toOwned();
attrs.emplace( attrs.emplace(
state.symbols[attr.name], state.symbols[attr.name],
params.isFetchGit && state.symbols[attr.name] == "url" ? fixGitURL(s) : s); params.isFetchGit && state.symbols[attr.name] == "url" ? fixGitURL(s).to_string() : s);
} else if (attr.value->type() == nBool) } else if (attr.value->type() == nBool)
attrs.emplace(state.symbols[attr.name], Explicit<bool>{attr.value->boolean()}); attrs.emplace(state.symbols[attr.name], Explicit<bool>{attr.value->boolean()});
else if (attr.value->type() == nInt) { else if (attr.value->type() == nInt) {
@ -175,7 +175,7 @@ static void fetchTree(
if (params.isFetchGit) { if (params.isFetchGit) {
fetchers::Attrs attrs; fetchers::Attrs attrs;
attrs.emplace("type", "git"); attrs.emplace("type", "git");
attrs.emplace("url", fixGitURL(url)); attrs.emplace("url", fixGitURL(url).to_string());
if (!attrs.contains("exportIgnore") if (!attrs.contains("exportIgnore")
&& (!attrs.contains("submodules") || !*fetchers::maybeGetBoolAttr(attrs, "submodules"))) { && (!attrs.contains("submodules") || !*fetchers::maybeGetBoolAttr(attrs, "submodules"))) {
attrs.emplace("exportIgnore", Explicit<bool>{true}); attrs.emplace("exportIgnore", Explicit<bool>{true});

View file

@ -179,7 +179,7 @@ Fetch::Fetch(git_repository * repo, git_oid rev)
const auto remoteUrl = lfs::getLfsEndpointUrl(repo); const auto remoteUrl = lfs::getLfsEndpointUrl(repo);
this->url = nix::parseURL(nix::fixGitURL(remoteUrl)).canonicalise(); this->url = nix::fixGitURL(remoteUrl).canonicalise();
} }
bool Fetch::shouldFetch(const CanonPath & path) const bool Fetch::shouldFetch(const CanonPath & path) const

View file

@ -233,9 +233,7 @@ struct GitInputScheme : InputScheme
Input input{settings}; Input input{settings};
input.attrs = attrs; input.attrs = attrs;
auto url = fixGitURL(getStrAttr(attrs, "url")); input.attrs["url"] = fixGitURL(getStrAttr(attrs, "url")).to_string();
parseURL(url);
input.attrs["url"] = url;
getShallowAttr(input); getShallowAttr(input);
getSubmodulesAttr(input); getSubmodulesAttr(input);
getAllRefsAttr(input); getAllRefsAttr(input);

View file

@ -154,22 +154,17 @@ protected:
FileTransferRequest makeRequest(const std::string & path) FileTransferRequest makeRequest(const std::string & path)
{ {
/* FIXME path is not a path, but a full relative or absolute /* Otherwise the last path fragment will get discarded. */
auto cacheUriWithTrailingSlash = config->cacheUri;
if (!cacheUriWithTrailingSlash.path.empty())
cacheUriWithTrailingSlash.path += "/";
/* path is not a path, but a full relative or absolute
URL, e.g. we've seen in the wild NARINFO files have a URL URL, e.g. we've seen in the wild NARINFO files have a URL
field which is field which is
`nar/15f99rdaf26k39knmzry4xd0d97wp6yfpnfk1z9avakis7ipb9yg.nar?hash=zphkqn2wg8mnvbkixnl2aadkbn0rcnfj` `nar/15f99rdaf26k39knmzry4xd0d97wp6yfpnfk1z9avakis7ipb9yg.nar?hash=zphkqn2wg8mnvbkixnl2aadkbn0rcnfj`
(note the query param) and that gets passed here. (note the query param) and that gets passed here. */
return FileTransferRequest(parseURLRelative(path, cacheUriWithTrailingSlash));
What should actually happen is that we have two parsed URLs
(if we support relative URLs), and then we combined them with
a URL `operator/` which would be like
`std::filesystem::path`'s equivalent operator, which properly
combines the the URLs, whether the right is relative or
absolute. */
return FileTransferRequest(parseURL(
hasPrefix(path, "https://") || hasPrefix(path, "http://") || hasPrefix(path, "file://")
? path
: config->cacheUri.to_string() + "/" + path));
} }
void getFile(const std::string & path, Sink & sink) override void getFile(const std::string & path, Sink & sink) override

View file

@ -290,6 +290,194 @@ TEST(parseURL, parsesHttpUrlWithEmptyPort)
ASSERT_EQ("http://www.example.org/file.tar.gz?foo=bar", parsed.to_string()); ASSERT_EQ("http://www.example.org/file.tar.gz?foo=bar", parsed.to_string());
} }
/* ----------------------------------------------------------------------------
* parseURLRelative
* --------------------------------------------------------------------------*/
TEST(parseURLRelative, resolvesRelativePath)
{
ParsedURL base = parseURL("http://example.org/dir/page.html");
auto parsed = parseURLRelative("subdir/file.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/dir/subdir/file.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, baseUrlIpv6AddressWithoutZoneId)
{
ParsedURL base = parseURL("http://[fe80::818c:da4d:8975:415c]/dir/page.html");
auto parsed = parseURLRelative("subdir/file.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c"},
.path = "/dir/subdir/file.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, resolvesRelativePathIpv6AddressWithZoneId)
{
ParsedURL base = parseURL("http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080/dir/page.html");
auto parsed = parseURLRelative("subdir/file2.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c\%enp0s25", .port = 8080},
.path = "/dir/subdir/file2.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, resolvesRelativePathWithDot)
{
ParsedURL base = parseURL("http://example.org/dir/page.html");
auto parsed = parseURLRelative("./subdir/file.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/dir/subdir/file.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, resolvesParentDirectory)
{
ParsedURL base = parseURL("http://example.org:234/dir/page.html");
auto parsed = parseURLRelative("../up.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org", .port = 234},
.path = "/up.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, replacesPathWithAbsoluteRelative)
{
ParsedURL base = parseURL("http://example.org/dir/page.html");
auto parsed = parseURLRelative("/rooted.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/rooted.txt",
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, keepsQueryAndFragmentFromRelative)
{
// But discard query params on base URL
ParsedURL base = parseURL("https://www.example.org/path/index.html?z=3");
auto parsed = parseURLRelative("other.html?x=1&y=2#frag", base);
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/other.html",
.query = {{"x", "1"}, {"y", "2"}},
.fragment = "frag",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, absOverride)
{
ParsedURL base = parseURL("http://example.org/path/page.html");
std::string_view abs = "https://127.0.0.1.org/secure";
auto parsed = parseURLRelative(abs, base);
auto parsedAbs = parseURL(abs);
ASSERT_EQ(parsed, parsedAbs);
}
TEST(parseURLRelative, absOverrideWithZoneId)
{
ParsedURL base = parseURL("http://example.org/path/page.html");
std::string_view abs = "https://[fe80::818c:da4d:8975:415c\%25enp0s25]/secure?foo=bar";
auto parsed = parseURLRelative(abs, base);
auto parsedAbs = parseURL(abs);
ASSERT_EQ(parsed, parsedAbs);
}
TEST(parseURLRelative, bothWithoutAuthority)
{
ParsedURL base = parseURL("mailto:mail-base@bar.baz?bcc=alice@asdf.com");
std::string_view over = "mailto:mail-override@foo.bar?subject=url-testing";
auto parsed = parseURLRelative(over, base);
auto parsedOverride = parseURL(over);
ASSERT_EQ(parsed, parsedOverride);
}
TEST(parseURLRelative, emptyRelative)
{
ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag");
auto parsed = parseURLRelative("", base);
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.query = {{"a b", "5 6"}, {"x y", "34"}},
.fragment = "",
};
EXPECT_EQ(base.fragment, "frag");
EXPECT_EQ(parsed, expected);
}
TEST(parseURLRelative, fragmentRelative)
{
ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag");
auto parsed = parseURLRelative("#frag2", base);
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.query = {{"a b", "5 6"}, {"x y", "34"}},
.fragment = "frag2",
};
EXPECT_EQ(parsed, expected);
}
TEST(parseURLRelative, queryRelative)
{
ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag");
auto parsed = parseURLRelative("?asdf\%20qwer=1\%202\%203", base);
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.query = {{"asdf qwer", "1 2 3"}},
.fragment = "",
};
EXPECT_EQ(parsed, expected);
}
TEST(parseURLRelative, queryFragmentRelative)
{
ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag");
auto parsed = parseURLRelative("?asdf\%20qwer=1\%202\%203#frag2", base);
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.query = {{"asdf qwer", "1 2 3"}},
.fragment = "frag2",
};
EXPECT_EQ(parsed, expected);
}
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* decodeQuery * decodeQuery
* --------------------------------------------------------------------------*/ * --------------------------------------------------------------------------*/

View file

@ -117,6 +117,16 @@ std::string encodeQuery(const StringMap & query);
*/ */
ParsedURL parseURL(std::string_view url, bool lenient = false); ParsedURL parseURL(std::string_view url, bool lenient = false);
/**
* Like `parseURL`, but also accepts relative URLs, which are resolved
* against the given base URL.
*
* This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
*
* Behavior should also match the `new URL(url, base)` JavaScript constructor.
*/
ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);
/** /**
* Although thats not really standardized anywhere, an number of tools * Although thats not really standardized anywhere, an number of tools
* use a scheme of the form 'x+y' in urls, where y is the transport layer * use a scheme of the form 'x+y' in urls, where y is the transport layer
@ -136,7 +146,7 @@ ParsedUrlScheme parseUrlScheme(std::string_view scheme);
/* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes /* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes
them by removing the `:` and assuming a scheme of `ssh://`. Also them by removing the `:` and assuming a scheme of `ssh://`. Also
changes absolute paths into file:// URLs. */ changes absolute paths into file:// URLs. */
std::string fixGitURL(const std::string & url); ParsedURL fixGitURL(const std::string & url);
/** /**
* Whether a string is valid as RFC 3986 scheme name. * Whether a string is valid as RFC 3986 scheme name.

View file

@ -108,6 +108,8 @@ static std::string percentEncodeCharSet(std::string_view s, auto charSet)
return res; return res;
} }
static ParsedURL fromBoostUrlView(boost::urls::url_view url, bool lenient);
ParsedURL parseURL(std::string_view url, bool lenient) ParsedURL parseURL(std::string_view url, bool lenient)
try { try {
/* Account for several non-standard properties of nix urls (for back-compat): /* Account for several non-standard properties of nix urls (for back-compat):
@ -149,10 +151,15 @@ try {
}(); }();
} }
auto urlView = boost::urls::url_view(lenient ? fixedEncodedUrl : url); return fromBoostUrlView(boost::urls::url_view(lenient ? fixedEncodedUrl : url), lenient);
} catch (boost::system::system_error & e) {
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
}
static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
{
if (!urlView.has_scheme()) if (!urlView.has_scheme())
throw BadURL("'%s' doesn't have a scheme", url); throw BadURL("'%s' doesn't have a scheme", urlView.buffer());
auto scheme = urlView.scheme(); auto scheme = urlView.scheme();
auto authority = [&]() -> std::optional<ParsedURL::Authority> { auto authority = [&]() -> std::optional<ParsedURL::Authority> {
@ -170,7 +177,7 @@ try {
* scheme considers a missing authority or empty host invalid. */ * scheme considers a missing authority or empty host invalid. */
auto transportIsFile = parseUrlScheme(scheme).transport == "file"; auto transportIsFile = parseUrlScheme(scheme).transport == "file";
if (authority && authority->host.size() && transportIsFile) if (authority && authority->host.size() && transportIsFile)
throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority); throw BadURL("file:// URL '%s' has unexpected authority '%s'", urlView.buffer(), *authority);
auto path = urlView.path(); /* Does pct-decoding */ auto path = urlView.path(); /* Does pct-decoding */
auto fragment = urlView.fragment(); /* Does pct-decoding */ auto fragment = urlView.fragment(); /* Does pct-decoding */
@ -189,8 +196,58 @@ try {
.query = decodeQuery(query, lenient), .query = decodeQuery(query, lenient),
.fragment = fragment, .fragment = fragment,
}; };
} catch (boost::system::system_error & e) { }
throw BadURL("'%s' is not a valid URL: %s", url, e.code().message());
ParsedURL parseURLRelative(std::string_view urlS, const ParsedURL & base)
try {
boost::urls::url resolved;
try {
resolved.set_scheme(base.scheme);
if (base.authority) {
auto & authority = *base.authority;
resolved.set_host_address(authority.host);
if (authority.user)
resolved.set_user(*authority.user);
if (authority.password)
resolved.set_password(*authority.password);
if (authority.port)
resolved.set_port_number(*authority.port);
}
resolved.set_path(base.path);
resolved.set_encoded_query(encodeQuery(base.query));
resolved.set_fragment(base.fragment);
} catch (boost::system::system_error & e) {
throw BadURL("'%s' is not a valid URL: %s", base.to_string(), e.code().message());
}
boost::urls::url_view url;
try {
url = urlS;
resolved.resolve(url).value();
} catch (boost::system::system_error & e) {
throw BadURL("'%s' is not a valid URL: %s", urlS, e.code().message());
}
auto ret = fromBoostUrlView(resolved, /*lenient=*/false);
/* Hack: Boost `url_view` supports Zone IDs, but `url` does not.
Just manually take the authority from the original URL to work
around it. See https://github.com/boostorg/url/issues/919 for
details. */
if (!url.has_authority()) {
ret.authority = base.authority;
}
/* Hack, work around fragment of base URL improperly being preserved
https://github.com/boostorg/url/issues/920 */
ret.fragment = url.has_fragment() ? std::string{url.fragment()} : "";
return ret;
} catch (BadURL & e) {
e.addTrace({}, "while resolving possibly-relative url '%s' against base URL '%s'", urlS, base);
throw;
} }
std::string percentDecode(std::string_view in) std::string percentDecode(std::string_view in)
@ -287,17 +344,17 @@ ParsedUrlScheme parseUrlScheme(std::string_view scheme)
}; };
} }
std::string fixGitURL(const std::string & url) ParsedURL fixGitURL(const std::string & url)
{ {
std::regex scpRegex("([^/]*)@(.*):(.*)"); std::regex scpRegex("([^/]*)@(.*):(.*)");
if (!hasPrefix(url, "/") && std::regex_match(url, scpRegex)) if (!hasPrefix(url, "/") && std::regex_match(url, scpRegex))
return std::regex_replace(url, scpRegex, "ssh://$1@$2/$3"); return parseURL(std::regex_replace(url, scpRegex, "ssh://$1@$2/$3"));
if (hasPrefix(url, "file:")) if (hasPrefix(url, "file:"))
return url; return parseURL(url);
if (url.find("://") == std::string::npos) { if (url.find("://") == std::string::npos) {
return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url}).to_string(); return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url});
} }
return url; return parseURL(url);
} }
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1 // https://www.rfc-editor.org/rfc/rfc3986#section-3.1