diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index 274f758a7..d58d76d75 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -113,7 +113,7 @@ static void fetchTree( auto s = state.coerceToString(attr.pos, *attr.value, context, "", false, false).toOwned(); attrs.emplace( state.symbols[attr.name], - params.isFetchGit && state.symbols[attr.name] == "url" ? fixGitURL(s) : s); + params.isFetchGit && state.symbols[attr.name] == "url" ? fixGitURL(s).to_string() : s); } else if (attr.value->type() == nBool) attrs.emplace(state.symbols[attr.name], Explicit{attr.value->boolean()}); else if (attr.value->type() == nInt) { @@ -175,7 +175,7 @@ static void fetchTree( if (params.isFetchGit) { fetchers::Attrs attrs; attrs.emplace("type", "git"); - attrs.emplace("url", fixGitURL(url)); + attrs.emplace("url", fixGitURL(url).to_string()); if (!attrs.contains("exportIgnore") && (!attrs.contains("submodules") || !*fetchers::maybeGetBoolAttr(attrs, "submodules"))) { attrs.emplace("exportIgnore", Explicit{true}); diff --git a/src/libfetchers/git-lfs-fetch.cc b/src/libfetchers/git-lfs-fetch.cc index f555a9a4c..bd9752711 100644 --- a/src/libfetchers/git-lfs-fetch.cc +++ b/src/libfetchers/git-lfs-fetch.cc @@ -179,7 +179,7 @@ Fetch::Fetch(git_repository * repo, git_oid rev) const auto remoteUrl = lfs::getLfsEndpointUrl(repo); - this->url = nix::parseURL(nix::fixGitURL(remoteUrl)).canonicalise(); + this->url = nix::fixGitURL(remoteUrl).canonicalise(); } bool Fetch::shouldFetch(const CanonPath & path) const diff --git a/src/libfetchers/git.cc b/src/libfetchers/git.cc index bd1e1fffe..c19e8d7db 100644 --- a/src/libfetchers/git.cc +++ b/src/libfetchers/git.cc @@ -233,9 +233,7 @@ struct GitInputScheme : InputScheme Input input{settings}; input.attrs = attrs; - auto url = fixGitURL(getStrAttr(attrs, "url")); - parseURL(url); - input.attrs["url"] = url; + input.attrs["url"] = fixGitURL(getStrAttr(attrs, "url")).to_string(); getShallowAttr(input); getSubmodulesAttr(input); getAllRefsAttr(input); diff --git a/src/libstore/http-binary-cache-store.cc b/src/libstore/http-binary-cache-store.cc index 940dcec2e..ab799617e 100644 --- a/src/libstore/http-binary-cache-store.cc +++ b/src/libstore/http-binary-cache-store.cc @@ -154,22 +154,17 @@ protected: FileTransferRequest makeRequest(const std::string & path) { - /* FIXME path is not a path, but a full relative or absolute + /* Otherwise the last path fragment will get discarded. */ + auto cacheUriWithTrailingSlash = config->cacheUri; + if (!cacheUriWithTrailingSlash.path.empty()) + cacheUriWithTrailingSlash.path += "/"; + + /* path is not a path, but a full relative or absolute URL, e.g. we've seen in the wild NARINFO files have a URL field which is `nar/15f99rdaf26k39knmzry4xd0d97wp6yfpnfk1z9avakis7ipb9yg.nar?hash=zphkqn2wg8mnvbkixnl2aadkbn0rcnfj` - (note the query param) and that gets passed here. - - What should actually happen is that we have two parsed URLs - (if we support relative URLs), and then we combined them with - a URL `operator/` which would be like - `std::filesystem::path`'s equivalent operator, which properly - combines the the URLs, whether the right is relative or - absolute. */ - return FileTransferRequest(parseURL( - hasPrefix(path, "https://") || hasPrefix(path, "http://") || hasPrefix(path, "file://") - ? path - : config->cacheUri.to_string() + "/" + path)); + (note the query param) and that gets passed here. */ + return FileTransferRequest(parseURLRelative(path, cacheUriWithTrailingSlash)); } void getFile(const std::string & path, Sink & sink) override diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc index ae383eb65..b776ba671 100644 --- a/src/libutil-tests/url.cc +++ b/src/libutil-tests/url.cc @@ -290,6 +290,194 @@ TEST(parseURL, parsesHttpUrlWithEmptyPort) ASSERT_EQ("http://www.example.org/file.tar.gz?foo=bar", parsed.to_string()); } +/* ---------------------------------------------------------------------------- + * parseURLRelative + * --------------------------------------------------------------------------*/ + +TEST(parseURLRelative, resolvesRelativePath) +{ + ParsedURL base = parseURL("http://example.org/dir/page.html"); + auto parsed = parseURLRelative("subdir/file.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"}, + .path = "/dir/subdir/file.txt", + .query = {}, + .fragment = "", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, baseUrlIpv6AddressWithoutZoneId) +{ + ParsedURL base = parseURL("http://[fe80::818c:da4d:8975:415c]/dir/page.html"); + auto parsed = parseURLRelative("subdir/file.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = ParsedURL::Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c"}, + .path = "/dir/subdir/file.txt", + .query = {}, + .fragment = "", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, resolvesRelativePathIpv6AddressWithZoneId) +{ + ParsedURL base = parseURL("http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080/dir/page.html"); + auto parsed = parseURLRelative("subdir/file2.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c\%enp0s25", .port = 8080}, + .path = "/dir/subdir/file2.txt", + .query = {}, + .fragment = "", + }; + + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, resolvesRelativePathWithDot) +{ + ParsedURL base = parseURL("http://example.org/dir/page.html"); + auto parsed = parseURLRelative("./subdir/file.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"}, + .path = "/dir/subdir/file.txt", + .query = {}, + .fragment = "", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, resolvesParentDirectory) +{ + ParsedURL base = parseURL("http://example.org:234/dir/page.html"); + auto parsed = parseURLRelative("../up.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org", .port = 234}, + .path = "/up.txt", + .query = {}, + .fragment = "", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, replacesPathWithAbsoluteRelative) +{ + ParsedURL base = parseURL("http://example.org/dir/page.html"); + auto parsed = parseURLRelative("/rooted.txt", base); + ParsedURL expected{ + .scheme = "http", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"}, + .path = "/rooted.txt", + .query = {}, + .fragment = "", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, keepsQueryAndFragmentFromRelative) +{ + // But discard query params on base URL + ParsedURL base = parseURL("https://www.example.org/path/index.html?z=3"); + auto parsed = parseURLRelative("other.html?x=1&y=2#frag", base); + ParsedURL expected{ + .scheme = "https", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"}, + .path = "/path/other.html", + .query = {{"x", "1"}, {"y", "2"}}, + .fragment = "frag", + }; + ASSERT_EQ(parsed, expected); +} + +TEST(parseURLRelative, absOverride) +{ + ParsedURL base = parseURL("http://example.org/path/page.html"); + std::string_view abs = "https://127.0.0.1.org/secure"; + auto parsed = parseURLRelative(abs, base); + auto parsedAbs = parseURL(abs); + ASSERT_EQ(parsed, parsedAbs); +} + +TEST(parseURLRelative, absOverrideWithZoneId) +{ + ParsedURL base = parseURL("http://example.org/path/page.html"); + std::string_view abs = "https://[fe80::818c:da4d:8975:415c\%25enp0s25]/secure?foo=bar"; + auto parsed = parseURLRelative(abs, base); + auto parsedAbs = parseURL(abs); + ASSERT_EQ(parsed, parsedAbs); +} + +TEST(parseURLRelative, bothWithoutAuthority) +{ + ParsedURL base = parseURL("mailto:mail-base@bar.baz?bcc=alice@asdf.com"); + std::string_view over = "mailto:mail-override@foo.bar?subject=url-testing"; + auto parsed = parseURLRelative(over, base); + auto parsedOverride = parseURL(over); + ASSERT_EQ(parsed, parsedOverride); +} + +TEST(parseURLRelative, emptyRelative) +{ + ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag"); + auto parsed = parseURLRelative("", base); + ParsedURL expected{ + .scheme = "https", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"}, + .path = "/path/index.html", + .query = {{"a b", "5 6"}, {"x y", "34"}}, + .fragment = "", + }; + EXPECT_EQ(base.fragment, "frag"); + EXPECT_EQ(parsed, expected); +} + +TEST(parseURLRelative, fragmentRelative) +{ + ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag"); + auto parsed = parseURLRelative("#frag2", base); + ParsedURL expected{ + .scheme = "https", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"}, + .path = "/path/index.html", + .query = {{"a b", "5 6"}, {"x y", "34"}}, + .fragment = "frag2", + }; + EXPECT_EQ(parsed, expected); +} + +TEST(parseURLRelative, queryRelative) +{ + ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag"); + auto parsed = parseURLRelative("?asdf\%20qwer=1\%202\%203", base); + ParsedURL expected{ + .scheme = "https", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"}, + .path = "/path/index.html", + .query = {{"asdf qwer", "1 2 3"}}, + .fragment = "", + }; + EXPECT_EQ(parsed, expected); +} + +TEST(parseURLRelative, queryFragmentRelative) +{ + ParsedURL base = parseURL("https://www.example.org/path/index.html?a\%20b=5\%206&x\%20y=34#frag"); + auto parsed = parseURLRelative("?asdf\%20qwer=1\%202\%203#frag2", base); + ParsedURL expected{ + .scheme = "https", + .authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"}, + .path = "/path/index.html", + .query = {{"asdf qwer", "1 2 3"}}, + .fragment = "frag2", + }; + EXPECT_EQ(parsed, expected); +} + /* ---------------------------------------------------------------------------- * decodeQuery * --------------------------------------------------------------------------*/ diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh index 3262b44b7..54bd1e533 100644 --- a/src/libutil/include/nix/util/url.hh +++ b/src/libutil/include/nix/util/url.hh @@ -117,6 +117,16 @@ std::string encodeQuery(const StringMap & query); */ ParsedURL parseURL(std::string_view url, bool lenient = false); +/** + * Like `parseURL`, but also accepts relative URLs, which are resolved + * against the given base URL. + * + * This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5) + * + * Behavior should also match the `new URL(url, base)` JavaScript constructor. + */ +ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base); + /** * Although that’s not really standardized anywhere, an number of tools * use a scheme of the form 'x+y' in urls, where y is the “transport layer” @@ -136,7 +146,7 @@ ParsedUrlScheme parseUrlScheme(std::string_view scheme); /* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes them by removing the `:` and assuming a scheme of `ssh://`. Also changes absolute paths into file:// URLs. */ -std::string fixGitURL(const std::string & url); +ParsedURL fixGitURL(const std::string & url); /** * Whether a string is valid as RFC 3986 scheme name. diff --git a/src/libutil/url.cc b/src/libutil/url.cc index 73e8cc181..ff0b7a71b 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -108,6 +108,8 @@ static std::string percentEncodeCharSet(std::string_view s, auto charSet) return res; } +static ParsedURL fromBoostUrlView(boost::urls::url_view url, bool lenient); + ParsedURL parseURL(std::string_view url, bool lenient) try { /* Account for several non-standard properties of nix urls (for back-compat): @@ -149,10 +151,15 @@ try { }(); } - auto urlView = boost::urls::url_view(lenient ? fixedEncodedUrl : url); + return fromBoostUrlView(boost::urls::url_view(lenient ? fixedEncodedUrl : url), lenient); +} catch (boost::system::system_error & e) { + throw BadURL("'%s' is not a valid URL: %s", url, e.code().message()); +} +static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient) +{ if (!urlView.has_scheme()) - throw BadURL("'%s' doesn't have a scheme", url); + throw BadURL("'%s' doesn't have a scheme", urlView.buffer()); auto scheme = urlView.scheme(); auto authority = [&]() -> std::optional { @@ -170,7 +177,7 @@ try { * scheme considers a missing authority or empty host invalid. */ auto transportIsFile = parseUrlScheme(scheme).transport == "file"; if (authority && authority->host.size() && transportIsFile) - throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority); + throw BadURL("file:// URL '%s' has unexpected authority '%s'", urlView.buffer(), *authority); auto path = urlView.path(); /* Does pct-decoding */ auto fragment = urlView.fragment(); /* Does pct-decoding */ @@ -189,8 +196,58 @@ try { .query = decodeQuery(query, lenient), .fragment = fragment, }; -} catch (boost::system::system_error & e) { - throw BadURL("'%s' is not a valid URL: %s", url, e.code().message()); +} + +ParsedURL parseURLRelative(std::string_view urlS, const ParsedURL & base) +try { + + boost::urls::url resolved; + + try { + resolved.set_scheme(base.scheme); + if (base.authority) { + auto & authority = *base.authority; + resolved.set_host_address(authority.host); + if (authority.user) + resolved.set_user(*authority.user); + if (authority.password) + resolved.set_password(*authority.password); + if (authority.port) + resolved.set_port_number(*authority.port); + } + resolved.set_path(base.path); + resolved.set_encoded_query(encodeQuery(base.query)); + resolved.set_fragment(base.fragment); + } catch (boost::system::system_error & e) { + throw BadURL("'%s' is not a valid URL: %s", base.to_string(), e.code().message()); + } + + boost::urls::url_view url; + try { + url = urlS; + resolved.resolve(url).value(); + } catch (boost::system::system_error & e) { + throw BadURL("'%s' is not a valid URL: %s", urlS, e.code().message()); + } + + auto ret = fromBoostUrlView(resolved, /*lenient=*/false); + + /* Hack: Boost `url_view` supports Zone IDs, but `url` does not. + Just manually take the authority from the original URL to work + around it. See https://github.com/boostorg/url/issues/919 for + details. */ + if (!url.has_authority()) { + ret.authority = base.authority; + } + + /* Hack, work around fragment of base URL improperly being preserved + https://github.com/boostorg/url/issues/920 */ + ret.fragment = url.has_fragment() ? std::string{url.fragment()} : ""; + + return ret; +} catch (BadURL & e) { + e.addTrace({}, "while resolving possibly-relative url '%s' against base URL '%s'", urlS, base); + throw; } std::string percentDecode(std::string_view in) @@ -287,17 +344,17 @@ ParsedUrlScheme parseUrlScheme(std::string_view scheme) }; } -std::string fixGitURL(const std::string & url) +ParsedURL fixGitURL(const std::string & url) { std::regex scpRegex("([^/]*)@(.*):(.*)"); if (!hasPrefix(url, "/") && std::regex_match(url, scpRegex)) - return std::regex_replace(url, scpRegex, "ssh://$1@$2/$3"); + return parseURL(std::regex_replace(url, scpRegex, "ssh://$1@$2/$3")); if (hasPrefix(url, "file:")) - return url; + return parseURL(url); if (url.find("://") == std::string::npos) { - return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url}).to_string(); + return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url}); } - return url; + return parseURL(url); } // https://www.rfc-editor.org/rfc/rfc3986#section-3.1