1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-08 19:46:02 +01:00

Fix ParsedURL handling of %2F in URL paths

See the new extensive doxygen in `url.hh`.
This fixes fetching gitlab: flakes.

Paths are now stored as a std::vector of individual path
segments, which can themselves contain path separators '/' (%2F).
This is necessary to make the Gitlab's /projects/ API work.

Co-authored-by: John Ericson <John.Ericson@Obsidian.Systems>
Co-authored-by: Sergei Zimmerman <sergei@zimmerman.foo>
This commit is contained in:
Jörg Thalheim 2025-08-26 12:49:28 +02:00 committed by Sergei Zimmerman
parent 6839f3de55
commit c436b7a32a
No known key found for this signature in database
19 changed files with 446 additions and 117 deletions

View file

@ -69,7 +69,8 @@ static LfsApiInfo getLfsApi(const ParsedURL & url)
args.push_back("--");
args.push_back("git-lfs-authenticate");
args.push_back(url.path);
// FIXME %2F encode slashes? Does this command take/accept percent encoding?
args.push_back(url.renderPath(/*encode=*/false));
args.push_back("download");
auto [status, output] = runProgram({.program = "ssh", .args = args});

View file

@ -462,8 +462,8 @@ struct GitInputScheme : InputScheme
// Why are we checking for bare repository?
// well if it's a bare repository we want to force a git fetch rather than copying the folder
bool isBareRepository = url.scheme == "file" && pathExists(url.path) && !pathExists(url.path + "/.git");
//
auto isBareRepository = [](PathView path) { return pathExists(path) && !pathExists(path + "/.git"); };
// FIXME: here we turn a possibly relative path into an absolute path.
// This allows relative git flake inputs to be resolved against the
// **current working directory** (as in POSIX), which tends to work out
@ -472,8 +472,10 @@ struct GitInputScheme : InputScheme
//
// See: https://discourse.nixos.org/t/57783 and #9708
//
if (url.scheme == "file" && !forceHttp && !isBareRepository) {
if (!isAbsolute(url.path)) {
if (url.scheme == "file" && !forceHttp && !isBareRepository(renderUrlPathEnsureLegal(url.path))) {
auto path = renderUrlPathEnsureLegal(url.path);
if (!isAbsolute(path)) {
warn(
"Fetching Git repository '%s', which uses a path relative to the current directory. "
"This is not supported and will stop working in a future release. "
@ -483,10 +485,10 @@ struct GitInputScheme : InputScheme
// If we don't check here for the path existence, then we can give libgit2 any directory
// and it will initialize them as git directories.
if (!pathExists(url.path)) {
throw Error("The path '%s' does not exist.", url.path);
if (!pathExists(path)) {
throw Error("The path '%s' does not exist.", path);
}
repoInfo.location = std::filesystem::absolute(url.path);
repoInfo.location = std::filesystem::absolute(path);
} else {
if (url.scheme == "file")
/* Query parameters are meaningless for file://, but

View file

@ -38,7 +38,7 @@ struct GitArchiveInputScheme : InputScheme
if (url.scheme != schemeName())
return {};
auto path = tokenizeString<std::vector<std::string>>(url.path, "/");
const auto & path = url.path;
std::optional<Hash> rev;
std::optional<std::string> ref;
@ -139,12 +139,12 @@ struct GitArchiveInputScheme : InputScheme
auto repo = getStrAttr(input.attrs, "repo");
auto ref = input.getRef();
auto rev = input.getRev();
auto path = owner + "/" + repo;
std::vector<std::string> path{owner, repo};
assert(!(ref && rev));
if (ref)
path += "/" + *ref;
path.push_back(*ref);
if (rev)
path += "/" + rev->to_string(HashFormat::Base16, false);
path.push_back(rev->to_string(HashFormat::Base16, false));
auto url = ParsedURL{
.scheme = std::string{schemeName()},
.path = path,

View file

@ -14,7 +14,7 @@ struct IndirectInputScheme : InputScheme
if (url.scheme != "flake")
return {};
auto path = tokenizeString<std::vector<std::string>>(url.path, "/");
const auto & path = url.path;
std::optional<Hash> rev;
std::optional<std::string> ref;
@ -82,16 +82,15 @@ struct IndirectInputScheme : InputScheme
ParsedURL toURL(const Input & input) const override
{
ParsedURL url;
url.scheme = "flake";
url.path = getStrAttr(input.attrs, "id");
ParsedURL url{
.scheme = "flake",
.path = {getStrAttr(input.attrs, "id")},
};
if (auto ref = input.getRef()) {
url.path += '/';
url.path += *ref;
url.path.push_back(*ref);
};
if (auto rev = input.getRev()) {
url.path += '/';
url.path += rev->gitRev();
url.path.push_back(rev->gitRev());
};
return url;
}

View file

@ -120,7 +120,7 @@ struct MercurialInputScheme : InputScheme
{
auto url = parseURL(getStrAttr(input.attrs, "url"));
if (url.scheme == "file" && !input.getRef() && !input.getRev())
return url.path;
return renderUrlPathEnsureLegal(url.path);
return {};
}
@ -152,7 +152,7 @@ struct MercurialInputScheme : InputScheme
{
auto url = parseURL(getStrAttr(input.attrs, "url"));
bool isLocal = url.scheme == "file";
return {isLocal, isLocal ? url.path : url.to_string()};
return {isLocal, isLocal ? renderUrlPathEnsureLegal(url.path) : url.to_string()};
}
StorePath fetchToStore(ref<Store> store, Input & input) const

View file

@ -20,7 +20,7 @@ struct PathInputScheme : InputScheme
Input input{settings};
input.attrs.insert_or_assign("type", "path");
input.attrs.insert_or_assign("path", url.path);
input.attrs.insert_or_assign("path", renderUrlPathEnsureLegal(url.path));
for (auto & [name, value] : url.query)
if (name == "rev" || name == "narHash")
@ -74,7 +74,7 @@ struct PathInputScheme : InputScheme
query.erase("__final");
return ParsedURL{
.scheme = "path",
.path = getStrAttr(input.attrs, "path"),
.path = splitString<std::vector<std::string>>(getStrAttr(input.attrs, "path"), "/"),
.query = query,
};
}

View file

@ -107,19 +107,19 @@ DownloadFileResult downloadFile(
}
static DownloadTarballResult downloadTarball_(
const Settings & settings, const std::string & url, const Headers & headers, const std::string & displayPrefix)
const Settings & settings, const std::string & urlS, const Headers & headers, const std::string & displayPrefix)
{
auto url = parseURL(urlS);
// Some friendly error messages for common mistakes.
// Namely lets catch when the url is a local file path, but
// it is not in fact a tarball.
if (url.rfind("file://", 0) == 0) {
// Remove "file://" prefix to get the local file path
std::string localPath = url.substr(7);
if (!std::filesystem::exists(localPath)) {
if (url.scheme == "file") {
std::filesystem::path localPath = renderUrlPathEnsureLegal(url.path);
if (!exists(localPath)) {
throw Error("tarball '%s' does not exist.", localPath);
}
if (std::filesystem::is_directory(localPath)) {
if (is_directory(localPath)) {
if (std::filesystem::exists(localPath + "/.git")) {
throw Error(
"tarball '%s' is a git repository, not a tarball. Please use `git+file` as the scheme.", localPath);
@ -128,7 +128,7 @@ static DownloadTarballResult downloadTarball_(
}
}
Cache::Key cacheKey{"tarball", {{"url", url}}};
Cache::Key cacheKey{"tarball", {{"url", urlS}}};
auto cached = settings.getCache()->lookupExpired(cacheKey);
@ -153,7 +153,7 @@ static DownloadTarballResult downloadTarball_(
auto _res = std::make_shared<Sync<FileTransferResult>>();
auto source = sinkToSource([&](Sink & sink) {
FileTransferRequest req(parseURL(url));
FileTransferRequest req(url);
req.expectedETag = cached ? getStrAttr(cached->value, "etag") : "";
getFileTransfer()->download(std::move(req), sink, [_res](FileTransferResult r) { *_res->lock() = r; });
});
@ -166,7 +166,7 @@ static DownloadTarballResult downloadTarball_(
/* Note: if the download is cached, `importTarball()` will receive
no data, which causes it to import an empty tarball. */
auto archive = hasSuffix(toLower(parseURL(url).path), ".zip") ? ({
auto archive = !url.path.empty() && hasSuffix(toLower(url.path.back()), ".zip") ? ({
/* In streaming mode, libarchive doesn't handle
symlinks in zip files correctly (#10649). So write
the entire file to disk so libarchive can access it
@ -180,7 +180,7 @@ static DownloadTarballResult downloadTarball_(
}
TarArchive{path};
})
: TarArchive{*source};
: TarArchive{*source};
auto tarballCache = getTarballCache();
auto parseSink = tarballCache->getFileSystemObjectSink();
auto lastModified = unpackTarfileToSink(archive, *parseSink);
@ -234,8 +234,11 @@ struct CurlInputScheme : InputScheme
{
const StringSet transportUrlSchemes = {"file", "http", "https"};
bool hasTarballExtension(std::string_view path) const
bool hasTarballExtension(const ParsedURL & url) const
{
if (url.path.empty())
return false;
const auto & path = url.path.back();
return hasSuffix(path, ".zip") || hasSuffix(path, ".tar") || hasSuffix(path, ".tgz")
|| hasSuffix(path, ".tar.gz") || hasSuffix(path, ".tar.xz") || hasSuffix(path, ".tar.bz2")
|| hasSuffix(path, ".tar.zst");
@ -336,7 +339,7 @@ struct FileInputScheme : CurlInputScheme
auto parsedUrlScheme = parseUrlScheme(url.scheme);
return transportUrlSchemes.count(std::string(parsedUrlScheme.transport))
&& (parsedUrlScheme.application ? parsedUrlScheme.application.value() == schemeName()
: (!requireTree && !hasTarballExtension(url.path)));
: (!requireTree && !hasTarballExtension(url)));
}
std::pair<ref<SourceAccessor>, Input> getAccessor(ref<Store> store, const Input & _input) const override
@ -373,7 +376,7 @@ struct TarballInputScheme : CurlInputScheme
return transportUrlSchemes.count(std::string(parsedUrlScheme.transport))
&& (parsedUrlScheme.application ? parsedUrlScheme.application.value() == schemeName()
: (requireTree || hasTarballExtension(url.path)));
: (requireTree || hasTarballExtension(url)));
}
std::pair<ref<SourceAccessor>, Input> getAccessor(ref<Store> store, const Input & _input) const override

View file

@ -143,7 +143,7 @@ std::pair<FlakeRef, std::string> parsePathFlakeRefWithFragment(
auto parsedURL = ParsedURL{
.scheme = "git+file",
.authority = ParsedURL::Authority{},
.path = flakeRoot,
.path = splitString<std::vector<std::string>>(flakeRoot, "/"),
.query = query,
.fragment = fragment,
};
@ -172,7 +172,13 @@ std::pair<FlakeRef, std::string> parsePathFlakeRefWithFragment(
return fromParsedURL(
fetchSettings,
{.scheme = "path", .authority = ParsedURL::Authority{}, .path = path, .query = query, .fragment = fragment},
{
.scheme = "path",
.authority = ParsedURL::Authority{},
.path = splitString<std::vector<std::string>>(path, "/"),
.query = query,
.fragment = fragment,
},
isFlake);
}
@ -193,7 +199,7 @@ parseFlakeIdRef(const fetchers::Settings & fetchSettings, const std::string & ur
auto parsedURL = ParsedURL{
.scheme = "flake",
.authority = ParsedURL::Authority{},
.path = match[1],
.path = splitString<std::vector<std::string>>(match[1].str(), "/"),
};
return std::make_pair(
@ -211,8 +217,12 @@ std::optional<std::pair<FlakeRef, std::string>> parseURLFlakeRef(
{
try {
auto parsed = parseURL(url, /*lenient=*/true);
if (baseDir && (parsed.scheme == "path" || parsed.scheme == "git+file") && !isAbsolute(parsed.path))
parsed.path = absPath(parsed.path, *baseDir);
if (baseDir && (parsed.scheme == "path" || parsed.scheme == "git+file")) {
/* Here we know that the path must not contain encoded '/' or NUL bytes. */
auto path = renderUrlPathEnsureLegal(parsed.path);
if (!isAbsolute(path))
parsed.path = splitString<std::vector<std::string>>(absPath(path, *baseDir), "/");
}
return fromParsedURL(fetchSettings, std::move(parsed), isFlake);
} catch (BadURL &) {
return std::nullopt;

View file

@ -27,16 +27,21 @@ std::optional<std::string> getNameFromURL(const ParsedURL & url)
return match.str(2);
}
/* This is not right, because special chars like slashes within the
path fragments should be percent encoded, but I don't think any
of the regexes above care. */
auto path = concatStringsSep("/", url.path);
/* If this is a github/gitlab/sourcehut flake, use the repo name */
if (std::regex_match(url.scheme, gitProviderRegex) && std::regex_match(url.path, match, secondPathSegmentRegex))
if (std::regex_match(url.scheme, gitProviderRegex) && std::regex_match(path, match, secondPathSegmentRegex))
return match.str(1);
/* If it is a regular git flake, use the directory name */
if (std::regex_match(url.scheme, gitSchemeRegex) && std::regex_match(url.path, match, lastPathSegmentRegex))
if (std::regex_match(url.scheme, gitSchemeRegex) && std::regex_match(path, match, lastPathSegmentRegex))
return match.str(1);
/* If there is no fragment, take the last element of the path */
if (std::regex_match(url.path, match, lastPathSegmentRegex))
if (std::regex_match(path, match, lastPathSegmentRegex))
return match.str(1);
/* If even that didn't work, the URL does not contain enough info to determine a useful name */

View file

@ -33,7 +33,7 @@ INSTANTIATE_TEST_SUITE_P(
"s3://my-bucket/my-key.txt",
{
.bucket = "my-bucket",
.key = "my-key.txt",
.key = {"my-key.txt"},
},
"basic_s3_bucket",
},
@ -41,7 +41,7 @@ INSTANTIATE_TEST_SUITE_P(
"s3://prod-cache/nix/store/abc123.nar.xz?region=eu-west-1",
{
.bucket = "prod-cache",
.key = "nix/store/abc123.nar.xz",
.key = {"nix", "store", "abc123.nar.xz"},
.region = "eu-west-1",
},
"with_region",
@ -50,7 +50,7 @@ INSTANTIATE_TEST_SUITE_P(
"s3://bucket/key?region=us-west-2&profile=prod&endpoint=custom.s3.com&scheme=https&region=us-east-1",
{
.bucket = "bucket",
.key = "key",
.key = {"key"},
.profile = "prod",
.region = "us-west-2", //< using the first parameter (decodeQuery ignores dupicates)
.scheme = "https",
@ -62,7 +62,7 @@ INSTANTIATE_TEST_SUITE_P(
"s3://cache/file.txt?profile=production&region=ap-southeast-2",
{
.bucket = "cache",
.key = "file.txt",
.key = {"file.txt"},
.profile = "production",
.region = "ap-southeast-2",
},
@ -72,13 +72,14 @@ INSTANTIATE_TEST_SUITE_P(
"s3://bucket/key?endpoint=https://minio.local&scheme=http",
{
.bucket = "bucket",
.key = "key",
.key = {"key"},
/* TODO: Figure out what AWS SDK is doing when both endpointOverride and scheme are set. */
.scheme = "http",
.endpoint =
ParsedURL{
.scheme = "https",
.authority = ParsedURL::Authority{.host = "minio.local"},
.path = {""},
},
},
"with_absolute_endpoint_uri",
@ -101,6 +102,7 @@ struct S3ToHttpsConversionTestCase
{
ParsedS3URL input;
ParsedURL expected;
std::string expectedRendered;
std::string description;
};
@ -113,6 +115,7 @@ TEST_P(S3ToHttpsConversionTest, ConvertsCorrectly)
const auto & testCase = GetParam();
auto result = testCase.input.toHttpsUrl();
EXPECT_EQ(result, testCase.expected) << "Failed for: " << testCase.description;
EXPECT_EQ(result.to_string(), testCase.expectedRendered);
}
INSTANTIATE_TEST_SUITE_P(
@ -122,71 +125,77 @@ INSTANTIATE_TEST_SUITE_P(
S3ToHttpsConversionTestCase{
ParsedS3URL{
.bucket = "my-bucket",
.key = "my-key.txt",
.key = {"my-key.txt"},
},
ParsedURL{
.scheme = "https",
.authority = ParsedURL::Authority{.host = "s3.us-east-1.amazonaws.com"},
.path = "/my-bucket/my-key.txt",
.path = {"", "my-bucket", "my-key.txt"},
},
"https://s3.us-east-1.amazonaws.com/my-bucket/my-key.txt",
"basic_s3_default_region",
},
S3ToHttpsConversionTestCase{
ParsedS3URL{
.bucket = "prod-cache",
.key = "nix/store/abc123.nar.xz",
.key = {"nix", "store", "abc123.nar.xz"},
.region = "eu-west-1",
},
ParsedURL{
.scheme = "https",
.authority = ParsedURL::Authority{.host = "s3.eu-west-1.amazonaws.com"},
.path = "/prod-cache/nix/store/abc123.nar.xz",
.path = {"", "prod-cache", "nix", "store", "abc123.nar.xz"},
},
"https://s3.eu-west-1.amazonaws.com/prod-cache/nix/store/abc123.nar.xz",
"with_eu_west_1_region",
},
S3ToHttpsConversionTestCase{
ParsedS3URL{
.bucket = "bucket",
.key = "key",
.key = {"key"},
.scheme = "http",
.endpoint = ParsedURL::Authority{.host = "custom.s3.com"},
},
ParsedURL{
.scheme = "http",
.authority = ParsedURL::Authority{.host = "custom.s3.com"},
.path = "/bucket/key",
.path = {"", "bucket", "key"},
},
"http://custom.s3.com/bucket/key",
"custom_endpoint_authority",
},
S3ToHttpsConversionTestCase{
ParsedS3URL{
.bucket = "bucket",
.key = "key",
.key = {"key"},
.endpoint =
ParsedURL{
.scheme = "http",
.authority = ParsedURL::Authority{.host = "server", .port = 9000},
.path = {""},
},
},
ParsedURL{
.scheme = "http",
.authority = ParsedURL::Authority{.host = "server", .port = 9000},
.path = "/bucket/key",
.path = {"", "bucket", "key"},
},
"http://server:9000/bucket/key",
"custom_endpoint_with_port",
},
S3ToHttpsConversionTestCase{
ParsedS3URL{
.bucket = "bucket",
.key = "path/to/file.txt",
.key = {"path", "to", "file.txt"},
.region = "ap-southeast-2",
.scheme = "https",
},
ParsedURL{
.scheme = "https",
.authority = ParsedURL::Authority{.host = "s3.ap-southeast-2.amazonaws.com"},
.path = "/bucket/path/to/file.txt",
.path = {"", "bucket", "path", "to", "file.txt"},
},
"https://s3.ap-southeast-2.amazonaws.com/bucket/path/to/file.txt",
"complex_path_and_region",
}),
[](const ::testing::TestParamInfo<S3ToHttpsConversionTestCase> & info) { return info.param.description; });

View file

@ -815,7 +815,7 @@ struct curlFileTransfer : public FileTransfer
S3Helper s3Helper(profile, region, scheme, endpoint);
// FIXME: implement ETag
auto s3Res = s3Helper.getObject(parsed.bucket, parsed.key);
auto s3Res = s3Helper.getObject(parsed.bucket, encodeUrlPath(parsed.key));
FileTransferResult res;
if (!s3Res.data)
throw FileTransferError(NotFound, {}, "S3 object '%s' does not exist", request.uri);

View file

@ -27,7 +27,7 @@ HttpBinaryCacheStoreConfig::HttpBinaryCacheStoreConfig(
+ (!_cacheUri.empty() ? _cacheUri
: throw UsageError("`%s` Store requires a non-empty authority in Store URL", scheme))))
{
while (!cacheUri.path.empty() && cacheUri.path.back() == '/')
while (!cacheUri.path.empty() && cacheUri.path.back() == "")
cacheUri.path.pop_back();
}
@ -37,7 +37,7 @@ StoreReference HttpBinaryCacheStoreConfig::getReference() const
.variant =
StoreReference::Specified{
.scheme = cacheUri.scheme,
.authority = (cacheUri.authority ? cacheUri.authority->to_string() : "") + cacheUri.path,
.authority = cacheUri.renderAuthorityAndPath(),
},
.params = cacheUri.query,
};
@ -157,7 +157,7 @@ protected:
/* Otherwise the last path fragment will get discarded. */
auto cacheUriWithTrailingSlash = config->cacheUri;
if (!cacheUriWithTrailingSlash.path.empty())
cacheUriWithTrailingSlash.path += "/";
cacheUriWithTrailingSlash.path.push_back("");
/* path is not a path, but a full relative or absolute
URL, e.g. we've seen in the wild NARINFO files have a URL

View file

@ -54,7 +54,12 @@ struct S3Helper
struct ParsedS3URL
{
std::string bucket;
std::string key;
/**
* @see ParsedURL::path. This is a vector for the same reason.
* Unlike ParsedURL::path this doesn't include the leading empty segment,
* since the bucket name is necessary.
*/
std::vector<std::string> key;
std::optional<std::string> profile;
std::optional<std::string> region;
std::optional<std::string> scheme;

View file

@ -77,12 +77,22 @@ struct StoreReference
*/
std::string render(bool withParams = true) const;
std::string to_string() const
{
return render();
}
/**
* Parse a URI into a store reference.
*/
static StoreReference parse(const std::string & uri, const Params & extraParams = Params{});
};
static inline std::ostream & operator<<(std::ostream & os, const StoreReference & ref)
{
return os << ref.render();
}
/**
* Split URI into protocol+hierarchy part and its parameter set.
*/

View file

@ -3,6 +3,9 @@
#include "nix/util/url.hh"
#include "nix/util/util.hh"
#include "nix/util/canon-path.hh"
#include "nix/util/strings-inline.hh"
#include <ranges>
namespace nix {
@ -24,10 +27,6 @@ try {
|| parsed.authority->hostType != ParsedURL::Authority::HostType::Name)
throw BadURL("URI has a missing or invalid bucket name");
std::string_view key = parsed.path;
/* Make the key a relative path. */
splitPrefix(key, "/");
/* TODO: Validate the key against:
* https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines
*/
@ -41,10 +40,14 @@ try {
};
auto endpoint = getOptionalParam("endpoint");
if (parsed.path.size() <= 1 || !parsed.path.front().empty())
throw BadURL("URI has a missing or invalid key");
auto path = std::views::drop(parsed.path, 1) | std::ranges::to<std::vector<std::string>>();
return ParsedS3URL{
.bucket = parsed.authority->host,
.key = std::string{key},
.key = std::move(path),
.profile = getOptionalParam("profile"),
.region = getOptionalParam("region"),
.scheme = getOptionalParam("scheme"),
@ -78,26 +81,35 @@ ParsedURL ParsedS3URL::toHttpsUrl() const
overloaded{
[&](const std::monostate &) {
// No custom endpoint, use standard AWS S3 endpoint
std::vector<std::string> path{""};
path.push_back(bucket);
path.insert(path.end(), key.begin(), key.end());
return ParsedURL{
.scheme = std::string{schemeStr},
.authority = ParsedURL::Authority{.host = "s3." + regionStr + ".amazonaws.com"},
.path = (CanonPath::root / bucket / CanonPath(key)).abs(),
.path = std::move(path),
};
},
[&](const ParsedURL::Authority & auth) {
// Endpoint is just an authority (hostname/port)
std::vector<std::string> path{""};
path.push_back(bucket);
path.insert(path.end(), key.begin(), key.end());
return ParsedURL{
.scheme = std::string{schemeStr},
.authority = auth,
.path = (CanonPath::root / bucket / CanonPath(key)).abs(),
.path = std::move(path),
};
},
[&](const ParsedURL & endpointUrl) {
// Endpoint is already a ParsedURL (e.g., http://server:9000)
auto path = endpointUrl.path;
path.push_back(bucket);
path.insert(path.end(), key.begin(), key.end());
return ParsedURL{
.scheme = endpointUrl.scheme,
.authority = endpointUrl.authority,
.path = (CanonPath(endpointUrl.path) / bucket / CanonPath(key)).abs(),
.path = std::move(path),
};
},
},

View file

@ -48,13 +48,11 @@ StoreReference StoreReference::parse(const std::string & uri, const StoreReferen
auto parsedUri = parseURL(uri, /*lenient=*/true);
params.insert(parsedUri.query.begin(), parsedUri.query.end());
auto baseURI = parsedUri.authority.value_or(ParsedURL::Authority{}).to_string() + parsedUri.path;
return {
.variant =
Specified{
.scheme = std::move(parsedUri.scheme),
.authority = std::move(baseURI),
.authority = parsedUri.renderAuthorityAndPath(),
},
.params = std::move(params),
};

View file

@ -18,7 +18,7 @@ TEST(parseURL, parsesSimpleHttpUrl)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {},
.fragment = "",
};
@ -35,7 +35,7 @@ TEST(parseURL, parsesSimpleHttpsUrl)
ParsedURL expected{
.scheme = "https",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {},
.fragment = "",
};
@ -52,7 +52,7 @@ TEST(parseURL, parsesSimpleHttpUrlWithQueryAndFragment)
ParsedURL expected{
.scheme = "https",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {{"download", "fast"}, {"when", "now"}},
.fragment = "hello",
};
@ -69,7 +69,7 @@ TEST(parseURL, parsesSimpleHttpUrlWithComplexFragment)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {{"field", "value"}},
.fragment = "?foo=bar#",
};
@ -85,7 +85,7 @@ TEST(parseURL, parsesFilePlusHttpsUrl)
ParsedURL expected{
.scheme = "file+https",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/video.mp4",
.path = {"", "video.mp4"},
.query = (StringMap) {},
.fragment = "",
};
@ -108,7 +108,7 @@ TEST(parseURL, parseIPv4Address)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::IPv4, .host = "127.0.0.1", .port = 8080},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {{"download", "fast"}, {"when", "now"}},
.fragment = "hello",
};
@ -125,7 +125,7 @@ TEST(parseURL, parseScopedRFC6874IPv6Address)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c\%enp0s25", .port = 8080},
.path = "",
.path = {""},
.query = (StringMap) {},
.fragment = "",
};
@ -147,7 +147,7 @@ TEST(parseURL, parseIPv6Address)
.host = "2a02:8071:8192:c100:311d:192d:81ac:11ea",
.port = 8080,
},
.path = "",
.path = {""},
.query = (StringMap) {},
.fragment = "",
};
@ -178,7 +178,7 @@ TEST(parseURL, parseUserPassword)
.password = "pass",
.port = 8080,
},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {},
.fragment = "",
};
@ -195,11 +195,12 @@ TEST(parseURL, parseFileURLWithQueryAndFragment)
ParsedURL expected{
.scheme = "file",
.authority = Authority{},
.path = "/none/of//your/business",
.path = {"", "none", "of", "", "your", "business"},
.query = (StringMap) {},
.fragment = "",
};
ASSERT_EQ(parsed.renderPath(), "/none/of//your/business");
ASSERT_EQ(parsed, expected);
ASSERT_EQ(s, parsed.to_string());
}
@ -212,9 +213,10 @@ TEST(parseURL, parseFileURL)
ParsedURL expected{
.scheme = "file",
.authority = std::nullopt,
.path = "/none/of/your/business/",
.path = {"", "none", "of", "your", "business", ""},
};
ASSERT_EQ(parsed.renderPath(), "/none/of/your/business/");
ASSERT_EQ(parsed, expected);
ASSERT_EQ(s, parsed.to_string());
}
@ -227,10 +229,11 @@ TEST(parseURL, parseFileURLWithAuthority)
ParsedURL expected{
.scheme = "file",
.authority = Authority{.host = ""},
.path = "///of/your/business//",
.path = {"", "", "", "of", "your", "business", "", ""},
};
ASSERT_EQ(parsed.authority, expected.authority);
ASSERT_EQ(parsed.path, expected.path);
ASSERT_EQ(parsed.renderPath(), "///of/your/business//");
ASSERT_EQ(parsed, expected);
ASSERT_EQ(s, parsed.to_string());
}
@ -243,9 +246,10 @@ TEST(parseURL, parseFileURLNoLeadingSlash)
ParsedURL expected{
.scheme = "file",
.authority = std::nullopt,
.path = "none/of/your/business/",
.path = {"none", "of", "your", "business", ""},
};
ASSERT_EQ(parsed.renderPath(), "none/of/your/business/");
ASSERT_EQ(parsed, expected);
ASSERT_EQ("file:none/of/your/business/", parsed.to_string());
}
@ -258,9 +262,10 @@ TEST(parseURL, parseHttpTrailingSlash)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.host = "example.com"},
.path = "/",
.path = {"", ""},
};
ASSERT_EQ(parsed.renderPath(), "/");
ASSERT_EQ(parsed, expected);
ASSERT_EQ(s, parsed.to_string());
}
@ -306,7 +311,7 @@ TEST(parseURL, parseFTPUrl)
ParsedURL expected{
.scheme = "ftp",
.authority = Authority{.hostType = HostType::Name, .host = "ftp.nixos.org"},
.path = "/downloads/nixos.iso",
.path = {"", "downloads", "nixos.iso"},
.query = (StringMap) {},
.fragment = "",
};
@ -342,7 +347,7 @@ TEST(parseURL, parsesHttpUrlWithEmptyPort)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/file.tar.gz",
.path = {"", "file.tar.gz"},
.query = (StringMap) {{"foo", "bar"}},
.fragment = "",
};
@ -362,7 +367,7 @@ TEST(parseURLRelative, resolvesRelativePath)
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/dir/subdir/file.txt",
.path = {"", "dir", "subdir", "file.txt"},
.query = {},
.fragment = "",
};
@ -376,7 +381,7 @@ TEST(parseURLRelative, baseUrlIpv6AddressWithoutZoneId)
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c"},
.path = "/dir/subdir/file.txt",
.path = {"", "dir", "subdir", "file.txt"},
.query = {},
.fragment = "",
};
@ -390,7 +395,7 @@ TEST(parseURLRelative, resolvesRelativePathIpv6AddressWithZoneId)
ParsedURL expected{
.scheme = "http",
.authority = Authority{.hostType = HostType::IPv6, .host = "fe80::818c:da4d:8975:415c\%enp0s25", .port = 8080},
.path = "/dir/subdir/file2.txt",
.path = {"", "dir", "subdir", "file2.txt"},
.query = {},
.fragment = "",
};
@ -405,7 +410,7 @@ TEST(parseURLRelative, resolvesRelativePathWithDot)
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/dir/subdir/file.txt",
.path = {"", "dir", "subdir", "file.txt"},
.query = {},
.fragment = "",
};
@ -419,7 +424,21 @@ TEST(parseURLRelative, resolvesParentDirectory)
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org", .port = 234},
.path = "/up.txt",
.path = {"", "up.txt"},
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
}
TEST(parseURLRelative, resolvesParentDirectoryNotTrickedByEscapedSlash)
{
ParsedURL base = parseURL("http://example.org:234/dir\%2Ffirst-trick/another-dir\%2Fsecond-trick/page.html");
auto parsed = parseURLRelative("../up.txt", base);
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org", .port = 234},
.path = {"", "dir/first-trick", "up.txt"},
.query = {},
.fragment = "",
};
@ -433,7 +452,7 @@ TEST(parseURLRelative, replacesPathWithAbsoluteRelative)
ParsedURL expected{
.scheme = "http",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "example.org"},
.path = "/rooted.txt",
.path = {"", "rooted.txt"},
.query = {},
.fragment = "",
};
@ -448,7 +467,7 @@ TEST(parseURLRelative, keepsQueryAndFragmentFromRelative)
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/other.html",
.path = {"", "path", "other.html"},
.query = {{"x", "1"}, {"y", "2"}},
.fragment = "frag",
};
@ -489,7 +508,7 @@ TEST(parseURLRelative, emptyRelative)
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.path = {"", "path", "index.html"},
.query = {{"a b", "5 6"}, {"x y", "34"}},
.fragment = "",
};
@ -504,7 +523,7 @@ TEST(parseURLRelative, fragmentRelative)
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.path = {"", "path", "index.html"},
.query = {{"a b", "5 6"}, {"x y", "34"}},
.fragment = "frag2",
};
@ -518,7 +537,7 @@ TEST(parseURLRelative, queryRelative)
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.path = {"", "path", "index.html"},
.query = {{"asdf qwer", "1 2 3"}},
.fragment = "",
};
@ -532,7 +551,7 @@ TEST(parseURLRelative, queryFragmentRelative)
ParsedURL expected{
.scheme = "https",
.authority = ParsedURL::Authority{.hostType = HostType::Name, .host = "www.example.org"},
.path = "/path/index.html",
.path = {"", "path", "index.html"},
.query = {{"asdf qwer", "1 2 3"}},
.fragment = "frag2",
};
@ -648,6 +667,25 @@ TEST(percentEncode, yen)
ASSERT_EQ(percentDecode(e), s);
}
TEST(parseURL, gitlabNamespacedProjectUrls)
{
// Test GitLab URL patterns with namespaced projects
// These should preserve %2F encoding in the path
auto s = "https://gitlab.example.com/api/v4/projects/group%2Fsubgroup%2Fproject/repository/archive.tar.gz";
auto parsed = parseURL(s);
ParsedURL expected{
.scheme = "https",
.authority = Authority{.hostType = HostType::Name, .host = "gitlab.example.com"},
.path = {"", "api", "v4", "projects", "group/subgroup/project", "repository", "archive.tar.gz"},
.query = {},
.fragment = "",
};
ASSERT_EQ(parsed, expected);
ASSERT_EQ(s, parsed.to_string());
}
TEST(nix, isValidSchemeName)
{
ASSERT_TRUE(isValidSchemeName("http"));

View file

@ -1,7 +1,10 @@
#pragma once
///@file
#include <span>
#include "nix/util/error.hh"
#include "nix/util/canon-path.hh"
namespace nix {
@ -65,6 +68,7 @@ struct ParsedURL
};
std::string scheme;
/**
* Optional parsed authority component of the URL.
*
@ -75,16 +79,155 @@ struct ParsedURL
* part of the URL.
*/
std::optional<Authority> authority;
std::string path;
/**
* @note Unlike Unix paths, URLs provide a way to escape path
* separators, in the form of the `%2F` encoding of `/`. That means
* that if one percent-decodes the path into a single string, that
* decoding will be *lossy*, because `/` and `%2F` both become `/`.
* The right thing to do is instead split up the path on `/`, and
* then percent decode each part.
*
* For an example, the path
* ```
* foo/bar%2Fbaz/quux
* ```
* is parsed as
* ```
* {"foo, "bar/baz", "quux"}
* ```
*
* We're doing splitting and joining that assumes the separator (`/` in this case) only goes *between* elements.
*
* That means the parsed representation will begin with an empty
* element to make an initial `/`, and will end with an ementy
* element to make a trailing `/`. That means that elements of this
* vector mostly, but *not always*, correspond to segments of the
* path.
*
* Examples:
*
* - ```
* https://foo.com/bar
* ```
* has path
* ```
* {"", "bar"}
* ```
*
* - ```
* https://foo.com/bar/
* ```
* has path
* ```
* {"", "bar", ""}
* ```
*
* - ```
* https://foo.com//bar///
* ```
* has path
* ```
* {"", "", "bar", "", "", ""}
* ```
*
* - ```
* https://foo.com
* ```
* has path
* ```
* {""}
* ```
*
* - ```
* https://foo.com/
* ```
* has path
* ```
* {"", ""}
* ```
*
* - ```
* tel:01234
* ```
* has path `{"01234"}` (and no authority)
*
* - ```
* foo:/01234
* ```
* has path `{"", "01234"}` (and no authority)
*
* Note that both trailing and leading slashes are, in general,
* semantically significant.
*
* For trailing slashes, the main example affecting many schemes is
* that `../baz` resolves against a base URL different depending on
* the presence/absence of a trailing slash:
*
* - `https://foo.com/bar` is `https://foo.com/baz`
*
* - `https://foo.com/bar/` is `https://foo.com/bar/baz`
*
* See `parseURLRelative` for more details.
*
* For leading slashes, there are some requirements to be aware of.
*
* - When there is an authority, the path *must* start with a leading
* slash. Otherwise the path will not be separated from the
* authority, and will not round trip though the parser:
*
* ```
* {.scheme="https", .authority.host = "foo", .path={"bad"}}
* ```
* will render to `https://foobar`. but that would parse back as as
* ```
* {.scheme="https", .authority.host = "foobar", .path={}}
* ```
*
* - When there is no authority, the path must *not* begin with two
* slashes. Otherwise, there will be another parser round trip
* issue:
*
* ```
* {.scheme="https", .path={"", "", "bad"}}
* ```
* will render to `https://bad`. but that would parse back as as
* ```
* {.scheme="https", .authority.host = "bad", .path={}}
* ```
*
* These invariants will be checked in `to_string` and
* `renderAuthorityAndPath`.
*/
std::vector<std::string> path;
StringMap query;
std::string fragment;
/**
* Render just the middle part of a URL, without the `//` which
* indicates whether the authority is present.
*
* @note This is kind of an ad-hoc
* operation, but it ends up coming up with some frequency, probably
* due to the current design of `StoreReference` in `nix-store`.
*/
std::string renderAuthorityAndPath() const;
std::string to_string() const;
/**
* Render the path to a string.
*
* @param encode Whether to percent encode path segments.
*/
std::string renderPath(bool encode = false) const;
auto operator<=>(const ParsedURL & other) const noexcept = default;
/**
* Remove `.` and `..` path elements.
* Remove `.` and `..` path segments.
*/
ParsedURL canonicalise();
};
@ -96,6 +239,22 @@ MakeError(BadURL, Error);
std::string percentDecode(std::string_view in);
std::string percentEncode(std::string_view s, std::string_view keep = "");
/**
* Get the path part of the URL as an absolute or relative Path.
*
* @throws if any path component contains an slash (which would have
* been escaped `%2F` in the rendered URL). This is because OS file
* paths have no escape sequences --- file names cannot contain a
* `/`.
*/
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath);
/**
* Percent encode path. `%2F` for "interior slashes" is the most
* important.
*/
std::string encodeUrlPath(std::span<const std::string> urlPath);
/**
* @param lenient @see parseURL
*/
@ -114,6 +273,12 @@ std::string encodeQuery(const StringMap & query);
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
*
* @throws BadURL
*
* The WHATWG specification of the URL constructor in Java Script is
* also a useful reference:
* https://url.spec.whatwg.org/#concept-basic-url-parser. Note, however,
* that it includes various scheme-specific normalizations / extra steps
* that we do not implement.
*/
ParsedURL parseURL(std::string_view url, bool lenient = false);
@ -123,7 +288,11 @@ ParsedURL parseURL(std::string_view url, bool lenient = false);
*
* This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
*
* Behavior should also match the `new URL(url, base)` JavaScript constructor.
* @throws BadURL
*
* Behavior should also match the `new URL(url, base)` JavaScript
* constructor, except for extra steps specific to the HTTP scheme. See
* `parseURL` for link to the relevant WHATWG standard.
*/
ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);

View file

@ -3,6 +3,7 @@
#include "nix/util/util.hh"
#include "nix/util/split.hh"
#include "nix/util/canon-path.hh"
#include "nix/util/strings-inline.hh"
#include <boost/url.hpp>
@ -179,11 +180,14 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
if (authority && authority->host.size() && transportIsFile)
throw BadURL("file:// URL '%s' has unexpected authority '%s'", urlView.buffer(), *authority);
auto path = urlView.path(); /* Does pct-decoding */
auto fragment = urlView.fragment(); /* Does pct-decoding */
if (transportIsFile && path.empty())
path = "/";
boost::core::string_view encodedPath = urlView.encoded_path();
if (transportIsFile && encodedPath.empty())
encodedPath = "/";
auto path = std::views::transform(splitString<std::vector<std::string_view>>(encodedPath, "/"), percentDecode)
| std::ranges::to<std::vector<std::string>>();
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
the inner &/? are pct-encoded. */
@ -192,7 +196,7 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
return ParsedURL{
.scheme = scheme,
.authority = authority,
.path = path,
.path = std::move(path),
.query = decodeQuery(query, lenient),
.fragment = fragment,
};
@ -215,7 +219,7 @@ try {
if (authority.port)
resolved.set_port_number(*authority.port);
}
resolved.set_path(base.path);
resolved.set_encoded_path(encodeUrlPath(base.path));
resolved.set_encoded_query(encodeQuery(base.query));
resolved.set_fragment(base.fragment);
} catch (boost::system::system_error & e) {
@ -291,7 +295,15 @@ try {
}
const static std::string allowedInQuery = ":@/?";
const static std::string allowedInPath = ":@/";
const static std::string allowedInPath = ":@";
std::string encodeUrlPath(std::span<const std::string> urlPath)
{
std::vector<std::string> encodedPath;
for (auto & p : urlPath)
encodedPath.push_back(percentEncode(p, allowedInPath));
return concatStringsSep("/", encodedPath);
}
std::string encodeQuery(const StringMap & ss)
{
@ -308,10 +320,62 @@ std::string encodeQuery(const StringMap & ss)
return res;
}
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath)
{
for (const auto & comp : urlPath) {
/* This is only really valid for UNIX. Windows has more restrictions. */
if (comp.contains('/'))
throw BadURL("URL path component '%s' contains '/', which is not allowed in file names", comp);
if (comp.contains(char(0)))
throw BadURL("URL path component '%s' contains NUL byte which is not allowed", comp);
}
return concatStringsSep("/", urlPath);
}
std::string ParsedURL::renderPath(bool encode) const
{
if (encode)
return encodeUrlPath(path);
return concatStringsSep("/", path);
}
std::string ParsedURL::renderAuthorityAndPath() const
{
std::string res;
/* The following assertions correspond to 3.3. Path [rfc3986]. URL parser
will never violate these properties, but hand-constructed ParsedURLs might. */
if (authority.has_value()) {
/* If a URI contains an authority component, then the path component
must either be empty or begin with a slash ("/") character. */
assert(path.empty() || path.front().empty());
res += authority->to_string();
} else if (std::ranges::equal(std::views::take(path, 2), std::views::repeat("", 2))) {
/* If a URI does not contain an authority component, then the path cannot begin
with two slash characters ("//") */
unreachable();
}
res += encodeUrlPath(path);
return res;
}
std::string ParsedURL::to_string() const
{
return scheme + ":" + (authority ? "//" + authority->to_string() : "") + percentEncode(path, allowedInPath)
+ (query.empty() ? "" : "?" + encodeQuery(query)) + (fragment.empty() ? "" : "#" + percentEncode(fragment));
std::string res;
res += scheme;
res += ":";
if (authority.has_value())
res += "//";
res += renderAuthorityAndPath();
if (!query.empty()) {
res += "?";
res += encodeQuery(query);
}
if (!fragment.empty()) {
res += "#";
res += percentEncode(fragment);
}
return res;
}
std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
@ -323,7 +387,7 @@ std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
ParsedURL ParsedURL::canonicalise()
{
ParsedURL res(*this);
res.path = CanonPath(res.path).abs();
res.path = splitString<std::vector<std::string>>(CanonPath(renderPath()).abs(), "/");
return res;
}
@ -352,7 +416,11 @@ ParsedURL fixGitURL(const std::string & url)
if (hasPrefix(url, "file:"))
return parseURL(url);
if (url.find("://") == std::string::npos) {
return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url});
return ParsedURL{
.scheme = "file",
.authority = ParsedURL::Authority{},
.path = splitString<std::vector<std::string>>(url, "/"),
};
}
return parseURL(url);
}