1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-21 17:59:36 +01:00
nix/src/libutil/include/nix/util/url.hh
John Ericson d2f1860ee5 Revert "Improve Git URI handling"
I (@Ericson2314) messed up. We were supposed to test the status quo
before landing any new chnages, and also there is one change that is not
quite right (relative paths).

I am reverting for now, and then backporting the test suite to the old
situation.

This reverts commit 04ad66af5f.
2025-09-01 16:13:32 -04:00

403 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
///@file
#include <ranges>
#include <span>
#include "nix/util/error.hh"
#include "nix/util/canon-path.hh"
namespace nix {
/**
* Represents a parsed RFC3986 URL.
*
* @note All fields are already percent decoded.
*/
struct ParsedURL
{
/**
* Parsed representation of a URL authority.
*
* It consists of user information, hostname and an optional port number.
* Note that passwords in the userinfo are not yet supported and are ignored.
*
* @todo Maybe support passwords in userinfo part of the url for auth.
*/
struct Authority
{
enum class HostType {
Name, //< Registered name (can be empty)
IPv4,
IPv6,
IPvFuture
};
static Authority parse(std::string_view encodedAuthority);
auto operator<=>(const Authority & other) const = default;
std::string to_string() const;
friend std::ostream & operator<<(std::ostream & os, const Authority & self);
/**
* Type of the host subcomponent, as specified by rfc3986 3.2.2. Host.
*/
HostType hostType = HostType::Name;
/**
* Host subcomponent. Either a registered name or IPv{4,6,Future} literal addresses.
*
* IPv6 enclosing brackets are already stripped. Percent encoded characters
* in the hostname are decoded.
*/
std::string host;
/** Percent-decoded user part of the userinfo. */
std::optional<std::string> user;
/**
* Password subcomponent of the authority (if specified).
*
* @warning As per the rfc3986, the password syntax is deprecated,
* but it's necessary to make the parse -> to_string roundtrip.
* We don't use it anywhere (at least intentionally).
* @todo Warn about unused password subcomponent.
*/
std::optional<std::string> password;
/** Port subcomponent (if specified). Default value is determined by the scheme. */
std::optional<uint16_t> port;
};
std::string scheme;
/**
* Optional parsed authority component of the URL.
*
* IMPORTANT: An empty authority (i.e. one with an empty host string) and
* a missing authority (std::nullopt) are drastically different cases. This
* is especially important for "file:///path/to/file" URLs defined by RFC8089.
* The presence of the authority is indicated by `//` following the <scheme>:
* part of the URL.
*/
std::optional<Authority> authority;
/**
* @note Unlike Unix paths, URLs provide a way to escape path
* separators, in the form of the `%2F` encoding of `/`. That means
* that if one percent-decodes the path into a single string, that
* decoding will be *lossy*, because `/` and `%2F` both become `/`.
* The right thing to do is instead split up the path on `/`, and
* then percent decode each part.
*
* For an example, the path
* ```
* foo/bar%2Fbaz/quux
* ```
* is parsed as
* ```
* {"foo, "bar/baz", "quux"}
* ```
*
* We're doing splitting and joining that assumes the separator (`/` in this case) only goes *between* elements.
*
* That means the parsed representation will begin with an empty
* element to make an initial `/`, and will end with an ementy
* element to make a trailing `/`. That means that elements of this
* vector mostly, but *not always*, correspond to segments of the
* path.
*
* Examples:
*
* - ```
* https://foo.com/bar
* ```
* has path
* ```
* {"", "bar"}
* ```
*
* - ```
* https://foo.com/bar/
* ```
* has path
* ```
* {"", "bar", ""}
* ```
*
* - ```
* https://foo.com//bar///
* ```
* has path
* ```
* {"", "", "bar", "", "", ""}
* ```
*
* - ```
* https://foo.com
* ```
* has path
* ```
* {""}
* ```
*
* - ```
* https://foo.com/
* ```
* has path
* ```
* {"", ""}
* ```
*
* - ```
* tel:01234
* ```
* has path `{"01234"}` (and no authority)
*
* - ```
* foo:/01234
* ```
* has path `{"", "01234"}` (and no authority)
*
* Note that both trailing and leading slashes are, in general,
* semantically significant.
*
* For trailing slashes, the main example affecting many schemes is
* that `../baz` resolves against a base URL different depending on
* the presence/absence of a trailing slash:
*
* - `https://foo.com/bar` is `https://foo.com/baz`
*
* - `https://foo.com/bar/` is `https://foo.com/bar/baz`
*
* See `parseURLRelative` for more details.
*
* For leading slashes, there are some requirements to be aware of.
*
* - When there is an authority, the path *must* start with a leading
* slash. Otherwise the path will not be separated from the
* authority, and will not round trip though the parser:
*
* ```
* {.scheme="https", .authority.host = "foo", .path={"bad"}}
* ```
* will render to `https://foobar`. but that would parse back as as
* ```
* {.scheme="https", .authority.host = "foobar", .path={}}
* ```
*
* - When there is no authority, the path must *not* begin with two
* slashes. Otherwise, there will be another parser round trip
* issue:
*
* ```
* {.scheme="https", .path={"", "", "bad"}}
* ```
* will render to `https://bad`. but that would parse back as as
* ```
* {.scheme="https", .authority.host = "bad", .path={}}
* ```
*
* These invariants will be checked in `to_string` and
* `renderAuthorityAndPath`.
*/
std::vector<std::string> path;
StringMap query;
std::string fragment;
/**
* Render just the middle part of a URL, without the `//` which
* indicates whether the authority is present.
*
* @note This is kind of an ad-hoc
* operation, but it ends up coming up with some frequency, probably
* due to the current design of `StoreReference` in `nix-store`.
*/
std::string renderAuthorityAndPath() const;
std::string to_string() const;
/**
* Render the path to a string.
*
* @param encode Whether to percent encode path segments.
*/
std::string renderPath(bool encode = false) const;
auto operator<=>(const ParsedURL & other) const noexcept = default;
/**
* Remove `.` and `..` path segments.
*/
ParsedURL canonicalise();
/**
* Get a range of path segments (the substrings separated by '/' characters).
*
* @param skipEmpty Skip all empty path segments
*/
auto pathSegments(bool skipEmpty) const &
{
return std::views::filter(path, [skipEmpty](std::string_view segment) {
if (skipEmpty)
return !segment.empty();
return true;
});
}
};
std::ostream & operator<<(std::ostream & os, const ParsedURL & url);
MakeError(BadURL, Error);
std::string percentDecode(std::string_view in);
std::string percentEncode(std::string_view s, std::string_view keep = "");
/**
* Get the path part of the URL as an absolute or relative Path.
*
* @throws if any path component contains an slash (which would have
* been escaped `%2F` in the rendered URL). This is because OS file
* paths have no escape sequences --- file names cannot contain a
* `/`.
*/
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath);
/**
* Percent encode path. `%2F` for "interior slashes" is the most
* important.
*/
std::string encodeUrlPath(std::span<const std::string> urlPath);
/**
* @param lenient @see parseURL
*/
StringMap decodeQuery(std::string_view query, bool lenient = false);
std::string encodeQuery(const StringMap & query);
/**
* Parse a URL into a ParsedURL.
*
* @parm lenient Also allow some long-supported Nix URIs that are not quite compliant with RFC3986.
* Here are the deviations:
* - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals.
* - Queries may contain unescaped '"' or spaces.
*
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
*
* @throws BadURL
*
* The WHATWG specification of the URL constructor in Java Script is
* also a useful reference:
* https://url.spec.whatwg.org/#concept-basic-url-parser. Note, however,
* that it includes various scheme-specific normalizations / extra steps
* that we do not implement.
*/
ParsedURL parseURL(std::string_view url, bool lenient = false);
/**
* Like `parseURL`, but also accepts relative URLs, which are resolved
* against the given base URL.
*
* This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
*
* @throws BadURL
*
* Behavior should also match the `new URL(url, base)` JavaScript
* constructor, except for extra steps specific to the HTTP scheme. See
* `parseURL` for link to the relevant WHATWG standard.
*/
ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);
/**
* Although thats not really standardized anywhere, an number of tools
* use a scheme of the form 'x+y' in urls, where y is the “transport layer”
* scheme, and x is the “application layer” scheme.
*
* For example git uses `git+https` to designate remotes using a Git
* protocol over http.
*/
struct ParsedUrlScheme
{
std::optional<std::string_view> application;
std::string_view transport;
};
ParsedUrlScheme parseUrlScheme(std::string_view scheme);
/* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes
them by removing the `:` and assuming a scheme of `ssh://`. Also
changes absolute paths into file:// URLs. */
ParsedURL fixGitURL(const std::string & url);
/**
* Whether a string is valid as RFC 3986 scheme name.
* Colon `:` is part of the URI; not the scheme name, and therefore rejected.
* See https://www.rfc-editor.org/rfc/rfc3986#section-3.1
*
* Does not check whether the scheme is understood, as that's context-dependent.
*/
bool isValidSchemeName(std::string_view scheme);
/**
* Either a ParsedURL or a verbatim string, but the string must be a valid
* ParsedURL. This is necessary because in certain cases URI must be passed
* verbatim (e.g. in builtin fetchers), since those are specified by the user.
* In those cases normalizations performed by the ParsedURL might be surprising
* and undesirable, since Nix must be a universal client that has to work with
* various broken services that might interpret URLs in quirky and non-standard ways.
*
* One of those examples is space-as-plus encoding that is very widespread, but it's
* not strictly RFC3986 compliant. We must preserve that information verbatim.
*
* Though we perform parsing and validation for internal needs.
*/
struct ValidURL : private ParsedURL
{
std::optional<std::string> encoded;
ValidURL(std::string str)
: ParsedURL(parseURL(str, /*lenient=*/false))
, encoded(std::move(str))
{
}
ValidURL(std::string_view str)
: ValidURL(std::string{str})
{
}
ValidURL(ParsedURL parsed)
: ParsedURL{std::move(parsed)}
{
}
/**
* Get the encoded URL (if specified) verbatim or encode the parsed URL.
*/
std::string to_string() const
{
return encoded.or_else([&]() -> std::optional<std::string> { return ParsedURL::to_string(); }).value();
}
const ParsedURL & parsed() const &
{
return *this;
}
std::string_view scheme() const &
{
return ParsedURL::scheme;
}
const auto & path() const &
{
return ParsedURL::path;
}
};
std::ostream & operator<<(std::ostream & os, const ValidURL & url);
} // namespace nix