nix/src/libutil/include/nix/util/url.hh

#pragma once
///@file

#include <ranges>
#include <span>

#include "nix/util/error.hh"
#include "nix/util/canon-path.hh"

namespace nix {

/**
 * Represents a parsed RFC3986 URL.
 *
 * @note All fields are already percent decoded.
 */
struct ParsedURL
{
    /**
     * Parsed representation of a URL authority.
     *
     * It consists of user information, hostname and an optional port number.
     * Note that passwords in the userinfo are not yet supported and are ignored.
     *
     * @todo Maybe support passwords in userinfo part of the url for auth.
     */
    struct Authority
    {
        enum class HostType {
            Name, //< Registered name (can be empty)
            IPv4,
            IPv6,
            IPvFuture
        };

        static Authority parse(std::string_view encodedAuthority);
        auto operator<=>(const Authority & other) const = default;
        std::string to_string() const;
        friend std::ostream & operator<<(std::ostream & os, const Authority & self);

        /**
         * Type of the host subcomponent, as specified by rfc3986 3.2.2. Host.
         */
        HostType hostType = HostType::Name;

        /**
         * Host subcomponent. Either a registered name or IPv{4,6,Future} literal addresses.
         *
         * IPv6 enclosing brackets are already stripped. Percent encoded characters
         * in the hostname are decoded.
         */
        std::string host;

        /** Percent-decoded user part of the userinfo. */
        std::optional<std::string> user;

        /**
         * Password subcomponent of the authority (if specified).
         *
         * @warning As per the rfc3986, the password syntax is deprecated,
         * but it's necessary to make the parse -> to_string roundtrip.
         * We don't use it anywhere (at least intentionally).
         * @todo Warn about unused password subcomponent.
         */
        std::optional<std::string> password;

        /** Port subcomponent (if specified). Default value is determined by the scheme. */
        std::optional<uint16_t> port;
    };

    std::string scheme;

    /**
     * Optional parsed authority component of the URL.
     *
     * IMPORTANT: An empty authority (i.e. one with an empty host string) and
     * a missing authority (std::nullopt) are drastically different cases. This
     * is especially important for "file:///path/to/file" URLs defined by RFC8089.
     * The presence of the authority is indicated by `//` following the <scheme>:
     * part of the URL.
     */
    std::optional<Authority> authority;

    /**
     * @note Unlike Unix paths, URLs provide a way to escape path
     * separators, in the form of the `%2F` encoding of `/`. That means
     * that if one percent-decodes the path into a single string, that
     * decoding will be *lossy*, because `/` and `%2F` both become `/`.
     * The right thing to do is instead split up the path on `/`, and
     * then percent decode each part.
     *
     * For an example, the path
     * ```
     * foo/bar%2Fbaz/quux
     * ```
     * is parsed as
     * ```
     * {"foo, "bar/baz", "quux"}
     * ```
     *
     * We're doing splitting and joining that assumes the separator (`/` in this case) only goes *between* elements.
     *
     * That means the parsed representation will begin with an empty
     * element to make an initial `/`, and will end with an ementy
     * element to make a trailing `/`. That means that elements of this
     * vector mostly, but *not always*, correspond to segments of the
     * path.
     *
     * Examples:
     *
     * - ```
     *   https://foo.com/bar
     *   ```
     *   has path
     *   ```
     *   {"", "bar"}
     *   ```
     *
     * - ```
     *   https://foo.com/bar/
     *   ```
     *   has path
     *   ```
     *   {"", "bar", ""}
     *   ```
     *
     * - ```
     *   https://foo.com//bar///
     *   ```
     *   has path
     *   ```
     *   {"", "", "bar", "", "", ""}
     *   ```
     *
     * - ```
     *   https://foo.com
     *   ```
     *   has path
     *   ```
     *   {""}
     *   ```
     *
     * - ```
     *   https://foo.com/
     *   ```
     *   has path
     *   ```
     *   {"", ""}
     *   ```
     *
     * - ```
     *   tel:01234
     *   ```
     *   has path `{"01234"}` (and no authority)
     *
     * - ```
     *   foo:/01234
     *   ```
     *   has path `{"", "01234"}` (and no authority)
     *
     * Note that both trailing and leading slashes are, in general,
     * semantically significant.
     *
     * For trailing slashes, the main example affecting many schemes is
     * that `../baz` resolves against a base URL different depending on
     * the presence/absence of a trailing slash:
     *
     * - `https://foo.com/bar` is `https://foo.com/baz`
     *
     * - `https://foo.com/bar/` is `https://foo.com/bar/baz`
     *
     * See `parseURLRelative` for more details.
     *
     * For leading slashes, there are some requirements to be aware of.
     *
     * - When there is an authority, the path *must* start with a leading
     *   slash. Otherwise the path will not be separated from the
     *   authority, and will not round trip though the parser:
     *
     *   ```
     *   {.scheme="https", .authority.host = "foo", .path={"bad"}}
     *   ```
     *   will render to `https://foobar`. but that would parse back as as
     *   ```
     *   {.scheme="https", .authority.host = "foobar", .path={}}
     *   ```
     *
     * - When there is no authority, the path must *not* begin with two
     *   slashes. Otherwise, there will be another parser round trip
     *   issue:
     *
     *   ```
     *   {.scheme="https", .path={"", "", "bad"}}
     *   ```
     *   will render to `https://bad`. but that would parse back as as
     *   ```
     *   {.scheme="https", .authority.host = "bad", .path={}}
     *   ```
     *
     * These invariants will be checked in `to_string` and
     * `renderAuthorityAndPath`.
     */
    std::vector<std::string> path;

    StringMap query;

    std::string fragment;

    /**
     * Render just the middle part of a URL, without the `//` which
     * indicates whether the authority is present.
     *
     * @note This is kind of an ad-hoc
     * operation, but it ends up coming up with some frequency, probably
     * due to the current design of `StoreReference` in `nix-store`.
     */
    std::string renderAuthorityAndPath() const;

    std::string to_string() const;

    /**
     * Render the path to a string.
     *
     * @param encode Whether to percent encode path segments.
     */
    std::string renderPath(bool encode = false) const;

    auto operator<=>(const ParsedURL & other) const noexcept = default;

    /**
     * Remove `.` and `..` path segments.
     */
    ParsedURL canonicalise();

    /**
     * Get a range of path segments (the substrings separated by '/' characters).
     *
     * @param skipEmpty Skip all empty path segments
     */
    auto pathSegments(bool skipEmpty) const &
    {
        return std::views::filter(path, [skipEmpty](std::string_view segment) {
            if (skipEmpty)
                return !segment.empty();
            return true;
        });
    }
};

std::ostream & operator<<(std::ostream & os, const ParsedURL & url);

MakeError(BadURL, Error);

std::string percentDecode(std::string_view in);
std::string percentEncode(std::string_view s, std::string_view keep = "");

/**
 * Get the path part of the URL as an absolute or relative Path.
 *
 * @throws if any path component contains an slash (which would have
 * been escaped `%2F` in the rendered URL). This is because OS file
 * paths have no escape sequences --- file names cannot contain a
 * `/`.
 */
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath);

/**
 * Percent encode path. `%2F` for "interior slashes" is the most
 * important.
 */
std::string encodeUrlPath(std::span<const std::string> urlPath);

/**
 * @param lenient @see parseURL
 */
StringMap decodeQuery(std::string_view query, bool lenient = false);

std::string encodeQuery(const StringMap & query);

/**
 * Parse a URL into a ParsedURL.
 *
 * @parm lenient Also allow some long-supported Nix URIs that are not quite compliant with RFC3986.
 * Here are the deviations:
 * - Fragments can contain unescaped (not URL encoded) '^', '"' or space literals.
 * - Queries may contain unescaped '"' or spaces.
 *
 * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
 *
 * @throws BadURL
 *
 * The WHATWG specification of the URL constructor in Java Script is
 * also a useful reference:
 * https://url.spec.whatwg.org/#concept-basic-url-parser. Note, however,
 * that it includes various scheme-specific normalizations / extra steps
 * that we do not implement.
 */
ParsedURL parseURL(std::string_view url, bool lenient = false);

/**
 * Like `parseURL`, but also accepts relative URLs, which are resolved
 * against the given base URL.
 *
 * This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
 *
 * @throws BadURL
 *
 * Behavior should also match the `new URL(url, base)` JavaScript
 * constructor, except for extra steps specific to the HTTP scheme. See
 * `parseURL` for link to the relevant WHATWG standard.
 */
ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);

/**
 * Although that’s not really standardized anywhere, an number of tools
 * use a scheme of the form 'x+y' in urls, where y is the “transport layer”
 * scheme, and x is the “application layer” scheme.
 *
 * For example git uses `git+https` to designate remotes using a Git
 * protocol over http.
 */
struct ParsedUrlScheme
{
    std::optional<std::string_view> application;
    std::string_view transport;
};

ParsedUrlScheme parseUrlScheme(std::string_view scheme);

/* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes
   them by removing the `:` and assuming a scheme of `ssh://`. Also
   changes absolute paths into file:// URLs. */
ParsedURL fixGitURL(const std::string & url);

/**
 * Whether a string is valid as RFC 3986 scheme name.
 * Colon `:` is part of the URI; not the scheme name, and therefore rejected.
 * See https://www.rfc-editor.org/rfc/rfc3986#section-3.1
 *
 * Does not check whether the scheme is understood, as that's context-dependent.
 */
bool isValidSchemeName(std::string_view scheme);

/**
 * Either a ParsedURL or a verbatim string, but the string must be a valid
 * ParsedURL. This is necessary because in certain cases URI must be passed
 * verbatim (e.g. in builtin fetchers), since those are specified by the user.
 * In those cases normalizations performed by the ParsedURL might be surprising
 * and undesirable, since Nix must be a universal client that has to work with
 * various broken services that might interpret URLs in quirky and non-standard ways.
 *
 * One of those examples is space-as-plus encoding that is very widespread, but it's
 * not strictly RFC3986 compliant. We must preserve that information verbatim.
 *
 * Though we perform parsing and validation for internal needs.
 */
struct ValidURL : private ParsedURL
{
    std::optional<std::string> encoded;

    ValidURL(std::string str)
        : ParsedURL(parseURL(str, /*lenient=*/false))
        , encoded(std::move(str))
    {
    }

    ValidURL(std::string_view str)
        : ValidURL(std::string{str})
    {
    }

    ValidURL(ParsedURL parsed)
        : ParsedURL{std::move(parsed)}
    {
    }

    /**
     * Get the encoded URL (if specified) verbatim or encode the parsed URL.
     */
    std::string to_string() const
    {
        return encoded.or_else([&]() -> std::optional<std::string> { return ParsedURL::to_string(); }).value();
    }

    const ParsedURL & parsed() const &
    {
        return *this;
    }

    std::string_view scheme() const &
    {
        return ParsedURL::scheme;
    }

    const auto & path() const &
    {
        return ParsedURL::path;
    }
};

std::ostream & operator<<(std::ostream & os, const ValidURL & url);

} // namespace nix