Fix ParsedURL handling of %2F in URL paths

See the new extensive doxygen in `url.hh`. This fixes fetching gitlab: flakes. Paths are now stored as a std::vector of individual path segments, which can themselves contain path separators '/' (%2F). This is necessary to make the Gitlab's /projects/ API work. Co-authored-by: John Ericson <John.Ericson@Obsidian.Systems> Co-authored-by: Sergei Zimmerman <sergei@zimmerman.foo>
2025-11-16 15:32:43 +01:00 · 2025-08-26 12:49:28 +02:00 · 2025-08-26 12:49:28 +02:00 · c436b7a32a
commit c436b7a32a
parent 6839f3de55
19 changed files with 446 additions and 117 deletions
--- a/src/libutil/include/nix/util/url.hh
+++ b/src/libutil/include/nix/util/url.hh
@ -1,7 +1,10 @@
 #pragma once
 ///@file

+#include <span>
+
 #include "nix/util/error.hh"
+#include "nix/util/canon-path.hh"

 namespace nix {

@ -65,6 +68,7 @@ struct ParsedURL
    };

    std::string scheme;
+
    /**
     * Optional parsed authority component of the URL.
     *
@ -75,16 +79,155 @@ struct ParsedURL
     * part of the URL.
     */
    std::optional<Authority> authority;
-    std::string path;
+
+    /**
+     * @note Unlike Unix paths, URLs provide a way to escape path
+     * separators, in the form of the `%2F` encoding of `/`. That means
+     * that if one percent-decodes the path into a single string, that
+     * decoding will be *lossy*, because `/` and `%2F` both become `/`.
+     * The right thing to do is instead split up the path on `/`, and
+     * then percent decode each part.
+     *
+     * For an example, the path
+     * ```
+     * foo/bar%2Fbaz/quux
+     * ```
+     * is parsed as
+     * ```
+     * {"foo, "bar/baz", "quux"}
+     * ```
+     *
+     * We're doing splitting and joining that assumes the separator (`/` in this case) only goes *between* elements.
+     *
+     * That means the parsed representation will begin with an empty
+     * element to make an initial `/`, and will end with an ementy
+     * element to make a trailing `/`. That means that elements of this
+     * vector mostly, but *not always*, correspond to segments of the
+     * path.
+     *
+     * Examples:
+     *
+     * - ```
+     *   https://foo.com/bar
+     *   ```
+     *   has path
+     *   ```
+     *   {"", "bar"}
+     *   ```
+     *
+     * - ```
+     *   https://foo.com/bar/
+     *   ```
+     *   has path
+     *   ```
+     *   {"", "bar", ""}
+     *   ```
+     *
+     * - ```
+     *   https://foo.com//bar///
+     *   ```
+     *   has path
+     *   ```
+     *   {"", "", "bar", "", "", ""}
+     *   ```
+     *
+     * - ```
+     *   https://foo.com
+     *   ```
+     *   has path
+     *   ```
+     *   {""}
+     *   ```
+     *
+     * - ```
+     *   https://foo.com/
+     *   ```
+     *   has path
+     *   ```
+     *   {"", ""}
+     *   ```
+     *
+     * - ```
+     *   tel:01234
+     *   ```
+     *   has path `{"01234"}` (and no authority)
+     *
+     * - ```
+     *   foo:/01234
+     *   ```
+     *   has path `{"", "01234"}` (and no authority)
+     *
+     * Note that both trailing and leading slashes are, in general,
+     * semantically significant.
+     *
+     * For trailing slashes, the main example affecting many schemes is
+     * that `../baz` resolves against a base URL different depending on
+     * the presence/absence of a trailing slash:
+     *
+     * - `https://foo.com/bar` is `https://foo.com/baz`
+     *
+     * - `https://foo.com/bar/` is `https://foo.com/bar/baz`
+     *
+     * See `parseURLRelative` for more details.
+     *
+     * For leading slashes, there are some requirements to be aware of.
+     *
+     * - When there is an authority, the path *must* start with a leading
+     *   slash. Otherwise the path will not be separated from the
+     *   authority, and will not round trip though the parser:
+     *
+     *   ```
+     *   {.scheme="https", .authority.host = "foo", .path={"bad"}}
+     *   ```
+     *   will render to `https://foobar`. but that would parse back as as
+     *   ```
+     *   {.scheme="https", .authority.host = "foobar", .path={}}
+     *   ```
+     *
+     * - When there is no authority, the path must *not* begin with two
+     *   slashes. Otherwise, there will be another parser round trip
+     *   issue:
+     *
+     *   ```
+     *   {.scheme="https", .path={"", "", "bad"}}
+     *   ```
+     *   will render to `https://bad`. but that would parse back as as
+     *   ```
+     *   {.scheme="https", .authority.host = "bad", .path={}}
+     *   ```
+     *
+     * These invariants will be checked in `to_string` and
+     * `renderAuthorityAndPath`.
+     */
+    std::vector<std::string> path;
+
    StringMap query;
+
    std::string fragment;

+    /**
+     * Render just the middle part of a URL, without the `//` which
+     * indicates whether the authority is present.
+     *
+     * @note This is kind of an ad-hoc
+     * operation, but it ends up coming up with some frequency, probably
+     * due to the current design of `StoreReference` in `nix-store`.
+     */
+    std::string renderAuthorityAndPath() const;
+
    std::string to_string() const;

+    /**
+     * Render the path to a string.
+     *
+     * @param encode Whether to percent encode path segments.
+     */
+    std::string renderPath(bool encode = false) const;
+
    auto operator<=>(const ParsedURL & other) const noexcept = default;

    /**
-     * Remove `.` and `..` path elements.
+     * Remove `.` and `..` path segments.
     */
    ParsedURL canonicalise();
 };
@ -96,6 +239,22 @@ MakeError(BadURL, Error);
 std::string percentDecode(std::string_view in);
 std::string percentEncode(std::string_view s, std::string_view keep = "");

+/**
+ * Get the path part of the URL as an absolute or relative Path.
+ *
+ * @throws if any path component contains an slash (which would have
+ * been escaped `%2F` in the rendered URL). This is because OS file
+ * paths have no escape sequences --- file names cannot contain a
+ * `/`.
+ */
+Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath);
+
+/**
+ * Percent encode path. `%2F` for "interior slashes" is the most
+ * important.
+ */
+std::string encodeUrlPath(std::span<const std::string> urlPath);
+
 /**
 * @param lenient @see parseURL
 */
@ -114,6 +273,12 @@ std::string encodeQuery(const StringMap & query);
 * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
 *
 * @throws BadURL
+ *
+ * The WHATWG specification of the URL constructor in Java Script is
+ * also a useful reference:
+ * https://url.spec.whatwg.org/#concept-basic-url-parser. Note, however,
+ * that it includes various scheme-specific normalizations / extra steps
+ * that we do not implement.
 */
 ParsedURL parseURL(std::string_view url, bool lenient = false);

@ -123,7 +288,11 @@ ParsedURL parseURL(std::string_view url, bool lenient = false);
 *
 * This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
 *
- * Behavior should also match the `new URL(url, base)` JavaScript constructor.
+ * @throws BadURL
+ *
+ * Behavior should also match the `new URL(url, base)` JavaScript
+ * constructor, except for extra steps specific to the HTTP scheme. See
+ * `parseURL` for link to the relevant WHATWG standard.
 */
 ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);

--- a/src/libutil/url.cc
+++ b/src/libutil/url.cc
@ -3,6 +3,7 @@
 #include "nix/util/util.hh"
 #include "nix/util/split.hh"
 #include "nix/util/canon-path.hh"
+#include "nix/util/strings-inline.hh"

 #include <boost/url.hpp>

@ -179,11 +180,14 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
    if (authority && authority->host.size() && transportIsFile)
        throw BadURL("file:// URL '%s' has unexpected authority '%s'", urlView.buffer(), *authority);

-    auto path = urlView.path();         /* Does pct-decoding */
    auto fragment = urlView.fragment(); /* Does pct-decoding */

-    if (transportIsFile && path.empty())
-        path = "/";
+    boost::core::string_view encodedPath = urlView.encoded_path();
+    if (transportIsFile && encodedPath.empty())
+        encodedPath = "/";
+
+    auto path = std::views::transform(splitString<std::vector<std::string_view>>(encodedPath, "/"), percentDecode)
+                | std::ranges::to<std::vector<std::string>>();

    /* Get the raw query. Store URI supports smuggling doubly nested queries, where
       the inner &/? are pct-encoded. */
@ -192,7 +196,7 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
    return ParsedURL{
        .scheme = scheme,
        .authority = authority,
-        .path = path,
+        .path = std::move(path),
        .query = decodeQuery(query, lenient),
        .fragment = fragment,
    };
@ -215,7 +219,7 @@ try {
            if (authority.port)
                resolved.set_port_number(*authority.port);
        }
-        resolved.set_path(base.path);
+        resolved.set_encoded_path(encodeUrlPath(base.path));
        resolved.set_encoded_query(encodeQuery(base.query));
        resolved.set_fragment(base.fragment);
    } catch (boost::system::system_error & e) {
@ -291,7 +295,15 @@ try {
 }

 const static std::string allowedInQuery = ":@/?";
-const static std::string allowedInPath = ":@/";
+const static std::string allowedInPath = ":@";
+
+std::string encodeUrlPath(std::span<const std::string> urlPath)
+{
+    std::vector<std::string> encodedPath;
+    for (auto & p : urlPath)
+        encodedPath.push_back(percentEncode(p, allowedInPath));
+    return concatStringsSep("/", encodedPath);
+}

 std::string encodeQuery(const StringMap & ss)
 {
@ -308,10 +320,62 @@ std::string encodeQuery(const StringMap & ss)
    return res;
 }

+Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath)
+{
+    for (const auto & comp : urlPath) {
+        /* This is only really valid for UNIX. Windows has more restrictions. */
+        if (comp.contains('/'))
+            throw BadURL("URL path component '%s' contains '/', which is not allowed in file names", comp);
+        if (comp.contains(char(0)))
+            throw BadURL("URL path component '%s' contains NUL byte which is not allowed", comp);
+    }
+
+    return concatStringsSep("/", urlPath);
+}
+
+std::string ParsedURL::renderPath(bool encode) const
+{
+    if (encode)
+        return encodeUrlPath(path);
+    return concatStringsSep("/", path);
+}
+
+std::string ParsedURL::renderAuthorityAndPath() const
+{
+    std::string res;
+    /* The following assertions correspond to 3.3. Path [rfc3986]. URL parser
+       will never violate these properties, but hand-constructed ParsedURLs might. */
+    if (authority.has_value()) {
+        /* If a URI contains an authority component, then the path component
+           must either be empty or begin with a slash ("/") character. */
+        assert(path.empty() || path.front().empty());
+        res += authority->to_string();
+    } else if (std::ranges::equal(std::views::take(path, 2), std::views::repeat("", 2))) {
+        /* If a URI does not contain an authority component, then the path cannot begin
+           with two slash characters ("//") */
+        unreachable();
+    }
+    res += encodeUrlPath(path);
+    return res;
+}
+
 std::string ParsedURL::to_string() const
 {
-    return scheme + ":" + (authority ? "//" + authority->to_string() : "") + percentEncode(path, allowedInPath)
-           + (query.empty() ? "" : "?" + encodeQuery(query)) + (fragment.empty() ? "" : "#" + percentEncode(fragment));
+    std::string res;
+    res += scheme;
+    res += ":";
+    if (authority.has_value())
+        res += "//";
+    res += renderAuthorityAndPath();
+    if (!query.empty()) {
+        res += "?";
+        res += encodeQuery(query);
+    }
+    if (!fragment.empty()) {
+        res += "#";
+        res += percentEncode(fragment);
+    }
+    return res;
 }

 std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
@ -323,7 +387,7 @@ std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
 ParsedURL ParsedURL::canonicalise()
 {
    ParsedURL res(*this);
-    res.path = CanonPath(res.path).abs();
+    res.path = splitString<std::vector<std::string>>(CanonPath(renderPath()).abs(), "/");
    return res;
 }

@ -352,7 +416,11 @@ ParsedURL fixGitURL(const std::string & url)
    if (hasPrefix(url, "file:"))
        return parseURL(url);
    if (url.find("://") == std::string::npos) {
-        return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url});
+        return ParsedURL{
+            .scheme = "file",
+            .authority = ParsedURL::Authority{},
+            .path = splitString<std::vector<std::string>>(url, "/"),
+        };
    }
    return parseURL(url);
 }