mirror of
https://github.com/NixOS/nix.git
synced 2025-11-16 15:32:43 +01:00
Fix ParsedURL handling of %2F in URL paths
See the new extensive doxygen in `url.hh`. This fixes fetching gitlab: flakes. Paths are now stored as a std::vector of individual path segments, which can themselves contain path separators '/' (%2F). This is necessary to make the Gitlab's /projects/ API work. Co-authored-by: John Ericson <John.Ericson@Obsidian.Systems> Co-authored-by: Sergei Zimmerman <sergei@zimmerman.foo>
This commit is contained in:
parent
6839f3de55
commit
c436b7a32a
19 changed files with 446 additions and 117 deletions
|
|
@ -1,7 +1,10 @@
|
|||
#pragma once
|
||||
///@file
|
||||
|
||||
#include <span>
|
||||
|
||||
#include "nix/util/error.hh"
|
||||
#include "nix/util/canon-path.hh"
|
||||
|
||||
namespace nix {
|
||||
|
||||
|
|
@ -65,6 +68,7 @@ struct ParsedURL
|
|||
};
|
||||
|
||||
std::string scheme;
|
||||
|
||||
/**
|
||||
* Optional parsed authority component of the URL.
|
||||
*
|
||||
|
|
@ -75,16 +79,155 @@ struct ParsedURL
|
|||
* part of the URL.
|
||||
*/
|
||||
std::optional<Authority> authority;
|
||||
std::string path;
|
||||
|
||||
/**
|
||||
* @note Unlike Unix paths, URLs provide a way to escape path
|
||||
* separators, in the form of the `%2F` encoding of `/`. That means
|
||||
* that if one percent-decodes the path into a single string, that
|
||||
* decoding will be *lossy*, because `/` and `%2F` both become `/`.
|
||||
* The right thing to do is instead split up the path on `/`, and
|
||||
* then percent decode each part.
|
||||
*
|
||||
* For an example, the path
|
||||
* ```
|
||||
* foo/bar%2Fbaz/quux
|
||||
* ```
|
||||
* is parsed as
|
||||
* ```
|
||||
* {"foo, "bar/baz", "quux"}
|
||||
* ```
|
||||
*
|
||||
* We're doing splitting and joining that assumes the separator (`/` in this case) only goes *between* elements.
|
||||
*
|
||||
* That means the parsed representation will begin with an empty
|
||||
* element to make an initial `/`, and will end with an ementy
|
||||
* element to make a trailing `/`. That means that elements of this
|
||||
* vector mostly, but *not always*, correspond to segments of the
|
||||
* path.
|
||||
*
|
||||
* Examples:
|
||||
*
|
||||
* - ```
|
||||
* https://foo.com/bar
|
||||
* ```
|
||||
* has path
|
||||
* ```
|
||||
* {"", "bar"}
|
||||
* ```
|
||||
*
|
||||
* - ```
|
||||
* https://foo.com/bar/
|
||||
* ```
|
||||
* has path
|
||||
* ```
|
||||
* {"", "bar", ""}
|
||||
* ```
|
||||
*
|
||||
* - ```
|
||||
* https://foo.com//bar///
|
||||
* ```
|
||||
* has path
|
||||
* ```
|
||||
* {"", "", "bar", "", "", ""}
|
||||
* ```
|
||||
*
|
||||
* - ```
|
||||
* https://foo.com
|
||||
* ```
|
||||
* has path
|
||||
* ```
|
||||
* {""}
|
||||
* ```
|
||||
*
|
||||
* - ```
|
||||
* https://foo.com/
|
||||
* ```
|
||||
* has path
|
||||
* ```
|
||||
* {"", ""}
|
||||
* ```
|
||||
*
|
||||
* - ```
|
||||
* tel:01234
|
||||
* ```
|
||||
* has path `{"01234"}` (and no authority)
|
||||
*
|
||||
* - ```
|
||||
* foo:/01234
|
||||
* ```
|
||||
* has path `{"", "01234"}` (and no authority)
|
||||
*
|
||||
* Note that both trailing and leading slashes are, in general,
|
||||
* semantically significant.
|
||||
*
|
||||
* For trailing slashes, the main example affecting many schemes is
|
||||
* that `../baz` resolves against a base URL different depending on
|
||||
* the presence/absence of a trailing slash:
|
||||
*
|
||||
* - `https://foo.com/bar` is `https://foo.com/baz`
|
||||
*
|
||||
* - `https://foo.com/bar/` is `https://foo.com/bar/baz`
|
||||
*
|
||||
* See `parseURLRelative` for more details.
|
||||
*
|
||||
* For leading slashes, there are some requirements to be aware of.
|
||||
*
|
||||
* - When there is an authority, the path *must* start with a leading
|
||||
* slash. Otherwise the path will not be separated from the
|
||||
* authority, and will not round trip though the parser:
|
||||
*
|
||||
* ```
|
||||
* {.scheme="https", .authority.host = "foo", .path={"bad"}}
|
||||
* ```
|
||||
* will render to `https://foobar`. but that would parse back as as
|
||||
* ```
|
||||
* {.scheme="https", .authority.host = "foobar", .path={}}
|
||||
* ```
|
||||
*
|
||||
* - When there is no authority, the path must *not* begin with two
|
||||
* slashes. Otherwise, there will be another parser round trip
|
||||
* issue:
|
||||
*
|
||||
* ```
|
||||
* {.scheme="https", .path={"", "", "bad"}}
|
||||
* ```
|
||||
* will render to `https://bad`. but that would parse back as as
|
||||
* ```
|
||||
* {.scheme="https", .authority.host = "bad", .path={}}
|
||||
* ```
|
||||
*
|
||||
* These invariants will be checked in `to_string` and
|
||||
* `renderAuthorityAndPath`.
|
||||
*/
|
||||
std::vector<std::string> path;
|
||||
|
||||
StringMap query;
|
||||
|
||||
std::string fragment;
|
||||
|
||||
/**
|
||||
* Render just the middle part of a URL, without the `//` which
|
||||
* indicates whether the authority is present.
|
||||
*
|
||||
* @note This is kind of an ad-hoc
|
||||
* operation, but it ends up coming up with some frequency, probably
|
||||
* due to the current design of `StoreReference` in `nix-store`.
|
||||
*/
|
||||
std::string renderAuthorityAndPath() const;
|
||||
|
||||
std::string to_string() const;
|
||||
|
||||
/**
|
||||
* Render the path to a string.
|
||||
*
|
||||
* @param encode Whether to percent encode path segments.
|
||||
*/
|
||||
std::string renderPath(bool encode = false) const;
|
||||
|
||||
auto operator<=>(const ParsedURL & other) const noexcept = default;
|
||||
|
||||
/**
|
||||
* Remove `.` and `..` path elements.
|
||||
* Remove `.` and `..` path segments.
|
||||
*/
|
||||
ParsedURL canonicalise();
|
||||
};
|
||||
|
|
@ -96,6 +239,22 @@ MakeError(BadURL, Error);
|
|||
std::string percentDecode(std::string_view in);
|
||||
std::string percentEncode(std::string_view s, std::string_view keep = "");
|
||||
|
||||
/**
|
||||
* Get the path part of the URL as an absolute or relative Path.
|
||||
*
|
||||
* @throws if any path component contains an slash (which would have
|
||||
* been escaped `%2F` in the rendered URL). This is because OS file
|
||||
* paths have no escape sequences --- file names cannot contain a
|
||||
* `/`.
|
||||
*/
|
||||
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath);
|
||||
|
||||
/**
|
||||
* Percent encode path. `%2F` for "interior slashes" is the most
|
||||
* important.
|
||||
*/
|
||||
std::string encodeUrlPath(std::span<const std::string> urlPath);
|
||||
|
||||
/**
|
||||
* @param lenient @see parseURL
|
||||
*/
|
||||
|
|
@ -114,6 +273,12 @@ std::string encodeQuery(const StringMap & query);
|
|||
* @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874.
|
||||
*
|
||||
* @throws BadURL
|
||||
*
|
||||
* The WHATWG specification of the URL constructor in Java Script is
|
||||
* also a useful reference:
|
||||
* https://url.spec.whatwg.org/#concept-basic-url-parser. Note, however,
|
||||
* that it includes various scheme-specific normalizations / extra steps
|
||||
* that we do not implement.
|
||||
*/
|
||||
ParsedURL parseURL(std::string_view url, bool lenient = false);
|
||||
|
||||
|
|
@ -123,7 +288,11 @@ ParsedURL parseURL(std::string_view url, bool lenient = false);
|
|||
*
|
||||
* This is specified in [IETF RFC 3986, section 5](https://datatracker.ietf.org/doc/html/rfc3986#section-5)
|
||||
*
|
||||
* Behavior should also match the `new URL(url, base)` JavaScript constructor.
|
||||
* @throws BadURL
|
||||
*
|
||||
* Behavior should also match the `new URL(url, base)` JavaScript
|
||||
* constructor, except for extra steps specific to the HTTP scheme. See
|
||||
* `parseURL` for link to the relevant WHATWG standard.
|
||||
*/
|
||||
ParsedURL parseURLRelative(std::string_view url, const ParsedURL & base);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "nix/util/util.hh"
|
||||
#include "nix/util/split.hh"
|
||||
#include "nix/util/canon-path.hh"
|
||||
#include "nix/util/strings-inline.hh"
|
||||
|
||||
#include <boost/url.hpp>
|
||||
|
||||
|
|
@ -179,11 +180,14 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
|
|||
if (authority && authority->host.size() && transportIsFile)
|
||||
throw BadURL("file:// URL '%s' has unexpected authority '%s'", urlView.buffer(), *authority);
|
||||
|
||||
auto path = urlView.path(); /* Does pct-decoding */
|
||||
auto fragment = urlView.fragment(); /* Does pct-decoding */
|
||||
|
||||
if (transportIsFile && path.empty())
|
||||
path = "/";
|
||||
boost::core::string_view encodedPath = urlView.encoded_path();
|
||||
if (transportIsFile && encodedPath.empty())
|
||||
encodedPath = "/";
|
||||
|
||||
auto path = std::views::transform(splitString<std::vector<std::string_view>>(encodedPath, "/"), percentDecode)
|
||||
| std::ranges::to<std::vector<std::string>>();
|
||||
|
||||
/* Get the raw query. Store URI supports smuggling doubly nested queries, where
|
||||
the inner &/? are pct-encoded. */
|
||||
|
|
@ -192,7 +196,7 @@ static ParsedURL fromBoostUrlView(boost::urls::url_view urlView, bool lenient)
|
|||
return ParsedURL{
|
||||
.scheme = scheme,
|
||||
.authority = authority,
|
||||
.path = path,
|
||||
.path = std::move(path),
|
||||
.query = decodeQuery(query, lenient),
|
||||
.fragment = fragment,
|
||||
};
|
||||
|
|
@ -215,7 +219,7 @@ try {
|
|||
if (authority.port)
|
||||
resolved.set_port_number(*authority.port);
|
||||
}
|
||||
resolved.set_path(base.path);
|
||||
resolved.set_encoded_path(encodeUrlPath(base.path));
|
||||
resolved.set_encoded_query(encodeQuery(base.query));
|
||||
resolved.set_fragment(base.fragment);
|
||||
} catch (boost::system::system_error & e) {
|
||||
|
|
@ -291,7 +295,15 @@ try {
|
|||
}
|
||||
|
||||
const static std::string allowedInQuery = ":@/?";
|
||||
const static std::string allowedInPath = ":@/";
|
||||
const static std::string allowedInPath = ":@";
|
||||
|
||||
std::string encodeUrlPath(std::span<const std::string> urlPath)
|
||||
{
|
||||
std::vector<std::string> encodedPath;
|
||||
for (auto & p : urlPath)
|
||||
encodedPath.push_back(percentEncode(p, allowedInPath));
|
||||
return concatStringsSep("/", encodedPath);
|
||||
}
|
||||
|
||||
std::string encodeQuery(const StringMap & ss)
|
||||
{
|
||||
|
|
@ -308,10 +320,62 @@ std::string encodeQuery(const StringMap & ss)
|
|||
return res;
|
||||
}
|
||||
|
||||
Path renderUrlPathEnsureLegal(const std::vector<std::string> & urlPath)
|
||||
{
|
||||
for (const auto & comp : urlPath) {
|
||||
/* This is only really valid for UNIX. Windows has more restrictions. */
|
||||
if (comp.contains('/'))
|
||||
throw BadURL("URL path component '%s' contains '/', which is not allowed in file names", comp);
|
||||
if (comp.contains(char(0)))
|
||||
throw BadURL("URL path component '%s' contains NUL byte which is not allowed", comp);
|
||||
}
|
||||
|
||||
return concatStringsSep("/", urlPath);
|
||||
}
|
||||
|
||||
std::string ParsedURL::renderPath(bool encode) const
|
||||
{
|
||||
if (encode)
|
||||
return encodeUrlPath(path);
|
||||
return concatStringsSep("/", path);
|
||||
}
|
||||
|
||||
std::string ParsedURL::renderAuthorityAndPath() const
|
||||
{
|
||||
std::string res;
|
||||
/* The following assertions correspond to 3.3. Path [rfc3986]. URL parser
|
||||
will never violate these properties, but hand-constructed ParsedURLs might. */
|
||||
if (authority.has_value()) {
|
||||
/* If a URI contains an authority component, then the path component
|
||||
must either be empty or begin with a slash ("/") character. */
|
||||
assert(path.empty() || path.front().empty());
|
||||
res += authority->to_string();
|
||||
} else if (std::ranges::equal(std::views::take(path, 2), std::views::repeat("", 2))) {
|
||||
/* If a URI does not contain an authority component, then the path cannot begin
|
||||
with two slash characters ("//") */
|
||||
unreachable();
|
||||
}
|
||||
res += encodeUrlPath(path);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string ParsedURL::to_string() const
|
||||
{
|
||||
return scheme + ":" + (authority ? "//" + authority->to_string() : "") + percentEncode(path, allowedInPath)
|
||||
+ (query.empty() ? "" : "?" + encodeQuery(query)) + (fragment.empty() ? "" : "#" + percentEncode(fragment));
|
||||
std::string res;
|
||||
res += scheme;
|
||||
res += ":";
|
||||
if (authority.has_value())
|
||||
res += "//";
|
||||
res += renderAuthorityAndPath();
|
||||
if (!query.empty()) {
|
||||
res += "?";
|
||||
res += encodeQuery(query);
|
||||
}
|
||||
if (!fragment.empty()) {
|
||||
res += "#";
|
||||
res += percentEncode(fragment);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
|
||||
|
|
@ -323,7 +387,7 @@ std::ostream & operator<<(std::ostream & os, const ParsedURL & url)
|
|||
ParsedURL ParsedURL::canonicalise()
|
||||
{
|
||||
ParsedURL res(*this);
|
||||
res.path = CanonPath(res.path).abs();
|
||||
res.path = splitString<std::vector<std::string>>(CanonPath(renderPath()).abs(), "/");
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
@ -352,7 +416,11 @@ ParsedURL fixGitURL(const std::string & url)
|
|||
if (hasPrefix(url, "file:"))
|
||||
return parseURL(url);
|
||||
if (url.find("://") == std::string::npos) {
|
||||
return (ParsedURL{.scheme = "file", .authority = ParsedURL::Authority{}, .path = url});
|
||||
return ParsedURL{
|
||||
.scheme = "file",
|
||||
.authority = ParsedURL::Authority{},
|
||||
.path = splitString<std::vector<std::string>>(url, "/"),
|
||||
};
|
||||
}
|
||||
return parseURL(url);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue