From ffc9bfb66d0b8d5da4e94ab1ac4580657b6352eb Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:20:13 +0300 Subject: [PATCH 1/6] lib{store,flake}-tests: Add test for spaces in URIs These cases do not seem to be covered by the test suite at all. --- src/libflake-tests/flakeref.cc | 7 +++++++ .../data/store-reference/local_3.txt | 1 + src/libstore-tests/store-reference.cc | 14 ++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 src/libstore-tests/data/store-reference/local_3.txt diff --git a/src/libflake-tests/flakeref.cc b/src/libflake-tests/flakeref.cc index eafe74a2d..b8f1ef7c9 100644 --- a/src/libflake-tests/flakeref.cc +++ b/src/libflake-tests/flakeref.cc @@ -48,6 +48,13 @@ TEST(parseFlakeRef, path) ASSERT_EQ(flakeref.to_string(), "path:/foo/bar?revCount=123"); ASSERT_EQ(fragment, "bla"); } + + { + auto s = "/foo bar/baz?dir=bla space"; + auto flakeref = parseFlakeRef(fetchSettings, s); + ASSERT_EQ(flakeref.to_string(), "path:/foo%20bar/baz?dir=bla%20space"); + ASSERT_EQ(flakeref.toAttrs().at("dir"), fetchers::Attr("bla space")); + } } TEST(to_string, doesntReencodeUrl) diff --git a/src/libstore-tests/data/store-reference/local_3.txt b/src/libstore-tests/data/store-reference/local_3.txt new file mode 100644 index 000000000..2a67a3426 --- /dev/null +++ b/src/libstore-tests/data/store-reference/local_3.txt @@ -0,0 +1 @@ +local://?root=/foo bar/baz \ No newline at end of file diff --git a/src/libstore-tests/store-reference.cc b/src/libstore-tests/store-reference.cc index f8c3587d2..01b75f3d2 100644 --- a/src/libstore-tests/store-reference.cc +++ b/src/libstore-tests/store-reference.cc @@ -85,10 +85,24 @@ static StoreReference localExample_2{ }, }; +static StoreReference localExample_3{ + .variant = + StoreReference::Specified{ + .scheme = "local", + }, + .params = + { + {"root", "/foo bar/baz"}, + }, +}; + URI_TEST(local_1, localExample_1) URI_TEST(local_2, localExample_2) +/* Test path with spaces */ +URI_TEST(local_3, localExample_3) + URI_TEST_READ(local_shorthand_1, localExample_1) URI_TEST_READ(local_shorthand_2, localExample_2) From d9053390cec37cfb199cb3db256e43657a097465 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:20:24 +0300 Subject: [PATCH 2/6] libutil-test-support: Add HasSubstrIgnoreANSIMatcher This matcher is useful for checking error messages, which always contain ANSI escapes. --- .../include/nix/util/tests/gmock-matchers.hh | 56 +++++++++++++++++++ .../include/nix/util/tests/meson.build | 1 + 2 files changed, 57 insertions(+) create mode 100644 src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh diff --git a/src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh b/src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh new file mode 100644 index 000000000..27d765e9e --- /dev/null +++ b/src/libutil-test-support/include/nix/util/tests/gmock-matchers.hh @@ -0,0 +1,56 @@ +#pragma once +///@file + +#include "nix/util/terminal.hh" +#include + +namespace nix::testing { + +namespace internal { + +/** + * GMock matcher that matches substring while stripping off all ANSI escapes. + * Useful for checking exceptions messages in unit tests. + */ +class HasSubstrIgnoreANSIMatcher +{ +public: + explicit HasSubstrIgnoreANSIMatcher(std::string substring) + : substring(std::move(substring)) + { + } + + bool MatchAndExplain(const char * s, ::testing::MatchResultListener * listener) const + { + return s != nullptr && MatchAndExplain(std::string(s), listener); + } + + template + bool MatchAndExplain(const MatcheeStringType & s, [[maybe_unused]] ::testing::MatchResultListener * listener) const + { + return filterANSIEscapes(s, /*filterAll=*/true).find(substring) != substring.npos; + } + + void DescribeTo(::std::ostream * os) const + { + *os << "has substring " << substring; + } + + void DescribeNegationTo(::std::ostream * os) const + { + *os << "has no substring " << substring; + } + +private: + std::string substring; +}; + +} // namespace internal + +inline ::testing::PolymorphicMatcher +HasSubstrIgnoreANSIMatcher(const std::string & substring) +{ + return ::testing::MakePolymorphicMatcher(internal::HasSubstrIgnoreANSIMatcher(substring)); +} + +} // namespace nix::testing diff --git a/src/libutil-test-support/include/nix/util/tests/meson.build b/src/libutil-test-support/include/nix/util/tests/meson.build index f77dedff7..e6697b517 100644 --- a/src/libutil-test-support/include/nix/util/tests/meson.build +++ b/src/libutil-test-support/include/nix/util/tests/meson.build @@ -4,6 +4,7 @@ include_dirs = [include_directories('../../..')] headers = files( 'characterization.hh', + 'gmock-matchers.hh', 'gtest-with-params.hh', 'hash.hh', 'nix_api_util.hh', From ad449c0288a539dc8443671bbd879d7d95cecb74 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:20:36 +0300 Subject: [PATCH 3/6] libutil: Refactor percentDecode,percentEncode to use Boost.URL The myriad of hand-rolled URL parsing and validation code is a constant source of problems. Regexes are not a great way of writing parsers and there's a history of getting them wrong. Boost.URL is a good library we can outsource most of the heavy lifting to. --- packaging/dependencies.nix | 1 + src/libutil-tests/url.cc | 10 ++++++++++ src/libutil/meson.build | 2 +- src/libutil/url.cc | 41 ++++++++++++-------------------------- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/packaging/dependencies.nix b/packaging/dependencies.nix index 7ce3bf125..dda9ef8dc 100644 --- a/packaging/dependencies.nix +++ b/packaging/dependencies.nix @@ -62,6 +62,7 @@ scope: { "--with-context" "--with-coroutine" "--with-iostreams" + "--with-url" ]; enableIcu = false; }).overrideAttrs diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc index 2a2bba880..8f2033ded 100644 --- a/src/libutil-tests/url.cc +++ b/src/libutil-tests/url.cc @@ -1,5 +1,7 @@ #include "nix/util/url.hh" +#include "nix/util/tests/gmock-matchers.hh" #include +#include namespace nix { @@ -289,6 +291,14 @@ TEST(percentDecode, trailingPercent) ASSERT_EQ(d, s); } +TEST(percentDecode, incompleteEncoding) +{ + ASSERT_THAT( + []() { percentDecode("%1"); }, + ::testing::ThrowsMessage( + testing::HasSubstrIgnoreANSIMatcher("error: invalid URI parameter '%1': incomplete pct-encoding"))); +} + /* ---------------------------------------------------------------------------- * percentEncode * --------------------------------------------------------------------------*/ diff --git a/src/libutil/meson.build b/src/libutil/meson.build index f48c8f3d7..55419265a 100644 --- a/src/libutil/meson.build +++ b/src/libutil/meson.build @@ -57,7 +57,7 @@ deps_private += blake3 boost = dependency( 'boost', - modules : ['context', 'coroutine', 'iostreams'], + modules : ['context', 'coroutine', 'iostreams', 'url'], include_type: 'system', version: '>=1.82.0' ) diff --git a/src/libutil/url.cc b/src/libutil/url.cc index eac0b188e..67043285c 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -4,6 +4,8 @@ #include "nix/util/split.hh" #include "nix/util/canon-path.hh" +#include + namespace nix { std::regex refRegex(refRegexS, std::regex::ECMAScript); @@ -48,21 +50,17 @@ ParsedURL parseURL(const std::string & url) std::string percentDecode(std::string_view in) { - std::string decoded; - for (size_t i = 0; i < in.size();) { - if (in[i] == '%') { - if (i + 2 >= in.size()) - throw BadURL("invalid URI parameter '%s'", in); - try { - decoded += std::stoul(std::string(in, i + 1, 2), 0, 16); - i += 3; - } catch (...) { - throw BadURL("invalid URI parameter '%s'", in); - } - } else - decoded += in[i++]; - } - return decoded; + auto pctView = boost::urls::make_pct_string_view(in); + if (pctView.has_value()) + return pctView->decode(); + auto error = pctView.error(); + throw BadURL("invalid URI parameter '%s': %s", in, error.message()); +} + +std::string percentEncode(std::string_view s, std::string_view keep) +{ + return boost::urls::encode( + s, [keep](char c) { return boost::urls::unreserved_chars(c) || keep.find(c) != keep.npos; }); } StringMap decodeQuery(const std::string & query) @@ -85,19 +83,6 @@ StringMap decodeQuery(const std::string & query) const static std::string allowedInQuery = ":@/?"; const static std::string allowedInPath = ":@/"; -std::string percentEncode(std::string_view s, std::string_view keep) -{ - std::string res; - for (auto & c : s) - // unreserved + keep - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || strchr("-._~", c) - || keep.find(c) != std::string::npos) - res += c; - else - res += fmt("%%%02X", c & 0xFF); - return res; -} - std::string encodeQuery(const StringMap & ss) { std::string res; From d020f21a2a5dcd771988694cf634841fbcf310cd Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:20:46 +0300 Subject: [PATCH 4/6] libutil: Use default operator== for ParsedURL The default comparison operator can be generated by the compiler since C++20. --- src/libutil/include/nix/util/url.hh | 2 +- src/libutil/url.cc | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh index 8980b4ce3..e29226720 100644 --- a/src/libutil/include/nix/util/url.hh +++ b/src/libutil/include/nix/util/url.hh @@ -15,7 +15,7 @@ struct ParsedURL std::string to_string() const; - bool operator==(const ParsedURL & other) const noexcept; + bool operator==(const ParsedURL & other) const noexcept = default; /** * Remove `.` and `..` path elements. diff --git a/src/libutil/url.cc b/src/libutil/url.cc index 67043285c..7f31d0f1c 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -110,12 +110,6 @@ std::ostream & operator<<(std::ostream & os, const ParsedURL & url) return os; } -bool ParsedURL::operator==(const ParsedURL & other) const noexcept -{ - return scheme == other.scheme && authority == other.authority && path == other.path && query == other.query - && fragment == other.fragment; -} - ParsedURL ParsedURL::canonicalise() { ParsedURL res(*this); From bd1d2d1041a321284efcf22e11beb86ede08648d Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:20:59 +0300 Subject: [PATCH 5/6] libutil: Use Boost.URL in parseURL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boost.URL is a significantly more RFC-compliant parser than what libutil currently has a bundle of incomprehensible regexes. One aspect of this change is that RFC4007 ZoneId IPv6 literals are represented in URIs according to RFC6874 [1]. Previously they were represented naively like so: [fe80::818c:da4d:8975:415c\%enp0s25]. This is not entirely correct, because the percent itself has to be pct-encoded: > "%" is always treated as an escape character in a URI, so, according to the established URI syntax [RFC3986] any occurrences of literal "%" symbols in a URI MUST be percent-encoded and represented in the form "%25". Thus, the scoped address fe80::a%en1 would appear in a URI as http://[fe80::a%25en1]. [1]: https://datatracker.ietf.org/doc/html/rfc6874 Co-authored-by: Jörg Thalheim --- src/libutil-tests/url.cc | 4 +- src/libutil/include/nix/util/url-parts.hh | 11 --- src/libutil/include/nix/util/url.hh | 11 +++ src/libutil/url.cc | 100 +++++++++++++++------- 4 files changed, 81 insertions(+), 45 deletions(-) diff --git a/src/libutil-tests/url.cc b/src/libutil-tests/url.cc index 8f2033ded..5e9b81f46 100644 --- a/src/libutil-tests/url.cc +++ b/src/libutil-tests/url.cc @@ -124,9 +124,9 @@ TEST(parseURL, parseIPv4Address) ASSERT_EQ(parsed, expected); } -TEST(parseURL, parseScopedRFC4007IPv6Address) +TEST(parseURL, parseScopedRFC6874IPv6Address) { - auto s = "http://[fe80::818c:da4d:8975:415c\%enp0s25]:8080"; + auto s = "http://[fe80::818c:da4d:8975:415c\%25enp0s25]:8080"; auto parsed = parseURL(s); ParsedURL expected{ diff --git a/src/libutil/include/nix/util/url-parts.hh b/src/libutil/include/nix/util/url-parts.hh index bf1215b6d..72c901b5d 100644 --- a/src/libutil/include/nix/util/url-parts.hh +++ b/src/libutil/include/nix/util/url-parts.hh @@ -8,21 +8,10 @@ namespace nix { // URI stuff. const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])"; -const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)"; -const static std::string ipv6AddressSegmentRegex = "[0-9a-fA-F:]+(?:%\\w+)?"; -const static std::string ipv6AddressRegex = "(?:\\[" + ipv6AddressSegmentRegex + "\\]|" + ipv6AddressSegmentRegex + ")"; const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])"; const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])"; -const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)"; -const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")"; -const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)"; -const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?"; const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])"; -const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*"; const static std::string fragmentRegex = "(?:" + pcharRegex + "|[/? \"^])*"; -const static std::string segmentRegex = "(?:" + pcharRegex + "*)"; -const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)"; -const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)"; /// A Git ref (i.e. branch or tag name). /// \todo check that this is correct. diff --git a/src/libutil/include/nix/util/url.hh b/src/libutil/include/nix/util/url.hh index e29226720..1c51ab797 100644 --- a/src/libutil/include/nix/util/url.hh +++ b/src/libutil/include/nix/util/url.hh @@ -34,6 +34,17 @@ StringMap decodeQuery(const std::string & query); std::string encodeQuery(const StringMap & query); +/** + * Parse a Nix URL into a ParsedURL. + * + * Nix URI is mostly compliant with RFC3986, but with some deviations: + * - Literal spaces are allowed and don't have to be percent encoded. + * This is mostly done for backward compatibility. + * + * @note IPv6 ZoneId literals (RFC4007) are represented in URIs according to RFC6874. + * + * @throws BadURL + */ ParsedURL parseURL(const std::string & url); /** diff --git a/src/libutil/url.cc b/src/libutil/url.cc index 7f31d0f1c..2f9c7736a 100644 --- a/src/libutil/url.cc +++ b/src/libutil/url.cc @@ -12,40 +12,70 @@ std::regex refRegex(refRegexS, std::regex::ECMAScript); std::regex badGitRefRegex(badGitRefRegexS, std::regex::ECMAScript); std::regex revRegex(revRegexS, std::regex::ECMAScript); -ParsedURL parseURL(const std::string & url) +/** + * Drop trailing shevron for output installable syntax. + * + * FIXME: parseURL shouldn't really be used for parsing the OutputSpec, but it does + * get used. That code should actually use ExtendedOutputsSpec::parseOpt. + */ +static std::string_view dropShevronSuffix(std::string_view url) { - static std::regex uriRegex( - "((" + schemeNameRegex + "):" + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex - + ")))" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + fragmentRegex + "))?", - std::regex::ECMAScript); + auto shevron = url.rfind("^"); + if (shevron == std::string_view::npos) + return url; + return url.substr(0, shevron); +} - std::smatch match; +/** + * Percent encode spaces in the url. + */ +static std::string percentEncodeSpaces(std::string_view url) +{ + return replaceStrings(std::string(url), " ", percentEncode(" ")); +} - if (std::regex_match(url, match, uriRegex)) { - std::string scheme = match[2]; - auto authority = match[3].matched ? std::optional(match[3]) : std::nullopt; - std::string path = match[4].matched ? match[4] : match[5]; - auto & query = match[6]; - auto & fragment = match[7]; +ParsedURL parseURL(const std::string & url) +try { + /* Drop the shevron suffix used for the flakerefs. Shevron character is reserved and + shouldn't appear in normal URIs. */ + auto unparsedView = dropShevronSuffix(url); + /* For back-compat literal spaces are allowed. */ + auto withFixedSpaces = percentEncodeSpaces(unparsedView); + auto urlView = boost::urls::url_view(withFixedSpaces); - auto transportIsFile = parseUrlScheme(scheme).transport == "file"; + if (!urlView.has_scheme()) + throw BadURL("'%s' doesn't have a scheme", url); - if (authority && *authority != "" && transportIsFile) - throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority); + auto scheme = urlView.scheme(); + auto authority = [&]() -> std::optional { + if (urlView.has_authority()) + return percentDecode(urlView.authority().buffer()); + return std::nullopt; + }(); - if (transportIsFile && path.empty()) - path = "/"; + auto transportIsFile = parseUrlScheme(scheme).transport == "file"; + if (authority && *authority != "" && transportIsFile) + throw BadURL("file:// URL '%s' has unexpected authority '%s'", url, *authority); - return ParsedURL{ - .scheme = scheme, - .authority = authority, - .path = percentDecode(path), - .query = decodeQuery(query), - .fragment = percentDecode(std::string(fragment))}; - } + auto path = urlView.path(); /* Does pct-decoding */ + auto fragment = urlView.fragment(); /* Does pct-decoding */ - else - throw BadURL("'%s' is not a valid URL", url); + if (transportIsFile && path.empty()) + path = "/"; + + /* Get the raw query. Store URI supports smuggling doubly nested queries, where + the inner &/? are pct-encoded. */ + auto query = std::string_view(urlView.encoded_query()); + + return ParsedURL{ + .scheme = scheme, + .authority = authority, + .path = path, + .query = decodeQuery(std::string(query)), + .fragment = fragment, + }; +} catch (boost::system::system_error & e) { + throw BadURL("'%s' is not a valid URL: %s", url, e.code().message()); } std::string percentDecode(std::string_view in) @@ -64,20 +94,25 @@ std::string percentEncode(std::string_view s, std::string_view keep) } StringMap decodeQuery(const std::string & query) -{ +try { + /* For back-compat literal spaces are allowed. */ + auto withFixedSpaces = percentEncodeSpaces(query); + StringMap result; - for (const auto & s : tokenizeString(query, "&")) { - auto e = s.find('='); - if (e == std::string::npos) { - warn("dubious URI query '%s' is missing equal sign '%s', ignoring", s, "="); + auto encodedQuery = boost::urls::params_encoded_view(withFixedSpaces); + for (auto && [key, value, value_specified] : encodedQuery) { + if (!value_specified) { + warn("dubious URI query '%s' is missing equal sign '%s', ignoring", std::string_view(key), "="); continue; } - result.emplace(s.substr(0, e), percentDecode(std::string_view(s).substr(e + 1))); + result.emplace(key.decode(), value.decode()); } return result; +} catch (boost::system::system_error & e) { + throw BadURL("invalid URI query '%s': %s", query, e.code().message()); } const static std::string allowedInQuery = ":@/?"; @@ -150,6 +185,7 @@ std::string fixGitURL(const std::string & url) // https://www.rfc-editor.org/rfc/rfc3986#section-3.1 bool isValidSchemeName(std::string_view s) { + const static std::string schemeNameRegex = "(?:[a-z][a-z0-9+.-]*)"; static std::regex regex(schemeNameRegex, std::regex::ECMAScript); return std::regex_match(s.begin(), s.end(), regex, std::regex_constants::match_default); From a54284cbc706474d091284d8e7470b803a8ff001 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Fri, 18 Jul 2025 21:21:12 +0300 Subject: [PATCH 6/6] rl-next: Add release note about IPv6 Scoped Addresses in URIs --- doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md diff --git a/doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md b/doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md new file mode 100644 index 000000000..d5bc4736f --- /dev/null +++ b/doc/manual/rl-next/rfc4007-zone-id-in-uri-rfc6874.md @@ -0,0 +1,6 @@ +--- +synopsis: "Represent IPv6 RFC4007 ZoneId literals in conformance with RFC6874" +prs: [13445] +--- + +Prior versions of Nix since [#4646](https://github.com/NixOS/nix/pull/4646) accepted [IPv6 scoped addresses](https://datatracker.ietf.org/doc/html/rfc4007) in URIs like [store references](@docroot@/store/types/index.md#store-url-format) in the textual representation with a literal percent character: `[fe80::1%18]`. This was ambiguous, because the the percent literal `%` is reserved by [RFC3986](https://datatracker.ietf.org/doc/html/rfc3986), since it's used to indicate percent encoding. Nix now requires that the percent `%` symbol is percent-encoded as `%25`. This implements [RFC6874](https://datatracker.ietf.org/doc/html/rfc6874), which defines the representation of zone identifiers in URIs. The example from above now has to be specified as `[fe80::1%2518]`.