From bb8b50e4f34bce38351aece24843f3088cc48471 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Thu, 15 May 2025 22:28:41 +0000 Subject: [PATCH 1/2] libutil: Add `LRUCache::getOrNullptr` For heavier objects it doesn't make sense to return a std::optional with the copy of the data, when it can be used by const reference. (cherry picked from commit 4711720efe3e1cf27d0e80f818b5d02ca1075457) --- src/libutil/include/nix/util/lru-cache.hh | 46 +++++++++++++++++------ 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/libutil/include/nix/util/lru-cache.hh b/src/libutil/include/nix/util/lru-cache.hh index 80ec6f8c4..23cfa91e1 100644 --- a/src/libutil/include/nix/util/lru-cache.hh +++ b/src/libutil/include/nix/util/lru-cache.hh @@ -33,6 +33,18 @@ private: Data data; LRU lru; + /** + * Move this item to the back of the LRU list. + */ + void promote(LRU::iterator it) + { + /* Think of std::list iterators as stable pointers to the list node, + * which never get invalidated. Thus, we can reuse the same lru list + * element and just splice it to the back of the list without the need + * to update its value in the key -> list iterator map. */ + lru.splice(/*pos=*/lru.end(), /*other=*/lru, it); + } + public: LRUCache(size_t capacity) @@ -83,7 +95,9 @@ public: /** * Look up an item in the cache. If it exists, it becomes the most * recently used item. - * */ + * + * @returns corresponding cache entry, std::nullopt if it's not in the cache + */ template std::optional get(const K & key) { @@ -91,20 +105,30 @@ public: if (i == data.end()) return {}; - /** - * Move this item to the back of the LRU list. - * - * Think of std::list iterators as stable pointers to the list node, - * which never get invalidated. Thus, we can reuse the same lru list - * element and just splice it to the back of the list without the need - * to update its value in the key -> list iterator map. - */ auto & [it, value] = i->second; - lru.splice(/*pos=*/lru.end(), /*other=*/lru, it.it); - + promote(it.it); return value; } + /** + * Look up an item in the cache. If it exists, it becomes the most + * recently used item. + * + * @returns mutable pointer to the corresponding cache entry, nullptr if + * it's not in the cache + */ + template + Value * getOrNullptr(const K & key) + { + auto i = data.find(key); + if (i == data.end()) + return nullptr; + + auto & [it, value] = i->second; + promote(it.it); + return &value; + } + size_t size() const noexcept { return data.size(); From ef8dc34bd00f8e8936f8b0cfd1efe7bb202a4971 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Thu, 15 May 2025 23:07:25 +0000 Subject: [PATCH 2/2] libexpr: Actually cache line information in PosTable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous code had a sneaky bug due to which no caching actually happened: ```cpp auto linesForInput = (*lines)[origin->offset]; ``` That should have been: ```cpp auto & linesForInput = (*lines)[origin->offset]; ``` See [1]. Now that it also makes sense to make the cache bound in side in order not to memoize all the sources without freeing any memory. The default cache size has been chosen somewhat arbitrarily to be ~64k origins. For reference, 25.05 nixpkgs has ~50k .nix files. Simple benchmark: ```nix let pkgs = import { }; in builtins.foldl' (acc: el: acc + el.line) 0 ( builtins.genList (x: builtins.unsafeGetAttrPos "gcc" pkgs) 10000 ) ``` (After) ``` $ hyperfine "result/bin/nix eval -f ./test.nix" Benchmark 1: result/bin/nix eval -f ./test.nix Time (mean ± σ): 292.7 ms ± 3.9 ms [User: 131.0 ms, System: 120.5 ms] Range (min … max): 288.1 ms … 300.5 ms 10 runs ``` (Before) ``` hyperfine "nix eval -f ./test.nix" Benchmark 1: nix eval -f ./test.nix Time (mean ± σ): 666.7 ms ± 6.4 ms [User: 428.3 ms, System: 191.2 ms] Range (min … max): 659.7 ms … 681.3 ms 10 runs ``` If the origin happens to be a `all-packages.nix` or similar in size then the difference is much more dramatic. [1]: https://www.github.com/lix-project/lix/commit/22e3f0e9875082be7f4eec8e3caeb134a7f1c05f (cherry picked from commit 5ea81f5b8f36f351981f53d27e85c6e4733c2b3e) --- src/libutil/include/nix/util/pos-table.hh | 18 ++++++++++- src/libutil/pos-table.cc | 38 ++++++++++++++++------- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/libutil/include/nix/util/pos-table.hh b/src/libutil/include/nix/util/pos-table.hh index 8ef7bd893..d944b1353 100644 --- a/src/libutil/include/nix/util/pos-table.hh +++ b/src/libutil/include/nix/util/pos-table.hh @@ -4,6 +4,7 @@ #include #include +#include "nix/util/lru-cache.hh" #include "nix/util/pos-idx.hh" #include "nix/util/position.hh" #include "nix/util/sync.hh" @@ -37,10 +38,20 @@ public: }; private: + /** + * Vector of byte offsets (in the virtual input buffer) of initial line character's position. + * Sorted by construction. Binary search over it allows for efficient translation of arbitrary + * byte offsets in the virtual input buffer to its line + column position. + */ using Lines = std::vector; + /** + * Cache from byte offset in the virtual buffer of Origins -> @ref Lines in that origin. + */ + using LinesCache = LRUCache; std::map origins; - mutable Sync> lines; + + mutable Sync linesCache; const Origin * resolve(PosIdx p) const { @@ -56,6 +67,11 @@ private: } public: + PosTable(std::size_t linesCacheCapacity = 65536) + : linesCache(linesCacheCapacity) + { + } + Origin addOrigin(Pos::Origin origin, size_t size) { uint32_t offset = 0; diff --git a/src/libutil/pos-table.cc b/src/libutil/pos-table.cc index cdee4adb0..e24aff4b1 100644 --- a/src/libutil/pos-table.cc +++ b/src/libutil/pos-table.cc @@ -15,21 +15,35 @@ Pos PosTable::operator[](PosIdx p) const const auto offset = origin->offsetOf(p); Pos result{0, 0, origin->origin}; - auto lines = this->lines.lock(); - auto linesForInput = (*lines)[origin->offset]; + auto linesCache = this->linesCache.lock(); - if (linesForInput.empty()) { - auto source = result.getSource().value_or(""); - const char * begin = source.data(); - for (Pos::LinesIterator it(source), end; it != end; it++) - linesForInput.push_back(it->data() - begin); - if (linesForInput.empty()) - linesForInput.push_back(0); + /* Try the origin's line cache */ + const auto * linesForInput = linesCache->getOrNullptr(origin->offset); + + auto fillCacheForOrigin = [](std::string_view content) { + auto contentLines = Lines(); + + const char * begin = content.data(); + for (Pos::LinesIterator it(content), end; it != end; it++) + contentLines.push_back(it->data() - begin); + if (contentLines.empty()) + contentLines.push_back(0); + + return contentLines; + }; + + /* Calculate line offsets and fill the cache */ + if (!linesForInput) { + auto originContent = result.getSource().value_or(""); + linesCache->upsert(origin->offset, fillCacheForOrigin(originContent)); + linesForInput = linesCache->getOrNullptr(origin->offset); } - // as above: the first line starts at byte 0 and is always present - auto lineStartOffset = std::prev(std::upper_bound(linesForInput.begin(), linesForInput.end(), offset)); - result.line = 1 + (lineStartOffset - linesForInput.begin()); + assert(linesForInput); + + // as above: the first line starts at byte 0 and is always present + auto lineStartOffset = std::prev(std::upper_bound(linesForInput->begin(), linesForInput->end(), offset)); + result.line = 1 + (lineStartOffset - linesForInput->begin()); result.column = 1 + (offset - *lineStartOffset); return result; }