From 5ea81f5b8f36f351981f53d27e85c6e4733c2b3e Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Thu, 15 May 2025 23:07:25 +0000 Subject: [PATCH] libexpr: Actually cache line information in PosTable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous code had a sneaky bug due to which no caching actually happened: ```cpp auto linesForInput = (*lines)[origin->offset]; ``` That should have been: ```cpp auto & linesForInput = (*lines)[origin->offset]; ``` See [1]. Now that it also makes sense to make the cache bound in side in order not to memoize all the sources without freeing any memory. The default cache size has been chosen somewhat arbitrarily to be ~64k origins. For reference, 25.05 nixpkgs has ~50k .nix files. Simple benchmark: ```nix let pkgs = import { }; in builtins.foldl' (acc: el: acc + el.line) 0 ( builtins.genList (x: builtins.unsafeGetAttrPos "gcc" pkgs) 10000 ) ``` (After) ``` $ hyperfine "result/bin/nix eval -f ./test.nix" Benchmark 1: result/bin/nix eval -f ./test.nix Time (mean ± σ): 292.7 ms ± 3.9 ms [User: 131.0 ms, System: 120.5 ms] Range (min … max): 288.1 ms … 300.5 ms 10 runs ``` (Before) ``` hyperfine "nix eval -f ./test.nix" Benchmark 1: nix eval -f ./test.nix Time (mean ± σ): 666.7 ms ± 6.4 ms [User: 428.3 ms, System: 191.2 ms] Range (min … max): 659.7 ms … 681.3 ms 10 runs ``` If the origin happens to be a `all-packages.nix` or similar in size then the difference is much more dramatic. [1]: https://www.github.com/lix-project/lix/commit/22e3f0e9875082be7f4eec8e3caeb134a7f1c05f --- src/libutil/include/nix/util/pos-table.hh | 18 ++++++++++- src/libutil/pos-table.cc | 38 ++++++++++++++++------- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/libutil/include/nix/util/pos-table.hh b/src/libutil/include/nix/util/pos-table.hh index ef170e0f1..f64466c21 100644 --- a/src/libutil/include/nix/util/pos-table.hh +++ b/src/libutil/include/nix/util/pos-table.hh @@ -4,6 +4,7 @@ #include #include +#include "nix/util/lru-cache.hh" #include "nix/util/pos-idx.hh" #include "nix/util/position.hh" #include "nix/util/sync.hh" @@ -37,10 +38,20 @@ public: }; private: + /** + * Vector of byte offsets (in the virtual input buffer) of initial line character's position. + * Sorted by construction. Binary search over it allows for efficient translation of arbitrary + * byte offsets in the virtual input buffer to its line + column position. + */ using Lines = std::vector; + /** + * Cache from byte offset in the virtual buffer of Origins -> @ref Lines in that origin. + */ + using LinesCache = LRUCache; std::map origins; - mutable Sync> lines; + + mutable Sync linesCache; const Origin * resolve(PosIdx p) const { @@ -56,6 +67,11 @@ private: } public: + PosTable(std::size_t linesCacheCapacity = 65536) + : linesCache(linesCacheCapacity) + { + } + Origin addOrigin(Pos::Origin origin, size_t size) { uint32_t offset = 0; diff --git a/src/libutil/pos-table.cc b/src/libutil/pos-table.cc index 5a61ffbc5..e50b12873 100644 --- a/src/libutil/pos-table.cc +++ b/src/libutil/pos-table.cc @@ -15,21 +15,35 @@ Pos PosTable::operator[](PosIdx p) const const auto offset = origin->offsetOf(p); Pos result{0, 0, origin->origin}; - auto lines = this->lines.lock(); - auto linesForInput = (*lines)[origin->offset]; + auto linesCache = this->linesCache.lock(); - if (linesForInput.empty()) { - auto source = result.getSource().value_or(""); - const char * begin = source.data(); - for (Pos::LinesIterator it(source), end; it != end; it++) - linesForInput.push_back(it->data() - begin); - if (linesForInput.empty()) - linesForInput.push_back(0); + /* Try the origin's line cache */ + const auto * linesForInput = linesCache->getOrNullptr(origin->offset); + + auto fillCacheForOrigin = [](std::string_view content) { + auto contentLines = Lines(); + + const char * begin = content.data(); + for (Pos::LinesIterator it(content), end; it != end; it++) + contentLines.push_back(it->data() - begin); + if (contentLines.empty()) + contentLines.push_back(0); + + return contentLines; + }; + + /* Calculate line offsets and fill the cache */ + if (!linesForInput) { + auto originContent = result.getSource().value_or(""); + linesCache->upsert(origin->offset, fillCacheForOrigin(originContent)); + linesForInput = linesCache->getOrNullptr(origin->offset); } - // as above: the first line starts at byte 0 and is always present - auto lineStartOffset = std::prev(std::upper_bound(linesForInput.begin(), linesForInput.end(), offset)); - result.line = 1 + (lineStartOffset - linesForInput.begin()); + assert(linesForInput); + + // as above: the first line starts at byte 0 and is always present + auto lineStartOffset = std::prev(std::upper_bound(linesForInput->begin(), linesForInput->end(), offset)); + result.line = 1 + (lineStartOffset - linesForInput->begin()); result.column = 1 + (offset - *lineStartOffset); return result; }