perf(libstore/find-cycles): avoid copying StringSet on every chunk

Previously, `CycleEdgeScanSink::operator()` copied the entire `getResult()` `StringSet` twice on every 64KB chunk to detect newly found hashes. For large files, this created O(n * chunks) overhead. Now we track which hashes have been recorded for the current file using `recordedForCurrentFile`, avoiding the set copies. The insert() returns true only for newly seen hashes, making this O(1) per hash found.
2025-11-12 13:36:02 +01:00 · 2025-10-11 19:33:38 +00:00 · 2025-10-11 19:33:38 +00:00 · 5803daa940
commit 5803daa940
parent f7c4bcee9d
2 changed files with 11 additions and 8 deletions
--- a/src/libstore/build/find-cycles.cc
+++ b/src/libstore/build/find-cycles.cc
@ -26,22 +26,21 @@ CycleEdgeScanSink::CycleEdgeScanSink(
 void CycleEdgeScanSink::setCurrentPath(const std::string & path)
 {
    currentFilePath = path;
    // Clear tracking for new file
    recordedForCurrentFile.clear();
 }
 void CycleEdgeScanSink::operator()(std::string_view data)
 {
    // Track what hashes we've already seen
    auto seenBefore = getResult();
    // Call parent's operator() to do the actual hash searching
    // This reuses all the proven buffer boundary handling logic
    RefScanSink::operator()(data);
-    // Check for newly found hashes
+    // Check which hashes have been found and not yet recorded for this file
-    auto seenAfter = getResult();
+    // getResult() returns the set of ALL hashes found so far
-    for (const auto & hash : seenAfter) {
+    for (const auto & hash : getResult()) {
-        if (seenBefore.find(hash) == seenBefore.end()) {
+        if (recordedForCurrentFile.insert(hash).second) {
-            // This hash was just found in the current file
+            // This hash was just found and not yet recorded for current file
            // Create an edge from current file to the target
            auto targetPath = storeDir + hash;
--- a/src/libstore/build/find-cycles.hh
+++ b/src/libstore/build/find-cycles.hh
@ -38,6 +38,10 @@ class CycleEdgeScanSink : public RefScanSink
    std::map<std::string, StorePath> hashPathMap;
    std::string storeDir;
    // Track hashes we've already recorded for current file
    // to avoid duplicates
    StringSet recordedForCurrentFile;
 public:
    StoreCycleEdgeVec edges;