From 7a7b9fdf1cb7ff618e937816a8767854923aca7b Mon Sep 17 00:00:00 2001 From: Bernardo Meurer Costa Date: Sun, 12 Oct 2025 00:40:12 +0000 Subject: [PATCH] perf(libstore/find-cycles): optimize transformEdgesToMultiedges to O(n) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the multi-pass O(n²) algorithm with a single-pass O(n) approach using hashmaps to track path endpoints. Before: - First pass: O(n*m) to join each edge with existing multiedges - Second pass: O(m²) repeated merging until no more merges possible - Required multiple iterations through the entire dataset Now: - Single O(n) pass through edges - Two hashmaps (pathStartingAt, pathEndingAt) enable O(1) lookups - Immediately identifies connections without linear searches - Handles all cases in one pass: * Extending existing paths (prepend/append) * Joining two separate paths * Forming cycles when a path connects to itself --- src/libstore/build/find-cycles.cc | 133 ++++++++++-------- .../include/nix/store/build/find-cycles.hh | 4 +- 2 files changed, 80 insertions(+), 57 deletions(-) diff --git a/src/libstore/build/find-cycles.cc b/src/libstore/build/find-cycles.cc index 3ade944f4..c39757fd7 100644 --- a/src/libstore/build/find-cycles.cc +++ b/src/libstore/build/find-cycles.cc @@ -6,7 +6,9 @@ # include "nix/util/archive.hh" // For caseHackSuffix #endif +#include #include +#include namespace nix { @@ -143,67 +145,88 @@ void transformEdgesToMultiedges(StoreCycleEdgeVec & edges, StoreCycleEdgeVec & m { debug("transformEdgesToMultiedges: processing %lu edges", edges.size()); - // First pass: join edges to multiedges - for (auto & edge2 : edges) { - bool edge2Joined = false; + // Maps to track path endpoints for efficient joining + // Key: node name, Value: index into multiedges vector + std::map pathStartingAt; // Maps start node -> path index + std::map pathEndingAt; // Maps end node -> path index - for (auto & edge1 : multiedges) { - // Check if edge1.back() == edge2.front() - // This means we can extend edge1 by appending edge2 - if (edge1.back() == edge2.front()) { - // Append all but the first element of edge2 (to avoid duplication) - for (size_t i = 1; i < edge2.size(); i++) { - edge1.push_back(edge2[i]); - } - edge2Joined = true; - break; - } + for (auto & edge : edges) { + if (edge.empty()) + continue; - // Check if edge2.back() == edge1.front() - // This means we can extend edge1 by prepending edge2 - if (edge2.back() == edge1.front()) { - // Prepend all but the last element of edge2 (to avoid duplication) - for (int i = edge2.size() - 2; i >= 0; i--) { - edge1.push_front(edge2[i]); - } - edge2Joined = true; - break; - } - } + const std::string & edgeStart = edge.front(); + const std::string & edgeEnd = edge.back(); - if (!edge2Joined) { - multiedges.push_back(edge2); + // Check if this edge can connect to existing paths + auto startIt = pathEndingAt.find(edgeStart); + auto endIt = pathStartingAt.find(edgeEnd); + + bool canPrepend = (startIt != pathEndingAt.end()); + bool canAppend = (endIt != pathStartingAt.end()); + + if (canPrepend && canAppend && startIt->second == endIt->second) { + // Edge connects a path to itself - append it to form a cycle + size_t pathIdx = startIt->second; + auto & path = multiedges[pathIdx]; + // Append all but first element of edge (first element is duplicate) + path.insert(path.end(), std::next(edge.begin()), edge.end()); + // Update the end point (start point stays the same for a cycle) + pathEndingAt.erase(startIt); + pathEndingAt[edgeEnd] = pathIdx; + } else if (canPrepend && canAppend) { + // Edge joins two different paths - merge them + size_t prependIdx = startIt->second; + size_t appendIdx = endIt->second; + auto & prependPath = multiedges[prependIdx]; + auto & appendPath = multiedges[appendIdx]; + + // Save endpoint before modifying appendPath + const std::string appendPathEnd = appendPath.back(); + const std::string appendPathStart = appendPath.front(); + + // Append edge (without first element) to prependPath + prependPath.insert(prependPath.end(), std::next(edge.begin()), edge.end()); + // Append appendPath (without first element) to prependPath + prependPath.insert(prependPath.end(), std::next(appendPath.begin()), appendPath.end()); + + // Update maps: prependPath now ends where appendPath ended + pathEndingAt.erase(startIt); + pathEndingAt[appendPathEnd] = prependIdx; + pathStartingAt.erase(appendPathStart); + + // Mark appendPath for removal by clearing it + appendPath.clear(); + } else if (canPrepend) { + // Edge extends an existing path at its end + size_t pathIdx = startIt->second; + auto & path = multiedges[pathIdx]; + // Append all but first element of edge (first element is duplicate) + path.insert(path.end(), std::next(edge.begin()), edge.end()); + // Update the end point + pathEndingAt.erase(startIt); + pathEndingAt[edgeEnd] = pathIdx; + } else if (canAppend) { + // Edge extends an existing path at its start + size_t pathIdx = endIt->second; + auto & path = multiedges[pathIdx]; + // Prepend all but last element of edge (last element is duplicate) + path.insert(path.begin(), edge.begin(), std::prev(edge.end())); + // Update the start point + pathStartingAt.erase(endIt); + pathStartingAt[edgeStart] = pathIdx; + } else { + // Edge doesn't connect to anything - start a new path + size_t newIdx = multiedges.size(); + multiedges.push_back(edge); + pathStartingAt[edgeStart] = newIdx; + pathEndingAt[edgeEnd] = newIdx; } } - // Second pass: merge multiedges that can now be connected - // After joining edges, some multiedges might now be connectable - bool merged = true; - while (merged) { - merged = false; - for (size_t i = 0; i < multiedges.size() && !merged; i++) { - for (size_t j = i + 1; j < multiedges.size() && !merged; j++) { - auto & path1 = multiedges[i]; - auto & path2 = multiedges[j]; - - if (path1.back() == path2.front()) { - // Append path2 to path1 - for (size_t k = 1; k < path2.size(); k++) { - path1.push_back(path2[k]); - } - multiedges.erase(multiedges.begin() + j); - merged = true; - } else if (path2.back() == path1.front()) { - // Prepend path2 to path1 - for (int k = path2.size() - 2; k >= 0; k--) { - path1.push_front(path2[k]); - } - multiedges.erase(multiedges.begin() + j); - merged = true; - } - } - } - } + // Remove empty paths (those that were merged into others) + multiedges.erase( + std::remove_if(multiedges.begin(), multiedges.end(), [](const StoreCycleEdge & p) { return p.empty(); }), + multiedges.end()); debug("transformEdgesToMultiedges: result has %lu multiedges", multiedges.size()); } diff --git a/src/libstore/include/nix/store/build/find-cycles.hh b/src/libstore/include/nix/store/build/find-cycles.hh index e04d8a7f3..ad0fb2221 100644 --- a/src/libstore/include/nix/store/build/find-cycles.hh +++ b/src/libstore/include/nix/store/build/find-cycles.hh @@ -95,8 +95,8 @@ void walkAndScanPath(const std::filesystem::path & path, CycleEdgeScanSink & sin * longer paths like [A→B→C→A]. This makes it easier to visualize the * actual cycle paths. * - * The algorithm is greedy: it tries to extend each edge by finding - * matching edges to prepend or append. + * Uses hashmaps to track path endpoints, enabling O(n) joining of edges + * where n is the number of input edges. * * @param edges Input edges to transform * @param multiedges Output parameter with connected paths