1
1
Fork 0
mirror of https://github.com/NixOS/nix.git synced 2025-11-12 13:36:02 +01:00

perf(libstore/find-cycles): avoid copying StringSet on every chunk

Previously, `CycleEdgeScanSink::operator()` copied the entire
`getResult()` `StringSet` twice on every 64KB chunk to detect newly
found hashes. For large files, this created O(n * chunks) overhead.

Now we track which hashes have been recorded for the current file using
`recordedForCurrentFile`, avoiding the set copies. The insert() returns
true only for newly seen hashes, making this O(1) per hash found.
This commit is contained in:
Bernardo Meurer Costa 2025-10-11 19:33:38 +00:00
parent f7c4bcee9d
commit 5803daa940
No known key found for this signature in database
2 changed files with 11 additions and 8 deletions

View file

@ -26,22 +26,21 @@ CycleEdgeScanSink::CycleEdgeScanSink(
void CycleEdgeScanSink::setCurrentPath(const std::string & path)
{
currentFilePath = path;
// Clear tracking for new file
recordedForCurrentFile.clear();
}
void CycleEdgeScanSink::operator()(std::string_view data)
{
// Track what hashes we've already seen
auto seenBefore = getResult();
// Call parent's operator() to do the actual hash searching
// This reuses all the proven buffer boundary handling logic
RefScanSink::operator()(data);
// Check for newly found hashes
auto seenAfter = getResult();
for (const auto & hash : seenAfter) {
if (seenBefore.find(hash) == seenBefore.end()) {
// This hash was just found in the current file
// Check which hashes have been found and not yet recorded for this file
// getResult() returns the set of ALL hashes found so far
for (const auto & hash : getResult()) {
if (recordedForCurrentFile.insert(hash).second) {
// This hash was just found and not yet recorded for current file
// Create an edge from current file to the target
auto targetPath = storeDir + hash;

View file

@ -38,6 +38,10 @@ class CycleEdgeScanSink : public RefScanSink
std::map<std::string, StorePath> hashPathMap;
std::string storeDir;
// Track hashes we've already recorded for current file
// to avoid duplicates
StringSet recordedForCurrentFile;
public:
StoreCycleEdgeVec edges;