mirror of
https://github.com/NixOS/nix.git
synced 2025-11-12 13:36:02 +01:00
perf(libstore/find-cycles): avoid copying StringSet on every chunk
Previously, `CycleEdgeScanSink::operator()` copied the entire `getResult()` `StringSet` twice on every 64KB chunk to detect newly found hashes. For large files, this created O(n * chunks) overhead. Now we track which hashes have been recorded for the current file using `recordedForCurrentFile`, avoiding the set copies. The insert() returns true only for newly seen hashes, making this O(1) per hash found.
This commit is contained in:
parent
f7c4bcee9d
commit
5803daa940
2 changed files with 11 additions and 8 deletions
|
|
@ -26,22 +26,21 @@ CycleEdgeScanSink::CycleEdgeScanSink(
|
||||||
void CycleEdgeScanSink::setCurrentPath(const std::string & path)
|
void CycleEdgeScanSink::setCurrentPath(const std::string & path)
|
||||||
{
|
{
|
||||||
currentFilePath = path;
|
currentFilePath = path;
|
||||||
|
// Clear tracking for new file
|
||||||
|
recordedForCurrentFile.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CycleEdgeScanSink::operator()(std::string_view data)
|
void CycleEdgeScanSink::operator()(std::string_view data)
|
||||||
{
|
{
|
||||||
// Track what hashes we've already seen
|
|
||||||
auto seenBefore = getResult();
|
|
||||||
|
|
||||||
// Call parent's operator() to do the actual hash searching
|
// Call parent's operator() to do the actual hash searching
|
||||||
// This reuses all the proven buffer boundary handling logic
|
// This reuses all the proven buffer boundary handling logic
|
||||||
RefScanSink::operator()(data);
|
RefScanSink::operator()(data);
|
||||||
|
|
||||||
// Check for newly found hashes
|
// Check which hashes have been found and not yet recorded for this file
|
||||||
auto seenAfter = getResult();
|
// getResult() returns the set of ALL hashes found so far
|
||||||
for (const auto & hash : seenAfter) {
|
for (const auto & hash : getResult()) {
|
||||||
if (seenBefore.find(hash) == seenBefore.end()) {
|
if (recordedForCurrentFile.insert(hash).second) {
|
||||||
// This hash was just found in the current file
|
// This hash was just found and not yet recorded for current file
|
||||||
// Create an edge from current file to the target
|
// Create an edge from current file to the target
|
||||||
auto targetPath = storeDir + hash;
|
auto targetPath = storeDir + hash;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,10 @@ class CycleEdgeScanSink : public RefScanSink
|
||||||
std::map<std::string, StorePath> hashPathMap;
|
std::map<std::string, StorePath> hashPathMap;
|
||||||
std::string storeDir;
|
std::string storeDir;
|
||||||
|
|
||||||
|
// Track hashes we've already recorded for current file
|
||||||
|
// to avoid duplicates
|
||||||
|
StringSet recordedForCurrentFile;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
StoreCycleEdgeVec edges;
|
StoreCycleEdgeVec edges;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue