From 5e220271e2dbafb5205684354057aeaa4a58a5c6 Mon Sep 17 00:00:00 2001 From: Bernardo Meurer Costa Date: Sat, 25 Oct 2025 22:38:43 +0000 Subject: [PATCH 1/2] feat(libstore): add scanForReferencesDeep for per-file reference tracking Introduces `scanForReferencesDeep` to provide per-file granularity when scanning for store path references, enabling better diagnostics for cycle detection and `nix why-depends --precise`. --- src/libstore-tests/references.cc | 143 ++++++++++++++++++ .../include/nix/store/path-references.hh | 57 +++++++ src/libstore/path-references.cc | 90 +++++++++++ 3 files changed, 290 insertions(+) diff --git a/src/libstore-tests/references.cc b/src/libstore-tests/references.cc index 27ecad08f..9cecd573e 100644 --- a/src/libstore-tests/references.cc +++ b/src/libstore-tests/references.cc @@ -1,4 +1,6 @@ #include "nix/store/references.hh" +#include "nix/store/path-references.hh" +#include "nix/util/memory-source-accessor.hh" #include @@ -79,4 +81,145 @@ TEST(references, scan) } } +TEST(references, scanForReferencesDeep) +{ + using File = MemorySourceAccessor::File; + + // Create store paths to search for + StorePath path1{"dc04vv14dak1c1r48qa0m23vr9jy8sm0-foo"}; + StorePath path2{"zc842j0rz61mjsp3h3wp5ly71ak6qgdn-bar"}; + StorePath path3{"a5cn2i4b83gnsm60d38l3kgb8qfplm11-baz"}; + + StorePathSet refs{path1, path2, path3}; + + std::string_view hash1 = path1.hashPart(); + std::string_view hash2 = path2.hashPart(); + std::string_view hash3 = path3.hashPart(); + + // Create an in-memory file system with various reference patterns + auto accessor = make_ref(); + accessor->root = File::Directory{ + .contents{ + { + // file1.txt: contains hash1 + "file1.txt", + File::Regular{ + .contents = "This file references " + hash1 + " in its content", + }, + }, + { + // file2.txt: contains hash2 and hash3 + "file2.txt", + File::Regular{ + .contents = "Multiple refs: " + hash2 + " and also " + hash3, + }, + }, + { + // file3.txt: contains no references + "file3.txt", + File::Regular{ + .contents = "This file has no store path references at all", + }, + }, + { + // subdir: a subdirectory + "subdir", + File::Directory{ + .contents{ + { + // subdir/file4.txt: contains hash1 again + "file4.txt", + File::Regular{ + .contents = "Subdirectory file with " + hash1, + }, + }, + }, + }, + }, + { + // link1: a symlink that contains a reference in its target + "link1", + File::Symlink{ + .target = hash2 + "-target", + }, + }, + }, + }; + + // Test the callback-based API + { + std::map foundRefs; + + scanForReferencesDeep(*accessor, CanonPath::root, refs, [&](FileRefScanResult result) { + foundRefs[std::move(result.filePath)] = std::move(result.foundRefs); + }); + + // Verify we found the expected references + EXPECT_EQ(foundRefs.size(), 4); // file1, file2, file4, link1 + + // Check file1.txt found path1 + { + CanonPath f1Path("/file1.txt"); + auto it = foundRefs.find(f1Path); + ASSERT_TRUE(it != foundRefs.end()); + EXPECT_EQ(it->second.size(), 1); + EXPECT_TRUE(it->second.count(path1)); + } + + // Check file2.txt found path2 and path3 + { + CanonPath f2Path("/file2.txt"); + auto it = foundRefs.find(f2Path); + ASSERT_TRUE(it != foundRefs.end()); + EXPECT_EQ(it->second.size(), 2); + EXPECT_TRUE(it->second.count(path2)); + EXPECT_TRUE(it->second.count(path3)); + } + + // Check file3.txt is not in results (no refs) + { + CanonPath f3Path("/file3.txt"); + EXPECT_FALSE(foundRefs.count(f3Path)); + } + + // Check subdir/file4.txt found path1 + { + CanonPath f4Path("/subdir/file4.txt"); + auto it = foundRefs.find(f4Path); + ASSERT_TRUE(it != foundRefs.end()); + EXPECT_EQ(it->second.size(), 1); + EXPECT_TRUE(it->second.count(path1)); + } + + // Check symlink found path2 + { + CanonPath linkPath("/link1"); + auto it = foundRefs.find(linkPath); + ASSERT_TRUE(it != foundRefs.end()); + EXPECT_EQ(it->second.size(), 1); + EXPECT_TRUE(it->second.count(path2)); + } + } + + // Test the map-based convenience API + { + auto results = scanForReferencesDeep(*accessor, CanonPath::root, refs); + + EXPECT_EQ(results.size(), 4); // file1, file2, file4, link1 + + // Verify all expected files are in the results + EXPECT_TRUE(results.count(CanonPath("/file1.txt"))); + EXPECT_TRUE(results.count(CanonPath("/file2.txt"))); + EXPECT_TRUE(results.count(CanonPath("/subdir/file4.txt"))); + EXPECT_TRUE(results.count(CanonPath("/link1"))); + EXPECT_FALSE(results.count(CanonPath("/file3.txt"))); + + // Verify the references found in each file are correct + EXPECT_EQ(results.at(CanonPath("/file1.txt")), StorePathSet{path1}); + EXPECT_EQ(results.at(CanonPath("/file2.txt")), StorePathSet({path2, path3})); + EXPECT_EQ(results.at(CanonPath("/subdir/file4.txt")), StorePathSet{path1}); + EXPECT_EQ(results.at(CanonPath("/link1")), StorePathSet{path2}); + } +} + } // namespace nix diff --git a/src/libstore/include/nix/store/path-references.hh b/src/libstore/include/nix/store/path-references.hh index 66d0da268..6aa506da4 100644 --- a/src/libstore/include/nix/store/path-references.hh +++ b/src/libstore/include/nix/store/path-references.hh @@ -3,6 +3,10 @@ #include "nix/store/references.hh" #include "nix/store/path.hh" +#include "nix/util/source-accessor.hh" + +#include +#include namespace nix { @@ -21,4 +25,57 @@ public: StorePathSet getResultPaths(); }; +/** + * Result of scanning a single file for references. + */ +struct FileRefScanResult +{ + CanonPath filePath; ///< The file that was scanned + StorePathSet foundRefs; ///< Which store paths were found in this file +}; + +/** + * Scan a store path tree and report which references appear in which files. + * + * This is like scanForReferences() but provides per-file granularity. + * Useful for cycle detection and detailed dependency analysis like `nix why-depends --precise`. + * + * The function walks the tree using the provided accessor and streams each file's + * contents through a RefScanSink to detect hash references. For each file that + * contains at least one reference, a callback is invoked with the file path and + * the set of references found. + * + * Note: This function only searches for the hash part of store paths (e.g., + * "dc04vv14dak1c1r48qa0m23vr9jy8sm0"), not the name part. A store path like + * "/nix/store/dc04vv14dak1c1r48qa0m23vr9jy8sm0-foo" will be detected if the + * hash appears anywhere in the scanned content, regardless of the "-foo" suffix. + * + * @param accessor Source accessor to read the tree + * @param rootPath Root path to scan + * @param refs Set of store paths to search for + * @param callback Called for each file that contains at least one reference + */ +void scanForReferencesDeep( + SourceAccessor & accessor, + const CanonPath & rootPath, + const StorePathSet & refs, + std::function callback); + +/** + * Scan a store path tree and return which references appear in which files. + * + * This is a convenience wrapper around the callback-based scanForReferencesDeep() + * that collects all results into a map for efficient lookups. + * + * Note: This function only searches for the hash part of store paths, not the name part. + * See the callback-based overload for details. + * + * @param accessor Source accessor to read the tree + * @param rootPath Root path to scan + * @param refs Set of store paths to search for + * @return Map from file paths to the set of references found in each file + */ +std::map +scanForReferencesDeep(SourceAccessor & accessor, const CanonPath & rootPath, const StorePathSet & refs); + } // namespace nix diff --git a/src/libstore/path-references.cc b/src/libstore/path-references.cc index 8b167e902..3d783bbe4 100644 --- a/src/libstore/path-references.cc +++ b/src/libstore/path-references.cc @@ -1,11 +1,15 @@ #include "nix/store/path-references.hh" #include "nix/util/hash.hh" #include "nix/util/archive.hh" +#include "nix/util/source-accessor.hh" +#include "nix/util/canon-path.hh" +#include "nix/util/logging.hh" #include #include #include #include +#include namespace nix { @@ -54,4 +58,90 @@ StorePathSet scanForReferences(Sink & toTee, const Path & path, const StorePathS return refsSink.getResultPaths(); } +void scanForReferencesDeep( + SourceAccessor & accessor, + const CanonPath & rootPath, + const StorePathSet & refs, + std::function callback) +{ + // Recursive tree walker + auto walk = [&](this auto & self, const CanonPath & path) -> void { + auto stat = accessor.lstat(path); + + switch (stat.type) { + case SourceAccessor::tRegular: { + // Create a fresh sink for each file to independently detect references. + // RefScanSink accumulates found hashes globally - once a hash is found, + // it remains in the result set. If we reused the same sink across files, + // we couldn't distinguish which files contain which references, as a hash + // found in an earlier file wouldn't be reported when found in later files. + PathRefScanSink sink = PathRefScanSink::fromPaths(refs); + + // Scan this file by streaming its contents through the sink + accessor.readFile(path, sink); + + // Get the references found in this file + auto foundRefs = sink.getResultPaths(); + + // Report if we found anything in this file + if (!foundRefs.empty()) { + debug("scanForReferencesDeep: found %d references in %s", foundRefs.size(), path.abs()); + callback(FileRefScanResult{.filePath = path, .foundRefs = std::move(foundRefs)}); + } + break; + } + + case SourceAccessor::tDirectory: { + // Recursively scan directory contents + auto entries = accessor.readDirectory(path); + for (const auto & [name, entryType] : entries) { + self(path / name); + } + break; + } + + case SourceAccessor::tSymlink: { + // Create a fresh sink for the symlink target (same reason as regular files) + PathRefScanSink sink = PathRefScanSink::fromPaths(refs); + + // Scan symlink target for references + auto target = accessor.readLink(path); + sink(std::string_view(target)); + + // Get the references found in this symlink target + auto foundRefs = sink.getResultPaths(); + + if (!foundRefs.empty()) { + debug("scanForReferencesDeep: found %d references in symlink %s", foundRefs.size(), path.abs()); + callback(FileRefScanResult{.filePath = path, .foundRefs = std::move(foundRefs)}); + } + break; + } + + case SourceAccessor::tChar: + case SourceAccessor::tBlock: + case SourceAccessor::tSocket: + case SourceAccessor::tFifo: + case SourceAccessor::tUnknown: + default: + throw Error("file '%s' has an unsupported type", path.abs()); + } + }; + + // Start the recursive walk from the root + walk(rootPath); +} + +std::map +scanForReferencesDeep(SourceAccessor & accessor, const CanonPath & rootPath, const StorePathSet & refs) +{ + std::map results; + + scanForReferencesDeep(accessor, rootPath, refs, [&](FileRefScanResult result) { + results[std::move(result.filePath)] = std::move(result.foundRefs); + }); + + return results; +} + } // namespace nix From 6129aee988132742837d36fd4cf995bfe85b3198 Mon Sep 17 00:00:00 2001 From: Bernardo Meurer Costa Date: Sat, 25 Oct 2025 22:55:14 +0000 Subject: [PATCH 2/2] refactor(nix/why-depends): use scanForReferencesDeep for --precise mode Replaces manual tree-walking and reference scanning with the new scanForReferencesDeep function. --- src/nix/why-depends.cc | 79 +++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/src/nix/why-depends.cc b/src/nix/why-depends.cc index dc30fabd7..29da9e953 100644 --- a/src/nix/why-depends.cc +++ b/src/nix/why-depends.cc @@ -1,5 +1,6 @@ #include "nix/cmd/command.hh" #include "nix/store/store-api.hh" +#include "nix/store/path-references.hh" #include "nix/util/source-accessor.hh" #include "nix/main/shared.hh" @@ -191,7 +192,7 @@ struct CmdWhyDepends : SourceExprCommand, MixOperateOnOptions /* Sort the references by distance to `dependency` to ensure that the shortest path is printed first. */ std::multimap refs; - StringSet hashes; + StorePathSet refPaths; for (auto & ref : node.refs) { if (ref == node.path && packagePath != dependencyPath) @@ -200,7 +201,7 @@ struct CmdWhyDepends : SourceExprCommand, MixOperateOnOptions if (node2.dist == inf) continue; refs.emplace(node2.dist, &node2); - hashes.insert(std::string(node2.path.hashPart())); + refPaths.insert(node2.path); } /* For each reference, find the files and symlinks that @@ -209,58 +210,50 @@ struct CmdWhyDepends : SourceExprCommand, MixOperateOnOptions auto accessor = store->requireStoreObjectAccessor(node.path); - auto visitPath = [&](this auto && recur, const CanonPath & p) -> void { - auto st = accessor->maybeLstat(p); - assert(st); + auto getColour = [&](const std::string & hash) { + return hash == dependencyPathHash ? ANSI_GREEN : ANSI_BLUE; + }; - auto p2 = p.isRoot() ? p.abs() : p.rel(); + if (precise) { + // Use scanForReferencesDeep to find files containing references + scanForReferencesDeep(*accessor, CanonPath::root, refPaths, [&](FileRefScanResult result) { + auto p2 = result.filePath.isRoot() ? result.filePath.abs() : result.filePath.rel(); + auto st = accessor->lstat(result.filePath); - auto getColour = [&](const std::string & hash) { - return hash == dependencyPathHash ? ANSI_GREEN : ANSI_BLUE; - }; + if (st.type == SourceAccessor::Type::tRegular) { + auto contents = accessor->readFile(result.filePath); - if (st->type == SourceAccessor::Type::tDirectory) { - auto names = accessor->readDirectory(p); - for (auto & [name, type] : names) - recur(p / name); - } - - else if (st->type == SourceAccessor::Type::tRegular) { - auto contents = accessor->readFile(p); - - for (auto & hash : hashes) { - auto pos = contents.find(hash); - if (pos != std::string::npos) { - size_t margin = 32; - auto pos2 = pos >= margin ? pos - margin : 0; - hits[hash].emplace_back( - fmt("%s: …%s…", + // For each reference found in this file, extract context + for (auto & foundRef : result.foundRefs) { + std::string hash(foundRef.hashPart()); + auto pos = contents.find(hash); + if (pos != std::string::npos) { + size_t margin = 32; + auto pos2 = pos >= margin ? pos - margin : 0; + hits[hash].emplace_back(fmt( + "%s: …%s…", p2, hilite( filterPrintable(std::string(contents, pos2, pos - pos2 + hash.size() + margin)), pos - pos2, StorePath::HashLen, getColour(hash)))); + } + } + } else if (st.type == SourceAccessor::Type::tSymlink) { + auto target = accessor->readLink(result.filePath); + + // For each reference found in this symlink, show it + for (auto & foundRef : result.foundRefs) { + std::string hash(foundRef.hashPart()); + auto pos = target.find(hash); + if (pos != std::string::npos) + hits[hash].emplace_back( + fmt("%s -> %s", p2, hilite(target, pos, StorePath::HashLen, getColour(hash)))); } } - } - - else if (st->type == SourceAccessor::Type::tSymlink) { - auto target = accessor->readLink(p); - - for (auto & hash : hashes) { - auto pos = target.find(hash); - if (pos != std::string::npos) - hits[hash].emplace_back( - fmt("%s -> %s", p2, hilite(target, pos, StorePath::HashLen, getColour(hash)))); - } - } - }; - - // FIXME: should use scanForReferences(). - - if (precise) - visitPath(CanonPath::root); + }); + } for (auto & ref : refs) { std::string hash(ref.second->path.hashPart());