diff --git a/tools/dir2graph b/tools/dir2graph index 0530a93..054ac61 100755 --- a/tools/dir2graph +++ b/tools/dir2graph @@ -1,102 +1,103 @@ #!/usr/bin/env python3 # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys from typing import Iterator, Tuple, Union import click from swh.model.from_disk import Content, Directory from swh.model.identifiers import CoreSWHID, ObjectType def swhid_of_node(obj: Union[Content, Directory]): return CoreSWHID( object_type=ObjectType[obj.object_type.upper()], object_id=obj.hash, ) def walk_model(root: Directory,) -> Iterator[Tuple[CoreSWHID, Iterator[CoreSWHID]]]: """recursively visit a model.from_disk object Yield pairs (SWHID, neighbors) where SWHID is the identifier of a node and neighbors an iterator over SWHID of nodes directly reachable from it. So you can obtain all graph nodes by only looking at the first element of the pair, and edges by joining the first element with each of the neighbors. Note that no deduplication is applied, so both nodes and edges can be yielded multiple times if they do in fact appear multiple times in the graph. """ - def walk_neighbors(node, to_visit): + def walk_neighbors(node): for child in node.values(): - to_visit.insert(0, child) yield swhid_of_node(child) to_visit = [root] while to_visit: node = to_visit.pop() swhid = swhid_of_node(node) - yield (swhid, walk_neighbors(node, to_visit)) + yield (swhid, walk_neighbors(node)) + for child in node.values(): + to_visit.insert(0, child) @click.command() @click.argument( "directory", required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), ) @click.option( "-n", "--nodes-output", type=click.Path(file_okay=True, dir_okay=False, writable=True), help="output file where to store nodes as SWHIDs" " (if not given, node SWHIDs will not be output)." ' Use "-" for stdout.' " Default: output node SWHIDs to stdout.", ) @click.option( "-e", "--edges-output", type=click.Path(file_okay=True, dir_okay=False, writable=True), help="output file where to store edges as SWHID pairs" " (if not given, edge SWHIDs will not be output)" ' Use "-" for stdout.' " Default: do not output edge SWHIDs.", ) def main(directory, nodes_output, edges_output): """Recursively identifies the content of a directory. Outputs SWHID identifiers as both nodes (one SWHID per object) and edges (pairs of SWHIDs (parent, child) corresponding to the filesystem hierarchy). """ nodes_file = sys.stdout edges_file = None if nodes_output: if nodes_output == "-": nodes_file = sys.stdout else: nodes_file = open(nodes_output, "w") if edges_output: if edges_output == "-": edges_file = sys.stdout else: edges_file = open(edges_output, "w") root = Directory.from_disk(path=directory.encode()) for swhid, neighbors in walk_model(root): if nodes_file: nodes_file.write(f"{swhid}\n") if edges_file: for child_swhid in neighbors: edges_file.write(f"{swhid} {child_swhid}\n") if __name__ == "__main__": main()