diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py --- a/swh/scanner/benchmark_algos.py +++ b/swh/scanner/benchmark_algos.py @@ -70,10 +70,6 @@ def stopngo(source_tree: Tree, api_url: str, counter: collections.Counter): - def set_children_known(node): - for child_node in node.iterate(): - child_node.known = True - nodes = [] nodes.append(source_tree) @@ -87,7 +83,7 @@ if not node.known: nodes.extend(list(node.children.values())) else: - set_children_known(node) + set_children_status(node, [CONTENT, DIRECTORY], True) def set_father_status(node, known): @@ -105,12 +101,14 @@ set_father_status(parent, known) -def set_children_status(node, node_type, known, status: Status = Status.unset): +def set_children_status( + node: Tree, node_types: Iterable[str], known: bool, status: Status = Status.unset +): """ - Recursively change father known and visited status of a given node + Recursively change the status of the children of the provided node """ for child_node in node.iterate(): - if child_node.otype == node_type and child_node.status == status: + if child_node.otype in node_types and child_node.status == status: child_node.known = known @@ -149,7 +147,8 @@ # update directory status dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] dir_.status = Status.queried - set_children_status(dir_, DIRECTORY, dir_.known) + if dir_.known: + set_children_status(dir_, [DIRECTORY], True) def directory_priority(source_tree: Tree, api_url: str, counter: collections.Counter): @@ -161,9 +160,6 @@ ) ) unset_dirs.reverse() - # insert root if it has no contents - if source_tree.has_contents: - unset_dirs.append(source_tree) for dir_ in unset_dirs: # if the directory is known set all the downstream file contents to known @@ -171,7 +167,7 @@ dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] dir_.status = Status.queried if dir_.known: - set_children_status(dir_, CONTENT, True) + set_children_status(dir_, [CONTENT], True) else: set_father_status(dir_, False) @@ -249,8 +245,7 @@ for child_node in node.iterate(): nodes.remove(child_node) - all_nodes = [node for node in source_tree.iterate()] - all_nodes.insert(0, source_tree) + all_nodes = [node for node in source_tree.iterate_bfs()] parsed_nodes = query_swhids(all_nodes, api_url) for node in all_nodes: diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -185,12 +185,15 @@ def iterate_bfs(self) -> Iterator[Tree]: """Get nodes in BFS order """ - nodes = set(node for node in self.children.values()) - for node in nodes: - yield node - for node in nodes: - if node.otype == DIRECTORY: - yield from node.iterate_bfs() + nodes = [] + nodes.append(self) + + while len(nodes) > 0: + for node in nodes.copy(): + yield node + nodes.remove(node) + if node.otype == DIRECTORY: + nodes.extend(list(node.children.values())) def get_files_from_dir(self, dir_path: Path) -> List: """