diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -142,7 +142,9 @@ "--policy", default="auto", show_default=True, - type=click.Choice(["auto", "bfs", "greedybfs", "filepriority", "dirpriority"]), + type=click.Choice( + ["auto", "bfs", "greedybfs", "filepriority", "dirpriority", "randomdir"] + ), help="The scan policy.", ) @click.option( @@ -178,6 +180,8 @@ dirpriority: scan all the source code directories and check only unknown directory contents. + randomdir: scan the source code using a random Merkle search on directories. + Other information about software artifacts could be specified with the -e/ --extra-info option:\n \b diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py --- a/swh/scanner/policy.py +++ b/swh/scanner/policy.py @@ -4,16 +4,20 @@ # See top-level LICENSE file for more information import abc -from typing import no_type_check +import itertools +from typing import Iterable, List, no_type_check from swh.core.utils import grouper -from swh.model.from_disk import Directory +from swh.loader.core import discovery +from swh.model import from_disk +from swh.model.from_disk import model +from swh.model.model import Sha1Git from .client import QUERY_LIMIT, Client from .data import MerkleNodeInfo -def source_size(source_tree: Directory): +def source_size(source_tree: from_disk.Directory): """return the size of a source tree as the number of nodes it contains""" return sum(1 for n in source_tree.iter_tree(dedup=False)) @@ -23,10 +27,10 @@ data: MerkleNodeInfo """information about contents and directories of the merkle tree""" - source_tree: Directory + source_tree: from_disk.Directory """representation of a source code project directory in the merkle tree""" - def __init__(self, source_tree: Directory, data: MerkleNodeInfo): + def __init__(self, source_tree: from_disk.Directory, data: MerkleNodeInfo): self.source_tree = source_tree self.data = data @@ -232,20 +236,85 @@ "known" ] - def has_contents(self, directory: Directory): + def has_contents(self, directory: from_disk.Directory): """Check if the directory given in input has contents""" for entry in directory.entries: if entry["type"] == "file": return True return False - def get_contents(self, dir_: Directory): + def get_contents(self, dir_: from_disk.Directory): """Get all the contents of a given directory""" for _, node in list(dir_.items()): if node.object_type == "content": yield node +class WebAPIConnection(discovery.ArchiveDiscoveryInterface): + """Use the web APIs to query the archive""" + + def __init__( + self, + contents: List[model.Content], + skipped_contents: List[model.SkippedContent], + directories: List[model.Directory], + client: Client, + ) -> None: + super().__init__(contents, skipped_contents, directories) + self.client = client + + self.sha_to_swhid = {} + self.swhid_to_sha = {} + for content in contents: + self.sha_to_swhid[content.sha1_git] = str(content.swhid()) + self.swhid_to_sha[str(content.swhid())] = content.sha1_git + + for directory in directories: + self.sha_to_swhid[directory.id] = str(directory.swhid()) + self.swhid_to_sha[str(directory.swhid())] = directory.id + + async def content_missing(self, contents: List[Sha1Git]) -> List[Sha1Git]: + """List content missing from the archive by sha1""" + return await self._missing(contents) + + async def skipped_content_missing( + self, skipped_contents: List[Sha1Git] + ) -> Iterable[Sha1Git]: + """List skipped content missing from the archive by sha1""" + # TODO what should we do about skipped contents? + return skipped_contents + + async def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]: + """List directories missing from the archive by sha1""" + return await self._missing(directories) + + async def _missing(self, shas): + res = await self.client.known([self.sha_to_swhid[o] for o in shas]) + return [self.swhid_to_sha[k] for k, v in res.items() if not v["known"]] + + +class RandomDirSamplingPriority(Policy): + """Check the Merkle tree querying random directories. Set all ancestors to + unknown for unknown directories, otherwise set all descendants to known. + Finally check all the remaining file contents. + """ + + @no_type_check + async def run(self, client: Client): + contents, skipped_contents, directories = from_disk.iter_directory( + self.source_tree + ) + + get_unknowns = discovery.filter_known_objects( + WebAPIConnection(contents, skipped_contents, directories, client), + ) + + unknowns = set(itertools.chain(*await get_unknowns)) + + for obj in itertools.chain(contents, skipped_contents, directories): + self.data[obj.swhid()]["known"] = obj not in unknowns + + class QueryAll(Policy): """Check the status of every node in the Merkle tree.""" diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -21,6 +21,7 @@ GreedyBFS, LazyBFS, QueryAll, + RandomDirSamplingPriority, source_size, ) @@ -66,6 +67,8 @@ return FilePriority(source_tree, nodes_data) elif policy == "dirpriority": return DirectoryPriority(source_tree, nodes_data) + elif policy == "randomdir": + return RandomDirSamplingPriority(source_tree, nodes_data) else: raise Exception(f"policy '{policy}' not found")