diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -137,9 +137,9 @@ @click.option( "-p", "--policy", - default="bfs", + default="auto", show_default=True, - type=click.Choice(["bfs", "filepriority", "dirpriority"]), + type=click.Choice(["auto", "bfs", "filepriority", "dirpriority"]), help="The scan policy.", ) @click.pass_context diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py --- a/swh/scanner/policy.py +++ b/swh/scanner/policy.py @@ -16,6 +16,8 @@ from .data import MerkleNodeInfo from .exceptions import error_response +QUERY_LIMIT = 1000 + async def swhids_discovery( swhids: List[str], session: aiohttp.ClientSession, api_url: str, @@ -38,12 +40,11 @@ """ endpoint = api_url + "known/" - chunk_size = 1000 requests = [] def get_chunk(swhids): - for i in range(0, len(swhids), chunk_size): - yield swhids[i : i + chunk_size] + for i in range(0, len(swhids), QUERY_LIMIT): + yield swhids[i : i + QUERY_LIMIT] async def make_request(swhids): async with session.post(endpoint, json=swhids) as resp: @@ -52,7 +53,7 @@ return await resp.json() - if len(swhids) > chunk_size: + if len(swhids) > QUERY_LIMIT: for swhids_chunk in get_chunk(swhids): requests.append(asyncio.create_task(make_request(swhids_chunk))) @@ -248,3 +249,15 @@ for _, node in list(dir_.items()): if node.object_type == CONTENT: yield node + + +class QueryAll(Policy): + @no_type_check + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + all_nodes = [node for node in self.source_tree.iter_tree()] + all_swhids = [str(node.swhid()) for node in all_nodes] + swhids_res = await swhids_discovery(all_swhids, session, api_url) + for node in all_nodes: + self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"] diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -13,7 +13,7 @@ from .data import MerkleNodeInfo from .output import Output -from .policy import DirectoryPriority, FilePriority, LazyBFS +from .policy import QUERY_LIMIT, DirectoryPriority, FilePriority, LazyBFS, QueryAll async def run(config: Dict[str, Any], policy) -> None: @@ -35,8 +35,18 @@ await policy.run(session, api_url) +def source_size(source_tree: Directory): + return len([n for n in source_tree.iter_tree(dedup=False)]) + + def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str): - if policy == "bfs": + if policy == "auto": + return ( + QueryAll(source_tree, nodes_data) + if source_size(source_tree) <= QUERY_LIMIT + else LazyBFS(source_tree, nodes_data) + ) + elif policy == "bfs": return LazyBFS(source_tree, nodes_data) elif policy == "filepriority": return FilePriority(source_tree, nodes_data) diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -7,8 +7,8 @@ import pytest from swh.scanner.data import MerkleNodeInfo -from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS -from swh.scanner.scanner import run +from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS, QueryAll +from swh.scanner.scanner import get_policy_obj, run from .data import unknown_swhids @@ -18,6 +18,10 @@ assert not app.debug +def test_get_policy_obj_auto(source_tree, nodes_data): + assert isinstance(get_policy_obj(source_tree, nodes_data, "auto"), QueryAll) + + def test_scanner_result_bfs(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}}