diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py --- a/swh/scanner/backend.py +++ b/swh/scanner/backend.py @@ -7,8 +7,7 @@ from .db import Db from .exceptions import LargePayloadExc - -LIMIT = 1000 +from .policy import QUERY_LIMIT def create_app(db: Db): @@ -20,9 +19,10 @@ def known(): swhids = request.get_json() - if len(swhids) > LIMIT: + if len(swhids) > QUERY_LIMIT: raise LargePayloadExc( - f"The maximum number of SWHIDs this endpoint can receive is {LIMIT}" + f"The maximum number of SWHIDs this endpoint can receive is" + f"{QUERY_LIMIT}" ) cur = db.conn.cursor() diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -137,15 +137,30 @@ @click.option( "-p", "--policy", - default="bfs", + default="auto", show_default=True, - type=click.Choice(["bfs", "filepriority", "dirpriority"]), + type=click.Choice(["auto", "bfs", "filepriority", "dirpriority"]), help="The scan policy.", ) @click.pass_context def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy): """Scan a source code project to discover files and directories already - present in the archive""" + present in the archive. + + The source code project can be checked using different policies that can be set + using the -p/--policy option: + + auto: it selects the best policy based on the source code, for codebase(s) with + less than 1000 file/dir contents all the nodes will be queried. + + bfs: scan the source code in the BFS order, checking unknown directories only. + + filepriority: scan all the source code file contents, checking only unset + directories. (useful if the codebase contains a lot of source files) + + dirpriority: scan all the source code directories and check only unknown + directory contents. + """ import swh.scanner.scanner as scanner config = setup_config(ctx, api_url) diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py --- a/swh/scanner/policy.py +++ b/swh/scanner/policy.py @@ -16,6 +16,10 @@ from .data import MerkleNodeInfo from .exceptions import error_response +# Maximum number of SWHIDs that can be requested by a single call to the +# Web API endpoint /known/ +QUERY_LIMIT = 1000 + async def swhids_discovery( swhids: List[str], session: aiohttp.ClientSession, api_url: str, @@ -38,12 +42,11 @@ """ endpoint = api_url + "known/" - chunk_size = 1000 requests = [] def get_chunk(swhids): - for i in range(0, len(swhids), chunk_size): - yield swhids[i : i + chunk_size] + for i in range(0, len(swhids), QUERY_LIMIT): + yield swhids[i : i + QUERY_LIMIT] async def make_request(swhids): async with session.post(endpoint, json=swhids) as resp: @@ -52,7 +55,7 @@ return await resp.json() - if len(swhids) > chunk_size: + if len(swhids) > QUERY_LIMIT: for swhids_chunk in get_chunk(swhids): requests.append(asyncio.create_task(make_request(swhids_chunk))) @@ -86,6 +89,11 @@ class LazyBFS(Policy): + """Read nodes in the merkle tree using the BFS algorithm. + Lookup only directories that are unknown otherwise set all the downstream + contents to known. + """ + async def run( self, session: aiohttp.ClientSession, api_url: str, ): @@ -112,6 +120,12 @@ class FilePriority(Policy): + """Check the Merkle tree querying all the file contents and set all the upstream + directories to unknown in the case a file content is unknown. + Finally check all the directories which status is still unknown and set all the + sub-directories of known directories to known. + """ + @no_type_check async def run( self, session: aiohttp.ClientSession, api_url: str, @@ -169,6 +183,13 @@ class DirectoryPriority(Policy): + """Check the Merkle tree querying all the directories that have at least one file + content and set all the upstream directories to unknown in the case a directory + is unknown otherwise set all the downstream contents to known. + Finally check the status of empty directories and all the remaining file + contents. + """ + @no_type_check async def run( self, session: aiohttp.ClientSession, api_url: str, @@ -248,3 +269,18 @@ for _, node in list(dir_.items()): if node.object_type == CONTENT: yield node + + +class QueryAll(Policy): + """Check the status of every node in the Merkle tree. + """ + + @no_type_check + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + all_nodes = [node for node in self.source_tree.iter_tree()] + all_swhids = [str(node.swhid()) for node in all_nodes] + swhids_res = await swhids_discovery(all_swhids, session, api_url) + for node in all_nodes: + self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"] diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -13,7 +13,7 @@ from .data import MerkleNodeInfo from .output import Output -from .policy import DirectoryPriority, FilePriority, LazyBFS +from .policy import QUERY_LIMIT, DirectoryPriority, FilePriority, LazyBFS, QueryAll async def run(config: Dict[str, Any], policy) -> None: @@ -35,8 +35,18 @@ await policy.run(session, api_url) +def source_size(source_tree: Directory): + return len([n for n in source_tree.iter_tree(dedup=False)]) + + def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str): - if policy == "bfs": + if policy == "auto": + return ( + QueryAll(source_tree, nodes_data) + if source_size(source_tree) <= QUERY_LIMIT + else LazyBFS(source_tree, nodes_data) + ) + elif policy == "bfs": return LazyBFS(source_tree, nodes_data) elif policy == "filepriority": return FilePriority(source_tree, nodes_data) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -14,6 +14,7 @@ from swh.model.cli import model_of_dir from swh.scanner.data import MerkleNodeInfo +from swh.scanner.policy import QUERY_LIMIT from .data import present_swhids from .flask_api import create_app @@ -72,6 +73,21 @@ return model_of_dir(str(test_sample_folder).encode()) +@pytest.fixture(scope="function") +def big_source_tree(tmp_path): + """Generate a model.from_disk.Directory from a "big" temporary directory + (more than 1000 nodes) + """ + dir_ = tmp_path / "big-directory" + dir_.mkdir() + for i in range(0, QUERY_LIMIT + 1): + file_ = dir_ / f"file_{i}.org" + file_.touch() + dir_obj = model_of_dir(str(dir_).encode()) + assert len(dir_obj) > QUERY_LIMIT + return dir_obj + + @pytest.fixture(scope="function") def source_tree_policy(test_sample_folder_policy): """Generate a model.from_disk.Directory object from the test sample diff --git a/swh/scanner/tests/test_backend.py b/swh/scanner/tests/test_backend.py --- a/swh/scanner/tests/test_backend.py +++ b/swh/scanner/tests/test_backend.py @@ -3,8 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.scanner.backend import LIMIT, create_app +from swh.scanner.backend import create_app from swh.scanner.db import Db +from swh.scanner.policy import QUERY_LIMIT from .data import present_swhids @@ -13,7 +14,7 @@ tmp_dbfile = tmp_path / "tmp_db.sqlite" db = Db(tmp_dbfile) cur = db.conn.cursor() - db.create_from(test_swhids_sample, LIMIT, cur) + db.create_from(test_swhids_sample, QUERY_LIMIT, cur) app = create_app(db) @@ -31,7 +32,7 @@ db = Db(tmp_dbfile) cur = db.conn.cursor() - db.create_from(test_swhids_sample, LIMIT, cur) + db.create_from(test_swhids_sample, QUERY_LIMIT, cur) app = create_app(db) @@ -52,7 +53,7 @@ swhids = [swhid for n in range(1001)] db = Db(tmp_dbfile) cur = db.conn.cursor() - db.create_from(test_swhids_sample, LIMIT, cur) + db.create_from(test_swhids_sample, QUERY_LIMIT, cur) app = create_app(db) diff --git a/swh/scanner/tests/test_db.py b/swh/scanner/tests/test_db.py --- a/swh/scanner/tests/test_db.py +++ b/swh/scanner/tests/test_db.py @@ -4,18 +4,17 @@ # See top-level LICENSE file for more information from swh.scanner.db import Db +from swh.scanner.policy import QUERY_LIMIT from .data import present_swhids -CHUNK_SIZE = 1000 - def test_db_create_from(tmp_path, test_swhids_sample): tmp_dbfile = tmp_path / "tmp_db.sqlite" db = Db(tmp_dbfile) cur = db.conn.cursor() - db.create_from(test_swhids_sample, CHUNK_SIZE, cur) + db.create_from(test_swhids_sample, QUERY_LIMIT, cur) for swhid in present_swhids: cur = db.conn.cursor() @@ -30,7 +29,7 @@ db = Db(tmp_dbfile) cur = db.conn.cursor() - db.create_from(test_swhids_sample, CHUNK_SIZE, cur) + db.create_from(test_swhids_sample, QUERY_LIMIT, cur) for swhid in swhids: cur = db.conn.cursor() diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -7,8 +7,8 @@ import pytest from swh.scanner.data import MerkleNodeInfo -from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS -from swh.scanner.scanner import run +from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS, QueryAll +from swh.scanner.scanner import get_policy_obj, run from .data import unknown_swhids @@ -18,6 +18,16 @@ assert not app.debug +def test_get_policy_obj_auto(source_tree, nodes_data): + assert isinstance(get_policy_obj(source_tree, nodes_data, "auto"), QueryAll) + + +def test_get_policy_obj_bfs(big_source_tree, nodes_data): + # check that the policy object is the LazyBFS if the source tree contains more than + # 1000 nodes + assert isinstance(get_policy_obj(big_source_tree, nodes_data, "auto"), LazyBFS) + + def test_scanner_result_bfs(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} @@ -58,3 +68,17 @@ assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True + + +def test_scanner_result_query_all(live_server, event_loop, source_tree): + api_url = url_for("index", _external=True) + config = {"web-api": {"url": api_url, "auth-token": None}} + + nodes_data = MerkleNodeInfo() + policy = QueryAll(source_tree, nodes_data) + event_loop.run_until_complete(run(config, policy)) + for node in source_tree.iter_tree(): + if str(node.swhid()) in unknown_swhids: + assert nodes_data[node.swhid()]["known"] is False + else: + assert nodes_data[node.swhid()]["known"] is True