diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index bf69213..4547abd 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,241 +1,249 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import os from typing import Any, Dict, Optional import click from importlib_metadata import version import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group from .exceptions import DBError # Config for the "serve" option BACKEND_DEFAULT_PORT = 5011 # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") DEFAULT_CONFIG: Dict[str, Any] = { "web-api": { "url": "https://archive.softwareheritage.org/api/1/", "auth-token": None, } } CONFIG_FILE_HELP = f"""Configuration file: \b The CLI option or the environment variable will fail if invalid. CLI option is checked first. Then, environment variable {CONFIG_ENVVAR} is checked. Then, if cannot load the default path, a set of default values are used. Default config path is {DEFAULT_CONFIG_PATH}. Default config values are: \b {yaml.dump(DEFAULT_CONFIG)}""" SCANNER_HELP = f"""Software Heritage Scanner tools. {CONFIG_FILE_HELP}""" def setup_config(ctx, api_url): config = ctx.obj["config"] if api_url: if not api_url.endswith("/"): api_url += "/" config["web-api"]["url"] = api_url return config @swh_cli_group.group( name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP, ) @click.option( "-C", "--config-file", default=None, type=click.Path(exists=False, dir_okay=False, path_type=str), help="""YAML configuration file""", ) @click.version_option( version=version("swh.scanner"), prog_name="swh.scanner", ) @click.pass_context def scanner(ctx, config_file: Optional[str]): env_config_path = os.environ.get(CONFIG_ENVVAR) # read_raw_config do not fail if file does not exist, so check it beforehand # while enforcing loading priority if config_file: if not config.config_exists(config_file): raise click.BadParameter( f"File '{config_file}' cannot be opened.", param_hint="--config-file" ) elif env_config_path: if not config.config_exists(env_config_path): raise click.BadParameter( f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR ) config_file = env_config_path elif config.config_exists(DEFAULT_CONFIG_PATH): config_file = DEFAULT_CONFIG_PATH conf = DEFAULT_CONFIG if config_file is not None: conf = config.read_raw_config(config.config_basepath(config_file)) conf = config.merge_configs(DEFAULT_CONFIG, conf) ctx.ensure_object(dict) ctx.obj["config"] = conf @scanner.command(name="scan") @click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default=None, metavar="API_URL", show_default=True, help="URL for the api request", ) @click.option( "--exclude", "-x", "patterns", metavar="PATTERN", multiple=True, help="Exclude directories using glob patterns \ (e.g., ``*.git`` to exclude all .git directories)", ) @click.option( "-f", "--output-format", "out_fmt", default="text", show_default=True, type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), help="The output format", ) @click.option( "-i", "--interactive", is_flag=True, help="Show the result in a dashboard" ) +@click.option( + "-p", + "--policy", + default="bfs", + show_default=True, + type=click.Choice(["bfs", "filepriority", "dirpriority"]), + help="The scan policy.", +) @click.pass_context -def scan(ctx, root_path, api_url, patterns, out_fmt, interactive): +def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy): """Scan a source code project to discover files and directories already present in the archive""" import swh.scanner.scanner as scanner config = setup_config(ctx, api_url) - scanner.scan(config, root_path, patterns, out_fmt, interactive) + scanner.scan(config, root_path, patterns, out_fmt, interactive, policy) @scanner.group("db", help="Manage local knowledge base for swh-scanner") @click.pass_context def db(ctx): pass @db.command("import") @click.option( "-i", "--input", "input_file", metavar="INPUT_FILE", required=True, type=click.File("r"), help="A file containing SWHIDs", ) @click.option( "-o", "--output", "output_file_db", metavar="OUTPUT_DB_FILE", required=True, show_default=True, help="The name of the generated sqlite database", ) @click.option( "-s", "--chunk-size", "chunk_size", default="10000", metavar="SIZE", show_default=True, type=int, help="The chunk size ", ) @click.pass_context def import_(ctx, chunk_size, input_file, output_file_db): """Create SQLite database of known SWHIDs from a textual list of SWHIDs""" from .db import Db db = Db(output_file_db) cur = db.conn.cursor() try: db.create_from(input_file, chunk_size, cur) db.close() except DBError as e: ctx.fail("Failed to import SWHIDs into database: {0}".format(e)) @db.command("serve") @click.option( "-h", "--host", metavar="HOST", default="127.0.0.1", show_default=True, help="The host of the API server", ) @click.option( "-p", "--port", metavar="PORT", default=f"{BACKEND_DEFAULT_PORT}", show_default=True, help="The port of the API server", ) @click.option( "-f", "--db-file", "db_file", metavar="DB_FILE", default="SWHID_DB.sqlite", show_default=True, type=click.Path(exists=True), help="An sqlite database file (it can be generated with: 'swh scanner db import')", ) @click.pass_context def serve(ctx, host, port, db_file): """Start an API service using the sqlite database generated with the "db import" option.""" import swh.scanner.backend as backend from .db import Db db = Db(db_file) backend.run(host, port, db) db.close() def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") if __name__ == "__main__": main() diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py new file mode 100644 index 0000000..a107200 --- /dev/null +++ b/swh/scanner/policy.py @@ -0,0 +1,250 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import asyncio +import itertools +from typing import Dict, List, no_type_check + +import aiohttp + +from swh.model.from_disk import Directory +from swh.model.identifiers import CONTENT, DIRECTORY + +from .data import MerkleNodeInfo +from .exceptions import error_response + + +async def swhids_discovery( + swhids: List[str], session: aiohttp.ClientSession, api_url: str, +) -> Dict[str, Dict[str, bool]]: + """API Request to get information about the SoftWare Heritage persistent + IDentifiers (SWHIDs) given in input. + + Args: + swhids: a list of SWHIDS + api_url: url for the API request + + Returns: + A dictionary with: + + key: + SWHID searched + value: + value['known'] = True if the SWHID is found + value['known'] = False if the SWHID is not found + + """ + endpoint = api_url + "known/" + chunk_size = 1000 + requests = [] + + def get_chunk(swhids): + for i in range(0, len(swhids), chunk_size): + yield swhids[i : i + chunk_size] + + async def make_request(swhids): + async with session.post(endpoint, json=swhids) as resp: + if resp.status != 200: + error_response(resp.reason, resp.status, endpoint) + + return await resp.json() + + if len(swhids) > chunk_size: + for swhids_chunk in get_chunk(swhids): + requests.append(asyncio.create_task(make_request(swhids_chunk))) + + res = await asyncio.gather(*requests) + # concatenate list of dictionaries + return dict(itertools.chain.from_iterable(e.items() for e in res)) + else: + return await make_request(swhids) + + +class Policy(metaclass=abc.ABCMeta): + + data: MerkleNodeInfo + """information about contents and directories of the merkle tree""" + + source_tree: Directory + """representation of a source code project directory in the merkle tree""" + + def __init__(self, source_tree: Directory, data: MerkleNodeInfo): + self.data = data + self.source_tree = source_tree + for node in source_tree.iter_tree(): + self.data[node.swhid()] = {"known": None} # type: ignore + + @abc.abstractmethod + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + """Scan a source code project""" + raise NotImplementedError("Must implement run method") + + +class LazyBFS(Policy): + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + queue = [] + queue.append(self.source_tree) + + while queue: + swhids = [str(node.swhid()) for node in queue] + swhids_res = await swhids_discovery(swhids, session, api_url) + for node in queue.copy(): + queue.remove(node) + self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())][ + "known" + ] + if node.object_type == DIRECTORY: + if not self.data[node.swhid()]["known"]: + children = [n[1] for n in list(node.items())] + queue.extend(children) + else: + for sub_node in node.iter_tree(): + if sub_node == node: + continue + self.data[sub_node.swhid()]["known"] = True # type: ignore + + +class FilePriority(Policy): + @no_type_check + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + # get all the files + all_contents = list( + filter( + lambda node: node.object_type == CONTENT, self.source_tree.iter_tree() + ) + ) + all_contents.reverse() # check deepest node first + + # query the backend to get all file contents status + cnt_swhids = [str(node.swhid()) for node in all_contents] + cnt_status_res = await swhids_discovery(cnt_swhids, session, api_url) + # set all the file contents status + for cnt in all_contents: + self.data[cnt.swhid()]["known"] = cnt_status_res[str(cnt.swhid())]["known"] + # set all the upstream directories of unknown file contents to unknown + if not self.data[cnt.swhid()]["known"]: + parent = cnt.parents[0] + while parent: + self.data[parent.swhid()]["known"] = False + parent = parent.parents[0] if parent.parents else None + + # get all unset directories and check their status + # (update children directories accordingly) + unset_dirs = list( + filter( + lambda node: node.object_type == DIRECTORY + and self.data[node.swhid()]["known"] is None, + self.source_tree.iter_tree(), + ) + ) + + # check unset directories + for dir_ in unset_dirs: + if self.data[dir_.swhid()]["known"] is None: + # update directory status + dir_status = await swhids_discovery( + [str(dir_.swhid())], session, api_url + ) + dir_known = dir_status[str(dir_.swhid())]["known"] + self.data[dir_.swhid()]["known"] = dir_known + if dir_known: + sub_dirs = list( + filter( + lambda n: n.object_type == DIRECTORY + and self.data[n.swhid()]["known"] is None, + dir_.iter_tree(), + ) + ) + for node in sub_dirs: + self.data[node.swhid()]["known"] = True + + +class DirectoryPriority(Policy): + @no_type_check + async def run( + self, session: aiohttp.ClientSession, api_url: str, + ): + # get all directory contents that have at least one file content + unknown_dirs = list( + filter( + lambda dir_: dir_.object_type == DIRECTORY and self.has_contents(dir_), + self.source_tree.iter_tree(), + ) + ) + unknown_dirs.reverse() # check deepest node first + + for dir_ in unknown_dirs: + if self.data[dir_.swhid()]["known"] is None: + dir_status = await swhids_discovery( + [str(dir_.swhid())], session, api_url + ) + dir_known = dir_status[str(dir_.swhid())]["known"] + self.data[dir_.swhid()]["known"] = dir_known + # set all the downstream file contents to known + if dir_known: + for cnt in self.get_contents(dir_): + self.data[cnt.swhid()]["known"] = True + # otherwise set all the upstream directories to unknown + else: + parent = dir_.parents[0] + while parent: + self.data[parent.swhid()]["known"] = False + parent = parent.parents[0] if parent.parents else None + + # get remaining directories that have no file contents + empty_dirs = list( + filter( + lambda n: n.object_type == DIRECTORY + and not self.has_contents(n) + and self.data[n.swhid()]["known"] is None, + self.source_tree.iter_tree(), + ) + ) + empty_dirs_swhids = [str(n.swhid()) for n in empty_dirs] + empty_dir_status = await swhids_discovery(empty_dirs_swhids, session, api_url) + + # update status of directories that have no file contents + for dir_ in empty_dirs: + self.data[dir_.swhid()]["known"] = empty_dir_status[str(dir_.swhid())][ + "known" + ] + + # check unknown file contents + unknown_cnts = list( + filter( + lambda n: n.object_type == CONTENT + and self.data[n.swhid()]["known"] is None, + self.source_tree.iter_tree(), + ) + ) + unknown_cnts_swhids = [str(n.swhid()) for n in unknown_cnts] + unknown_cnts_status = await swhids_discovery( + unknown_cnts_swhids, session, api_url + ) + + for cnt in unknown_cnts: + self.data[cnt.swhid()]["known"] = unknown_cnts_status[str(cnt.swhid())][ + "known" + ] + + def has_contents(self, directory: Directory): + """Check if the directory given in input has contents""" + for entry in directory.entries: + if entry["type"] == "file": + return True + return False + + def get_contents(self, dir_: Directory): + """Get all the contents of a given directory""" + for _, node in list(dir_.items()): + if node.object_type == CONTENT: + yield node diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index 6b1e365..2e77b5b 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,140 +1,71 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio -import itertools -from typing import Any, Dict, Iterable, List +from typing import Any, Dict, Iterable import aiohttp from swh.model.cli import model_of_dir from swh.model.from_disk import Directory -from swh.model.identifiers import DIRECTORY from .data import MerkleNodeInfo -from .exceptions import error_response from .output import Output +from .policy import DirectoryPriority, FilePriority, LazyBFS -async def lazy_bfs( - source_tree: Directory, - data: MerkleNodeInfo, - session: aiohttp.ClientSession, - api_url: str, -): - - queue = [] - queue.append(source_tree) - - while queue: - swhids = [str(node.swhid()) for node in queue] - swhids_res = await swhids_discovery(swhids, session, api_url) - for node in queue.copy(): - queue.remove(node) - data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"] - if node.object_type == DIRECTORY: - if not data[node.swhid()]["known"]: - children = [n[1] for n in list(node.items())] - queue.extend(children) - else: - for sub_node in node.iter_tree(dedup=False): - if sub_node == node: - continue - data[sub_node.swhid()]["known"] = True # type: ignore - - -async def swhids_discovery( - swhids: List[str], session: aiohttp.ClientSession, api_url: str, -) -> Dict[str, Dict[str, bool]]: - """API Request to get information about the SoftWare Heritage persistent - IDentifiers (SWHIDs) given in input. - - Args: - swhids: a list of SWHIDS - api_url: url for the API request - - Returns: - A dictionary with: - - key: - SWHID searched - value: - value['known'] = True if the SWHID is found - value['known'] = False if the SWHID is not found - - """ - endpoint = api_url + "known/" - chunk_size = 1000 - requests = [] - - def get_chunk(swhids): - for i in range(0, len(swhids), chunk_size): - yield swhids[i : i + chunk_size] - - async def make_request(swhids): - async with session.post(endpoint, json=swhids) as resp: - if resp.status != 200: - error_response(resp.reason, resp.status, endpoint) - - return await resp.json() - - if len(swhids) > chunk_size: - for swhids_chunk in get_chunk(swhids): - requests.append(asyncio.create_task(make_request(swhids_chunk))) - - res = await asyncio.gather(*requests) - # concatenate list of dictionaries - return dict(itertools.chain.from_iterable(e.items() for e in res)) - else: - return await make_request(swhids) - - -async def run( - config: Dict[str, Any], source_tree: Directory, nodes_data: MerkleNodeInfo -) -> None: - """Start scanning from the given root. - - It fills the source tree with the path discovered. +async def run(config: Dict[str, Any], policy) -> None: + """Scan a given source code according to the policy given in input. Args: root: the root path to scan api_url: url for the API request """ api_url = config["web-api"]["url"] if config["web-api"]["auth-token"]: headers = {"Authorization": f"Bearer {config['web-api']['auth-token']}"} else: headers = {} - for node in source_tree.iter_tree(): - nodes_data[node.swhid()] = {} # type: ignore - async with aiohttp.ClientSession(headers=headers, trust_env=True) as session: - await lazy_bfs(source_tree, nodes_data, session, api_url) + await policy.run(session, api_url) + + +def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str): + if policy == "bfs": + return LazyBFS(source_tree, nodes_data) + elif policy == "filepriority": + return FilePriority(source_tree, nodes_data) + elif policy == "dirpriority": + return DirectoryPriority(source_tree, nodes_data) + else: + raise Exception(f"policy '{policy}' not found") def scan( config: Dict[str, Any], root_path: str, exclude_patterns: Iterable[str], out_fmt: str, interactive: bool, + policy: str, ): """Scan a source code project to discover files and directories already present in the archive""" converted_patterns = [pattern.encode() for pattern in exclude_patterns] source_tree = model_of_dir(root_path.encode(), converted_patterns) nodes_data = MerkleNodeInfo() + policy = get_policy_obj(source_tree, nodes_data, policy) loop = asyncio.get_event_loop() - loop.run_until_complete(run(config, source_tree, nodes_data)) + loop.run_until_complete(run(config, policy)) out = Output(root_path, nodes_data, source_tree) if interactive: out.show("interactive") else: out.show(out_fmt) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py index 7b91f16..3e5b56d 100644 --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -1,109 +1,134 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import os from pathlib import Path import shutil import aiohttp from aioresponses import aioresponses # type: ignore import pytest from swh.model.cli import model_of_dir from swh.scanner.data import MerkleNodeInfo from .data import present_swhids from .flask_api import create_app @pytest.fixture def mock_aioresponse(): with aioresponses() as m: yield m @pytest.fixture def event_loop(): """Fixture that generate an asyncio event loop.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) yield loop loop.close() @pytest.fixture async def aiosession(): """Fixture that generate an aiohttp Client Session.""" session = aiohttp.ClientSession() yield session session.detach() @pytest.fixture(scope="function") def test_sample_folder(datadir, tmp_path): """Location of the "data" folder""" archive_path = Path(os.path.join(datadir, "sample-folder.tgz")) assert archive_path.exists() shutil.unpack_archive(archive_path, extract_dir=tmp_path) test_sample_folder = Path(os.path.join(tmp_path, "sample-folder")) assert test_sample_folder.exists() return test_sample_folder +@pytest.fixture(scope="function") +def test_sample_folder_policy(datadir, tmp_path): + """Location of the sample source code project to test the scanner policies""" + archive_path = Path(os.path.join(datadir, "sample-folder-policy.tgz")) + assert archive_path.exists() + shutil.unpack_archive(archive_path, extract_dir=tmp_path) + test_sample_folder = Path(os.path.join(tmp_path, "sample-folder-policy")) + assert test_sample_folder.exists() + return test_sample_folder + + @pytest.fixture(scope="function") def source_tree(test_sample_folder): """Generate a model.from_disk.Directory object from the test sample folder """ return model_of_dir(str(test_sample_folder).encode()) +@pytest.fixture(scope="function") +def source_tree_policy(test_sample_folder_policy): + """Generate a model.from_disk.Directory object from the test sample + folder + """ + return model_of_dir(str(test_sample_folder_policy).encode()) + + @pytest.fixture(scope="function") def source_tree_dirs(source_tree): """Returns a list of all directories contained inside the test sample folder """ root = source_tree.data["path"] return list( map( lambda n: Path(n.data["path"].decode()).relative_to(Path(root.decode())), filter( lambda n: n.object_type == "directory" and not n.data["path"] == source_tree.data["path"], source_tree.iter_tree(dedup=False), ), ) ) @pytest.fixture(scope="function") def nodes_data(source_tree): """mock known status of file/dirs in test_sample_folder""" nodes_data = MerkleNodeInfo() for node in source_tree.iter_tree(): nodes_data[node.swhid()] = {"known": True} return nodes_data @pytest.fixture def test_swhids_sample(tmp_path): """Create and return the opened "swhids_sample" file, filled with present swhids present in data.py """ test_swhids_sample = Path(os.path.join(tmp_path, "swhids_sample.txt")) with open(test_swhids_sample, "w") as f: f.write("\n".join(swhid for swhid in present_swhids)) assert test_swhids_sample.exists() return open(test_swhids_sample, "r") @pytest.fixture(scope="session") -def app(): +def tmp_requests(tmpdir_factory): + requests_file = tmpdir_factory.mktemp("data").join("requests.json") + return requests_file + + +@pytest.fixture(scope="session") +def app(tmp_requests): """Flask backend API (used by live_server).""" - app = create_app() + app = create_app(tmp_requests) return app diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py index dd805d3..42b8e21 100644 --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -1,26 +1,27 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information correct_api_response = { "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False}, "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True}, "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] # these SWHIDs are considered known by the fake backend (scanner.test.flask_api) unknown_swhids = [ - "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce", # root directory + "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce", # root sample-folder-policy + "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce", # root sample-folder "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119", # toexclude/example.txt "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/data/sample-folder-policy.tgz b/swh/scanner/tests/data/sample-folder-policy.tgz new file mode 100644 index 0000000..be94a92 Binary files /dev/null and b/swh/scanner/tests/data/sample-folder-policy.tgz differ diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py index 8ab3196..3e8ec92 100644 --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -1,36 +1,39 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from flask import Flask, request from swh.scanner.exceptions import LargePayloadExc from .data import unknown_swhids -def create_app(): +def create_app(tmp_requests): app = Flask(__name__) @app.route("/") def index(): return "SWH scanner API" @app.route("/known/", methods=["POST"]) def known(): swhids = request.get_json() + with open(tmp_requests, "a") as f: + for swhid in swhids: + f.write(swhid + "\n") if len(swhids) > 900: raise LargePayloadExc( "The maximum number of SWHIDs this endpoint can receive is 900" ) res = {swhid: {"known": False} for swhid in swhids} for swhid in swhids: if swhid not in unknown_swhids: res[swhid]["known"] = True return res return app diff --git a/swh/scanner/tests/test_policy.py b/swh/scanner/tests/test_policy.py new file mode 100644 index 0000000..51a3d93 --- /dev/null +++ b/swh/scanner/tests/test_policy.py @@ -0,0 +1,145 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +from flask import url_for +import pytest + +from swh.model.identifiers import CoreSWHID, ObjectType +from swh.scanner.data import MerkleNodeInfo +from swh.scanner.exceptions import APIError +from swh.scanner.policy import ( + DirectoryPriority, + FilePriority, + LazyBFS, + swhids_discovery, +) + +from .data import correct_api_response + +aio_url = "http://example.org/api/known/" + + +def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): + mock_aioresponse.post( + aio_url, + status=200, + content_type="application/json", + body=json.dumps(correct_api_response), + ) + + actual_result = event_loop.run_until_complete( + swhids_discovery([], aiosession, "http://example.org/api/") + ) + + assert correct_api_response == actual_result + + +def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): + mock_aioresponse.post(aio_url, content_type="application/json", status=413) + + with pytest.raises(APIError): + event_loop.run_until_complete( + swhids_discovery([], aiosession, "http://example.org/api/") + ) + + +def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server): + + api_url = url_for("index", _external=True) + request = [ + "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) + ] # /known/ is limited at 900 + + with pytest.raises(APIError): + event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url)) + + +def test_scanner_directory_priority_has_contents(source_tree): + nodes_data = MerkleNodeInfo() + policy = DirectoryPriority(source_tree, nodes_data) + assert policy.has_contents(source_tree[b"/bar/barfoo"]) + + +def get_backend_swhids_order(tmp_requests): + with open(tmp_requests, "r") as f: + backend_swhids_order = f.readlines() + + return [x.strip() for x in backend_swhids_order] + + +def test_lazybfs_policy( + live_server, aiosession, event_loop, source_tree_policy, tmp_requests +): + open(tmp_requests, "w").close() + api_url = url_for("index", _external=True) + + nodes_data = MerkleNodeInfo() + policy = LazyBFS(source_tree_policy, nodes_data) + event_loop.run_until_complete(policy.run(aiosession, api_url)) + + backend_swhids_requests = get_backend_swhids_order(tmp_requests) + + assert ( + backend_swhids_requests[0] + == "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce" + ) + + # the second request must contain 3 SWHIDs related to directories and one content + dir_count, cnt_count = 0, 0 + for swhid in backend_swhids_requests[1:5]: + if CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY: + dir_count += 1 + else: + cnt_count += 1 + + assert dir_count == 3 + assert cnt_count == 1 + + # the last swhid must be a content related to the unknown directory + # "sample-folder-policy/toexclude" + assert ( + backend_swhids_requests[5] + == "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119" + ) + + +def test_directory_priority_policy( + live_server, aiosession, event_loop, source_tree_policy, tmp_requests +): + open(tmp_requests, "w").close() + api_url = url_for("index", _external=True) + + nodes_data = MerkleNodeInfo() + policy = DirectoryPriority(source_tree_policy, nodes_data) + event_loop.run_until_complete(policy.run(aiosession, api_url)) + + backend_swhids_requests = get_backend_swhids_order(tmp_requests) + + for swhid in backend_swhids_requests[0:4]: + assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY + + for swhid in backend_swhids_requests[5:]: + assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT + + +def test_file_priority_policy( + live_server, aiosession, event_loop, source_tree_policy, tmp_requests +): + open(tmp_requests, "w").close() + api_url = url_for("index", _external=True) + + nodes_data = MerkleNodeInfo() + policy = FilePriority(source_tree_policy, nodes_data) + event_loop.run_until_complete(policy.run(aiosession, api_url)) + + backend_swhids_requests = get_backend_swhids_order(tmp_requests) + + for swhid in backend_swhids_requests[0:4]: + assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT + + for swhid in backend_swhids_requests[5:]: + assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py index 8f0beeb..9e5e59b 100644 --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -1,70 +1,60 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json - from flask import url_for import pytest from swh.scanner.data import MerkleNodeInfo -from swh.scanner.exceptions import APIError -from swh.scanner.scanner import run, swhids_discovery - -from .data import correct_api_response, unknown_swhids - -aio_url = "http://example.org/api/known/" - +from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS +from swh.scanner.scanner import run -def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): - mock_aioresponse.post( - aio_url, - status=200, - content_type="application/json", - body=json.dumps(correct_api_response), - ) +from .data import unknown_swhids - actual_result = event_loop.run_until_complete( - swhids_discovery([], aiosession, "http://example.org/api/") - ) - - assert correct_api_response == actual_result +@pytest.mark.options(debug=False) +def test_app(app): + assert not app.debug -def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): - mock_aioresponse.post(aio_url, content_type="application/json", status=413) - with pytest.raises(APIError): - event_loop.run_until_complete( - swhids_discovery([], aiosession, "http://example.org/api/") - ) +def test_scanner_result_bfs(live_server, event_loop, source_tree): + api_url = url_for("index", _external=True) + config = {"web-api": {"url": api_url, "auth-token": None}} + nodes_data = MerkleNodeInfo() + policy = LazyBFS(source_tree, nodes_data) + event_loop.run_until_complete(run(config, policy)) + for node in source_tree.iter_tree(): + if str(node.swhid()) in unknown_swhids: + assert nodes_data[node.swhid()]["known"] is False + else: + assert nodes_data[node.swhid()]["known"] is True -def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server): +def test_scanner_result_file_priority(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) - request = [ - "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) - ] # /known/ is limited at 900 - - with pytest.raises(APIError): - event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url)) - + config = {"web-api": {"url": api_url, "auth-token": None}} -@pytest.mark.options(debug=False) -def test_app(app): - assert not app.debug + nodes_data = MerkleNodeInfo() + policy = FilePriority(source_tree, nodes_data) + event_loop.run_until_complete(run(config, policy)) + for node in source_tree.iter_tree(): + if str(node.swhid()) in unknown_swhids: + assert nodes_data[node.swhid()]["known"] is False + else: + assert nodes_data[node.swhid()]["known"] is True -def test_scanner_result(live_server, event_loop, source_tree): +def test_scanner_result_directory_priority(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} nodes_data = MerkleNodeInfo() - event_loop.run_until_complete(run(config, source_tree, nodes_data)) + policy = DirectoryPriority(source_tree, nodes_data) + event_loop.run_until_complete(run(config, policy)) for node in source_tree.iter_tree(): if str(node.swhid()) in unknown_swhids: assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True