diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index de262df..5f00d48 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,267 +1,285 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import os from typing import Any, Dict, Optional import click from importlib_metadata import version import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group from .exceptions import DBError # Config for the "serve" option BACKEND_DEFAULT_PORT = 5011 # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") DEFAULT_CONFIG: Dict[str, Any] = { "web-api": { "url": "https://archive.softwareheritage.org/api/1/", "auth-token": None, } } CONFIG_FILE_HELP = f"""Configuration file: \b The CLI option or the environment variable will fail if invalid. CLI option is checked first. Then, environment variable {CONFIG_ENVVAR} is checked. Then, if cannot load the default path, a set of default values are used. Default config path is {DEFAULT_CONFIG_PATH}. Default config values are: \b {yaml.dump(DEFAULT_CONFIG)}""" SCANNER_HELP = f"""Software Heritage Scanner tools. {CONFIG_FILE_HELP}""" def setup_config(ctx, api_url): config = ctx.obj["config"] if api_url: if not api_url.endswith("/"): api_url += "/" config["web-api"]["url"] = api_url return config @swh_cli_group.group( name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP, ) @click.option( "-C", "--config-file", default=None, type=click.Path(exists=False, dir_okay=False, path_type=str), help="""YAML configuration file""", ) @click.version_option( version=version("swh.scanner"), prog_name="swh.scanner", ) @click.pass_context def scanner(ctx, config_file: Optional[str]): env_config_path = os.environ.get(CONFIG_ENVVAR) # read_raw_config do not fail if file does not exist, so check it beforehand # while enforcing loading priority if config_file: if not config.config_exists(config_file): raise click.BadParameter( f"File '{config_file}' cannot be opened.", param_hint="--config-file" ) elif env_config_path: if not config.config_exists(env_config_path): raise click.BadParameter( f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR ) config_file = env_config_path elif config.config_exists(DEFAULT_CONFIG_PATH): config_file = DEFAULT_CONFIG_PATH conf = DEFAULT_CONFIG if config_file is not None: conf = config.read_raw_config(config.config_basepath(config_file)) conf = config.merge_configs(DEFAULT_CONFIG, conf) ctx.ensure_object(dict) ctx.obj["config"] = conf @scanner.command(name="scan") @click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default=None, metavar="API_URL", show_default=True, help="URL for the api request", ) @click.option( "--exclude", "-x", "patterns", metavar="PATTERN", multiple=True, help="Exclude directories using glob patterns \ (e.g., ``*.git`` to exclude all .git directories)", ) @click.option( "-f", "--output-format", "out_fmt", default="text", show_default=True, type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), help="The output format", ) @click.option( "-i", "--interactive", is_flag=True, help="Show the result in a dashboard" ) @click.option( "-p", "--policy", default="auto", show_default=True, type=click.Choice(["auto", "bfs", "greedybfs", "filepriority", "dirpriority"]), help="The scan policy.", ) +@click.option( + "-e", + "--extra-info", + "extra_info", + multiple=True, + type=click.Choice(["origin"]), + help="Add selected additional information about known software artifacts.", +) @click.pass_context -def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy): +def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy, extra_info): """Scan a source code project to discover files and directories already present in the archive. The source code project can be checked using different policies that can be set - using the -p/--policy option: - - auto: it selects the best policy based on the source code, for codebase(s) with - less than 1000 file/dir contents all the nodes will be queried. - - bfs: scan the source code in the BFS order, checking unknown directories only. - - greedybfs: same as "bfs" policy, but lookup the status of source code artifacts in - chunks, in order to minimize the number of Web API round-trips with the archive. - - filepriority: scan all the source code file contents, checking only unset - directories. (useful if the codebase contains a lot of source files) - - dirpriority: scan all the source code directories and check only unknown - directory contents. - """ + using the -p/--policy option:\n + \b + auto: it selects the best policy based on the source code, for codebase(s) + with less than 1000 file/dir contents all the nodes will be queried. + + bfs: scan the source code in the BFS order, checking unknown directories only. + + \b + greedybfs: same as "bfs" policy, but lookup the status of source code artifacts + in chunks, in order to minimize the number of Web API round-trips with the + archive. + + \b + filepriority: scan all the source code file contents, checking only unset + directories. (useful if the codebase contains a lot of source files) + + dirpriority: scan all the source code directories and check only unknown + directory contents. + + Other information about software artifacts could be specified with the -e/ + --extra-info option:\n + \b + origin: search the origin url of each source code files/dirs using the in-memory + compressed graph. +""" import swh.scanner.scanner as scanner config = setup_config(ctx, api_url) - scanner.scan(config, root_path, patterns, out_fmt, interactive, policy) + extra_info = set(extra_info) + scanner.scan(config, root_path, patterns, out_fmt, interactive, policy, extra_info) @scanner.group("db", help="Manage local knowledge base for swh-scanner") @click.pass_context def db(ctx): pass @db.command("import") @click.option( "-i", "--input", "input_file", metavar="INPUT_FILE", required=True, type=click.File("r"), help="A file containing SWHIDs", ) @click.option( "-o", "--output", "output_file_db", metavar="OUTPUT_DB_FILE", required=True, show_default=True, help="The name of the generated sqlite database", ) @click.option( "-s", "--chunk-size", "chunk_size", default="10000", metavar="SIZE", show_default=True, type=int, help="The chunk size ", ) @click.pass_context def import_(ctx, chunk_size, input_file, output_file_db): """Create SQLite database of known SWHIDs from a textual list of SWHIDs""" from .db import Db db = Db(output_file_db) cur = db.conn.cursor() try: db.create_from(input_file, chunk_size, cur) db.close() except DBError as e: ctx.fail("Failed to import SWHIDs into database: {0}".format(e)) @db.command("serve") @click.option( "-h", "--host", metavar="HOST", default="127.0.0.1", show_default=True, help="The host of the API server", ) @click.option( "-p", "--port", metavar="PORT", default=f"{BACKEND_DEFAULT_PORT}", show_default=True, help="The port of the API server", ) @click.option( "-f", "--db-file", "db_file", metavar="DB_FILE", default="SWHID_DB.sqlite", show_default=True, type=click.Path(exists=True), help="An sqlite database file (it can be generated with: 'swh scanner db import')", ) @click.pass_context def serve(ctx, host, port, db_file): """Start an API service using the sqlite database generated with the "db import" option.""" import swh.scanner.backend as backend from .db import Db db = Db(db_file) backend.run(host, port, db) db.close() def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") if __name__ == "__main__": main() diff --git a/swh/scanner/client.py b/swh/scanner/client.py new file mode 100644 index 0000000..d814b72 --- /dev/null +++ b/swh/scanner/client.py @@ -0,0 +1,98 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +Minimal async web client for the Software Heritage Web API. + +This module could be removed when +`T2635 ` is implemented. +""" + +import asyncio +import itertools +from typing import Any, Dict, List, Optional + +import aiohttp + +from swh.model.identifiers import CoreSWHID + +from .exceptions import error_response + +# Maximum number of SWHIDs that can be requested by a single call to the +# Web API endpoint /known/ +QUERY_LIMIT = 1000 + +KNOWN_EP = "known/" +GRAPH_RANDOMWALK_EP = "graph/randomwalk/" + + +class Client: + """Manage requests to the Software Heritage Web API. + """ + + def __init__(self, api_url: str, session: aiohttp.ClientSession): + self.api_url = api_url + self.session = session + + async def get_origin(self, swhid: CoreSWHID) -> Optional[Any]: + """Walk the compressed graph to discover the origin of a given swhid + """ + endpoint = ( + f"{self.api_url}{GRAPH_RANDOMWALK_EP}{str(swhid)}/ori/?direction=" + f"backward&limit=-1&resolve_origins=true" + ) + res = None + async with self.session.get(endpoint) as resp: + if resp.status == 200: + res = await resp.text() + res = res.rstrip() + return res + if resp.status != 404: + error_response(resp.reason, resp.status, endpoint) + + return res + + async def known(self, swhids: List[CoreSWHID]) -> Dict[str, Dict[str, bool]]: + """API Request to get information about the SoftWare Heritage persistent + IDentifiers (SWHIDs) given in input. + + Args: + swhids: a list of CoreSWHID instances + api_url: url for the API request + + Returns: + A dictionary with: + + key: + string SWHID searched + value: + value['known'] = True if the SWHID is found + value['known'] = False if the SWHID is not found + + """ + endpoint = self.api_url + KNOWN_EP + requests = [] + + def get_chunk(swhids): + for i in range(0, len(swhids), QUERY_LIMIT): + yield swhids[i : i + QUERY_LIMIT] + + async def make_request(swhids): + swhids = [str(swhid) for swhid in swhids] + async with self.session.post(endpoint, json=swhids) as resp: + if resp.status != 200: + error_response(resp.reason, resp.status, endpoint) + + return await resp.json() + + if len(swhids) > QUERY_LIMIT: + for swhids_chunk in get_chunk(swhids): + requests.append(asyncio.create_task(make_request(swhids_chunk))) + + res = await asyncio.gather(*requests) + # concatenate list of dictionaries + return dict(itertools.chain.from_iterable(e.items() for e in res)) + else: + return await make_request(swhids) diff --git a/swh/scanner/data.py b/swh/scanner/data.py index 4db27f5..88ab170 100644 --- a/swh/scanner/data.py +++ b/swh/scanner/data.py @@ -1,107 +1,150 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple from swh.model.exceptions import ValidationError from swh.model.from_disk import Directory from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID +from .client import Client + +SUPPORTED_INFO = {"known", "origin"} + class MerkleNodeInfo(dict): """Store additional information about Merkle DAG nodes, using SWHIDs as keys""" def __setitem__(self, key, value): """The keys must be valid valid Software Heritage Persistent Identifiers while values must be dict. """ if not isinstance(key, CoreSWHID): raise ValidationError("keys must be valid SWHID(s)") if not isinstance(value, dict): raise ValidationError(f"values must be dict, not {type(value)}") super(MerkleNodeInfo, self).__setitem__(key, value) +def init_merkle_node_info(source_tree: Directory, data: MerkleNodeInfo, info: set): + """Populate the MerkleNodeInfo with the SWHIDs of the given source tree and the + attributes that will be stored. + """ + if not info: + raise Exception("Data initialization requires node attributes values.") + nodes_info: Dict[str, Optional[str]] = {} + for ainfo in info: + if ainfo in SUPPORTED_INFO: + nodes_info[ainfo] = None + else: + raise Exception(f"Information {ainfo} is not supported.") + + for node in source_tree.iter_tree(): + data[node.swhid()] = nodes_info.copy() # type: ignore + + +async def add_origin(source_tree: Directory, data: MerkleNodeInfo, client: Client): + """Store origin information about software artifacts retrieved from the Software + Heritage graph service. + """ + queue = [] + queue.append(source_tree) + while queue: + for node in queue.copy(): + queue.remove(node) + node_ori = await client.get_origin(node.swhid()) + if node_ori: + data[node.swhid()]["origin"] = node_ori + if node.object_type == DIRECTORY: + for sub_node in node.iter_tree(): + data[sub_node.swhid()]["origin"] = node_ori # type: ignore + else: + if node.object_type == DIRECTORY: + children = [sub_node for sub_node in node.iter_tree()] + children.remove(node) + queue.extend(children) # type: ignore + + def get_directory_data( root_path: str, source_tree: Directory, nodes_data: MerkleNodeInfo, directory_data: Dict = {}, ) -> Dict[Path, dict]: """Get content information for each directory inside source_tree. Returns: A dictionary with a directory path as key and the relative contents information as values. """ def _get_directory_data( source_tree: Directory, nodes_data: MerkleNodeInfo, directory_data: Dict ): directories = list( filter( lambda n: n.object_type == DIRECTORY, map(lambda n: n[1], source_tree.items()), ) ) for node in directories: directory_info = directory_content(node, nodes_data) rel_path = Path(node.data["path"].decode()).relative_to(Path(root_path)) directory_data[rel_path] = directory_info if has_dirs(node): _get_directory_data(node, nodes_data, directory_data) _get_directory_data(source_tree, nodes_data, directory_data) return directory_data def directory_content(node: Directory, nodes_data: MerkleNodeInfo) -> Tuple[int, int]: """Count known contents inside the given directory. Returns: A tuple with the total number of contents inside the directory and the number of known contents. """ known_cnt = 0 node_contents = list( filter(lambda n: n.object_type == CONTENT, map(lambda n: n[1], node.items())) ) for sub_node in node_contents: if nodes_data[sub_node.swhid()]["known"]: known_cnt += 1 return (len(node_contents), known_cnt) def has_dirs(node: Directory) -> bool: """Check if the given directory has other directories inside.""" for _, sub_node in node.items(): if isinstance(sub_node, Directory): return True return False def get_content_from( node_path: bytes, source_tree: Directory, nodes_data: MerkleNodeInfo ) -> Dict[bytes, dict]: """Get content information from the given directory node.""" # root in model.from_disk.Directory should be accessed with b"" directory = source_tree[node_path if node_path != source_tree.data["path"] else b""] node_contents = list( filter( lambda n: n.object_type == CONTENT, map(lambda n: n[1], directory.items()) ) ) files_data = {} for node in node_contents: node_info = nodes_data[node.swhid()] node_info["swhid"] = str(node.swhid()) path_name = "path" if "path" in node.data.keys() else "data" files_data[node.data[path_name]] = node_info return files_data diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py index b890a45..dd9eded 100644 --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -1,30 +1,32 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Optional + class InvalidObjectType(TypeError): pass class InvalidDirectoryPath(Exception): pass class LargePayloadExc(Exception): pass class DBError(Exception): pass class APIError(Exception): def __str__(self): return '"%s"' % self.args -def error_response(reason: str, status_code: int, api_url: str): +def error_response(reason: Optional[Any], status_code: int, api_url: str): error_msg = f"{status_code} {reason}: '{api_url}'" raise APIError(error_msg) diff --git a/swh/scanner/output.py b/swh/scanner/output.py index 6a2607a..5269d0f 100644 --- a/swh/scanner/output.py +++ b/swh/scanner/output.py @@ -1,108 +1,109 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from enum import Enum import json import os import sys from typing import Any import ndjson from swh.model.from_disk import Directory from .dashboard.dashboard import run_app from .data import MerkleNodeInfo, get_directory_data from .plot import generate_sunburst, offline_plot DEFAULT_OUTPUT = "text" class Color(Enum): BLUE = "\033[94m" GREEN = "\033[92m" RED = "\033[91m" END = "\033[0m" def colorize(text: str, color: Color): return color.value + text + Color.END.value class Output: def __init__( self, root_path: str, nodes_data: MerkleNodeInfo, source_tree: Directory ): self.root_path = root_path self.nodes_data = nodes_data self.source_tree = source_tree def show(self, mode=DEFAULT_OUTPUT): if mode == "text": isatty = sys.stdout.isatty() self.print_text(isatty) elif mode == "sunburst": directory_data = get_directory_data( self.root_path, self.source_tree, self.nodes_data ) sunburst_figure = generate_sunburst(directory_data, self.root_path) offline_plot(sunburst_figure) elif mode == "interactive": directory_data = get_directory_data( self.root_path, self.source_tree, self.nodes_data ) sunburst_figure = generate_sunburst(directory_data, self.root_path) run_app(sunburst_figure, self.source_tree, self.nodes_data) elif mode == "json": self.print_json() elif mode == "ndjson": self.print_ndjson() else: raise Exception(f"mode {mode} is not an output format") def get_path_name(self, node): return "path" if "path" in node.data.keys() else "data" def print_text(self, isatty: bool) -> None: def compute_level(node): node_path = str(node.data[self.get_path_name(node)]).split("/") source_path = str(self.source_tree.data["path"]).split("/") return len(node_path) - len(source_path) for node in self.source_tree.iter_tree(): self.print_node(node, isatty, compute_level(node)) def print_node(self, node: Any, isatty: bool, level: int) -> None: rel_path = os.path.basename(node.data[self.get_path_name(node)]) rel_path = rel_path.decode() begin = "│ " * level end = "/" if node.object_type == "directory" else "" if isatty: if not self.nodes_data[node.swhid()]["known"]: rel_path = colorize(rel_path, Color.RED) elif node.object_type == "directory": rel_path = colorize(rel_path, Color.BLUE) elif node.object_type == "content": rel_path = colorize(rel_path, Color.GREEN) print(f"{begin}{rel_path}{end}") def data_as_json(self): json = {} for node in self.source_tree.iter_tree(): - node_known = self.nodes_data[node.swhid()]["known"] rel_path = os.path.relpath( node.data[self.get_path_name(node)].decode(), self.source_tree.data["path"].decode(), ) - json[rel_path] = {"swhid": str(node.swhid()), "known": node_known} + json[rel_path] = {"swhid": str(node.swhid())} + for k, v in self.nodes_data[node.swhid()].items(): + json[rel_path][k] = v return json def print_json(self): print(json.dumps(self.data_as_json(), indent=4, sort_keys=True)) def print_ndjson(self): print(ndjson.dumps({k: v} for k, v in self.data_as_json().items())) diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py index 431b999..44cf053 100644 --- a/swh/scanner/policy.py +++ b/swh/scanner/policy.py @@ -1,332 +1,260 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc -import asyncio -import itertools -from typing import Dict, List, no_type_check - -import aiohttp +from typing import no_type_check from swh.core.utils import grouper from swh.model.from_disk import Directory -from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID +from swh.model.identifiers import CONTENT, DIRECTORY +from .client import QUERY_LIMIT, Client from .data import MerkleNodeInfo -from .exceptions import error_response - -# Maximum number of SWHIDs that can be requested by a single call to the -# Web API endpoint /known/ -QUERY_LIMIT = 1000 - - -async def swhids_discovery( - swhids: List[CoreSWHID], session: aiohttp.ClientSession, api_url: str, -) -> Dict[str, Dict[str, bool]]: - """API Request to get information about the SoftWare Heritage persistent - IDentifiers (SWHIDs) given in input. - - Args: - swhids: a list of CoreSWHID instances - api_url: url for the API request - - Returns: - A dictionary with: - - key: - string SWHID searched - value: - value['known'] = True if the SWHID is found - value['known'] = False if the SWHID is not found - - """ - endpoint = api_url + "known/" - requests = [] - - def get_chunk(swhids): - for i in range(0, len(swhids), QUERY_LIMIT): - yield swhids[i : i + QUERY_LIMIT] - - async def make_request(swhids): - swhids = [str(swhid) for swhid in swhids] - async with session.post(endpoint, json=swhids) as resp: - if resp.status != 200: - error_response(resp.reason, resp.status, endpoint) - - return await resp.json() - - if len(swhids) > QUERY_LIMIT: - for swhids_chunk in get_chunk(swhids): - requests.append(asyncio.create_task(make_request(swhids_chunk))) - - res = await asyncio.gather(*requests) - # concatenate list of dictionaries - return dict(itertools.chain.from_iterable(e.items() for e in res)) - else: - return await make_request(swhids) def source_size(source_tree: Directory): """return the size of a source tree as the number of nodes it contains """ return sum(1 for n in source_tree.iter_tree(dedup=False)) class Policy(metaclass=abc.ABCMeta): data: MerkleNodeInfo """information about contents and directories of the merkle tree""" source_tree: Directory """representation of a source code project directory in the merkle tree""" def __init__(self, source_tree: Directory, data: MerkleNodeInfo): - self.data = data self.source_tree = source_tree - for node in source_tree.iter_tree(): - self.data[node.swhid()] = {"known": None} # type: ignore + self.data = data @abc.abstractmethod - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): """Scan a source code project""" raise NotImplementedError("Must implement run method") class LazyBFS(Policy): """Read nodes in the merkle tree using the BFS algorithm. Lookup only directories that are unknown otherwise set all the downstream contents to known. """ - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): queue = [] queue.append(self.source_tree) while queue: swhids = [node.swhid() for node in queue] - swhids_res = await swhids_discovery(swhids, session, api_url) + swhids_res = await client.known(swhids) for node in queue.copy(): queue.remove(node) self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())][ "known" ] if node.object_type == DIRECTORY: if not self.data[node.swhid()]["known"]: children = [n[1] for n in list(node.items())] queue.extend(children) else: for sub_node in node.iter_tree(): if sub_node == node: continue self.data[sub_node.swhid()]["known"] = True # type: ignore class GreedyBFS(Policy): """Query graph nodes in chunks (to maximize the Web API rate limit use) and set the downstream contents of known directories to known. """ - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): ssize = source_size(self.source_tree) seen = [] - async for nodes_chunk in self.get_nodes_chunks(session, api_url, ssize): + async for nodes_chunk in self.get_nodes_chunks(client, ssize): for node in nodes_chunk: seen.append(node) if len(seen) == ssize: return if node.object_type == DIRECTORY and self.data[node.swhid()]["known"]: sub_nodes = [n for n in node.iter_tree(dedup=False)] sub_nodes.remove(node) # remove root node for sub_node in sub_nodes: seen.append(sub_node) self.data[sub_node.swhid()]["known"] = True @no_type_check - async def get_nodes_chunks( - self, session: aiohttp.ClientSession, api_url: str, ssize: int - ): + async def get_nodes_chunks(self, client: Client, ssize: int): """Query chunks of QUERY_LIMIT nodes at once in order to fill the Web API rate limit. It query all the nodes in the case the source code contains less than QUERY_LIMIT nodes. """ nodes = self.source_tree.iter_tree(dedup=False) for nodes_chunk in grouper(nodes, QUERY_LIMIT): nodes_chunk = [n for n in nodes_chunk] swhids = [node.swhid() for node in nodes_chunk] - swhids_res = await swhids_discovery(swhids, session, api_url) + swhids_res = await client.known(swhids) for node in nodes_chunk: swhid = node.swhid() self.data[swhid]["known"] = swhids_res[str(swhid)]["known"] yield nodes_chunk class FilePriority(Policy): """Check the Merkle tree querying all the file contents and set all the upstream directories to unknown in the case a file content is unknown. Finally check all the directories which status is still unknown and set all the sub-directories of known directories to known. """ @no_type_check - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): # get all the files all_contents = list( filter( lambda node: node.object_type == CONTENT, self.source_tree.iter_tree() ) ) all_contents.reverse() # check deepest node first # query the backend to get all file contents status cnt_swhids = [node.swhid() for node in all_contents] - cnt_status_res = await swhids_discovery(cnt_swhids, session, api_url) + cnt_status_res = await client.known(cnt_swhids) # set all the file contents status for cnt in all_contents: self.data[cnt.swhid()]["known"] = cnt_status_res[str(cnt.swhid())]["known"] # set all the upstream directories of unknown file contents to unknown if not self.data[cnt.swhid()]["known"]: parent = cnt.parents[0] while parent: self.data[parent.swhid()]["known"] = False parent = parent.parents[0] if parent.parents else None # get all unset directories and check their status # (update children directories accordingly) unset_dirs = list( filter( lambda node: node.object_type == DIRECTORY and self.data[node.swhid()]["known"] is None, self.source_tree.iter_tree(), ) ) # check unset directories for dir_ in unset_dirs: if self.data[dir_.swhid()]["known"] is None: # update directory status - dir_status = await swhids_discovery([dir_.swhid()], session, api_url) + dir_status = await client.known([dir_.swhid()]) dir_known = dir_status[str(dir_.swhid())]["known"] self.data[dir_.swhid()]["known"] = dir_known if dir_known: sub_dirs = list( filter( lambda n: n.object_type == DIRECTORY and self.data[n.swhid()]["known"] is None, dir_.iter_tree(), ) ) for node in sub_dirs: self.data[node.swhid()]["known"] = True class DirectoryPriority(Policy): """Check the Merkle tree querying all the directories that have at least one file content and set all the upstream directories to unknown in the case a directory is unknown otherwise set all the downstream contents to known. Finally check the status of empty directories and all the remaining file contents. """ @no_type_check - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): # get all directory contents that have at least one file content unknown_dirs = list( filter( lambda dir_: dir_.object_type == DIRECTORY and self.has_contents(dir_), self.source_tree.iter_tree(), ) ) unknown_dirs.reverse() # check deepest node first for dir_ in unknown_dirs: if self.data[dir_.swhid()]["known"] is None: - dir_status = await swhids_discovery([dir_.swhid()], session, api_url) + dir_status = await client.known([dir_.swhid()]) dir_known = dir_status[str(dir_.swhid())]["known"] self.data[dir_.swhid()]["known"] = dir_known # set all the downstream file contents to known if dir_known: for cnt in self.get_contents(dir_): self.data[cnt.swhid()]["known"] = True # otherwise set all the upstream directories to unknown else: parent = dir_.parents[0] while parent: self.data[parent.swhid()]["known"] = False parent = parent.parents[0] if parent.parents else None # get remaining directories that have no file contents empty_dirs = list( filter( lambda n: n.object_type == DIRECTORY and not self.has_contents(n) and self.data[n.swhid()]["known"] is None, self.source_tree.iter_tree(), ) ) empty_dirs_swhids = [n.swhid() for n in empty_dirs] - empty_dir_status = await swhids_discovery(empty_dirs_swhids, session, api_url) + empty_dir_status = await client.known(empty_dirs_swhids) # update status of directories that have no file contents for dir_ in empty_dirs: self.data[dir_.swhid()]["known"] = empty_dir_status[str(dir_.swhid())][ "known" ] # check unknown file contents unknown_cnts = list( filter( lambda n: n.object_type == CONTENT and self.data[n.swhid()]["known"] is None, self.source_tree.iter_tree(), ) ) unknown_cnts_swhids = [n.swhid() for n in unknown_cnts] - unknown_cnts_status = await swhids_discovery( - unknown_cnts_swhids, session, api_url - ) + unknown_cnts_status = await client.known(unknown_cnts_swhids) for cnt in unknown_cnts: self.data[cnt.swhid()]["known"] = unknown_cnts_status[str(cnt.swhid())][ "known" ] def has_contents(self, directory: Directory): """Check if the directory given in input has contents""" for entry in directory.entries: if entry["type"] == "file": return True return False def get_contents(self, dir_: Directory): """Get all the contents of a given directory""" for _, node in list(dir_.items()): if node.object_type == CONTENT: yield node class QueryAll(Policy): """Check the status of every node in the Merkle tree. """ @no_type_check - async def run( - self, session: aiohttp.ClientSession, api_url: str, - ): + async def run(self, client: Client): all_nodes = [node for node in self.source_tree.iter_tree()] all_swhids = [node.swhid() for node in all_nodes] - swhids_res = await swhids_discovery(all_swhids, session, api_url) + swhids_res = await client.known(all_swhids) for node in all_nodes: self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"] diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index 6cd0653..8200a4f 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,87 +1,101 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio from typing import Any, Dict, Iterable import aiohttp from swh.model.cli import model_of_dir from swh.model.from_disk import Directory -from .data import MerkleNodeInfo +from .client import Client +from .data import MerkleNodeInfo, add_origin, init_merkle_node_info from .output import Output from .policy import ( QUERY_LIMIT, DirectoryPriority, FilePriority, GreedyBFS, LazyBFS, QueryAll, source_size, ) -async def run(config: Dict[str, Any], policy) -> None: +async def run( + config: Dict[str, Any], + policy, + source_tree: Directory, + nodes_data: MerkleNodeInfo, + extra_info: set, +) -> None: """Scan a given source code according to the policy given in input. - - Args: - root: the root path to scan - api_url: url for the API request - """ api_url = config["web-api"]["url"] if config["web-api"]["auth-token"]: headers = {"Authorization": f"Bearer {config['web-api']['auth-token']}"} else: headers = {} async with aiohttp.ClientSession(headers=headers, trust_env=True) as session: - await policy.run(session, api_url) + client = Client(api_url, session) + for info in extra_info: + if info == "known": + await policy.run(client) + elif info == "origin": + await add_origin(source_tree, nodes_data, client) + else: + raise Exception(f"The information '{info}' cannot be retrieved") def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str): if policy == "auto": return ( QueryAll(source_tree, nodes_data) if source_size(source_tree) <= QUERY_LIMIT else LazyBFS(source_tree, nodes_data) ) elif policy == "bfs": return LazyBFS(source_tree, nodes_data) elif policy == "greedybfs": return GreedyBFS(source_tree, nodes_data) elif policy == "filepriority": return FilePriority(source_tree, nodes_data) elif policy == "dirpriority": return DirectoryPriority(source_tree, nodes_data) else: raise Exception(f"policy '{policy}' not found") def scan( config: Dict[str, Any], root_path: str, exclude_patterns: Iterable[str], out_fmt: str, interactive: bool, policy: str, + extra_info: set, ): """Scan a source code project to discover files and directories already present in the archive""" converted_patterns = [pattern.encode() for pattern in exclude_patterns] source_tree = model_of_dir(root_path.encode(), converted_patterns) + nodes_data = MerkleNodeInfo() + extra_info.add("known") + init_merkle_node_info(source_tree, nodes_data, extra_info) + policy = get_policy_obj(source_tree, nodes_data, policy) loop = asyncio.get_event_loop() - loop.run_until_complete(run(config, policy)) + loop.run_until_complete(run(config, policy, source_tree, nodes_data, extra_info)) out = Output(root_path, nodes_data, source_tree) if interactive: out.show("interactive") else: out.show(out_fmt) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py index 42b8e21..12060dc 100644 --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -1,27 +1,32 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -correct_api_response = { +correct_known_api_response = { "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False}, "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True}, "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } +correct_origin_api_response = "https://bitbucket.org/chubbymaggie/bindead.git" + +sample_folder_root_swhid = "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce" +fake_origin = {sample_folder_root_swhid: correct_origin_api_response} + present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] # these SWHIDs are considered known by the fake backend (scanner.test.flask_api) unknown_swhids = [ "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce", # root sample-folder-policy "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce", # root sample-folder "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119", # toexclude/example.txt "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py index c521e63..42a8141 100644 --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -1,41 +1,48 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from flask import Flask, request +from flask import Flask, abort, request from swh.scanner.exceptions import LargePayloadExc from swh.scanner.policy import QUERY_LIMIT -from .data import unknown_swhids +from .data import fake_origin, unknown_swhids def create_app(tmp_requests): app = Flask(__name__) @app.route("/") def index(): return "SWH scanner API" @app.route("/known/", methods=["POST"]) def known(): swhids = request.get_json() with open(tmp_requests, "a") as f: for swhid in swhids: f.write(swhid + "\n") if len(swhids) > QUERY_LIMIT: raise LargePayloadExc( f"The maximum number of SWHIDs this endpoint can receive is " f"{QUERY_LIMIT}" ) res = {swhid: {"known": False} for swhid in swhids} for swhid in swhids: if swhid not in unknown_swhids: res[swhid]["known"] = True return res + @app.route("/graph/randomwalk//ori/", methods=["GET"]) + def randomwalk(swhid): + if swhid in fake_origin.keys(): + return fake_origin[swhid] + else: + abort(404) + return app diff --git a/swh/scanner/tests/test_client.py b/swh/scanner/tests/test_client.py new file mode 100644 index 0000000..6a85eec --- /dev/null +++ b/swh/scanner/tests/test_client.py @@ -0,0 +1,58 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json + +import pytest + +from swh.model.identifiers import CoreSWHID +from swh.scanner.client import Client +from swh.scanner.exceptions import APIError + +from .data import correct_known_api_response, correct_origin_api_response + +AIO_URL = "http://example.org/api/" +KNOWN_URL = f"{AIO_URL}known/" +ORIGIN_URL = f"{AIO_URL}graph/randomwalk/" + + +def test_client_known_correct_api_request(mock_aioresponse, event_loop, aiosession): + mock_aioresponse.post( + KNOWN_URL, + status=200, + content_type="application/json", + body=json.dumps(correct_known_api_response), + ) + + client = Client(AIO_URL, aiosession) + actual_result = event_loop.run_until_complete(client.known([])) + + assert correct_known_api_response == actual_result + + +def test_client_known_raise_apierror(mock_aioresponse, event_loop, aiosession): + mock_aioresponse.post(KNOWN_URL, content_type="application/json", status=413) + + client = Client(AIO_URL, aiosession) + with pytest.raises(APIError): + event_loop.run_until_complete(client.known([])) + + +def test_client_get_origin_correct_api_request( + mock_aioresponse, event_loop, aiosession +): + origin_url = ( + f"{ORIGIN_URL}swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140/ori" + f"/?direction=backward&limit=-1&resolve_origins=true" + ) + mock_aioresponse.get( + origin_url, status=200, body=correct_origin_api_response, + ) + + client = Client(AIO_URL, aiosession) + swhid = CoreSWHID.from_string("swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140") + actual_result = event_loop.run_until_complete(client.get_origin(swhid)) + + assert correct_origin_api_response == actual_result diff --git a/swh/scanner/tests/test_data.py b/swh/scanner/tests/test_data.py index 4a29751..2925c3e 100644 --- a/swh/scanner/tests/test_data.py +++ b/swh/scanner/tests/test_data.py @@ -1,44 +1,73 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path +from flask import url_for import pytest from swh.model.exceptions import ValidationError +from swh.scanner.client import Client from swh.scanner.data import ( MerkleNodeInfo, + add_origin, directory_content, get_directory_data, has_dirs, + init_merkle_node_info, ) +from .data import fake_origin + def test_merkle_node_data_wrong_args(): nodes_data = MerkleNodeInfo() with pytest.raises(ValidationError): nodes_data["wrong key"] = {"known": True} with pytest.raises(ValidationError): nodes_data["swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112"] = "wrong value" +def test_init_merkle_supported_node_info(source_tree): + nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree, nodes_data, {"known", "origin"}) + for _, node_attrs in nodes_data.items(): + assert "known" and "origin" in node_attrs.keys() + + +def test_init_merkle_not_supported_node_info(source_tree): + nodes_data = MerkleNodeInfo() + with pytest.raises(Exception): + init_merkle_node_info(source_tree, nodes_data, {"unsupported_info"}) + + +def test_add_origin(event_loop, live_server, aiosession, source_tree, nodes_data): + api_url = url_for("index", _external=True) + init_merkle_node_info(source_tree, nodes_data, {"known", "origin"}) + client = Client(api_url, aiosession) + + event_loop.run_until_complete(add_origin(source_tree, nodes_data, client)) + for node, attrs in nodes_data.items(): + assert attrs["origin"] == fake_origin[str(source_tree.swhid())] + + def test_get_directory_data(source_tree, nodes_data): root = Path(source_tree.data["path"].decode()) dirs_data = get_directory_data(root, source_tree, nodes_data) assert len(dirs_data) == 5 def test_directory_content(source_tree, nodes_data): foo_dir = source_tree[b"foo"] foo_content = directory_content(foo_dir, nodes_data) assert foo_content[0] == 3 assert foo_content[1] == 3 def test_has_dirs(source_tree): assert has_dirs(source_tree) diff --git a/swh/scanner/tests/test_policy.py b/swh/scanner/tests/test_policy.py index a60873c..937408c 100644 --- a/swh/scanner/tests/test_policy.py +++ b/swh/scanner/tests/test_policy.py @@ -1,168 +1,148 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json from flask import url_for import pytest from swh.model.identifiers import CONTENT, CoreSWHID, ObjectType -from swh.scanner.data import MerkleNodeInfo -from swh.scanner.exceptions import APIError +from swh.scanner.client import Client +from swh.scanner.data import MerkleNodeInfo, init_merkle_node_info from swh.scanner.policy import ( DirectoryPriority, FilePriority, GreedyBFS, LazyBFS, source_size, - swhids_discovery, ) -from .data import correct_api_response - -aio_url = "http://example.org/api/known/" - - -def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): - mock_aioresponse.post( - aio_url, - status=200, - content_type="application/json", - body=json.dumps(correct_api_response), - ) - - actual_result = event_loop.run_until_complete( - swhids_discovery([], aiosession, "http://example.org/api/") - ) - - assert correct_api_response == actual_result - - -def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): - mock_aioresponse.post(aio_url, content_type="application/json", status=413) - - with pytest.raises(APIError): - event_loop.run_until_complete( - swhids_discovery([], aiosession, "http://example.org/api/") - ) - def test_scanner_directory_priority_has_contents(source_tree): nodes_data = MerkleNodeInfo() policy = DirectoryPriority(source_tree, nodes_data) assert policy.has_contents(source_tree[b"/bar/barfoo"]) def get_backend_swhids_order(tmp_requests): with open(tmp_requests, "r") as f: backend_swhids_order = f.readlines() return [x.strip() for x in backend_swhids_order] def test_lazybfs_policy( live_server, aiosession, event_loop, source_tree_policy, tmp_requests ): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree_policy, nodes_data, {"known"}) policy = LazyBFS(source_tree_policy, nodes_data) - event_loop.run_until_complete(policy.run(aiosession, api_url)) + client = Client(api_url, aiosession) + event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) assert ( backend_swhids_requests[0] == "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce" ) # the second request must contain 3 SWHIDs related to directories and one content dir_count, cnt_count = 0, 0 for swhid in backend_swhids_requests[1:5]: if CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY: dir_count += 1 else: cnt_count += 1 assert dir_count == 3 assert cnt_count == 1 # the last swhid must be a content related to the unknown directory # "sample-folder-policy/toexclude" assert ( backend_swhids_requests[5] == "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119" ) def test_directory_priority_policy( live_server, aiosession, event_loop, source_tree_policy, tmp_requests ): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree_policy, nodes_data, {"known"}) policy = DirectoryPriority(source_tree_policy, nodes_data) - event_loop.run_until_complete(policy.run(aiosession, api_url)) + client = Client(api_url, aiosession) + event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) for swhid in backend_swhids_requests[0:4]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY for swhid in backend_swhids_requests[5:]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT def test_file_priority_policy( live_server, aiosession, event_loop, source_tree_policy, tmp_requests ): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree_policy, nodes_data, {"known"}) policy = FilePriority(source_tree_policy, nodes_data) - event_loop.run_until_complete(policy.run(aiosession, api_url)) + client = Client(api_url, aiosession) + event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) for swhid in backend_swhids_requests[0:4]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT for swhid in backend_swhids_requests[5:]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY def test_greedy_bfs_policy( live_server, event_loop, aiosession, big_source_tree, tmp_requests ): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() + init_merkle_node_info(big_source_tree, nodes_data, {"known"}) policy = GreedyBFS(big_source_tree, nodes_data) - event_loop.run_until_complete(policy.run(aiosession, api_url)) + client = Client(api_url, aiosession) + event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) last_swhid = backend_swhids_requests[-1] assert CoreSWHID.from_string(last_swhid).object_type == ObjectType.CONTENT @pytest.mark.asyncio async def test_greedy_bfs_get_nodes_chunks(live_server, aiosession, big_source_tree): api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() + init_merkle_node_info(big_source_tree, nodes_data, {"known"}) policy = GreedyBFS(big_source_tree, nodes_data) + client = Client(api_url, aiosession) chunks = [ n_chunk async for n_chunk in policy.get_nodes_chunks( - aiosession, api_url, source_size(big_source_tree) + client, source_size(big_source_tree) ) ] assert len(chunks) == 2 assert chunks[1][-1].object_type == CONTENT diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py index 903ed0b..42ae86c 100644 --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -1,84 +1,96 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from flask import url_for import pytest -from swh.scanner.data import MerkleNodeInfo +from swh.scanner.data import MerkleNodeInfo, init_merkle_node_info from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS, QueryAll from swh.scanner.scanner import get_policy_obj, run from .data import unknown_swhids @pytest.mark.options(debug=False) def test_app(app): assert not app.debug def test_get_policy_obj_auto(source_tree, nodes_data): assert isinstance(get_policy_obj(source_tree, nodes_data, "auto"), QueryAll) def test_get_policy_obj_bfs(big_source_tree, nodes_data): # check that the policy object is the LazyBFS if the source tree contains more than # 1000 nodes assert isinstance(get_policy_obj(big_source_tree, nodes_data, "auto"), LazyBFS) def test_scanner_result_bfs(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree, nodes_data, {"known"}) policy = LazyBFS(source_tree, nodes_data) - event_loop.run_until_complete(run(config, policy)) + event_loop.run_until_complete( + run(config, policy, source_tree, nodes_data, {"known"}) + ) for node in source_tree.iter_tree(): if str(node.swhid()) in unknown_swhids: assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True def test_scanner_result_file_priority(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree, nodes_data, {"known"}) policy = FilePriority(source_tree, nodes_data) - event_loop.run_until_complete(run(config, policy)) + event_loop.run_until_complete( + run(config, policy, source_tree, nodes_data, {"known"}) + ) for node in source_tree.iter_tree(): if str(node.swhid()) in unknown_swhids: assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True def test_scanner_result_directory_priority(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree, nodes_data, {"known"}) policy = DirectoryPriority(source_tree, nodes_data) - event_loop.run_until_complete(run(config, policy)) + event_loop.run_until_complete( + run(config, policy, source_tree, nodes_data, {"known"}) + ) for node in source_tree.iter_tree(): if str(node.swhid()) in unknown_swhids: assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True def test_scanner_result_query_all(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} nodes_data = MerkleNodeInfo() + init_merkle_node_info(source_tree, nodes_data, {"known"}) policy = QueryAll(source_tree, nodes_data) - event_loop.run_until_complete(run(config, policy)) + event_loop.run_until_complete( + run(config, policy, source_tree, nodes_data, {"known"}) + ) for node in source_tree.iter_tree(): if str(node.swhid()) in unknown_swhids: assert nodes_data[node.swhid()]["known"] is False else: assert nodes_data[node.swhid()]["known"] is True