diff --git a/requirements-swh.txt b/requirements-swh.txt index bb991b2..b05e153 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core >= 0.3 -swh.model >= 1.0.0 +swh.model >= 2.6.4 diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py index 6caa2c3..95169ce 100644 --- a/swh/scanner/backend.py +++ b/swh/scanner/backend.py @@ -1,41 +1,40 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from flask import Flask, request from .db import Db from .exceptions import LargePayloadExc LIMIT = 1000 def create_app(db: Db): """Backend for swh-scanner, implementing the /known endpoint of the - Software Heritage Web API""" + Software Heritage Web API""" app = Flask(__name__) @app.route("/api/1/known/", methods=["POST"]) def known(): swhids = request.get_json() if len(swhids) > LIMIT: raise LargePayloadExc( f"The maximum number of SWHIDs this endpoint can receive is {LIMIT}" ) cur = db.conn.cursor() res = {swhid: {"known": db.known(swhid, cur)} for swhid in swhids} cur.close() return res return app def run(host: str, port: int, db: Db): - """Serve the local database - """ + """Serve the local database""" app = create_app(db) app.run(host, port, debug=True) diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index 8f039bb..bf69213 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,243 +1,241 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import os from typing import Any, Dict, Optional import click from importlib_metadata import version import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group from .exceptions import DBError # Config for the "serve" option BACKEND_DEFAULT_PORT = 5011 # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") DEFAULT_CONFIG: Dict[str, Any] = { "web-api": { "url": "https://archive.softwareheritage.org/api/1/", "auth-token": None, } } CONFIG_FILE_HELP = f"""Configuration file: \b The CLI option or the environment variable will fail if invalid. CLI option is checked first. Then, environment variable {CONFIG_ENVVAR} is checked. Then, if cannot load the default path, a set of default values are used. Default config path is {DEFAULT_CONFIG_PATH}. Default config values are: \b {yaml.dump(DEFAULT_CONFIG)}""" SCANNER_HELP = f"""Software Heritage Scanner tools. {CONFIG_FILE_HELP}""" def setup_config(ctx, api_url): config = ctx.obj["config"] if api_url: if not api_url.endswith("/"): api_url += "/" config["web-api"]["url"] = api_url return config @swh_cli_group.group( name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP, ) @click.option( "-C", "--config-file", default=None, type=click.Path(exists=False, dir_okay=False, path_type=str), help="""YAML configuration file""", ) @click.version_option( version=version("swh.scanner"), prog_name="swh.scanner", ) @click.pass_context def scanner(ctx, config_file: Optional[str]): env_config_path = os.environ.get(CONFIG_ENVVAR) # read_raw_config do not fail if file does not exist, so check it beforehand # while enforcing loading priority if config_file: if not config.config_exists(config_file): raise click.BadParameter( f"File '{config_file}' cannot be opened.", param_hint="--config-file" ) elif env_config_path: if not config.config_exists(env_config_path): raise click.BadParameter( f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR ) config_file = env_config_path elif config.config_exists(DEFAULT_CONFIG_PATH): config_file = DEFAULT_CONFIG_PATH conf = DEFAULT_CONFIG if config_file is not None: conf = config.read_raw_config(config.config_basepath(config_file)) conf = config.merge_configs(DEFAULT_CONFIG, conf) ctx.ensure_object(dict) ctx.obj["config"] = conf @scanner.command(name="scan") @click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default=None, metavar="API_URL", show_default=True, help="URL for the api request", ) @click.option( "--exclude", "-x", "patterns", metavar="PATTERN", multiple=True, help="Exclude directories using glob patterns \ (e.g., ``*.git`` to exclude all .git directories)", ) @click.option( "-f", "--output-format", "out_fmt", default="text", show_default=True, type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), help="The output format", ) @click.option( "-i", "--interactive", is_flag=True, help="Show the result in a dashboard" ) @click.pass_context def scan(ctx, root_path, api_url, patterns, out_fmt, interactive): """Scan a source code project to discover files and directories already present in the archive""" import swh.scanner.scanner as scanner config = setup_config(ctx, api_url) scanner.scan(config, root_path, patterns, out_fmt, interactive) @scanner.group("db", help="Manage local knowledge base for swh-scanner") @click.pass_context def db(ctx): pass @db.command("import") @click.option( "-i", "--input", "input_file", metavar="INPUT_FILE", required=True, type=click.File("r"), help="A file containing SWHIDs", ) @click.option( "-o", "--output", "output_file_db", metavar="OUTPUT_DB_FILE", required=True, show_default=True, help="The name of the generated sqlite database", ) @click.option( "-s", "--chunk-size", "chunk_size", default="10000", metavar="SIZE", show_default=True, type=int, help="The chunk size ", ) @click.pass_context def import_(ctx, chunk_size, input_file, output_file_db): - """Create SQLite database of known SWHIDs from a textual list of SWHIDs - - """ + """Create SQLite database of known SWHIDs from a textual list of SWHIDs""" from .db import Db db = Db(output_file_db) cur = db.conn.cursor() try: db.create_from(input_file, chunk_size, cur) db.close() except DBError as e: ctx.fail("Failed to import SWHIDs into database: {0}".format(e)) @db.command("serve") @click.option( "-h", "--host", metavar="HOST", default="127.0.0.1", show_default=True, help="The host of the API server", ) @click.option( "-p", "--port", metavar="PORT", default=f"{BACKEND_DEFAULT_PORT}", show_default=True, help="The port of the API server", ) @click.option( "-f", "--db-file", "db_file", metavar="DB_FILE", default="SWHID_DB.sqlite", show_default=True, type=click.Path(exists=True), help="An sqlite database file (it can be generated with: 'swh scanner db import')", ) @click.pass_context def serve(ctx, host, port, db_file): """Start an API service using the sqlite database generated with the "db import" option.""" import swh.scanner.backend as backend from .db import Db db = Db(db_file) backend.run(host, port, db) db.close() def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") if __name__ == "__main__": main() diff --git a/swh/scanner/dashboard/dashboard.py b/swh/scanner/dashboard/dashboard.py index 1743401..e9597fc 100644 --- a/swh/scanner/dashboard/dashboard.py +++ b/swh/scanner/dashboard/dashboard.py @@ -1,101 +1,103 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import dash from dash.dependencies import Input, Output import dash_bootstrap_components as dbc import dash_core_components as dcc import dash_html_components as html import plotly.graph_objects as go -from ..model import Tree +from swh.model.from_disk import Directory +from ..data import MerkleNodeInfo, get_content_from -def generate_table_body(dir_path: Path, source: Tree): + +def generate_table_body( + dir_path: bytes, source_tree: Directory, nodes_data: MerkleNodeInfo +): """ Generate the data_table from the path taken from the chart. For each file builds the html table rows showing the known status, a local link to the file and the relative SoftWare Heritage persistent IDentifier (SWHID). """ + contents = get_content_from(dir_path, source_tree, nodes_data) data = [] - for file_info in source.get_files_from_dir(dir_path): - for file_path, attr in file_info.items(): - file_path = Path(file_path) - file_name = file_path.parts[len(file_path.parts) - 1] - data.append( - html.Tr( - [ - html.Td("✔" if attr["known"] else ""), - html.Td( - html.A(file_name, href="file://" + str(file_path.resolve())) - ), - html.Td(attr["swhid"]), - ] - ) + for cnt, attr in contents.items(): + file_path = Path(cnt.decode()) + file_name = file_path.parts[len(file_path.parts) - 1] + full_file_path = Path(Path(dir_path.decode()), file_path) + data.append( + html.Tr( + [ + html.Td("✔" if attr["known"] else ""), + html.Td(html.A(file_name, href="file://" + str(full_file_path))), + html.Td(attr["swhid"]), + ] ) + ) return [html.Tbody(data)] -def run_app(graph_obj: go, source: Tree): +def run_app(graph_obj: go, source_tree: Directory, nodes_data: MerkleNodeInfo): app = dash.Dash(__name__) fig = go.Figure().add_trace(graph_obj) fig.update_layout(height=800,) table_header = [ html.Thead(html.Tr([html.Th("KNOWN"), html.Th("FILE NAME"), html.Th("SWHID")])) ] app.layout = html.Div( [ html.Div( [ html.Div( [dcc.Graph(id="sunburst_chart", figure=fig),], className="col", ), html.Div( [ html.H3(id="directory_title"), dbc.Table( id="files_table", hover=True, responsive=True, striped=True, ), ], className="col", ), ], className="row", ), ] ) @app.callback( [Output("files_table", "children"), Output("directory_title", "children")], [Input("sunburst_chart", "clickData")], ) def update_files_table(click_data): """ Callback that takes the input (directory path) from the chart and update the `files_table` children with the relative files. """ if click_data is not None: - raw_path = click_data["points"][0]["label"] - full_path = ( - source.path.joinpath(raw_path) - if raw_path != str(source.path) - else Path(raw_path) + full_path = click_data["points"][0]["label"] + return ( + table_header + + generate_table_body(full_path.encode(), source_tree, nodes_data), + full_path, ) - return table_header + generate_table_body(full_path, source), str(full_path) else: return "", "" app.run_server(debug=True, use_reloader=True) diff --git a/swh/scanner/data.py b/swh/scanner/data.py new file mode 100644 index 0000000..4db27f5 --- /dev/null +++ b/swh/scanner/data.py @@ -0,0 +1,107 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from typing import Dict, Tuple + +from swh.model.exceptions import ValidationError +from swh.model.from_disk import Directory +from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID + + +class MerkleNodeInfo(dict): + """Store additional information about Merkle DAG nodes, using SWHIDs as keys""" + + def __setitem__(self, key, value): + """The keys must be valid valid Software Heritage Persistent Identifiers + while values must be dict. + """ + if not isinstance(key, CoreSWHID): + raise ValidationError("keys must be valid SWHID(s)") + + if not isinstance(value, dict): + raise ValidationError(f"values must be dict, not {type(value)}") + + super(MerkleNodeInfo, self).__setitem__(key, value) + + +def get_directory_data( + root_path: str, + source_tree: Directory, + nodes_data: MerkleNodeInfo, + directory_data: Dict = {}, +) -> Dict[Path, dict]: + """Get content information for each directory inside source_tree. + + Returns: + A dictionary with a directory path as key and the relative + contents information as values. + """ + + def _get_directory_data( + source_tree: Directory, nodes_data: MerkleNodeInfo, directory_data: Dict + ): + directories = list( + filter( + lambda n: n.object_type == DIRECTORY, + map(lambda n: n[1], source_tree.items()), + ) + ) + for node in directories: + directory_info = directory_content(node, nodes_data) + rel_path = Path(node.data["path"].decode()).relative_to(Path(root_path)) + directory_data[rel_path] = directory_info + if has_dirs(node): + _get_directory_data(node, nodes_data, directory_data) + + _get_directory_data(source_tree, nodes_data, directory_data) + return directory_data + + +def directory_content(node: Directory, nodes_data: MerkleNodeInfo) -> Tuple[int, int]: + """Count known contents inside the given directory. + + Returns: + A tuple with the total number of contents inside the directory and the number + of known contents. + """ + known_cnt = 0 + node_contents = list( + filter(lambda n: n.object_type == CONTENT, map(lambda n: n[1], node.items())) + ) + for sub_node in node_contents: + if nodes_data[sub_node.swhid()]["known"]: + known_cnt += 1 + + return (len(node_contents), known_cnt) + + +def has_dirs(node: Directory) -> bool: + """Check if the given directory has other directories inside.""" + for _, sub_node in node.items(): + if isinstance(sub_node, Directory): + return True + return False + + +def get_content_from( + node_path: bytes, source_tree: Directory, nodes_data: MerkleNodeInfo +) -> Dict[bytes, dict]: + """Get content information from the given directory node.""" + # root in model.from_disk.Directory should be accessed with b"" + directory = source_tree[node_path if node_path != source_tree.data["path"] else b""] + node_contents = list( + filter( + lambda n: n.object_type == CONTENT, map(lambda n: n[1], directory.items()) + ) + ) + files_data = {} + for node in node_contents: + node_info = nodes_data[node.swhid()] + node_info["swhid"] = str(node.swhid()) + path_name = "path" if "path" in node.data.keys() else "data" + files_data[node.data[path_name]] = node_info + + return files_data diff --git a/swh/scanner/model.py b/swh/scanner/model.py deleted file mode 100644 index 6cf7ef6..0000000 --- a/swh/scanner/model.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from __future__ import annotations - -from enum import Enum -import json -from pathlib import Path -import sys -from typing import Any, Dict, Iterator, List, Tuple - -import ndjson - -from swh.model.identifiers import CONTENT, DIRECTORY - -from .exceptions import InvalidDirectoryPath, InvalidObjectType -from .plot import generate_sunburst, offline_plot - - -class Color(Enum): - blue = "\033[94m" - green = "\033[92m" - red = "\033[91m" - end = "\033[0m" - - -def colorize(text: str, color: Color): - return color.value + text + Color.end.value - - -class Tree: - """Representation of a file system structure - """ - - def __init__(self, path: Path, father: Tree = None): - self.father = father - self.path = path - self.otype = DIRECTORY if path.is_dir() else CONTENT - self.swhid = "" - self.known = False - self.children: Dict[Path, Tree] = {} - - def add_node(self, path: Path, swhid: str, known: bool) -> None: - """Recursively add a new path. - """ - relative_path = path.relative_to(self.path) - - if relative_path == Path("."): - self.swhid = swhid - self.known = known - return - - new_path = self.path.joinpath(relative_path.parts[0]) - if new_path not in self.children: - self.children[new_path] = Tree(new_path, self) - - self.children[new_path].add_node(path, swhid, known) - - def show(self, fmt) -> None: - """Show tree in different formats""" - if fmt == "json": - print(json.dumps(self.to_dict(), indent=4, sort_keys=True)) - - if fmt == "ndjson": - print( - ndjson.dumps( - {str(Path(k).relative_to(self.path)): v} - for node in self.iterate() - for k, v in node.attributes.items() - ) - ) - - elif fmt == "text": - isatty = sys.stdout.isatty() - root_dir = self.path.relative_to(self.path.parent) - print(colorize(str(root_dir), Color.blue) if isatty else str(root_dir)) - self.print_children(isatty) - - elif fmt == "sunburst": - root = self.path - directories = self.get_directories_info(root) - sunburst = generate_sunburst(directories, root) - offline_plot(sunburst) - - def print_children(self, isatty: bool, inc: int = 1) -> None: - for path, node in self.children.items(): - self.print_node(node, isatty, inc) - if node.children: - node.print_children(isatty, inc + 1) - - def print_node(self, node: Any, isatty: bool, inc: int) -> None: - rel_path = str(node.path.relative_to(self.path)) - begin = "│ " * inc - end = "/" if node.otype == DIRECTORY else "" - - if isatty: - if not node.known: - rel_path = colorize(rel_path, Color.red) - elif node.otype == DIRECTORY: - rel_path = colorize(rel_path, Color.blue) - elif node.otype == CONTENT: - rel_path = colorize(rel_path, Color.green) - - print(f"{begin}{rel_path}{end}") - - @property - def attributes(self) -> Dict[str, Dict[str, Any]]: - """ - Get the attributes of the current node grouped by the relative path. - - Returns: - a dictionary containing a path as key and its known/unknown status and the - SWHID as values. - - """ - return {str(self.path): {"swhid": self.swhid, "known": self.known,}} - - def to_dict(self) -> Dict[str, Dict[str, Any]]: - """ - Recursively flatten the current tree nodes into a dictionary. - - For example, if you have the following structure: - - .. code-block:: none - - root { - subdir: { - file.txt - } - } - - The generated dictionary will be: - - .. code-block:: none - - { - "root": { - "swhid": "...", - "known": True/False - } - "root/subdir": { - "swhid": "...", - "known": True/False - } - "root/subdir/file.txt": { - "swhid": "...", - "known": True/False - } - } - """ - return { - str(Path(k).relative_to(self.path)): v - for node in self.iterate() - for k, v in node.attributes.items() - } - - def iterate(self) -> Iterator[Tree]: - """ - Recursively iterate through the children of the current node - - """ - for _, child_node in self.children.items(): - yield child_node - if child_node.otype == DIRECTORY: - yield from child_node.iterate() - - def get_files_from_dir(self, dir_path: Path) -> List: - """ - Retrieve files information about a specific directory path - - Returns: - A list containing the files attributes present inside the directory given - in input - """ - - def get_files(node): - files = [] - for _, node in node.children.items(): - if node.otype == CONTENT: - files.append(node.attributes) - return files - - if dir_path == self.path: - return get_files(self) - else: - for node in self.iterate(): - if node.path == dir_path: - return get_files(node) - raise InvalidDirectoryPath( - "The directory provided doesn't match any stored directory" - ) - - def _get_sub_dirs_info(self, root, directories): - """Fills the directories given in input with the contents information - stored inside the directory child, only if they have contents. - """ - for path, child_node in self.children.items(): - if child_node.otype == DIRECTORY: - rel_path = path.relative_to(root) - contents_info = child_node.count_contents() - # checks the first element of the tuple - # (the number of contents in a directory) - # if it is equal to zero it means that there are no contents - # in that directory. - if not contents_info[0] == 0: - directories[rel_path] = contents_info - if child_node.has_dirs(): - child_node._get_sub_dirs_info(root, directories) - - def get_directories_info(self, root: Path) -> Dict[Path, Tuple[int, int]]: - """Get information about all directories under the given root. - - Returns: - A dictionary with a directory path as key and the relative - contents information (the result of count_contents) as values. - - """ - directories = {root: self.count_contents()} - self._get_sub_dirs_info(root, directories) - return directories - - def count_contents(self) -> Tuple[int, int]: - """Count how many contents are present inside a directory. - If a directory has a SWHID returns as it has all the contents. - - Returns: - A tuple with the total number of the contents and the number - of contents known (the ones that have a persistent identifier). - - """ - contents = 0 - discovered = 0 - - if not self.otype == DIRECTORY: - raise InvalidObjectType( - "Can't count contents of the object type: %s" % self.otype - ) - - if self.known: - # to identify a directory with all files/directories present - return (1, 1) - else: - for _, child_node in self.children.items(): - if child_node.otype == CONTENT: - contents += 1 - if child_node.known: - discovered += 1 - - return (contents, discovered) - - def has_dirs(self) -> bool: - """Checks if node has directories - """ - for _, child_node in self.children.items(): - if child_node.otype == DIRECTORY: - return True - return False diff --git a/swh/scanner/output.py b/swh/scanner/output.py new file mode 100644 index 0000000..86cd485 --- /dev/null +++ b/swh/scanner/output.py @@ -0,0 +1,108 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from enum import Enum +import json +import os +import sys +from typing import Any + +import ndjson + +from swh.model.from_disk import Directory + +from .dashboard.dashboard import run_app +from .data import MerkleNodeInfo, get_directory_data +from .plot import generate_sunburst, offline_plot + +DEFAULT_OUTPUT = "text" + + +class Color(Enum): + BLUE = "\033[94m" + GREEN = "\033[92m" + RED = "\033[91m" + END = "\033[0m" + + +def colorize(text: str, color: Color): + return color.value + text + Color.END.value + + +class Output: + def __init__( + self, root_path: str, nodes_data: MerkleNodeInfo, source_tree: Directory + ): + self.root_path = root_path + self.nodes_data = nodes_data + self.source_tree = source_tree + + def show(self, mode=DEFAULT_OUTPUT): + if mode == "text": + isatty = sys.stdout.isatty() + self.print_text(isatty) + elif mode == "sunburst": + directory_data = get_directory_data( + self.root_path, self.source_tree, self.nodes_data + ) + sunburst_figure = generate_sunburst(directory_data, self.root_path) + offline_plot(sunburst_figure) + elif mode == "interactive": + directory_data = get_directory_data( + self.root_path, self.source_tree, self.nodes_data + ) + sunburst_figure = generate_sunburst(directory_data, self.root_path) + run_app(sunburst_figure, self.source_tree, self.nodes_data) + elif mode == "json": + self.print_json() + elif mode == "ndjson": + self.print_ndjson() + else: + raise Exception(f"mode {mode} is not an output format") + + def get_path_name(self, node): + return "path" if "path" in node.data.keys() else "data" + + def print_text(self, isatty: bool) -> None: + def compute_level(node): + node_path = str(node.data[self.get_path_name(node)]).split("/") + source_path = str(self.source_tree.data["path"]).split("/") + return len(node_path) - len(source_path) + + for node in self.source_tree.iter_tree(): + self.print_node(node, isatty, compute_level(node)) + + def print_node(self, node: Any, isatty: bool, level: int) -> None: + rel_path = os.path.basename(node.data[self.get_path_name(node)]) + rel_path = rel_path.decode() + begin = "│ " * level + end = "/" if node.object_type == "directory" else "" + + if isatty: + if not self.nodes_data[str(node.swhid())]["known"]: + rel_path = colorize(rel_path, Color.RED) + elif node.object_type == "directory": + rel_path = colorize(rel_path, Color.BLUE) + elif node.object_type == "content": + rel_path = colorize(rel_path, Color.GREEN) + + print(f"{begin}{rel_path}{end}") + + def data_as_json(self): + json = {} + for node in self.source_tree.iter_tree(): + node_known = self.nodes_data[node.swhid()]["known"] + rel_path = os.path.relpath( + node.data[self.get_path_name(node)].decode(), + self.source_tree.data["path"].decode(), + ) + json[rel_path] = {"swhid": str(node.swhid()), "known": node_known} + return json + + def print_json(self): + print(json.dumps(self.data_as_json(), indent=4, sort_keys=True)) + + def print_ndjson(self): + print(ndjson.dumps({k: v} for k, v in self.data_as_json().items())) diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py index 5c8b21b..ab7b4a9 100644 --- a/swh/scanner/plot.py +++ b/swh/scanner/plot.py @@ -1,278 +1,268 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ The purpose of this module is to display and to interact with the result of the scanner contained in the model. The `sunburst` function generates a navigable sunburst chart from the directories information retrieved from the model. The chart displays for each directory the total number of files and the percentage of file known. The size of the directory is defined by the total number of contents whereas the color gradient is generated relying on the percentage of contents known. """ from pathlib import Path from typing import Dict, List, Tuple import numpy as np import pandas as pd import plotly.graph_objects as go from plotly.offline import offline def build_hierarchical_df( dirs_dataframe: pd.DataFrame, levels: List[str], metrics_columns: List[str], root_name: str, ) -> pd.DataFrame: """ - Build a hierarchy of levels for Sunburst or Treemap charts. + Build a hierarchy of levels for Sunburst or Treemap charts. - For each directory the new dataframe will have the following - information: + For each directory the new dataframe will have the following + information: - id: the directory name - parent: the parent directory of id - contents: the total number of contents of the directory id and - the relative subdirectories - known: the percentage of contents known relative to computed - 'contents' + id: the directory name + parent: the parent directory of id + contents: the total number of contents of the directory id and + the relative subdirectories + known: the percentage of contents known relative to computed + 'contents' - Example: - Given the following dataframe: + Example: + Given the following dataframe: - .. code-block:: none + .. code-block:: none - lev0 lev1 contents known - '' '' 20 2 //root - kernel kernel/subdirker 5 0 - telnet telnet/subdirtel 10 4 + lev0 lev1 contents known + '' '' 20 2 //root + kernel kernel/subdirker 5 0 + telnet telnet/subdirtel 10 4 - The output hierarchical dataframe will be like the following: + The output hierarchical dataframe will be like the following: - .. code-block:: none + .. code-block:: none - id parent contents known - 20 10.00 - kernel/subdirker kernel 5 0.00 - telnet/subdirtel telnet 10 40.00 - total 20 10.00 - kernel total 5 0.00 - telnet total 10 40.00 - total 35 17.14 + id parent contents known + 20 10.00 + kernel/subdirker kernel 5 0.00 + telnet/subdirtel telnet 10 40.00 + total 20 10.00 + kernel total 5 0.00 + telnet total 10 40.00 + total 35 17.14 - To create the hierarchical dataframe we need to iterate through - the dataframe given in input relying on the number of levels. + To create the hierarchical dataframe we need to iterate through + the dataframe given in input relying on the number of levels. - Based on the previous example we have to do two iterations: + Based on the previous example we have to do two iterations: - iteration 1 - The generated dataframe 'df_tree' will be: + iteration 1 + The generated dataframe 'df_tree' will be: - .. code-block:: none + .. code-block:: none - id parent contents known - 20 10.0 - kernel/subdirker kernel 5 0.0 - telnet/subdirtel telnet 10 40.0 + id parent contents known + 20 10.0 + kernel/subdirker kernel 5 0.0 + telnet/subdirtel telnet 10 40.0 - iteration 2 - The generated dataframe 'df_tree' will be: + iteration 2 + The generated dataframe 'df_tree' will be: - .. code-block:: none + .. code-block:: none - id parent contents known - total 20 10.0 - kernel total 5 0.0 - telnet total 10 40.0 + id parent contents known + total 20 10.0 + kernel total 5 0.0 + telnet total 10 40.0 - Note that since we have reached the last level, the parent given - to the directory id is the directory root. + Note that since we have reached the last level, the parent given + to the directory id is the directory root. - The 'total' row il computed by adding the number of contents of the - dataframe given in input and the average of the contents known on - the total number of contents. + The 'total' row il computed by adding the number of contents of the + dataframe given in input and the average of the contents known on + the total number of contents. """ def compute_known_percentage(contents: pd.Series, known: pd.Series) -> pd.Series: """This function compute the percentage of known contents and generate - the new known column with the percentage values. + the new known column with the percentage values. - It also assures that if there is no contents inside a directory - the percentage is zero + It also assures that if there is no contents inside a directory + the percentage is zero """ known_values = [] for idx, content_val in enumerate(contents): if content_val == 0: known_values.append(0) else: percentage = known[idx] / contents[idx] * 100 known_values.append(percentage) return pd.Series(np.array(known_values)) complete_df = pd.DataFrame(columns=["id", "parent", "contents", "known"]) # revert the level order to start from the deepest levels = [level for level in reversed(levels)] contents_col = metrics_columns[0] known_col = metrics_columns[1] df_tree_list = [] for i, level in enumerate(levels): df_tree = pd.DataFrame(columns=["id", "parent", "contents", "known"]) dfg = dirs_dataframe.groupby(levels[i:]).sum() dfg = dfg.reset_index() df_tree["id"] = dfg[level].copy() if i < len(levels) - 1: # copy the parent directories (one level above) df_tree["parent"] = dfg[levels[i + 1]].copy() else: # last level reached df_tree["parent"] = root_name # copy the contents column df_tree["contents"] = dfg[contents_col] # compute the percentage relative to the contents df_tree["known"] = compute_known_percentage(dfg[contents_col], dfg[known_col]) df_tree_list.append(df_tree) complete_df = complete_df.append(df_tree_list, ignore_index=True) # create the main parent total_contents = dirs_dataframe[contents_col].sum() total_known = dirs_dataframe[known_col].sum() total_avg = total_known / total_contents * 100 total = pd.Series( dict(id=root_name, parent="", contents=total_contents, known=total_avg) ) complete_df = complete_df.append(total, ignore_index=True) return complete_df -def compute_max_depth(dirs_path: List[Path], root: Path) -> int: +def compute_max_depth(dirs_path: List[Path]) -> int: """Compute the maximum depth level of the given directory paths. - Example: for `var/log/kernel/` the depth level is 3 + Example: for `var/log/kernel/` the depth level is 3 """ max_depth = 0 for dir_path in dirs_path: - if dir_path == root: - continue - - dir_depth = len(dir_path.parts) + dir_depth = len( + dir_path.parts[1:] if dir_path.parts[0] == "/" else dir_path.parts + ) if dir_depth > max_depth: max_depth = dir_depth return max_depth def generate_df_from_dirs( - dirs: Dict[Path, Tuple[int, int]], columns: List[str], root: Path, max_depth: int, + dirs: Dict[Path, Tuple[int, int]], columns: List[str], max_depth: int, ) -> pd.DataFrame: """Generate a dataframe from the directories given in input. Example: given the following directories as input .. code-block:: python dirs = { '/var/log/': (23, 2), '/var/log/kernel': (5, 0), '/var/log/telnet': (10, 3) } The generated dataframe will be: .. code-block:: none lev0 lev1 lev2 contents known 'var' 'var/log' '' 23 2 'var' 'var/log' 'var/log/kernel' 5 0 'var' 'var/log' 'var/log/telnet' 10 3 """ def get_parents(path: Path): parts = path.parts[1:] if path.parts[0] == "/" else path.parts for i in range(1, len(parts) + 1): yield "/".join(parts[0:i]) def get_dirs_array(): for dir_path, contents_info in dirs.items(): empty_lvl = max_depth - len(dir_path.parts) - if dir_path == root: - # ignore the root but store contents information - yield [""] * (max_depth) + list(contents_info) - else: - yield list(get_parents(dir_path)) + [""] * empty_lvl + list( - contents_info - ) + yield list(get_parents(dir_path)) + [""] * empty_lvl + list(contents_info) df = pd.DataFrame( np.array([dir_array for dir_array in get_dirs_array()]), columns=columns ) df["contents"] = pd.to_numeric(df["contents"]) df["known"] = pd.to_numeric(df["known"]) return df def generate_sunburst( directories: Dict[Path, Tuple[int, int]], root: Path ) -> go.Sunburst: - """Generate a sunburst chart from the directories given in input. - - """ - max_depth = compute_max_depth(list(directories.keys()), root) + """Generate a sunburst chart from the directories given in input.""" + max_depth = compute_max_depth(list(directories.keys())) metrics_columns = ["contents", "known"] levels_columns = ["lev" + str(i) for i in range(max_depth)] df_columns = levels_columns + metrics_columns - dirs_df = generate_df_from_dirs(directories, df_columns, root, max_depth) + dirs_df = generate_df_from_dirs(directories, df_columns, max_depth) hierarchical_df = build_hierarchical_df( dirs_df, levels_columns, metrics_columns, str(root) ) sunburst = go.Sunburst( labels=hierarchical_df["id"], parents=hierarchical_df["parent"], values=hierarchical_df["contents"], branchvalues="total", marker=dict( colors=hierarchical_df["known"], colorscale="matter", cmid=50, showscale=True, ), hovertemplate="""%{label}
Files: %{value}
Known: %{color:.2f}%""", name="", ) return sunburst def offline_plot(graph_object: go): - """Plot a graph object to an html file - """ + """Plot a graph object to an html file""" fig = go.Figure() fig.add_trace(graph_object) offline.plot(fig, filename="chart.html") diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index 8ce2b0d..6b1e365 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,233 +1,140 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import itertools -import os -from pathlib import Path -from typing import Any, Dict, Iterable, Iterator, List, Pattern, Tuple, Union +from typing import Any, Dict, Iterable, List import aiohttp -from swh.model.from_disk import ( - Content, - Directory, - accept_all_directories, - extract_regex_objs, -) -from swh.model.identifiers import CoreSWHID, ObjectType +from swh.model.cli import model_of_dir +from swh.model.from_disk import Directory +from swh.model.identifiers import DIRECTORY -from .dashboard.dashboard import run_app +from .data import MerkleNodeInfo from .exceptions import error_response -from .model import Tree -from .plot import generate_sunburst +from .output import Output + + +async def lazy_bfs( + source_tree: Directory, + data: MerkleNodeInfo, + session: aiohttp.ClientSession, + api_url: str, +): + + queue = [] + queue.append(source_tree) + + while queue: + swhids = [str(node.swhid()) for node in queue] + swhids_res = await swhids_discovery(swhids, session, api_url) + for node in queue.copy(): + queue.remove(node) + data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"] + if node.object_type == DIRECTORY: + if not data[node.swhid()]["known"]: + children = [n[1] for n in list(node.items())] + queue.extend(children) + else: + for sub_node in node.iter_tree(dedup=False): + if sub_node == node: + continue + data[sub_node.swhid()]["known"] = True # type: ignore async def swhids_discovery( swhids: List[str], session: aiohttp.ClientSession, api_url: str, ) -> Dict[str, Dict[str, bool]]: """API Request to get information about the SoftWare Heritage persistent IDentifiers (SWHIDs) given in input. Args: swhids: a list of SWHIDS api_url: url for the API request Returns: A dictionary with: key: SWHID searched value: value['known'] = True if the SWHID is found value['known'] = False if the SWHID is not found """ endpoint = api_url + "known/" chunk_size = 1000 requests = [] def get_chunk(swhids): for i in range(0, len(swhids), chunk_size): yield swhids[i : i + chunk_size] async def make_request(swhids): async with session.post(endpoint, json=swhids) as resp: if resp.status != 200: error_response(resp.reason, resp.status, endpoint) return await resp.json() if len(swhids) > chunk_size: for swhids_chunk in get_chunk(swhids): requests.append(asyncio.create_task(make_request(swhids_chunk))) res = await asyncio.gather(*requests) # concatenate list of dictionaries return dict(itertools.chain.from_iterable(e.items() for e in res)) else: return await make_request(swhids) -def directory_filter( - path_name: Union[str, bytes], exclude_patterns: Iterable[Pattern[bytes]] -) -> bool: - """It checks if the path_name is matching with the patterns given in input. - - It is also used as a `dir_filter` function when generating the directory - object from `swh.model.from_disk` - - Returns: - False if the directory has to be ignored, True otherwise - - """ - path = Path(path_name.decode() if isinstance(path_name, bytes) else path_name) - - for sre_pattern in exclude_patterns: - if sre_pattern.match(bytes(path)): - return False - return True - - -def get_subpaths( - path: Path, exclude_patterns: Iterable[Pattern[bytes]] -) -> Iterator[Tuple[Path, str]]: - """Find the SoftWare Heritage persistent IDentifier (SWHID) of - the directories and files under a given path. - - Args: - path: the root path - - Yields: - pairs of: path, the relative SWHID - - """ - - def swhid_of(path: Path) -> str: - if path.is_dir(): - if exclude_patterns: - - def dir_filter(dirpath: bytes, *args) -> bool: - return directory_filter(dirpath, exclude_patterns) - - else: - dir_filter = accept_all_directories # type: ignore - - obj = Directory.from_disk( - path=bytes(path), dir_filter=dir_filter - ).get_data() - - return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"])) - else: - obj = Content.from_file(path=bytes(path)).get_data() - return str( - CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"]) - ) - - dirpath, dnames, fnames = next(os.walk(path)) - for node in itertools.chain(dnames, fnames): - sub_path = Path(dirpath).joinpath(node) - yield (sub_path, swhid_of(sub_path)) - - -async def parse_path( - path: Path, - session: aiohttp.ClientSession, - api_url: str, - exclude_patterns: Iterable[Pattern[bytes]], -) -> Iterator[Tuple[str, str, bool]]: - """Check if the sub paths of the given path are present in the - archive or not. - - Args: - path: the source path - api_url: url for the API request - - Returns: - a map containing tuples with: a subpath of the given path, - the SWHID of the subpath and the result of the api call - - """ - parsed_paths = dict(get_subpaths(path, exclude_patterns)) - parsed_swhids = await swhids_discovery( - list(parsed_paths.values()), session, api_url - ) - - def unpack(tup): - subpath, swhid = tup - return (subpath, swhid, parsed_swhids[swhid]["known"]) - - return map(unpack, parsed_paths.items()) - - async def run( - config: Dict[str, Any], - root: str, - source_tree: Tree, - exclude_patterns: Iterable[Pattern[bytes]], + config: Dict[str, Any], source_tree: Directory, nodes_data: MerkleNodeInfo ) -> None: """Start scanning from the given root. It fills the source tree with the path discovered. Args: root: the root path to scan api_url: url for the API request """ api_url = config["web-api"]["url"] - async def _scan(root, session, api_url, source_tree, exclude_patterns): - for path, obj_swhid, known in await parse_path( - root, session, api_url, exclude_patterns - ): - obj_type = CoreSWHID.from_string(obj_swhid).object_type - - if obj_type == ObjectType.CONTENT: - source_tree.add_node(path, obj_swhid, known) - elif obj_type == ObjectType.DIRECTORY and directory_filter( - path, exclude_patterns - ): - source_tree.add_node(path, obj_swhid, known) - if not known: - await _scan(path, session, api_url, source_tree, exclude_patterns) - if config["web-api"]["auth-token"]: headers = {"Authorization": f"Bearer {config['web-api']['auth-token']}"} else: headers = {} + for node in source_tree.iter_tree(): + nodes_data[node.swhid()] = {} # type: ignore + async with aiohttp.ClientSession(headers=headers, trust_env=True) as session: - await _scan(root, session, api_url, source_tree, exclude_patterns) + await lazy_bfs(source_tree, nodes_data, session, api_url) def scan( config: Dict[str, Any], root_path: str, exclude_patterns: Iterable[str], out_fmt: str, interactive: bool, ): """Scan a source code project to discover files and directories already present in the archive""" - converted_patterns = set(pattern.encode() for pattern in exclude_patterns) - sre_patterns = set() - if exclude_patterns: - sre_patterns = { - reg_obj - for reg_obj in extract_regex_objs(root_path.encode(), converted_patterns) - } - - source_tree = Tree(Path(root_path)) + converted_patterns = [pattern.encode() for pattern in exclude_patterns] + source_tree = model_of_dir(root_path.encode(), converted_patterns) + nodes_data = MerkleNodeInfo() + loop = asyncio.get_event_loop() - loop.run_until_complete(run(config, root_path, source_tree, sre_patterns)) + loop.run_until_complete(run(config, source_tree, nodes_data)) + out = Output(root_path, nodes_data, source_tree) if interactive: - root = Path(root_path) - directories = source_tree.get_directories_info(root) - figure = generate_sunburst(directories, root) - run_app(figure, source_tree) + out.show("interactive") else: - source_tree.show(out_fmt) + out.show(out_fmt) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py index 4430257..7b91f16 100644 --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -1,158 +1,109 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import os from pathlib import Path import shutil import aiohttp from aioresponses import aioresponses # type: ignore import pytest -from swh.model.cli import swhid_of_dir, swhid_of_file -from swh.scanner.model import Tree +from swh.model.cli import model_of_dir +from swh.scanner.data import MerkleNodeInfo from .data import present_swhids from .flask_api import create_app @pytest.fixture def mock_aioresponse(): with aioresponses() as m: yield m @pytest.fixture def event_loop(): """Fixture that generate an asyncio event loop.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) yield loop loop.close() @pytest.fixture async def aiosession(): """Fixture that generate an aiohttp Client Session.""" session = aiohttp.ClientSession() yield session session.detach() @pytest.fixture(scope="function") -def temp_folder(tmp_path): - """Fixture that generates a temporary folder with the following - structure: - - .. code-block:: python - - root = { - subdir: { - subsubdir - filesample.txt - filesample2.txt - } - subdir2 - subfile.txt - } - """ - root = tmp_path - subdir = root / "subdir" - subdir.mkdir() - subsubdir = subdir / "subsubdir" - subsubdir.mkdir() - subdir2 = root / "subdir2" - subdir2.mkdir() - subfile = root / "subfile.txt" - subfile.touch() - filesample = subdir / "filesample.txt" - filesample.touch() - filesample2 = subdir / "filesample2.txt" - filesample2.touch() - - avail_path = { - subdir: str(swhid_of_dir(bytes(subdir))), - subsubdir: str(swhid_of_dir(bytes(subsubdir))), - subdir2: str(swhid_of_dir(bytes(subdir2))), - subfile: str(swhid_of_file(bytes(subfile))), - filesample: str(swhid_of_file(bytes(filesample))), - filesample2: str(swhid_of_file(bytes(filesample2))), - } - - return { - "root": root, - "paths": avail_path, - "filesample": filesample, - "filesample2": filesample2, - "subsubdir": subsubdir, - "subdir": subdir, - } +def test_sample_folder(datadir, tmp_path): + """Location of the "data" folder""" + archive_path = Path(os.path.join(datadir, "sample-folder.tgz")) + assert archive_path.exists() + shutil.unpack_archive(archive_path, extract_dir=tmp_path) + test_sample_folder = Path(os.path.join(tmp_path, "sample-folder")) + assert test_sample_folder.exists() + return test_sample_folder @pytest.fixture(scope="function") -def example_tree(temp_folder): - """Fixture that generate a Tree with the root present in the - session fixture "temp_folder". +def source_tree(test_sample_folder): + """Generate a model.from_disk.Directory object from the test sample + folder """ - example_tree = Tree(temp_folder["root"]) - assert example_tree.path == temp_folder["root"] - - return example_tree + return model_of_dir(str(test_sample_folder).encode()) @pytest.fixture(scope="function") -def example_dirs(example_tree, temp_folder): - """ - Fixture that fill the fixture example_tree with the values contained in - the fixture temp_folder and returns the directories information of the - filled example_tree. - +def source_tree_dirs(source_tree): + """Returns a list of all directories contained inside the test sample + folder """ - root = temp_folder["root"] - filesample_path = temp_folder["filesample"] - filesample2_path = temp_folder["filesample2"] - subsubdir_path = temp_folder["subsubdir"] - known_paths = [filesample_path, filesample2_path, subsubdir_path] + root = source_tree.data["path"] + return list( + map( + lambda n: Path(n.data["path"].decode()).relative_to(Path(root.decode())), + filter( + lambda n: n.object_type == "directory" + and not n.data["path"] == source_tree.data["path"], + source_tree.iter_tree(dedup=False), + ), + ) + ) - for path, swhid in temp_folder["paths"].items(): - if path in known_paths: - example_tree.add_node(path, swhid, True) - else: - example_tree.add_node(path, swhid, False) - return example_tree.get_directories_info(root) - - -@pytest.fixture -def test_sample_folder(datadir, tmp_path): - """Location of the "data" folder """ - archive_path = Path(os.path.join(datadir, "sample-folder.tgz")) - assert archive_path.exists() - shutil.unpack_archive(archive_path, extract_dir=tmp_path) - test_sample_folder = Path(os.path.join(tmp_path, "sample-folder")) - assert test_sample_folder.exists() - return test_sample_folder +@pytest.fixture(scope="function") +def nodes_data(source_tree): + """mock known status of file/dirs in test_sample_folder""" + nodes_data = MerkleNodeInfo() + for node in source_tree.iter_tree(): + nodes_data[node.swhid()] = {"known": True} + return nodes_data @pytest.fixture def test_swhids_sample(tmp_path): """Create and return the opened "swhids_sample" file, - filled with present swhids present in data.py + filled with present swhids present in data.py """ test_swhids_sample = Path(os.path.join(tmp_path, "swhids_sample.txt")) with open(test_swhids_sample, "w") as f: f.write("\n".join(swhid for swhid in present_swhids)) assert test_swhids_sample.exists() return open(test_swhids_sample, "r") @pytest.fixture(scope="session") def app(): """Flask backend API (used by live_server).""" app = create_app() return app diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py index 541a16a..dd805d3 100644 --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -1,21 +1,26 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information correct_api_response = { "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False}, "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True}, "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } -# present SWHIDs inside /data/sample-folder present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] +# these SWHIDs are considered known by the fake backend (scanner.test.flask_api) +unknown_swhids = [ + "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce", # root directory + "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119", # toexclude/example.txt + "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ +] to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py index cdcd212..8ab3196 100644 --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -1,36 +1,36 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from flask import Flask, request from swh.scanner.exceptions import LargePayloadExc -from .data import present_swhids +from .data import unknown_swhids def create_app(): app = Flask(__name__) @app.route("/") def index(): return "SWH scanner API" @app.route("/known/", methods=["POST"]) def known(): swhids = request.get_json() if len(swhids) > 900: raise LargePayloadExc( "The maximum number of SWHIDs this endpoint can receive is 900" ) res = {swhid: {"known": False} for swhid in swhids} for swhid in swhids: - if swhid in present_swhids: + if swhid not in unknown_swhids: res[swhid]["known"] = True return res return app diff --git a/swh/scanner/tests/test_dashboard.py b/swh/scanner/tests/test_dashboard.py index ab7bfd5..93c7663 100644 --- a/swh/scanner/tests/test_dashboard.py +++ b/swh/scanner/tests/test_dashboard.py @@ -1,51 +1,46 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dash_html_components as html +from swh.model.identifiers import CoreSWHID, ObjectType from swh.scanner.dashboard.dashboard import generate_table_body +from swh.scanner.data import MerkleNodeInfo -def test_generate_table_body(example_tree, temp_folder): - subdir_path = temp_folder["subdir"] +def test_generate_table_body(source_tree): + chart_path = b"/bar/barfoo" + dir_path = source_tree[b"/bar/barfoo"].data["path"].decode() + nodes_data = MerkleNodeInfo() + # CoreSWHID of 'another-quote.org' + known_cnt_swhid = CoreSWHID( + object_type=ObjectType.CONTENT, + object_id=b"\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68", + ) + nodes_data[known_cnt_swhid] = {"known": True} - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True) - - generated_body = generate_table_body(subdir_path, example_tree) + generated_body = generate_table_body(chart_path, source_tree, nodes_data) expected_body = [ html.Tbody( [ html.Tr( [ html.Td("✔"), html.Td( html.A( - children="filesample.txt", - href=f"file://{subdir_path}/filesample.txt", - ) - ), - html.Td("swh:1:cnt:e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"), - ] - ), - html.Tr( - [ - html.Td("✔"), - html.Td( - html.A( - children="filesample2.txt", - href=f"file://{subdir_path}/filesample2.txt", + children="another-quote.org", + href=f"file://{dir_path}/another-quote.org", ) ), - html.Td("swh:1:cnt:e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"), + html.Td("swh:1:cnt:133693b125bad2b4ac318535b84901ebb1f6b638"), ] ), ] ) ] # workaround: dash_html_component.__eq__ checks for object identity only assert str(generated_body) == str(expected_body) diff --git a/swh/scanner/tests/test_data.py b/swh/scanner/tests/test_data.py new file mode 100644 index 0000000..4a29751 --- /dev/null +++ b/swh/scanner/tests/test_data.py @@ -0,0 +1,44 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +import pytest + +from swh.model.exceptions import ValidationError +from swh.scanner.data import ( + MerkleNodeInfo, + directory_content, + get_directory_data, + has_dirs, +) + + +def test_merkle_node_data_wrong_args(): + nodes_data = MerkleNodeInfo() + + with pytest.raises(ValidationError): + nodes_data["wrong key"] = {"known": True} + + with pytest.raises(ValidationError): + nodes_data["swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112"] = "wrong value" + + +def test_get_directory_data(source_tree, nodes_data): + root = Path(source_tree.data["path"].decode()) + dirs_data = get_directory_data(root, source_tree, nodes_data) + + assert len(dirs_data) == 5 + + +def test_directory_content(source_tree, nodes_data): + foo_dir = source_tree[b"foo"] + foo_content = directory_content(foo_dir, nodes_data) + assert foo_content[0] == 3 + assert foo_content[1] == 3 + + +def test_has_dirs(source_tree): + assert has_dirs(source_tree) diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py deleted file mode 100644 index 8266cc7..0000000 --- a/swh/scanner/tests/test_model.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - -from swh.scanner.exceptions import InvalidDirectoryPath - - -def test_tree_add_node(example_tree, temp_folder): - avail_paths = temp_folder["paths"].keys() - - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, False) - - for path, node in example_tree.children.items(): - assert path in avail_paths - if node.children: - for subpath, subnode in node.children.items(): - assert subpath in avail_paths - - -def test_to_json_no_one_present(example_tree, temp_folder): - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, False) - - result = example_tree.to_dict() - - assert len(result) == 6 - - for _, node_info in result.items(): - assert node_info["known"] is False - - -def test_get_json_tree_all_present(example_tree, temp_folder): - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True) - - result = example_tree.to_dict() - - assert len(result) == 6 - - for _, node_info in result.items(): - assert node_info["known"] is True - - -def test_get_json_tree_only_one_present(example_tree, temp_folder): - filesample_path = temp_folder["filesample"] - - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True if path == filesample_path else False) - - result = example_tree.to_dict() - - assert len(result) == 6 - - for path, node_attr in result.items(): - if path == "subdir/filesample.txt": - assert node_attr["known"] is True - else: - assert node_attr["known"] is False - - -def test_get_directories_info(example_tree, temp_folder): - root_path = temp_folder["root"] - filesample_path = temp_folder["filesample"] - filesample2_path = temp_folder["filesample2"] - subdir_path = temp_folder["subdir"].relative_to(root_path) - subsubdir_path = temp_folder["subsubdir"].relative_to(root_path) - - for path, swhid in temp_folder["paths"].items(): - if path == filesample_path or path == filesample2_path: - example_tree.add_node(path, swhid, True) - else: - example_tree.add_node(path, swhid, False) - - directories = example_tree.get_directories_info(example_tree.path) - - assert subsubdir_path not in directories - assert directories[subdir_path] == (2, 2) - - -def test_get_files_from_dir(example_tree, temp_folder): - subdir_path = temp_folder["subdir"] - - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True) - - files = example_tree.get_files_from_dir(subdir_path) - assert len(files) == 2 - - -def test_get_files_source_path(example_tree, temp_folder): - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True) - - files = example_tree.get_files_from_dir(example_tree.path) - assert len(files) == 1 - - -def test_get_files_from_dir_raise_exception(example_tree, temp_folder): - for path, swhid in temp_folder["paths"].items(): - example_tree.add_node(path, swhid, True) - - with pytest.raises(InvalidDirectoryPath): - example_tree.get_files_from_dir("test/") diff --git a/swh/scanner/tests/test_plot.py b/swh/scanner/tests/test_plot.py index cec801b..ce91708 100644 --- a/swh/scanner/tests/test_plot.py +++ b/swh/scanner/tests/test_plot.py @@ -1,57 +1,68 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from pathlib import Path + +from swh.scanner.data import get_directory_data from swh.scanner.plot import ( build_hierarchical_df, compute_max_depth, generate_df_from_dirs, ) -def test_max_depth(temp_folder, example_dirs): - root = temp_folder["root"] - max_depth = compute_max_depth(example_dirs, root) +def test_max_depth(source_tree, source_tree_dirs): + dirs = [Path(dir_path) for dir_path in source_tree_dirs] + max_depth = compute_max_depth(dirs) assert max_depth == 2 -def test_generate_df_from_dirs(temp_folder, example_dirs): - root = temp_folder["root"] - max_depth = compute_max_depth(example_dirs, root) +def test_generate_df_from_dirs(source_tree, source_tree_dirs, nodes_data): + root = Path(source_tree.data["path"].decode()) + dirs = [Path(dir_path) for dir_path in source_tree_dirs] + dirs_data = get_directory_data(root, source_tree, nodes_data) + max_depth = compute_max_depth(dirs) metrics_columns = ["contents", "known"] levels_columns = ["lev" + str(i) for i in range(max_depth)] df_columns = levels_columns + metrics_columns - actual_df = generate_df_from_dirs(example_dirs, df_columns, root, max_depth) + actual_df = generate_df_from_dirs(dirs_data, df_columns, max_depth) + + expected_lev0_path = ["bar", "foo", "toexclude"] + expected_lev1_path = ["bar/barfoo", "bar/barfoo2"] + + df_lev0 = actual_df["lev0"].tolist() + df_lev1 = actual_df["lev1"].tolist() - # assert root is empty - assert actual_df["lev0"][0] == "" - assert actual_df["lev1"][0] == "" + for path in expected_lev0_path: + assert path in df_lev0 - # assert subdir has correct contents information - assert actual_df["contents"][1] == 2 - assert actual_df["known"][1] == 2 + for path in expected_lev1_path: + assert path in df_lev1 - # assert subsubdir has correct level information - assert actual_df["lev0"][2] == "subdir" - assert actual_df["lev1"][2] == "subdir/subsubdir" + assert actual_df["contents"].sum() == 6 + assert actual_df["known"].sum() == 6 -def test_build_hierarchical_df(temp_folder, example_dirs): - root = temp_folder["root"] - max_depth = compute_max_depth(example_dirs, root) +def test_build_hierarchical_df(source_tree, source_tree_dirs, nodes_data): + root = Path(source_tree.data["path"].decode()) + dirs = [Path(dir_path) for dir_path in source_tree_dirs] + dirs_data = get_directory_data(root, source_tree, nodes_data) + max_depth = compute_max_depth(dirs) metrics_columns = ["contents", "known"] levels_columns = ["lev" + str(i) for i in range(max_depth)] df_columns = levels_columns + metrics_columns - actual_df = generate_df_from_dirs(example_dirs, df_columns, root, max_depth) + actual_df = generate_df_from_dirs(dirs_data, df_columns, max_depth) actual_result = build_hierarchical_df( actual_df, levels_columns, metrics_columns, root ) - assert actual_result["parent"][1] == "subdir" - assert actual_result["contents"][1] == 2 - assert actual_result["id"][5] == root - assert actual_result["known"][5] == 75 + assert actual_result["parent"][0] == "bar" + assert actual_result["parent"][1] == "foo" + assert actual_result["contents"][1] == 3 + assert actual_result["id"][8] == root + assert actual_result["known"][8] == 100 diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py index f757033..8f0beeb 100644 --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -1,121 +1,70 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from flask import url_for import pytest -from swh.model.exceptions import InvalidDirectoryPath +from swh.scanner.data import MerkleNodeInfo from swh.scanner.exceptions import APIError -from swh.scanner.model import Tree -from swh.scanner.scanner import extract_regex_objs, get_subpaths, run, swhids_discovery +from swh.scanner.scanner import run, swhids_discovery -from .data import correct_api_response, present_swhids, to_exclude_swhid +from .data import correct_api_response, unknown_swhids aio_url = "http://example.org/api/known/" -def test_extract_regex_objs(temp_folder): - root_path = bytes(temp_folder["root"]) - - patterns = (bytes(temp_folder["subdir"]), b"/none") - - sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] - assert len(sre_patterns) == 2 - - patterns = (*patterns, b"/tmp") - with pytest.raises(InvalidDirectoryPath): - sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] - - def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post( aio_url, status=200, content_type="application/json", body=json.dumps(correct_api_response), ) actual_result = event_loop.run_until_complete( swhids_discovery([], aiosession, "http://example.org/api/") ) assert correct_api_response == actual_result def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post(aio_url, content_type="application/json", status=413) with pytest.raises(APIError): event_loop.run_until_complete( swhids_discovery([], aiosession, "http://example.org/api/") ) def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server): api_url = url_for("index", _external=True) request = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) ] # /known/ is limited at 900 with pytest.raises(APIError): event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url)) -def test_scanner_get_subpaths(temp_folder): - root = temp_folder["root"] - - actual_result = [] - for subpath, swhid in get_subpaths(root, tuple()): - # also check if it's a symlink since pytest tmp_dir fixture create - # also a symlink to each directory inside the tmp_dir path - if subpath.is_dir() and not subpath.is_symlink(): - actual_result.append((subpath, swhid)) - - assert len(actual_result) == 2 - - @pytest.mark.options(debug=False) def test_app(app): assert not app.debug -def test_scanner_result(live_server, event_loop, test_sample_folder): +def test_scanner_result(live_server, event_loop, source_tree): api_url = url_for("index", _external=True) config = {"web-api": {"url": api_url, "auth-token": None}} - source_tree = Tree(test_sample_folder) - event_loop.run_until_complete(run(config, test_sample_folder, source_tree, set())) - - for child_node in source_tree.iterate(): - node_info = list(child_node.attributes.values())[0] - if node_info["swhid"] in present_swhids: - assert node_info["known"] is True + nodes_data = MerkleNodeInfo() + event_loop.run_until_complete(run(config, source_tree, nodes_data)) + for node in source_tree.iter_tree(): + if str(node.swhid()) in unknown_swhids: + assert nodes_data[node.swhid()]["known"] is False else: - assert node_info["known"] is False - - -def test_scanner_result_with_exclude_patterns( - live_server, event_loop, test_sample_folder -): - api_url = url_for("index", _external=True) - config = {"web-api": {"url": api_url, "auth-token": None}} - to_exclude_dir = str(test_sample_folder) + "/toexclude" - - patterns = (to_exclude_dir.encode(),) - exclude_pattern = { - reg_obj for reg_obj in extract_regex_objs(bytes(test_sample_folder), patterns) - } - - source_tree = Tree(test_sample_folder) - event_loop.run_until_complete( - run(config, test_sample_folder, source_tree, exclude_pattern) - ) - - for child_node in source_tree.iterate(): - node_info = list(child_node.attributes.values())[0] - assert node_info["swhid"] != to_exclude_swhid + assert nodes_data[node.swhid()]["known"] is True