diff --git a/requirements-swh.txt b/requirements-swh.txt index 904e7ab..08665ea 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core -swh.model >= 0.0.64 +swh.model >= 0.3.8 diff --git a/swh/scanner/dashboard/dashboard.py b/swh/scanner/dashboard/dashboard.py index 3c75d23..27db913 100644 --- a/swh/scanner/dashboard/dashboard.py +++ b/swh/scanner/dashboard/dashboard.py @@ -1,102 +1,101 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import PosixPath from ..model import Tree import plotly.graph_objects as go import dash import dash_core_components as dcc import dash_html_components as html import dash_bootstrap_components as dbc from dash.dependencies import Input, Output def generate_table_body(dir_path: PosixPath, source: Tree): """ Generate the data_table from the path taken from the chart. For each file builds the html table rows showing the known status, a local link to - the file and the relative Software Heritage Persistent Identifier. - + the file and the relative SoftWare Heritage persistent IDentifier (SWHID). """ data = [] for file_info in source.getFilesFromDir(dir_path): for file_path, attr in file_info.items(): file_path = PosixPath(file_path) file_name = file_path.parts[len(file_path.parts) - 1] data.append( html.Tr( [ html.Td("✔" if attr["known"] else ""), html.Td( html.A(file_name, href="file://" + str(file_path.resolve())) ), html.Td(attr["swhid"]), ] ) ) return [html.Tbody(data)] def run_app(graph_obj: go, source: Tree): app = dash.Dash(__name__) fig = go.Figure().add_trace(graph_obj) fig.update_layout(height=800,) table_header = [ html.Thead(html.Tr([html.Th("KNOWN"), html.Th("FILE NAME"), html.Th("SWHID")])) ] app.layout = html.Div( [ html.Div( [ html.Div( [dcc.Graph(id="sunburst_chart", figure=fig),], className="col", ), html.Div( [ html.H3(id="directory_title"), dbc.Table( id="files_table", hover=True, responsive=True, striped=True, ), ], className="col", ), ], className="row", ), ] ) @app.callback( [Output("files_table", "children"), Output("directory_title", "children")], [Input("sunburst_chart", "clickData")], ) def update_files_table(click_data): """ Callback that takes the input (directory path) from the chart and update the `files_table` children with the relative files. """ if click_data is not None: raw_path = click_data["points"][0]["label"] full_path = ( source.path.joinpath(raw_path) if raw_path != str(source.path) else PosixPath(raw_path) ) return table_header + generate_table_body(full_path, source), str(full_path) else: return "", "" app.run_server(debug=True, use_reloader=True) diff --git a/swh/scanner/model.py b/swh/scanner/model.py index 435ec3e..108e9e2 100644 --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -1,265 +1,265 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import sys import json from pathlib import PosixPath from typing import Any, Dict, Tuple, Iterable, List from enum import Enum import ndjson from .plot import generate_sunburst, offline_plot from .exceptions import InvalidObjectType, InvalidDirectoryPath from swh.model.identifiers import DIRECTORY, CONTENT class Color(Enum): blue = "\033[94m" green = "\033[92m" red = "\033[91m" end = "\033[0m" def colorize(text: str, color: Color): return color.value + text + Color.end.value class Tree: """Representation of a file system structure """ def __init__(self, path: PosixPath, father: Tree = None): self.father = father self.path = path self.otype = DIRECTORY if path.is_dir() else CONTENT self.swhid = "" self.known = False self.children: Dict[PosixPath, Tree] = {} def addNode(self, path: PosixPath, swhid: str, known: bool) -> None: """Recursively add a new path. """ relative_path = path.relative_to(self.path) if relative_path == PosixPath("."): self.swhid = swhid self.known = known return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: self.children[new_path] = Tree(new_path, self) self.children[new_path].addNode(path, swhid, known) def show(self, format) -> None: """Show tree in different formats""" if format == "json": print(json.dumps(self.toDict(), indent=4, sort_keys=True)) if format == "ndjson": print(ndjson.dumps(dict_path for dict_path in self.__iterNodesAttr())) elif format == "text": isatty = sys.stdout.isatty() print(colorize(str(self.path), Color.blue) if isatty else str(self.path)) self.printChildren(isatty) elif format == "sunburst": root = self.path directories = self.getDirectoriesInfo(root) sunburst = generate_sunburst(directories, root) offline_plot(sunburst) def printChildren(self, isatty: bool, inc: int = 1) -> None: for path, node in self.children.items(): self.printNode(node, isatty, inc) if node.children: node.printChildren(isatty, inc + 1) def printNode(self, node: Any, isatty: bool, inc: int) -> None: rel_path = str(node.path.relative_to(self.path)) begin = "│ " * inc end = "/" if node.otype == DIRECTORY else "" if isatty: if not node.known: rel_path = colorize(rel_path, Color.red) elif node.otype == DIRECTORY: rel_path = colorize(rel_path, Color.blue) elif node.otype == CONTENT: rel_path = colorize(rel_path, Color.green) print(f"{begin}{rel_path}{end}") @property def attributes(self): """ Get the attributes of the current node grouped by the relative path. Returns: a dictionary containing a path as key and its known/unknown status and the - Software Heritage persistent identifier as values. + SWHID as values. """ return {str(self.path): {"swhid": self.swhid, "known": self.known,}} def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]: """ Recursively groups the current child nodes inside a dictionary. For example, if you have the following structure: .. code-block:: none root { subdir: { file.txt } } The generated dictionary will be: .. code-block:: none { "root": { "swhid": "...", "known": True/False } "root/subdir": { "swhid": "...", "known": True/False } "root/subdir/file.txt": { "swhid": "...", "known": True/False } } """ for node_dict in self.__iterNodesAttr(): dict_nodes.update(node_dict) return dict_nodes def iterate(self) -> Iterable[Tree]: """ Recursively iterate through the children of the current node """ for _, child_node in self.children.items(): yield child_node if child_node.otype == DIRECTORY: yield from child_node.iterate() def __iterNodesAttr(self) -> Iterable[Dict[str, Dict]]: """ Recursively iterate through the children of the current node returning an iterable of the children nodes attributes Yields: a dictionary containing a path with its known/unknown status and the - Software Heritage persistent identifier + SWHID """ for child_node in self.iterate(): yield child_node.attributes if child_node.otype == DIRECTORY: yield from child_node.__iterNodesAttr() def getFilesFromDir(self, dir_path: PosixPath) -> List: """ Retrieve files information about a specific directory path Returns: A list containing the files attributes present inside the directory given in input """ def getFiles(node): files = [] for _, node in node.children.items(): if node.otype == CONTENT: files.append(node.attributes) return files if dir_path == self.path: return getFiles(self) else: for node in self.iterate(): if node.path == dir_path: return getFiles(node) raise InvalidDirectoryPath( "The directory provided doesn't match any stored directory" ) def __getSubDirsInfo(self, root, directories): """Fills the directories given in input with the contents information stored inside the directory child, only if they have contents. """ for path, child_node in self.children.items(): if child_node.otype == DIRECTORY: rel_path = path.relative_to(root) contents_info = child_node.count_contents() # checks the first element of the tuple # (the number of contents in a directory) # if it is equal to zero it means that there are no contents # in that directory. if not contents_info[0] == 0: directories[rel_path] = contents_info if child_node.has_dirs(): child_node.__getSubDirsInfo(root, directories) def getDirectoriesInfo(self, root: PosixPath) -> Dict[PosixPath, Tuple[int, int]]: """Get information about all directories under the given root. Returns: A dictionary with a directory path as key and the relative contents information (the result of count_contents) as values. """ directories = {root: self.count_contents()} self.__getSubDirsInfo(root, directories) return directories def count_contents(self) -> Tuple[int, int]: """Count how many contents are present inside a directory. - If a directory has a pid returns as it has all the contents. + If a directory has a SWHID returns as it has all the contents. Returns: A tuple with the total number of the contents and the number of contents known (the ones that have a persistent identifier). """ contents = 0 discovered = 0 if not self.otype == DIRECTORY: raise InvalidObjectType( "Can't calculate contents of the " "object type: %s" % self.otype ) if self.known: # to identify a directory with all files/directories present return (1, 1) else: for _, child_node in self.children.items(): if child_node.otype == CONTENT: contents += 1 if child_node.known: discovered += 1 return (contents, discovered) def has_dirs(self) -> bool: """Checks if node has directories """ for _, child_node in self.children.items(): if child_node.otype == DIRECTORY: return True return False diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index e1ff2bc..da316b4 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,180 +1,182 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import asyncio import aiohttp from typing import List, Dict, Tuple, Iterator, Union, Set, Any from pathlib import PosixPath from .exceptions import error_response from .model import Tree from swh.model.from_disk import Directory, Content, accept_all_directories from swh.model.identifiers import ( - persistent_identifier, - parse_persistent_identifier, + swhid, + parse_swhid, DIRECTORY, CONTENT, ) -async def pids_discovery( - pids: List[str], session: aiohttp.ClientSession, api_url: str, +async def swhids_discovery( + swhids: List[str], session: aiohttp.ClientSession, api_url: str, ) -> Dict[str, Dict[str, bool]]: - """API Request to get information about the persistent identifiers given in - input. + """API Request to get information about the SoftWare Heritage persistent + IDentifiers (SWHIDs) given in input. Args: - pids: a list of persistent identifier + swhids: a list of SWHIDS api_url: url for the API request Returns: A dictionary with: - key: persistent identifier searched + key: SWHID searched value: - value['known'] = True if the pid is found - value['known'] = False if the pid is not found + value['known'] = True if the SWHID is found + value['known'] = False if the SWHID is not found """ endpoint = api_url + "known/" chunk_size = 1000 requests = [] - def get_chunk(pids): - for i in range(0, len(pids), chunk_size): - yield pids[i : i + chunk_size] + def get_chunk(swhids): + for i in range(0, len(swhids), chunk_size): + yield swhids[i : i + chunk_size] - async def make_request(pids): - async with session.post(endpoint, json=pids) as resp: + async def make_request(swhids): + async with session.post(endpoint, json=swhids) as resp: if resp.status != 200: error_response(resp.reason, resp.status, endpoint) return await resp.json() - if len(pids) > chunk_size: - for pids_chunk in get_chunk(pids): - requests.append(asyncio.create_task(make_request(pids_chunk))) + if len(swhids) > chunk_size: + for swhids_chunk in get_chunk(swhids): + requests.append(asyncio.create_task(make_request(swhids_chunk))) res = await asyncio.gather(*requests) # concatenate list of dictionaries return dict(itertools.chain.from_iterable(e.items() for e in res)) else: - return await make_request(pids) + return await make_request(swhids) def directory_filter(path_name: Union[str, bytes], exclude_patterns: Set[Any]) -> bool: """It checks if the path_name is matching with the patterns given in input. It is also used as a `dir_filter` function when generating the directory object from `swh.model.from_disk` Returns: False if the directory has to be ignored, True otherwise """ path = PosixPath(path_name.decode() if isinstance(path_name, bytes) else path_name) for sre_pattern in exclude_patterns: if sre_pattern.match(str(path)): return False return True def get_subpaths( path: PosixPath, exclude_patterns: Set[Any] ) -> Iterator[Tuple[PosixPath, str]]: - """Find the persistent identifier of the directories and files under a - given path. + """Find the SoftWare Heritage persistent IDentifier (SWHID) of + the directories and files under a given path. Args: path: the root path Yields: - pairs of: path, the relative persistent identifier + pairs of: path, the relative SWHID """ - def pid_of(path): + def swhid_of(path): if path.is_dir(): if exclude_patterns: def dir_filter(dirpath, *args): return directory_filter(dirpath, exclude_patterns) else: dir_filter = accept_all_directories obj = Directory.from_disk( path=bytes(path), dir_filter=dir_filter ).get_data() - return persistent_identifier(DIRECTORY, obj) + return swhid(DIRECTORY, obj) else: obj = Content.from_file(path=bytes(path)).get_data() - return persistent_identifier(CONTENT, obj) + return swhid(CONTENT, obj) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): sub_path = PosixPath(dirpath).joinpath(node) - yield (sub_path, pid_of(sub_path)) + yield (sub_path, swhid_of(sub_path)) async def parse_path( path: PosixPath, session: aiohttp.ClientSession, api_url: str, exclude_patterns: Set[Any], ) -> Iterator[Tuple[str, str, bool]]: """Check if the sub paths of the given path are present in the archive or not. Args: path: the source path api_url: url for the API request Returns: a map containing tuples with: a subpath of the given path, - the pid of the subpath and the result of the api call + the SWHID of the subpath and the result of the api call """ parsed_paths = dict(get_subpaths(path, exclude_patterns)) - parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url) + parsed_swhids = await swhids_discovery( + list(parsed_paths.values()), session, api_url + ) def unpack(tup): - subpath, pid = tup - return (subpath, pid, parsed_pids[pid]["known"]) + subpath, swhid = tup + return (subpath, swhid, parsed_swhids[swhid]["known"]) return map(unpack, parsed_paths.items()) async def run( root: PosixPath, api_url: str, source_tree: Tree, exclude_patterns: Set[Any] ) -> None: """Start scanning from the given root. It fills the source tree with the path discovered. Args: root: the root path to scan api_url: url for the API request """ async def _scan(root, session, api_url, source_tree, exclude_patterns): - for path, pid, known in await parse_path( + for path, obj_swhid, known in await parse_path( root, session, api_url, exclude_patterns ): - obj_type = parse_persistent_identifier(pid).object_type + obj_type = parse_swhid(obj_swhid).object_type if obj_type == CONTENT: - source_tree.addNode(path, pid, known) + source_tree.addNode(path, obj_swhid, known) elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): - source_tree.addNode(path, pid, known) + source_tree.addNode(path, obj_swhid, known) if not known: await _scan(path, session, api_url, source_tree, exclude_patterns) async with aiohttp.ClientSession() as session: await _scan(root, session, api_url, source_tree, exclude_patterns) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py index 1b1493c..91decd6 100644 --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -1,136 +1,136 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import asyncio import aiohttp import os from pathlib import PosixPath from aioresponses import aioresponses # type: ignore -from swh.model.cli import pid_of_file, pid_of_dir +from swh.model.cli import swhid_of_file, swhid_of_dir from swh.scanner.model import Tree from .flask_api import create_app @pytest.fixture def mock_aioresponse(): with aioresponses() as m: yield m @pytest.fixture def event_loop(): """Fixture that generate an asyncio event loop.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) yield loop loop.close() @pytest.fixture async def aiosession(): """Fixture that generate an aiohttp Client Session.""" session = aiohttp.ClientSession() yield session session.detach() @pytest.fixture(scope="session") def temp_folder(tmp_path_factory): """Fixture that generates a temporary folder with the following structure: .. code-block:: python root = { subdir: { subsubdir filesample.txt filesample2.txt } subdir2 subfile.txt } """ root = tmp_path_factory.getbasetemp() subdir = tmp_path_factory.mktemp("subdir") subsubdir = subdir.joinpath("subsubdir") subsubdir.mkdir() subdir2 = tmp_path_factory.mktemp("subdir2") subfile = root / "subfile.txt" subfile.touch() filesample = subdir / "filesample.txt" filesample.touch() filesample2 = subdir / "filesample2.txt" filesample2.touch() avail_path = { - subdir: pid_of_dir(bytes(subdir)), - subsubdir: pid_of_dir(bytes(subsubdir)), - subdir2: pid_of_dir(bytes(subdir2)), - subfile: pid_of_file(bytes(subfile)), - filesample: pid_of_file(bytes(filesample)), - filesample2: pid_of_file(bytes(filesample2)), + subdir: swhid_of_dir(bytes(subdir)), + subsubdir: swhid_of_dir(bytes(subsubdir)), + subdir2: swhid_of_dir(bytes(subdir2)), + subfile: swhid_of_file(bytes(subfile)), + filesample: swhid_of_file(bytes(filesample)), + filesample2: swhid_of_file(bytes(filesample2)), } return { "root": root, "paths": avail_path, "filesample": filesample, "filesample2": filesample2, "subsubdir": subsubdir, "subdir": subdir, } @pytest.fixture(scope="function") def example_tree(temp_folder): """Fixture that generate a Tree with the root present in the session fixture "temp_folder". """ example_tree = Tree(temp_folder["root"]) assert example_tree.path == temp_folder["root"] return example_tree @pytest.fixture(scope="function") def example_dirs(example_tree, temp_folder): """ Fixture that fill the fixture example_tree with the values contained in the fixture temp_folder and returns the directories information of the filled example_tree. """ root = temp_folder["root"] filesample_path = temp_folder["filesample"] filesample2_path = temp_folder["filesample2"] subsubdir_path = temp_folder["subsubdir"] known_paths = [filesample_path, filesample2_path, subsubdir_path] - for path, pid in temp_folder["paths"].items(): + for path, swhid in temp_folder["paths"].items(): if path in known_paths: - example_tree.addNode(path, pid, True) + example_tree.addNode(path, swhid, True) else: - example_tree.addNode(path, pid, False) + example_tree.addNode(path, swhid, False) return example_tree.getDirectoriesInfo(root) @pytest.fixture def test_folder(): """Location of the "data" folder """ tests_path = PosixPath(os.path.abspath(__file__)).parent tests_data_folder = tests_path.joinpath("data") assert tests_data_folder.exists() return tests_data_folder @pytest.fixture(scope="session") def app(): """Flask backend API (used by live_server).""" app = create_app() return app diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py index 1235b94..541a16a 100644 --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -1,21 +1,21 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information correct_api_response = { "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False}, "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True}, "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } -# present pids inside /data/sample-folder +# present SWHIDs inside /data/sample-folder present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py index 7fb0b1c..ffed42a 100644 --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -1,32 +1,32 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from flask import Flask, request from .data import present_swhids from swh.web.common.exc import LargePayloadExc def create_app(): app = Flask(__name__) @app.route("/known/", methods=["POST"]) def known(): swhids = request.get_json() if len(swhids) > 900: raise LargePayloadExc( - "The maximum number of PIDs this endpoint " "can receive is 900" + "The maximum number of SWHIDs this endpoint can receive is 900" ) res = {swhid: {"known": False} for swhid in swhids} for swhid in swhids: if swhid in present_swhids: res[swhid]["known"] = True return res return app diff --git a/swh/scanner/tests/test_dashboard.py b/swh/scanner/tests/test_dashboard.py index ef0d57d..ab89396 100644 --- a/swh/scanner/tests/test_dashboard.py +++ b/swh/scanner/tests/test_dashboard.py @@ -1,51 +1,51 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scanner.dashboard.dashboard import generate_table_body import dash_html_components as html def test_generate_table_body(example_tree, temp_folder): subdir_path = temp_folder["subdir"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) generated_body = generate_table_body(subdir_path, example_tree) expected_body = [ html.Tbody( [ html.Tr( [ html.Td("✔"), html.Td( html.A( children="filesample.txt", href=f"file://{subdir_path}/filesample.txt", ) ), html.Td("swh:1:cnt:e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"), ] ), html.Tr( [ html.Td("✔"), html.Td( html.A( children="filesample2.txt", href=f"file://{subdir_path}/filesample2.txt", ) ), html.Td("swh:1:cnt:e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"), ] ), ] ) ] # workaround: dash_html_component.__eq__ checks for object identity only assert str(generated_body) == str(expected_body) diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py index 49b725b..15192d9 100644 --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -1,108 +1,108 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.scanner.exceptions import InvalidDirectoryPath def test_tree_add_node(example_tree, temp_folder): avail_paths = temp_folder["paths"].keys() - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, False) for path, node in example_tree.children.items(): assert path in avail_paths if node.children: for subpath, subnode in node.children.items(): assert subpath in avail_paths def test_to_json_no_one_present(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, False) result = example_tree.toDict() assert len(result) == 6 for _, node_info in result.items(): assert node_info["known"] is False def test_get_json_tree_all_present(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) result = example_tree.toDict() assert len(result) == 6 for _, node_info in result.items(): assert node_info["known"] is True def test_get_json_tree_only_one_present(example_tree, temp_folder): root = temp_folder["root"] filesample_path = temp_folder["filesample"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True if path == filesample_path else False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True if path == filesample_path else False) result = example_tree.toDict() assert len(result) == 6 for path, node_attr in result.items(): if path == str(root) + "/subdir0/filesample.txt": assert node_attr["known"] is True else: assert node_attr["known"] is False def test_get_directories_info(example_tree, temp_folder): root_path = temp_folder["root"] filesample_path = temp_folder["filesample"] filesample2_path = temp_folder["filesample2"] subdir_path = temp_folder["subdir"].relative_to(root_path) subsubdir_path = temp_folder["subsubdir"].relative_to(root_path) - for path, pid in temp_folder["paths"].items(): + for path, swhid in temp_folder["paths"].items(): if path == filesample_path or path == filesample2_path: - example_tree.addNode(path, pid, True) + example_tree.addNode(path, swhid, True) else: - example_tree.addNode(path, pid, False) + example_tree.addNode(path, swhid, False) directories = example_tree.getDirectoriesInfo(example_tree.path) assert subsubdir_path not in directories assert directories[subdir_path] == (2, 2) def test_get_files_from_dir(example_tree, temp_folder): subdir_path = temp_folder["subdir"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) files = example_tree.getFilesFromDir(subdir_path) assert len(files) == 2 def test_get_files_source_path(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) files = example_tree.getFilesFromDir(example_tree.path) assert len(files) == 1 def test_get_files_from_dir_raise_exception(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) with pytest.raises(InvalidDirectoryPath): example_tree.getFilesFromDir("test/") diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py index 712e28d..f580ef7 100644 --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -1,106 +1,106 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import json from pathlib import PosixPath from .data import correct_api_response, present_swhids, to_exclude_swhid -from swh.scanner.scanner import pids_discovery, get_subpaths, run +from swh.scanner.scanner import swhids_discovery, get_subpaths, run from swh.scanner.model import Tree from swh.scanner.cli import extract_regex_objs from swh.scanner.exceptions import APIError aio_url = "http://example.org/api/known/" def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post( aio_url, status=200, content_type="application/json", body=json.dumps(correct_api_response), ) actual_result = event_loop.run_until_complete( - pids_discovery([], aiosession, "http://example.org/api/") + swhids_discovery([], aiosession, "http://example.org/api/") ) assert correct_api_response == actual_result def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post(aio_url, content_type="application/json", status=413) with pytest.raises(APIError): event_loop.run_until_complete( - pids_discovery([], aiosession, "http://example.org/api/") + swhids_discovery([], aiosession, "http://example.org/api/") ) def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server): api_url = live_server.url() + "/" request = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) ] # /known/ is limited at 900 with pytest.raises(APIError): - event_loop.run_until_complete(pids_discovery(request, aiosession, api_url)) + event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url)) def test_scanner_get_subpaths(temp_folder): root = temp_folder["root"] actual_result = [] - for subpath, pid in get_subpaths(root, tuple()): + for subpath, swhid in get_subpaths(root, tuple()): # also check if it's a symlink since pytest tmp_dir fixture create # also a symlink to each directory inside the tmp_dir path if subpath.is_dir() and not subpath.is_symlink(): - actual_result.append((subpath, pid)) + actual_result.append((subpath, swhid)) assert len(actual_result) == 2 @pytest.mark.options(debug=False) def test_app(app): assert not app.debug def test_scanner_result(live_server, event_loop, test_folder): api_url = live_server.url() + "/" sample_folder = test_folder.joinpath(PosixPath("sample-folder")) source_tree = Tree(sample_folder) event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set())) for child_node in source_tree.iterate(): node_info = list(child_node.attributes.values())[0] if node_info["swhid"] in present_swhids: assert node_info["known"] is True else: assert node_info["known"] is False def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder): api_url = live_server.url() + "/" sample_folder = test_folder.joinpath(PosixPath("sample-folder")) patterns = (str(sample_folder) + "/toexclude",) exclude_pattern = { reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns) } source_tree = Tree(sample_folder) event_loop.run_until_complete( run(sample_folder, api_url, source_tree, exclude_pattern) ) for child_node in source_tree.iterate(): node_info = list(child_node.attributes.values())[0] assert node_info["swhid"] != to_exclude_swhid