diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index eb22201..4316a1c 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,62 +1,102 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import asyncio +import glob +import re +import fnmatch from pathlib import PosixPath +from typing import Tuple from .scanner import run from .model import Tree +from .exceptions import InvalidDirectoryPath from swh.core.cli import CONTEXT_SETTINGS @click.group(name="scanner", context_settings=CONTEXT_SETTINGS) @click.pass_context def scanner(ctx): """Software Heritage Scanner tools.""" pass def parse_url(url): if not url.startswith("https://"): url = "https://" + url if not url.endswith("/"): url += "/" return url +def extract_regex_objs(root_path: PosixPath, patterns: Tuple[str]) -> object: + """Generates a regex object for each pattern given in input and checks if + the path is a subdirectory or relative to the root path. + + Yields: + an SRE_Pattern object + """ + for pattern in patterns: + for path in glob.glob(pattern): + dirpath = PosixPath(path) + if root_path not in dirpath.parents: + error_msg = ( + f'The path "{dirpath}" is not a subdirectory or relative ' + f'to the root directory path: "{root_path}"' + ) + raise InvalidDirectoryPath(error_msg) + + if glob.glob(pattern): + regex = fnmatch.translate(str(PosixPath(pattern))) + yield re.compile(regex) + + @scanner.command(name="scan") -@click.argument("path", required=True, type=click.Path(exists=True)) +@click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default="https://archive.softwareheritage.org/api/1", metavar="API_URL", show_default=True, help="url for the api request", ) +@click.option( + "--exclude", + "-x", + "patterns", + metavar="PATTERN", + multiple=True, + help="recursively exclude a specific pattern", +) @click.option( "-f", "--format", type=click.Choice(["text", "json", "sunburst"], case_sensitive=False), default="text", help="select the output format", ) @click.pass_context -def scan(ctx, path, api_url, format): +def scan(ctx, root_path, api_url, patterns, format): """Scan a source code project to discover files and directories already present in the archive""" + sre_patterns = set() + if patterns: + sre_patterns = { + reg_obj for reg_obj in extract_regex_objs(PosixPath(root_path), patterns) + } api_url = parse_url(api_url) - source_tree = Tree(PosixPath(path)) + source_tree = Tree(PosixPath(root_path)) loop = asyncio.get_event_loop() - loop.run_until_complete(run(path, api_url, source_tree)) + loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns)) source_tree.show(format) if __name__ == "__main__": scan() diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py index 2618950..d5ad445 100644 --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -1,18 +1,22 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information class InvalidObjectType(TypeError): pass +class InvalidDirectoryPath(Exception): + pass + + class APIError(Exception): def __str__(self): return '"%s"' % self.args def error_response(reason: str, status_code: int, api_url: str): error_msg = f"{status_code} {reason}: '{api_url}'" raise APIError(error_msg) diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index bbc4e87..9759c4c 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,138 +1,182 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import asyncio import aiohttp -from typing import List, Dict, Tuple, Iterator +from typing import List, Dict, Tuple, Iterator, Union, Set, Any from pathlib import PosixPath from .exceptions import error_response from .model import Tree -from swh.model.cli import pid_of_file, pid_of_dir -from swh.model.identifiers import parse_persistent_identifier, DIRECTORY, CONTENT +from swh.model.from_disk import Directory, Content, accept_all_directories +from swh.model.identifiers import ( + persistent_identifier, + parse_persistent_identifier, + DIRECTORY, + CONTENT, +) async def pids_discovery( pids: List[str], session: aiohttp.ClientSession, api_url: str, ) -> Dict[str, Dict[str, bool]]: """API Request to get information about the persistent identifiers given in input. Args: pids: a list of persistent identifier api_url: url for the API request Returns: A dictionary with: key: persistent identifier searched value: value['known'] = True if the pid is found value['known'] = False if the pid is not found """ endpoint = api_url + "known/" chunk_size = 1000 requests = [] def get_chunk(pids): for i in range(0, len(pids), chunk_size): yield pids[i : i + chunk_size] async def make_request(pids): async with session.post(endpoint, json=pids) as resp: if resp.status != 200: error_response(resp.reason, resp.status, endpoint) return await resp.json() if len(pids) > chunk_size: for pids_chunk in get_chunk(pids): requests.append(asyncio.create_task(make_request(pids_chunk))) res = await asyncio.gather(*requests) # concatenate list of dictionaries return dict(itertools.chain.from_iterable(e.items() for e in res)) else: return await make_request(pids) -def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]: +def directory_filter(path_name: Union[str, bytes], exclude_patterns: Set[Any]) -> bool: + """It checks if the path_name is matching with the patterns given in input. + + It is also used as a `dir_filter` function when generating the directory + object from `swh.model.from_disk` + + Returns: + False if the directory has to be ignored, True otherwise + + """ + path = PosixPath(path_name.decode() if isinstance(path_name, bytes) else path_name) + for sre_pattern in exclude_patterns: + if sre_pattern.match(str(path)): + return False + return True + + +def get_subpaths( + path: PosixPath, exclude_patterns: Set[Any] +) -> Iterator[Tuple[PosixPath, str]]: """Find the persistent identifier of the directories and files under a given path. Args: path: the root path Yields: pairs of: path, the relative persistent identifier """ def pid_of(path): if path.is_dir(): - return pid_of_dir(bytes(path)) - elif path.is_file() or path.is_symlink(): - return pid_of_file(bytes(path)) + if exclude_patterns: + + def dir_filter(dirpath, *args): + return directory_filter(dirpath, exclude_patterns) + + else: + dir_filter = accept_all_directories + + obj = Directory.from_disk( + path=bytes(path), dir_filter=dir_filter + ).get_data() + + return persistent_identifier(DIRECTORY, obj) + else: + obj = Content.from_file(path=bytes(path)).get_data() + return persistent_identifier(CONTENT, obj) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): sub_path = PosixPath(dirpath).joinpath(node) yield (sub_path, pid_of(sub_path)) async def parse_path( - path: PosixPath, session: aiohttp.ClientSession, api_url: str + path: PosixPath, + session: aiohttp.ClientSession, + api_url: str, + exclude_patterns: Set[Any], ) -> Iterator[Tuple[str, str, bool]]: """Check if the sub paths of the given path are present in the archive or not. Args: path: the source path api_url: url for the API request Returns: a map containing tuples with: a subpath of the given path, the pid of the subpath and the result of the api call """ - parsed_paths = dict(get_subpaths(path)) + parsed_paths = dict(get_subpaths(path, exclude_patterns)) parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url) def unpack(tup): subpath, pid = tup return (subpath, pid, parsed_pids[pid]["known"]) return map(unpack, parsed_paths.items()) -async def run(root: PosixPath, api_url: str, source_tree: Tree) -> None: +async def run( + root: PosixPath, api_url: str, source_tree: Tree, exclude_patterns: Set[Any] +) -> None: """Start scanning from the given root. It fills the source tree with the path discovered. Args: root: the root path to scan api_url: url for the API request """ - async def _scan(root, session, api_url, source_tree): - for path, pid, found in await parse_path(root, session, api_url): + async def _scan(root, session, api_url, source_tree, exclude_patterns): + for path, pid, found in await parse_path( + root, session, api_url, exclude_patterns + ): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: source_tree.addNode(path, pid if found else None) - elif obj_type == DIRECTORY: + elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): if found: source_tree.addNode(path, pid) else: source_tree.addNode(path) - await _scan(path, session, api_url, source_tree) + await _scan(path, session, api_url, source_tree, exclude_patterns) async with aiohttp.ClientSession() as session: - await _scan(root, session, api_url, source_tree) + await _scan(root, session, api_url, source_tree, exclude_patterns) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py index 4648b3e..8cd289b 100644 --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -1,17 +1,18 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information correct_api_response = { "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False}, "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True}, "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } # present pids inside /data/sample-folder present_pids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ + "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json new file mode 100644 index 0000000..d16a6f7 --- /dev/null +++ b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json @@ -0,0 +1,12 @@ +{ + "foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "bar": { + "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" + }, + "link-to-foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" +} diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json index cefe27f..78350c0 100644 --- a/swh/scanner/tests/data/sample-folder-result.json +++ b/swh/scanner/tests/data/sample-folder-result.json @@ -1 +1,13 @@ -{"foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "bar": {"barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"}, "link-to-foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"} +{ + "foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "bar": { + "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" + }, + "link-to-foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", + "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" +} diff --git a/swh/scanner/tests/data/sample-folder/toexclude/example.txt b/swh/scanner/tests/data/sample-folder/toexclude/example.txt new file mode 100644 index 0000000..5f1cfce --- /dev/null +++ b/swh/scanner/tests/data/sample-folder/toexclude/example.txt @@ -0,0 +1 @@ +example file diff --git a/swh/scanner/tests/test_cli.py b/swh/scanner/tests/test_cli.py new file mode 100644 index 0000000..3ded158 --- /dev/null +++ b/swh/scanner/tests/test_cli.py @@ -0,0 +1,16 @@ +import pytest + +from swh.scanner.cli import extract_regex_objs +from swh.scanner.exceptions import InvalidDirectoryPath + + +def test_extract_regex_objs(temp_folder): + root_path = temp_folder["root"] + + patterns = (str(temp_folder["subdir"]), "/none") + sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] + assert len(sre_patterns) == 1 + + patterns = (*patterns, "/tmp") + with pytest.raises(InvalidDirectoryPath): + sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py index 8404a71..f5c6b70 100644 --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -1,82 +1,112 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import json from pathlib import PosixPath from .data import correct_api_response from swh.scanner.scanner import pids_discovery, get_subpaths, run from swh.scanner.model import Tree +from swh.scanner.cli import extract_regex_objs from swh.scanner.exceptions import APIError aio_url = "http://example.org/api/known/" def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post( aio_url, status=200, content_type="application/json", body=json.dumps(correct_api_response), ) actual_result = event_loop.run_until_complete( pids_discovery([], aiosession, "http://example.org/api/") ) assert correct_api_response == actual_result def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession): mock_aioresponse.post(aio_url, content_type="application/json", status=413) with pytest.raises(APIError): event_loop.run_until_complete( pids_discovery([], aiosession, "http://example.org/api/") ) def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server): api_url = live_server.url() + "/" request = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) ] # /known/ is limited at 900 with pytest.raises(APIError): event_loop.run_until_complete(pids_discovery(request, aiosession, api_url)) -def test_scanner_get_subpaths(temp_folder, tmp_path): - paths = temp_folder["paths"].keys() - pids = temp_folder["paths"].values() +def test_scanner_get_subpaths(temp_folder): + root = temp_folder["root"] - for subpath, pid in get_subpaths(tmp_path): - assert subpath in paths - assert pid in pids + actual_result = [] + for subpath, pid in get_subpaths(root, tuple()): + # also check if it's a symlink since pytest tmp_dir fixture create + # also a symlink to each directory inside the tmp_dir path + if subpath.is_dir() and not subpath.is_symlink(): + actual_result.append((subpath, pid)) + + assert len(actual_result) == 2 @pytest.mark.options(debug=False) def test_app(app): assert not app.debug def test_scanner_result(live_server, event_loop, test_folder): api_url = live_server.url() + "/" result_path = test_folder.joinpath(PosixPath("sample-folder-result.json")) with open(result_path, "r") as json_file: expected_result = json.loads(json_file.read()) sample_folder = test_folder.joinpath(PosixPath("sample-folder")) source_tree = Tree(sample_folder) - event_loop.run_until_complete(run(sample_folder, api_url, source_tree)) + event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple())) + + actual_result = source_tree.getTree() + + assert actual_result == expected_result + + +def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder): + api_url = live_server.url() + "/" + + result_path = test_folder.joinpath( + PosixPath("sample-folder-result-no-toexclude.json") + ) + with open(result_path, "r") as json_file: + expected_result = json.loads(json_file.read()) + + sample_folder = test_folder.joinpath(PosixPath("sample-folder")) + patterns = (str(sample_folder) + "/toexclude",) + exclude_pattern = { + reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns) + } + + source_tree = Tree(sample_folder) + event_loop.run_until_complete( + run(sample_folder, api_url, source_tree, exclude_pattern) + ) actual_result = source_tree.getTree() assert actual_result == expected_result