diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -5,10 +5,15 @@ import click import asyncio +import glob +import re +import fnmatch from pathlib import PosixPath +from typing import Tuple from .scanner import run from .model import Tree +from .exceptions import InvalidDirectoryPath from swh.core.cli import CONTEXT_SETTINGS @@ -28,8 +33,30 @@ return url +def extract_regex_objs(root_path: PosixPath, patterns: Tuple[str]) -> object: + """Generates a regex object for each pattern given in input and checks if + the path is a subdirectory or relative to the root path. + + Yields: + an SRE_Pattern object + """ + for pattern in patterns: + for path in glob.glob(pattern): + dirpath = PosixPath(path) + if root_path not in dirpath.parents: + error_msg = ( + f'The path "{dirpath}" is not a subdirectory or relative ' + f'to the root directory path: "{root_path}"' + ) + raise InvalidDirectoryPath(error_msg) + + if glob.glob(pattern): + regex = fnmatch.translate(str(PosixPath(pattern))) + yield re.compile(regex) + + @scanner.command(name="scan") -@click.argument("path", required=True, type=click.Path(exists=True)) +@click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", @@ -38,6 +65,14 @@ show_default=True, help="url for the api request", ) +@click.option( + "--exclude", + "-x", + "patterns", + metavar="PATTERN", + multiple=True, + help="recursively exclude a specific pattern", +) @click.option( "-f", "--format", @@ -46,14 +81,19 @@ help="select the output format", ) @click.pass_context -def scan(ctx, path, api_url, format): +def scan(ctx, root_path, api_url, patterns, format): """Scan a source code project to discover files and directories already present in the archive""" + sre_patterns = set() + if patterns: + sre_patterns = { + reg_obj for reg_obj in extract_regex_objs(PosixPath(root_path), patterns) + } api_url = parse_url(api_url) - source_tree = Tree(PosixPath(path)) + source_tree = Tree(PosixPath(root_path)) loop = asyncio.get_event_loop() - loop.run_until_complete(run(path, api_url, source_tree)) + loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns)) source_tree.show(format) diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -8,6 +8,10 @@ pass +class InvalidDirectoryPath(Exception): + pass + + class APIError(Exception): def __str__(self): return '"%s"' % self.args diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -7,14 +7,19 @@ import itertools import asyncio import aiohttp -from typing import List, Dict, Tuple, Iterator +from typing import List, Dict, Tuple, Iterator, Union, Set, Any from pathlib import PosixPath from .exceptions import error_response from .model import Tree -from swh.model.cli import pid_of_file, pid_of_dir -from swh.model.identifiers import parse_persistent_identifier, DIRECTORY, CONTENT +from swh.model.from_disk import Directory, Content, accept_all_directories +from swh.model.identifiers import ( + persistent_identifier, + parse_persistent_identifier, + DIRECTORY, + CONTENT, +) async def pids_discovery( @@ -61,7 +66,26 @@ return await make_request(pids) -def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]: +def directory_filter(path_name: Union[str, bytes], exclude_patterns: Set[Any]) -> bool: + """It checks if the path_name is matching with the patterns given in input. + + It is also used as a `dir_filter` function when generating the directory + object from `swh.model.from_disk` + + Returns: + False if the directory has to be ignored, True otherwise + + """ + path = PosixPath(path_name.decode() if isinstance(path_name, bytes) else path_name) + for sre_pattern in exclude_patterns: + if sre_pattern.match(str(path)): + return False + return True + + +def get_subpaths( + path: PosixPath, exclude_patterns: Set[Any] +) -> Iterator[Tuple[PosixPath, str]]: """Find the persistent identifier of the directories and files under a given path. @@ -75,9 +99,22 @@ def pid_of(path): if path.is_dir(): - return pid_of_dir(bytes(path)) - elif path.is_file() or path.is_symlink(): - return pid_of_file(bytes(path)) + if exclude_patterns: + + def dir_filter(dirpath, *args): + return directory_filter(dirpath, exclude_patterns) + + else: + dir_filter = accept_all_directories + + obj = Directory.from_disk( + path=bytes(path), dir_filter=dir_filter + ).get_data() + + return persistent_identifier(DIRECTORY, obj) + else: + obj = Content.from_file(path=bytes(path)).get_data() + return persistent_identifier(CONTENT, obj) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): @@ -86,7 +123,10 @@ async def parse_path( - path: PosixPath, session: aiohttp.ClientSession, api_url: str + path: PosixPath, + session: aiohttp.ClientSession, + api_url: str, + exclude_patterns: Set[Any], ) -> Iterator[Tuple[str, str, bool]]: """Check if the sub paths of the given path are present in the archive or not. @@ -100,7 +140,7 @@ the pid of the subpath and the result of the api call """ - parsed_paths = dict(get_subpaths(path)) + parsed_paths = dict(get_subpaths(path, exclude_patterns)) parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url) def unpack(tup): @@ -110,7 +150,9 @@ return map(unpack, parsed_paths.items()) -async def run(root: PosixPath, api_url: str, source_tree: Tree) -> None: +async def run( + root: PosixPath, api_url: str, source_tree: Tree, exclude_patterns: Set[Any] +) -> None: """Start scanning from the given root. It fills the source tree with the path discovered. @@ -121,18 +163,20 @@ """ - async def _scan(root, session, api_url, source_tree): - for path, pid, found in await parse_path(root, session, api_url): + async def _scan(root, session, api_url, source_tree, exclude_patterns): + for path, pid, found in await parse_path( + root, session, api_url, exclude_patterns + ): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: source_tree.addNode(path, pid if found else None) - elif obj_type == DIRECTORY: + elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): if found: source_tree.addNode(path, pid) else: source_tree.addNode(path) - await _scan(path, session, api_url, source_tree) + await _scan(path, session, api_url, source_tree, exclude_patterns) async with aiohttp.ClientSession() as session: - await _scan(root, session, api_url, source_tree) + await _scan(root, session, api_url, source_tree, exclude_patterns) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -14,4 +14,5 @@ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ + "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json new file mode 100644 --- /dev/null +++ b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json @@ -0,0 +1,12 @@ +{ + "foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "bar": { + "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" + }, + "link-to-foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" +} diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json --- a/swh/scanner/tests/data/sample-folder-result.json +++ b/swh/scanner/tests/data/sample-folder-result.json @@ -1 +1,13 @@ -{"foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "bar": {"barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"}, "link-to-foo": {"quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"}, "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"} +{ + "foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "bar": { + "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" + }, + "link-to-foo": { + "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" + }, + "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", + "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" +} diff --git a/swh/scanner/tests/data/sample-folder/toexclude/example.txt b/swh/scanner/tests/data/sample-folder/toexclude/example.txt new file mode 100644 --- /dev/null +++ b/swh/scanner/tests/data/sample-folder/toexclude/example.txt @@ -0,0 +1 @@ +example file diff --git a/swh/scanner/tests/test_cli.py b/swh/scanner/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/scanner/tests/test_cli.py @@ -0,0 +1,16 @@ +import pytest + +from swh.scanner.cli import extract_regex_objs +from swh.scanner.exceptions import InvalidDirectoryPath + + +def test_extract_regex_objs(temp_folder): + root_path = temp_folder["root"] + + patterns = (str(temp_folder["subdir"]), "/none") + sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] + assert len(sre_patterns) == 1 + + patterns = (*patterns, "/tmp") + with pytest.raises(InvalidDirectoryPath): + sre_patterns = [reg_obj for reg_obj in extract_regex_objs(root_path, patterns)] diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -11,6 +11,7 @@ from swh.scanner.scanner import pids_discovery, get_subpaths, run from swh.scanner.model import Tree +from swh.scanner.cli import extract_regex_objs from swh.scanner.exceptions import APIError aio_url = "http://example.org/api/known/" @@ -51,13 +52,17 @@ event_loop.run_until_complete(pids_discovery(request, aiosession, api_url)) -def test_scanner_get_subpaths(temp_folder, tmp_path): - paths = temp_folder["paths"].keys() - pids = temp_folder["paths"].values() +def test_scanner_get_subpaths(temp_folder): + root = temp_folder["root"] - for subpath, pid in get_subpaths(tmp_path): - assert subpath in paths - assert pid in pids + actual_result = [] + for subpath, pid in get_subpaths(root, tuple()): + # also check if it's a symlink since pytest tmp_dir fixture create + # also a symlink to each directory inside the tmp_dir path + if subpath.is_dir() and not subpath.is_symlink(): + actual_result.append((subpath, pid)) + + assert len(actual_result) == 2 @pytest.mark.options(debug=False) @@ -75,7 +80,32 @@ sample_folder = test_folder.joinpath(PosixPath("sample-folder")) source_tree = Tree(sample_folder) - event_loop.run_until_complete(run(sample_folder, api_url, source_tree)) + event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple())) + + actual_result = source_tree.getTree() + + assert actual_result == expected_result + + +def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder): + api_url = live_server.url() + "/" + + result_path = test_folder.joinpath( + PosixPath("sample-folder-result-no-toexclude.json") + ) + with open(result_path, "r") as json_file: + expected_result = json.loads(json_file.read()) + + sample_folder = test_folder.joinpath(PosixPath("sample-folder")) + patterns = (str(sample_folder) + "/toexclude",) + exclude_pattern = { + reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns) + } + + source_tree = Tree(sample_folder) + event_loop.run_until_complete( + run(sample_folder, api_url, source_tree, exclude_pattern) + ) actual_result = source_tree.getTree()