diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -5,10 +5,13 @@ import click import asyncio +import glob from pathlib import PosixPath +from typing import Iterable from .scanner import run from .model import Tree +from .exceptions import InvalidDirectoryPath from swh.core.cli import CONTEXT_SETTINGS @@ -28,8 +31,28 @@ return url +def extract_paths(root_path: PosixPath, patterns: str) -> Iterable[PosixPath]: + """Extract the paths from the given patterns. + Check if the path is a subdirectory of the root path. + """ + for pattern in patterns: + for path in glob.glob(pattern, recursive=True): + dirpath = PosixPath(path) + # don't consider the path if it's not a directory + if not dirpath.is_dir(): + continue + elif not set(filter(lambda p: p == root_path, dirpath.parents)): + error_msg = ( + f'The path "{dirpath}" is not a subdirectory ' + f'of the root directory path: "{root_path}"' + ) + raise InvalidDirectoryPath(error_msg) + else: + yield dirpath + + @scanner.command(name="scan") -@click.argument("path", required=True, type=click.Path(exists=True)) +@click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", @@ -38,6 +61,14 @@ show_default=True, help="url for the api request", ) +@click.option( + "--exclude", + "-x", + "patterns", + metavar="PATTERN", + multiple=True, + help="recursively exclude a specific pattern", +) @click.option( "-f", "--format", @@ -46,14 +77,18 @@ help="select the output format", ) @click.pass_context -def scan(ctx, path, api_url, format): +def scan(ctx, root_path, api_url, patterns, format): """Scan a source code project to discover files and directories already present in the archive""" + if patterns: + exclude_paths = set( + path for path in extract_paths(PosixPath(root_path), patterns) + ) api_url = parse_url(api_url) - source_tree = Tree(PosixPath(path)) + source_tree = Tree(PosixPath(root_path)) loop = asyncio.get_event_loop() - loop.run_until_complete(run(path, api_url, source_tree)) + loop.run_until_complete(run(root_path, api_url, source_tree, exclude_paths)) source_tree.show(format) diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -8,6 +8,10 @@ pass +class InvalidDirectoryPath(Exception): + pass + + class APIError(Exception): def __str__(self): return '"%s"' % self.args diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -7,14 +7,19 @@ import itertools import asyncio import aiohttp -from typing import List, Dict, Tuple, Iterator +from typing import List, Dict, Tuple, Iterator, Set, Union from pathlib import PosixPath from .exceptions import error_response from .model import Tree -from swh.model.cli import pid_of_file, pid_of_dir -from swh.model.identifiers import parse_persistent_identifier, DIRECTORY, CONTENT +from swh.model.from_disk import Directory, Content +from swh.model.identifiers import ( + persistent_identifier, + parse_persistent_identifier, + DIRECTORY, + CONTENT, +) async def pids_discovery( @@ -61,7 +66,26 @@ return await make_request(pids) -def get_subpaths(path: PosixPath) -> Iterator[Tuple[PosixPath, str]]: +def directory_filter( + path_name: Union[str, bytes], exclude_paths: Set[PosixPath] +) -> bool: + """It checks if the parent of the path_name given in input is present in + the exclude_paths set. It is also used as a `dir_filter` function when + generating the directory object from `swh.model.from_disk` + + Returns: + False if the directory has to be ignored, True otherwise + + """ + path = PosixPath(path_name.decode() if isinstance(path_name, bytes) else path_name) + return not ( + path in exclude_paths or len(set(path.parents).intersection(exclude_paths)) != 0 + ) + + +def get_subpaths( + path: PosixPath, exclude_paths: Set[PosixPath] +) -> Iterator[Tuple[PosixPath, str]]: """Find the persistent identifier of the directories and files under a given path. @@ -73,11 +97,21 @@ """ + def dir_filter(dirpath, *args): + return directory_filter(dirpath, exclude_paths) + def pid_of(path): if path.is_dir(): - return pid_of_dir(bytes(path)) + if exclude_paths: + obj = Directory.from_disk( + path=bytes(path), dir_filter=dir_filter + ).get_data() + else: + obj = Directory.from_disk(path=bytes(path)).get_data() + return persistent_identifier(DIRECTORY, obj) elif path.is_file() or path.is_symlink(): - return pid_of_file(bytes(path)) + obj = Content.from_file(path=bytes(path)).get_data() + return persistent_identifier(CONTENT, obj) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): @@ -86,7 +120,7 @@ async def parse_path( - path: PosixPath, session: aiohttp.ClientSession, api_url: str + path: PosixPath, session: aiohttp.ClientSession, api_url: str, exclude_paths: Set ) -> Iterator[Tuple[str, str, bool]]: """Check if the sub paths of the given path are present in the archive or not. @@ -100,7 +134,7 @@ the pid of the subpath and the result of the api call """ - parsed_paths = dict(get_subpaths(path)) + parsed_paths = dict(get_subpaths(path, exclude_paths)) parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url) def unpack(tup): @@ -110,7 +144,12 @@ return map(unpack, parsed_paths.items()) -async def run(root: PosixPath, api_url: str, source_tree: Tree) -> None: +async def run( + root: PosixPath, + api_url: str, + source_tree: Tree, + exclude_paths: Set[PosixPath] = set(), +) -> None: """Start scanning from the given root. It fills the source tree with the path discovered. @@ -121,18 +160,18 @@ """ - async def _scan(root, session, api_url, source_tree): - for path, pid, found in await parse_path(root, session, api_url): + async def _scan(root, session, api_url, source_tree, exclude_paths): + for path, pid, found in await parse_path(root, session, api_url, exclude_paths): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: source_tree.addNode(path, pid if found else None) - elif obj_type == DIRECTORY: + elif obj_type == DIRECTORY and directory_filter(path, exclude_paths): if found: source_tree.addNode(path, pid) else: source_tree.addNode(path) - await _scan(path, session, api_url, source_tree) + await _scan(path, session, api_url, source_tree, exclude_paths) async with aiohttp.ClientSession() as session: - await _scan(root, session, api_url, source_tree) + await _scan(root, session, api_url, source_tree, exclude_paths) diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -9,7 +9,7 @@ from .data import correct_api_response -from swh.scanner.scanner import pids_discovery, get_subpaths, run +from swh.scanner.scanner import pids_discovery, get_subpaths, run, directory_filter from swh.scanner.model import Tree from swh.scanner.exceptions import APIError @@ -55,11 +55,22 @@ paths = temp_folder["paths"].keys() pids = temp_folder["paths"].values() - for subpath, pid in get_subpaths(tmp_path): + for subpath, pid in get_subpaths(tmp_path, set()): assert subpath in paths assert pid in pids +def test_scanner_directory_filter(temp_folder, tmp_path): + exclude_paths = {temp_folder["subdir"]} + + actual_result = [] + for path, _ in temp_folder["paths"].items(): + if path.is_dir() and directory_filter(path, exclude_paths): + actual_result.append(path) + + assert len(actual_result) == 1 + + @pytest.mark.options(debug=False) def test_app(app): assert not app.debug