Index: swh/model/cli.py =================================================================== --- swh/model/cli.py +++ swh/model/cli.py @@ -52,11 +52,21 @@ return swhid(CONTENT, object) -def swhid_of_dir(path): - from swh.model.from_disk import Directory +def swhid_of_dir(path, exclude_patterns=None): + from swh.model.from_disk import ( + Directory, + accept_all_directories, + ignore_directories_patterns, + ) from swh.model.identifiers import DIRECTORY, swhid - object = Directory.from_disk(path=path).get_data() + dir_filter = ( + ignore_directories_patterns(path.decode(), exclude_patterns) + if exclude_patterns + else accept_all_directories + ) + + object = Directory.from_disk(path=path, dir_filter=dir_filter).get_data() return swhid(DIRECTORY, object) @@ -96,7 +106,7 @@ return str(SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot))) -def identify_object(obj_type, follow_symlinks, obj): +def identify_object(obj_type, follow_symlinks, exclude_patterns, obj): from urllib.parse import urlparse if obj_type == "auto": @@ -125,7 +135,7 @@ if obj_type == "content": swhid = swhid_of_file(path) elif obj_type == "directory": - swhid = swhid_of_dir(path) + swhid = swhid_of_dir(path, exclude_patterns) elif obj_type == "origin": swhid = swhid_of_origin(obj) elif obj_type == "snapshot": @@ -160,6 +170,15 @@ type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]), help="type of object to identify (default: auto)", ) +@click.option( + "--exclude", + "-x", + "exclude_patterns", + metavar="PATTERN", + multiple=True, + help="Exclude directories using glob patterns \ + (e.g., '*.git' to exclude all .git directories)", +) @click.option( "--verify", "-v", @@ -168,7 +187,9 @@ help="reference identifier to be compared with computed one", ) @click.argument("objects", nargs=-1, required=True) -def identify(obj_type, verify, show_filename, follow_symlinks, objects): +def identify( + obj_type, verify, show_filename, follow_symlinks, objects, exclude_patterns, +): """Compute the Software Heritage persistent identifier (SWHID) for the given source code object(s). @@ -203,7 +224,9 @@ if verify and len(objects) != 1: raise click.BadParameter("verification requires a single object") - results = map(partial(identify_object, obj_type, follow_symlinks), objects) + results = map( + partial(identify_object, obj_type, follow_symlinks, exclude_patterns), objects, + ) if verify: swhid = next(results)[1] Index: swh/model/exceptions.py =================================================================== --- swh/model/exceptions.py +++ swh/model/exceptions.py @@ -129,3 +129,7 @@ def __repr__(self): return "ValidationError(%s)" % self + + +class InvalidDirectoryPath(Exception): + pass Index: swh/model/from_disk.py =================================================================== --- swh/model/from_disk.py +++ swh/model/from_disk.py @@ -5,15 +5,20 @@ import datetime import enum +import fnmatch +import glob import os +from pathlib import Path +import re import stat -from typing import Any, Iterable, List, Optional, Tuple +from typing import Any, Iterable, Iterator, List, Optional, Pattern, Tuple import attr from attrs_strict import type_validator from typing_extensions import Final from . import model +from .exceptions import InvalidDirectoryPath from .hashutil import MultiHash from .identifiers import directory_entry_sort_key, directory_identifier from .identifiers import identifier_to_bytes as id_to_bytes @@ -276,6 +281,54 @@ return named_filter +def extract_regex_objs( + root_path: Path, patterns: Iterable[str] +) -> Iterator[Pattern[str]]: + """Generates a regex object for each pattern given in input and checks if + the path is a subdirectory or relative to the root path. + + Yields: + an SRE_Pattern object + """ + for pattern in patterns: + for path in glob.glob(pattern): + dirpath = Path(path) + if root_path not in dirpath.parents: + error_msg = ( + f'The path "{dirpath}" is not a subdirectory or relative ' + f'to the root directory path: "{root_path}"' + ) + raise InvalidDirectoryPath(error_msg) + + regex = fnmatch.translate((pattern)) + yield re.compile(regex) + + +def ignore_directories_patterns(root_path: str, patterns: Iterable[str]): + """Filter for :func:`directory_to_objects` to ignore directories + matching certain patterns. + + Args: + patterns (list of str): pattern to ignore + Returns: + a directory filter for :func:`directory_to_objects` + """ + full_root_path = Path(root_path).absolute() + patterns = [str(full_root_path / pattern) for pattern in patterns] + sre_patterns = set(extract_regex_objs(full_root_path, patterns)) + + def pattern_filter( + dirpath: bytes, + dirname: bytes, + entries: Iterable[Any], + patterns: Iterable[Any] = sre_patterns, + ): + candidate = str(Path(dirpath.decode()).absolute()) + return not any([pattern.match(candidate) for pattern in patterns]) + + return pattern_filter + + def iter_directory( directory, ) -> Tuple[List[model.Content], List[model.SkippedContent], List[model.Directory]]: