diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -6,6 +6,7 @@ import enum import os import stat +from pathlib import PosixPath import attr from typing import List, Optional @@ -243,6 +244,20 @@ return named_filter +def ignore_path(path, exclude_paths) -> bool: + """Check if the given path has one of the parents in exclude_paths + """ + if exclude_paths is None: + return False + else: + path = PosixPath(path.decode()) + if set(path.parents).intersection(exclude_paths) \ + or path in exclude_paths: + return True + else: + return False + + class Directory(MerkleNode): """Representation of a Software Heritage directory as a node in a Merkle Tree. @@ -268,6 +283,7 @@ @classmethod def from_disk(cls, *, path, dir_filter=accept_all_directories, + exclude_paths=None, max_content_length=None): """Compute the Software Heritage objects for a given directory tree @@ -275,6 +291,7 @@ path (bytes): the directory to traverse data (bool): whether to add the data to the content objects save_path (bool): whether to add the path to the content objects + exclude_paths (set[PosixPath]): set of path to ignore dir_filter (function): a filter to ignore some directories by name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the @@ -282,7 +299,6 @@ max_content_length (Optional[int]): if given, all contents larger than this will be skipped. """ - top_path = path dirs = {} @@ -297,7 +313,8 @@ path=path, max_content_length=max_content_length) entries[name] = content else: - if dir_filter(name, dirs[path].entries): + if dir_filter(name, dirs[path].entries) \ + and not ignore_path(path, exclude_paths): entries[name] = dirs[path] dirs[root] = cls({'name': os.path.basename(root)}) diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -8,6 +8,7 @@ import tarfile import tempfile import unittest +from pathlib import PosixPath from typing import ClassVar, Optional @@ -848,6 +849,23 @@ len(self.contents) + 1) + def test_directory_to_objects_ignore_paths(self): + directory = Directory.from_disk( + path=self.tmpdir_name, + exclude_paths={ + PosixPath(self.tmpdir_name.decode()+'/contents'), + PosixPath(self.tmpdir_name.decode()+'/specials') + } + ) + + self.assertNotIn(b'specials', directory) + self.assertNotIn(b'contents', directory) + self.assertEqual(len(directory), 2) + + objs = directory.collect() + + self.assertEqual(len(objs['directory']), 4) + def test_directory_entry_order(self): with tempfile.TemporaryDirectory() as dirname: dirname = os.fsencode(dirname)