diff --git a/swh/storage/fixer.py b/swh/storage/fixer.py index 4b478edb..14b21c5e 100644 --- a/swh/storage/fixer.py +++ b/swh/storage/fixer.py @@ -1,332 +1,331 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime import logging from typing import Any, Dict, List, Optional -from swh.model.identifiers import normalize_timestamp -from swh.model.model import Origin +from swh.model.model import Origin, TimestampWithTimezone logger = logging.getLogger(__name__) def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]: """Filters-out invalid 'perms' key that leaked from swh.model.from_disk to the journal. >>> _fix_content({'perms': 0o100644, 'sha1_git': b'foo'}) {'sha1_git': b'foo'} >>> _fix_content({'sha1_git': b'bar'}) {'sha1_git': b'bar'} """ content = content.copy() content.pop("perms", None) return content def _fix_revision_pypi_empty_string(rev): """PyPI loader failed to encode empty strings as bytes, see: swh:1:rev:8f0095ee0664867055d03de9bcc8f95b91d8a2b9 or https://forge.softwareheritage.org/D1772 """ rev = { **rev, "author": rev["author"].copy(), "committer": rev["committer"].copy(), } if rev["author"].get("email") == "": rev["author"]["email"] = b"" if rev["author"].get("name") == "": rev["author"]["name"] = b"" if rev["committer"].get("email") == "": rev["committer"]["email"] = b"" if rev["committer"].get("name") == "": rev["committer"]["name"] = b"" return rev def _fix_revision_transplant_source(rev): if rev.get("metadata") and rev["metadata"].get("extra_headers"): rev = copy.deepcopy(rev) rev["metadata"]["extra_headers"] = [ [key, value.encode("ascii")] if key == "transplant_source" and isinstance(value, str) else [key, value] for (key, value) in rev["metadata"]["extra_headers"] ] return rev def _check_date(date): """Returns whether the date can be represented in backends with sane limits on timestamps and timezones (resp. signed 64-bits and signed 16 bits), and that microseconds is valid (ie. between 0 and 10^6). """ if date is None: return True - date = normalize_timestamp(date) - return ( - (-(2 ** 63) <= date["timestamp"]["seconds"] < 2 ** 63) - and (0 <= date["timestamp"]["microseconds"] < 10 ** 6) - and (-(2 ** 15) <= date["offset"] < 2 ** 15) - ) + try: + TimestampWithTimezone.from_dict(date) + except ValueError: + return False + else: + return True def _check_revision_date(rev): """Exclude revisions with invalid dates. See https://forge.softwareheritage.org/T1339""" return _check_date(rev["date"]) and _check_date(rev["committer_date"]) def _fix_revision(revision: Dict[str, Any]) -> Optional[Dict]: """Fix various legacy revision issues. Fix author/committer person: >>> from pprint import pprint >>> date = { ... 'timestamp': { ... 'seconds': 1565096932, ... 'microseconds': 0, ... }, ... 'offset': 0, ... } >>> rev0 = _fix_revision({ ... 'id': b'rev-id', ... 'author': {'fullname': b'', 'name': '', 'email': ''}, ... 'committer': {'fullname': b'', 'name': '', 'email': ''}, ... 'date': date, ... 'committer_date': date, ... 'type': 'git', ... 'message': '', ... 'directory': b'dir-id', ... 'synthetic': False, ... }) >>> rev0['author'] {'fullname': b'', 'name': b'', 'email': b''} >>> rev0['committer'] {'fullname': b'', 'name': b'', 'email': b''} Fix type of 'transplant_source' extra headers: >>> rev1 = _fix_revision({ ... 'id': b'rev-id', ... 'author': {'fullname': b'', 'name': '', 'email': ''}, ... 'committer': {'fullname': b'', 'name': '', 'email': ''}, ... 'date': date, ... 'committer_date': date, ... 'metadata': { ... 'extra_headers': [ ... ['time_offset_seconds', b'-3600'], ... ['transplant_source', '29c154a012a70f49df983625090434587622b39e'] ... ]}, ... 'type': 'git', ... 'message': '', ... 'directory': b'dir-id', ... 'synthetic': False, ... }) >>> pprint(rev1['metadata']['extra_headers']) [['time_offset_seconds', b'-3600'], ['transplant_source', b'29c154a012a70f49df983625090434587622b39e']] Revision with invalid date are filtered: >>> from copy import deepcopy >>> invalid_date1 = deepcopy(date) >>> invalid_date1['timestamp']['microseconds'] = 1000000000 # > 10^6 >>> rev = _fix_revision({ ... 'author': {'fullname': b'', 'name': '', 'email': ''}, ... 'committer': {'fullname': b'', 'name': '', 'email': ''}, ... 'date': invalid_date1, ... 'committer_date': date, ... }) >>> rev is None True >>> invalid_date2 = deepcopy(date) >>> invalid_date2['timestamp']['seconds'] = 2**70 # > 10^63 >>> rev = _fix_revision({ ... 'author': {'fullname': b'', 'name': '', 'email': ''}, ... 'committer': {'fullname': b'', 'name': '', 'email': ''}, ... 'date': invalid_date2, ... 'committer_date': date, ... }) >>> rev is None True >>> invalid_date3 = deepcopy(date) >>> invalid_date3['offset'] = 2**20 # > 10^15 >>> rev = _fix_revision({ ... 'author': {'fullname': b'', 'name': '', 'email': ''}, ... 'committer': {'fullname': b'', 'name': '', 'email': ''}, ... 'date': date, ... 'committer_date': invalid_date3, ... }) >>> rev is None True """ # noqa rev = _fix_revision_pypi_empty_string(revision) rev = _fix_revision_transplant_source(rev) if not _check_revision_date(rev): logger.warning( "Invalid revision date detected: %(revision)s", {"revision": rev} ) return None return rev def _fix_origin(origin: Dict) -> Dict: """Fix legacy origin with type which is no longer part of the model. >>> from pprint import pprint >>> pprint(_fix_origin({ ... 'url': 'http://foo', ... })) {'url': 'http://foo'} >>> pprint(_fix_origin({ ... 'url': 'http://bar', ... 'type': 'foo', ... })) {'url': 'http://bar'} """ o = origin.copy() o.pop("type", None) return o def _fix_origin_visit(visit: Dict) -> Dict: """Fix various legacy origin visit issues. `visit['origin']` is a dict instead of an URL: >>> from datetime import datetime, timezone >>> from pprint import pprint >>> date = datetime(2020, 2, 27, 14, 39, 19, tzinfo=timezone.utc) >>> pprint(_fix_origin_visit({ ... 'origin': {'url': 'http://foo'}, ... 'date': date, ... 'type': 'git', ... 'status': 'ongoing', ... 'snapshot': None, ... })) {'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), 'origin': 'http://foo', 'type': 'git'} `visit['type']` is missing , but `origin['visit']['type']` exists: >>> pprint(_fix_origin_visit( ... {'origin': {'type': 'hg', 'url': 'http://foo'}, ... 'date': date, ... 'status': 'ongoing', ... 'snapshot': None, ... })) {'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), 'origin': 'http://foo', 'type': 'hg'} >>> pprint(_fix_origin_visit( ... {'origin': {'type': 'hg', 'url': 'http://foo'}, ... 'date': '2020-02-27 14:39:19+00:00', ... 'status': 'ongoing', ... 'snapshot': None, ... })) {'date': datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=datetime.timezone.utc), 'origin': 'http://foo', 'type': 'hg'} Old visit format (origin_visit with no type) raises: >>> _fix_origin_visit({ ... 'origin': {'url': 'http://foo'}, ... 'date': date, ... 'status': 'ongoing', ... 'snapshot': None ... }) Traceback (most recent call last): ... ValueError: Old origin visit format detected... >>> _fix_origin_visit({ ... 'origin': 'http://foo', ... 'date': date, ... 'status': 'ongoing', ... 'snapshot': None ... }) Traceback (most recent call last): ... ValueError: Old origin visit format detected... """ # noqa visit = visit.copy() if "type" not in visit: if isinstance(visit["origin"], dict) and "type" in visit["origin"]: # Very old version of the schema: visits did not have a type, # but their 'origin' field was a dict with a 'type' key. visit["type"] = visit["origin"]["type"] else: # Very old schema version: 'type' is missing, stop early # We expect the journal's origin_visit topic to no longer reference # such visits. If it does, the replayer must crash so we can fix # the journal's topic. raise ValueError(f"Old origin visit format detected: {visit}") if isinstance(visit["origin"], dict): # Old version of the schema: visit['origin'] was a dict. visit["origin"] = visit["origin"]["url"] date = visit["date"] if isinstance(date, str): visit["date"] = datetime.datetime.fromisoformat(date) # Those are no longer part of the model for key in ["status", "snapshot", "metadata"]: visit.pop(key, None) return visit def _fix_raw_extrinsic_metadata(obj_dict: Dict) -> Dict: """Fix legacy RawExtrinsicMetadata with type which is no longer part of the model. >>> _fix_raw_extrinsic_metadata({ ... 'type': 'directory', ... 'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243', ... }) {'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243'} >>> _fix_raw_extrinsic_metadata({ ... 'type': 'origin', ... 'target': 'https://inria.halpreprod.archives-ouvertes.fr/hal-01667309', ... }) {'target': 'swh:1:ori:155291d5b9ada4570672510509f93fcfd9809882'} """ o = obj_dict.copy() if o.pop("type", None) == "origin": o["target"] = str(Origin(o["target"]).swhid()) return o def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]: """ Fix legacy objects from the journal to bring them up to date with the latest storage schema. """ if object_type == "content": return [_fix_content(v) for v in objects] elif object_type == "revision": revisions = [_fix_revision(v) for v in objects] return [rev for rev in revisions if rev is not None] elif object_type == "origin": return [_fix_origin(v) for v in objects] elif object_type == "origin_visit": return [_fix_origin_visit(v) for v in objects] elif object_type == "raw_extrinsic_metadata": return [_fix_raw_extrinsic_metadata(v) for v in objects] else: return objects diff --git a/swh/storage/tests/algos/test_diff.py b/swh/storage/tests/algos/test_diff.py index e19e26d8..e5355864 100644 --- a/swh/storage/tests/algos/test_diff.py +++ b/swh/storage/tests/algos/test_diff.py @@ -1,389 +1,388 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # flake8: noqa import unittest from unittest.mock import patch import pytest from swh.model.hashutil import hash_to_bytes -from swh.model.identifiers import directory_identifier from swh.storage.algos import diff from .test_dir_iterator import DirectoryModel def test__get_rev(swh_storage, sample_data): revision = sample_data.revision # does not exist then raises with pytest.raises(AssertionError): diff._get_rev(swh_storage, revision.id) # otherwise, we retrieve its dict representation swh_storage.revision_add([revision]) actual_revision = diff._get_rev(swh_storage, revision.id) assert actual_revision == revision.to_dict() @patch("swh.storage.algos.diff._get_rev") @patch("swh.storage.algos.dir_iterators._get_dir") class TestDiffRevisions(unittest.TestCase): def diff_revisions( self, rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ): rev_from_bytes = hash_to_bytes(rev_from) rev_to_bytes = hash_to_bytes(rev_to) def _get_rev(*args, **kwargs): if args[1] == rev_from_bytes: return {"directory": from_dir_model["target"]} else: return {"directory": to_dir_model["target"]} def _get_dir(*args, **kwargs): from_dir = from_dir_model.get_hash_data(args[1]) to_dir = to_dir_model.get_hash_data(args[1]) return from_dir if from_dir != None else to_dir mock_get_rev.side_effect = _get_rev mock_get_dir.side_effect = _get_dir changes = diff.diff_revisions( None, rev_from_bytes, rev_to_bytes, track_renaming=True ) self.assertEqual(changes, expected_changes) def test_insert_delete(self, mock_get_dir, mock_get_rev): rev_from = "898ff03e1e7925ecde3da66327d3cdc7e07625ba" rev_to = "647c3d381e67490e82cdbbe6c96e46d5e1628ce2" from_dir_model = DirectoryModel() to_dir_model = DirectoryModel() to_dir_model.add_file(b"file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") to_dir_model.add_file(b"file2", "3e5faecb3836ffcadf82cc160787e35d4e2bec6a") to_dir_model.add_file(b"file3", "2ae33b2984974d35eababe4890d37fbf4bce6b2c") expected_changes = [ { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"file1"), "to_path": b"file1", }, { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"file2"), "to_path": b"file2", }, { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"file3"), "to_path": b"file3", }, ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) from_dir_model = DirectoryModel() from_dir_model.add_file(b"file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") from_dir_model.add_file(b"file2", "3e5faecb3836ffcadf82cc160787e35d4e2bec6a") from_dir_model.add_file(b"file3", "2ae33b2984974d35eababe4890d37fbf4bce6b2c") to_dir_model = DirectoryModel() expected_changes = [ { "type": "delete", "from": from_dir_model.get_path_data(b"file1"), "from_path": b"file1", "to": None, "to_path": None, }, { "type": "delete", "from": from_dir_model.get_path_data(b"file2"), "from_path": b"file2", "to": None, "to_path": None, }, { "type": "delete", "from": from_dir_model.get_path_data(b"file3"), "from_path": b"file3", "to": None, "to_path": None, }, ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) def test_onelevel_diff(self, mock_get_dir, mock_get_rev): rev_from = "898ff03e1e7925ecde3da66327d3cdc7e07625ba" rev_to = "647c3d381e67490e82cdbbe6c96e46d5e1628ce2" from_dir_model = DirectoryModel() from_dir_model.add_file(b"file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") from_dir_model.add_file(b"file2", "f4a96b2000be83b61254d107046fa9777b17eb34") from_dir_model.add_file(b"file3", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da") to_dir_model = DirectoryModel() to_dir_model.add_file(b"file2", "3ee0f38ee0ea23cc2c8c0b9d66b27be4596b002b") to_dir_model.add_file(b"file3", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da") to_dir_model.add_file(b"file4", "40460b9653b1dc507e1b6eb333bd4500634bdffc") expected_changes = [ { "type": "delete", "from": from_dir_model.get_path_data(b"file1"), "from_path": b"file1", "to": None, "to_path": None, }, { "type": "modify", "from": from_dir_model.get_path_data(b"file2"), "from_path": b"file2", "to": to_dir_model.get_path_data(b"file2"), "to_path": b"file2", }, { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"file4"), "to_path": b"file4", }, ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) def test_twolevels_diff(self, mock_get_dir, mock_get_rev): rev_from = "898ff03e1e7925ecde3da66327d3cdc7e07625ba" rev_to = "647c3d381e67490e82cdbbe6c96e46d5e1628ce2" from_dir_model = DirectoryModel() from_dir_model.add_file(b"file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") from_dir_model.add_file( b"dir1/file1", "8335fca266811bac7ae5c8e1621476b4cf4156b6" ) from_dir_model.add_file( b"dir1/file2", "a6127d909e79f1fcb28bbf220faf86e7be7831e5" ) from_dir_model.add_file( b"dir1/file3", "18049b8d067ce1194a7e1cce26cfa3ae4242a43d" ) from_dir_model.add_file(b"file2", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da") to_dir_model = DirectoryModel() to_dir_model.add_file(b"file1", "3ee0f38ee0ea23cc2c8c0b9d66b27be4596b002b") to_dir_model.add_file(b"dir1/file2", "de3548b32a8669801daa02143a66dae21fe852fd") to_dir_model.add_file(b"dir1/file3", "18049b8d067ce1194a7e1cce26cfa3ae4242a43d") to_dir_model.add_file(b"dir1/file4", "f5c3f42aec5fe7b92276196c350cbadaf4c51f87") to_dir_model.add_file(b"file2", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da") expected_changes = [ { "type": "delete", "from": from_dir_model.get_path_data(b"dir1/file1"), "from_path": b"dir1/file1", "to": None, "to_path": None, }, { "type": "modify", "from": from_dir_model.get_path_data(b"dir1/file2"), "from_path": b"dir1/file2", "to": to_dir_model.get_path_data(b"dir1/file2"), "to_path": b"dir1/file2", }, { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"dir1/file4"), "to_path": b"dir1/file4", }, { "type": "modify", "from": from_dir_model.get_path_data(b"file1"), "from_path": b"file1", "to": to_dir_model.get_path_data(b"file1"), "to_path": b"file1", }, ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) def test_insert_delete_empty_dirs(self, mock_get_dir, mock_get_rev): rev_from = "898ff03e1e7925ecde3da66327d3cdc7e07625ba" rev_to = "647c3d381e67490e82cdbbe6c96e46d5e1628ce2" from_dir_model = DirectoryModel() from_dir_model.add_file( b"dir3/file1", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) to_dir_model = DirectoryModel() to_dir_model.add_file(b"dir3/file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") to_dir_model.add_file(b"dir3/dir1/") expected_changes = [ { "type": "insert", "from": None, "from_path": None, "to": to_dir_model.get_path_data(b"dir3/dir1"), "to_path": b"dir3/dir1", } ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) from_dir_model = DirectoryModel() from_dir_model.add_file(b"dir1/dir2/") from_dir_model.add_file( b"dir1/file1", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) to_dir_model = DirectoryModel() to_dir_model.add_file(b"dir1/file1", "ea15f54ca215e7920c60f564315ebb7f911a5204") expected_changes = [ { "type": "delete", "from": from_dir_model.get_path_data(b"dir1/dir2"), "from_path": b"dir1/dir2", "to": None, "to_path": None, } ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) def test_track_renaming(self, mock_get_dir, mock_get_rev): rev_from = "898ff03e1e7925ecde3da66327d3cdc7e07625ba" rev_to = "647c3d381e67490e82cdbbe6c96e46d5e1628ce2" from_dir_model = DirectoryModel() from_dir_model.add_file( b"file1_oldname", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) from_dir_model.add_file( b"dir1/file1_oldname", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) from_dir_model.add_file( b"file2_oldname", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da" ) to_dir_model = DirectoryModel() to_dir_model.add_file( b"dir1/file1_newname", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) to_dir_model.add_file( b"dir2/file1_newname", "ea15f54ca215e7920c60f564315ebb7f911a5204" ) to_dir_model.add_file( b"file2_newname", "d3c00f9396c6d0277727cec522ff6ad1ea0bc2da" ) expected_changes = [ { "type": "rename", "from": from_dir_model.get_path_data(b"dir1/file1_oldname"), "from_path": b"dir1/file1_oldname", "to": to_dir_model.get_path_data(b"dir1/file1_newname"), "to_path": b"dir1/file1_newname", }, { "type": "rename", "from": from_dir_model.get_path_data(b"file1_oldname"), "from_path": b"file1_oldname", "to": to_dir_model.get_path_data(b"dir2/file1_newname"), "to_path": b"dir2/file1_newname", }, { "type": "rename", "from": from_dir_model.get_path_data(b"file2_oldname"), "from_path": b"file2_oldname", "to": to_dir_model.get_path_data(b"file2_newname"), "to_path": b"file2_newname", }, ] self.diff_revisions( rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ) diff --git a/swh/storage/tests/algos/test_dir_iterator.py b/swh/storage/tests/algos/test_dir_iterator.py index 4b3c4488..9ff95788 100644 --- a/swh/storage/tests/algos/test_dir_iterator.py +++ b/swh/storage/tests/algos/test_dir_iterator.py @@ -1,153 +1,165 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from unittest.mock import patch from swh.model.from_disk import DentryPerms from swh.model.hashutil import MultiHash, hash_to_bytes -from swh.model.identifiers import directory_identifier +from swh.model.model import Directory from swh.storage.algos.dir_iterators import dir_iterator # flake8: noqa class DirectoryModel(object): """ Quick and dirty directory model to ease the writing of directory iterators and revision trees differential tests. """ def __init__(self, name=""): self.data = {} self.data["name"] = name self.data["perms"] = DentryPerms.directory self.data["type"] = "dir" self.data["entries"] = [] self.data["entry_idx"] = {} def __getitem__(self, item): if item == "target": - return hash_to_bytes(directory_identifier(self)) + return Directory.from_dict( + { + "entries": [ + { + "name": entry["name"], + "target": entry["target"], + "type": entry["type"], + "perms": entry["perms"], + } + for entry in self.data["entries"] + ] + } + ).id else: return self.data[item] def add_file(self, path, sha1=None): path_parts = path.split(b"/") sha1 = ( hash_to_bytes(sha1) if sha1 else MultiHash.from_data(path).digest()["sha1"] ) if len(path_parts) == 1: self["entry_idx"][path] = len(self["entries"]) self["entries"].append( { "target": sha1, "name": path, "perms": DentryPerms.content, "type": "file", } ) else: if not path_parts[0] in self["entry_idx"]: self["entry_idx"][path_parts[0]] = len(self["entries"]) self["entries"].append(DirectoryModel(path_parts[0])) if path_parts[1]: dir_idx = self["entry_idx"][path_parts[0]] self["entries"][dir_idx].add_file(b"/".join(path_parts[1:]), sha1) def get_hash_data(self, entry_hash): if self["target"] == entry_hash: ret = [] for e in self["entries"]: ret.append( { "target": e["target"], "name": e["name"], "perms": e["perms"], "type": e["type"], } ) return ret else: for e in self["entries"]: if e["type"] == "file" and e["target"] == entry_hash: return e elif e["type"] == "dir": data = e.get_hash_data(entry_hash) if data: return data return None def get_path_data(self, path): path_parts = path.split(b"/") entry_idx = self["entry_idx"][path_parts[0]] entry = self["entries"][entry_idx] if len(path_parts) == 1: return { "target": entry["target"], "name": entry["name"], "perms": entry["perms"], "type": entry["type"], } else: return entry.get_path_data(b"/".join(path_parts[1:])) @patch("swh.storage.algos.dir_iterators._get_dir") class TestDirectoryIterator(unittest.TestCase): def check_iterated_paths(self, dir_model, expected_paths_order, mock_get_dir): def _get_dir(*args, **kwargs): return dir_model.get_hash_data(args[1]) mock_get_dir.side_effect = _get_dir # noqa paths_order = [e["path"] for e in dir_iterator(None, dir_model["target"])] self.assertEqual(paths_order, expected_paths_order) def test_dir_iterator_empty_dir(self, mock_get_dir): dir_model = DirectoryModel() expected_paths_order = [] self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) def test_dir_iterator_no_empty_dirs(self, mock_get_dir): dir_model = DirectoryModel() dir_model.add_file(b"xyz/gtr/uhb") dir_model.add_file(b"bca/ef") dir_model.add_file(b"abc/ab") dir_model.add_file(b"abc/bc") dir_model.add_file(b"xyz/ouy/poi") expected_paths_order = [ b"abc", b"abc/ab", b"abc/bc", b"bca", b"bca/ef", b"xyz", b"xyz/gtr", b"xyz/gtr/uhb", b"xyz/ouy", b"xyz/ouy/poi", ] self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir) def test_dir_iterator_with_empty_dirs(self, mock_get_dir): dir_model = DirectoryModel() dir_model.add_file(b"xyz/gtr/") dir_model.add_file(b"bca/ef") dir_model.add_file(b"abc/") dir_model.add_file(b"xyz/ouy/poi") expected_paths_order = [ b"abc", b"bca", b"bca/ef", b"xyz", b"xyz/gtr", b"xyz/ouy", b"xyz/ouy/poi", ] self.check_iterated_paths(dir_model, expected_paths_order, mock_get_dir)