diff --git a/swh/provenance/algos/directory.py b/swh/provenance/algos/directory.py --- a/swh/provenance/algos/directory.py +++ b/swh/provenance/algos/directory.py @@ -50,7 +50,7 @@ """Flatten the known directories from ``start_id`` to ``end_id``.""" current = start_id while current < end_id: - dirs = provenance.storage.directory_iter_not_flattenned( + dirs = provenance.storage.directory_iter_not_flattened( limit=100, start_id=current ) if not dirs: @@ -71,9 +71,9 @@ ) -> None: for directory in directories: # Only flatten directories that are present in the provenance model, but not - # flattenned yet. - flattenned = provenance.directory_already_flattenned(directory) - if flattenned is not None and not flattenned: + # flattened yet. + flattened = provenance.directory_already_flattened(directory) + if flattened is not None and not flattened: directory_flatten( provenance, archive, @@ -104,4 +104,4 @@ for d_child in current.dirs: # Recursively walk the child directory. stack.append((d_child, os.path.join(prefix, d_child.name))) - provenance.directory_flag_as_flattenned(directory) + provenance.directory_flag_as_flattened(directory) diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -99,14 +99,14 @@ """ ... - def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: - """Check if the directory is already flattenned in the provenance model. If the + def directory_already_flattened(self, directory: DirectoryEntry) -> Optional[bool]: + """Check if the directory is already flattened in the provenance model. If the directory is unknown for the model, the methods returns None. """ ... - def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: - """Mark the directory as flattenned in the provenance model. If the + def directory_flag_as_flattened(self, directory: DirectoryEntry) -> None: + """Mark the directory as flattened in the provenance model. If the directory is unknown for the model, this method has no effect. """ ... diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from datetime import datetime +import hashlib import logging import os from types import TracebackType @@ -252,7 +253,7 @@ ) revs = { - sha1 + sha1: RevisionData(date=None, origin=None) for sha1, date in self.cache["revision"]["data"].items() if sha1 in self.cache["revision"]["added"] and date is not None } @@ -267,7 +268,7 @@ ) paths = { - path + hashlib.sha1(path).digest(): path for _, _, path in self.cache["content_in_revision"] | self.cache["content_in_directory"] | self.cache["directory_in_revision"] @@ -409,7 +410,7 @@ (directory.id, revision.id, path_normalize(path)) ) - def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: + def directory_already_flattened(self, directory: DirectoryEntry) -> Optional[bool]: cache = self.cache["directory_flatten"] if directory.id not in cache: cache.setdefault(directory.id, None) @@ -421,7 +422,7 @@ self.cache["directory"]["data"][directory.id] = dir.date return cache.get(directory.id) - def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: + def directory_flag_as_flattened(self, directory: DirectoryEntry) -> None: self.cache["directory_flatten"][directory.id] = True def directory_get_date_in_isochrone_frontier( diff --git a/swh/provenance/storage/interface.py b/swh/provenance/storage/interface.py --- a/swh/provenance/storage/interface.py +++ b/swh/provenance/storage/interface.py @@ -9,7 +9,7 @@ from datetime import datetime import enum from types import TracebackType -from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union +from typing import Dict, Generator, Iterable, List, Optional, Set, Type from typing_extensions import Protocol, runtime_checkable @@ -136,11 +136,11 @@ """ ... - @remote_api_endpoint("directory_iter_not_flattenned") - def directory_iter_not_flattenned( + @remote_api_endpoint("directory_iter_not_flattened") + def directory_iter_not_flattened( self, limit: int, start_id: Sha1Git ) -> List[Sha1Git]: - """Retrieve the unflattenned directories after ``start_id`` up to ``limit`` entries.""" + """Retrieve the unflattened directories after ``start_id`` up to ``limit`` entries.""" ... @remote_api_endpoint("entity_get_all") @@ -151,12 +151,12 @@ ... @remote_api_endpoint("location_add") - def location_add(self, paths: Iterable[bytes]) -> bool: + def location_add(self, paths: Dict[Sha1Git, bytes]) -> bool: """Register the given `paths` in the storage.""" ... @remote_api_endpoint("location_get_all") - def location_get_all(self) -> Set[bytes]: + def location_get_all(self) -> Dict[Sha1Git, bytes]: """Retrieve all paths present in the provenance model. This method is used only in tests.""" ... @@ -180,9 +180,7 @@ ... @remote_api_endpoint("revision_add") - def revision_add( - self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]] - ) -> bool: + def revision_add(self, revs: Dict[Sha1Git, RevisionData]) -> bool: """Add revisions identified by sha1 ids, with optional associated date or origin (as paired in `revs`) to the provenance storage. Return a boolean stating if the information was successfully stored. diff --git a/swh/provenance/storage/postgresql.py b/swh/provenance/storage/postgresql.py --- a/swh/provenance/storage/postgresql.py +++ b/swh/provenance/storage/postgresql.py @@ -8,10 +8,11 @@ from contextlib import contextmanager from datetime import datetime from functools import wraps +from hashlib import sha1 import itertools import logging from types import TracebackType -from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union +from typing import Dict, Generator, Iterable, List, Optional, Set, Type import psycopg2.extensions import psycopg2.extras @@ -198,9 +199,9 @@ return result @statsd.timed( - metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"} + metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattened"} ) - def directory_iter_not_flattenned( + def directory_iter_not_flattened( self, limit: int, start_id: Sha1Git ) -> List[Sha1Git]: sql = """ @@ -219,9 +220,9 @@ @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "location_add"}) @handle_raise_on_commit - def location_add(self, paths: Iterable[bytes]) -> bool: + def location_add(self, paths: Dict[Sha1Git, bytes]) -> bool: if self.with_path(): - values = [(path,) for path in paths] + values = [(path,) for path in paths.values()] if values: sql = """ INSERT INTO location(path) VALUES %s @@ -235,10 +236,10 @@ return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "location_get_all"}) - def location_get_all(self) -> Set[bytes]: + def location_get_all(self) -> Dict[Sha1Git, bytes]: with self.transaction(readonly=True) as cursor: cursor.execute("SELECT location.path AS path FROM location") - return {row["path"] for row in cursor} + return {sha1(row["path"]).digest(): row["path"] for row in cursor} @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "origin_add"}) @handle_raise_on_commit @@ -284,14 +285,9 @@ @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "revision_add"}) @handle_raise_on_commit - def revision_add( - self, revs: Union[Iterable[Sha1Git], Dict[Sha1Git, RevisionData]] - ) -> bool: - if isinstance(revs, dict): + def revision_add(self, revs: Dict[Sha1Git, RevisionData]) -> bool: + if revs: data = [(sha1, rev.date, rev.origin) for sha1, rev in revs.items()] - else: - data = [(sha1, None, None) for sha1 in revs] - if data: sql = """ INSERT INTO revision(sha1, date, origin) (SELECT V.rev AS sha1, V.date::timestamptz AS date, O.id AS origin diff --git a/swh/provenance/storage/rabbitmq/client.py b/swh/provenance/storage/rabbitmq/client.py --- a/swh/provenance/storage/rabbitmq/client.py +++ b/swh/provenance/storage/rabbitmq/client.py @@ -71,6 +71,7 @@ if isinstance(data, dict): items = set(data.items()) else: + # TODO this is probably not used any more items = {(item,) for item in data} for id, *rest in items: key = ProvenanceStorageRabbitMQServer.get_routing_key(id, meth_name) diff --git a/swh/provenance/storage/rabbitmq/server.py b/swh/provenance/storage/rabbitmq/server.py --- a/swh/provenance/storage/rabbitmq/server.py +++ b/swh/provenance/storage/rabbitmq/server.py @@ -480,7 +480,7 @@ elif meth_name == "directory_add": return resolve_directory elif meth_name == "location_add": - return lambda data: set(data) # just remove duplicates + return lambda data: dict(data) elif meth_name == "origin_add": return lambda data: dict(data) # last processed value is good enough elif meth_name == "revision_add": diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py --- a/swh/provenance/tests/test_directory_flatten.py +++ b/swh/provenance/tests/test_directory_flatten.py @@ -52,8 +52,8 @@ # this query forces the directory date to be retrieved from the storage and cached # (otherwise, the flush below won't update the directory flatten flag) - flattenned = provenance.directory_already_flattenned(directory) - assert flattenned is not None and not flattenned + flattened = provenance.directory_already_flattened(directory) + assert flattened is not None and not flattened return date, directory, content1, content2 diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from datetime import datetime, timezone +import hashlib import inspect import os from typing import Any, Dict, Iterable, Optional, Set, Tuple @@ -99,13 +100,17 @@ # Add all names of entries present in the directories of the current repo as paths # to the storage. Then check that the returned results when querying are the same. - paths = {entry["name"] for dir in data["directory"] for entry in dir["entries"]} + paths = { + hashlib.sha1(entry["name"]).digest(): entry["name"] + for dir in data["directory"] + for entry in dir["entries"] + } assert provenance_storage.location_add(paths) if provenance_storage.with_path(): assert provenance_storage.location_get_all() == paths else: - assert provenance_storage.location_get_all() == set() + assert not provenance_storage.location_get_all() @pytest.mark.origin_layer def test_provenance_storage_origin( @@ -143,22 +148,22 @@ # Origin must be inserted in advance. assert provenance_storage.origin_add({origin.id: origin.url}) - revs = {rev["id"] for idx, rev in enumerate(data["revision"]) if idx % 6 == 0} + revs = {rev["id"] for idx, rev in enumerate(data["revision"])} rev_data = { rev["id"]: RevisionData( date=ts2dt(rev["date"]) if idx % 2 != 0 else None, origin=origin.id if idx % 3 != 0 else None, ) for idx, rev in enumerate(data["revision"]) - if idx % 6 != 0 } assert revs - assert provenance_storage.revision_add(revs) assert provenance_storage.revision_add(rev_data) - assert provenance_storage.revision_get(set(rev_data.keys())) == rev_data - assert provenance_storage.entity_get_all(EntityType.REVISION) == revs | set( - rev_data.keys() - ) + assert provenance_storage.revision_get(set(rev_data.keys())) == { + k: v + for (k, v) in rev_data.items() + if v.date is not None or v.origin is not None + } + assert provenance_storage.entity_get_all(EntityType.REVISION) == set(rev_data) def test_provenance_storage_relation_revision_layer( self, @@ -476,7 +481,12 @@ assert entity_add(storage, EntityType(dst), dsts) if storage.with_path(): assert storage.location_add( - {rel.path for rels in data.values() for rel in rels if rel.path is not None} + { + hashlib.sha1(rel.path).digest(): rel.path + for rels in data.values() + for rel in rels + if rel.path is not None + } ) assert data diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -223,7 +223,7 @@ ).difference(prev_directories) ] for directory in directories: - assert not provenance.directory_already_flattenned(directory) + assert not provenance.directory_already_flattened(directory) directory_add(provenance, archive, directories) # each "entry" in the synth file is one new revision @@ -317,9 +317,9 @@ rows["location"] |= set(x["path"].encode() for x in synth_rev["R_C"]) rows["location"] |= set(x["path"].encode() for x in synth_rev["D_C"]) rows["location"] |= set(x["path"].encode() for x in synth_rev["R_D"]) - assert rows["location"] == provenance.storage.location_get_all(), synth_rev[ - "msg" - ] + assert rows["location"] == set( + provenance.storage.location_get_all().values() + ), synth_rev["msg"] @pytest.mark.parametrize(