diff --git a/sql/upgrades/002.sql b/sql/upgrades/002.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/002.sql @@ -0,0 +1,17 @@ +-- SWH DB schema upgrade +-- from_version: 1 +-- to_version: 2 +-- description: add flag to acknowledge directories' flattening + + +insert into dbversion(version, release, description) + values(2, now(), 'Work In Progress'); + +alter table content + alter column date set not null; + +alter table directory + add column flat boolean not null default false; + +alter table directory + alter column date set not null; diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py --- a/swh/provenance/api/serializers.py +++ b/swh/provenance/api/serializers.py @@ -33,11 +33,12 @@ ENCODERS: List[Tuple[type, str, Callable]] = [ + (interface.DirectoryData, "dataclass", _encode_dataclass), + (interface.EntityType, "enum", _encode_enum), (interface.ProvenanceResult, "dataclass", _encode_dataclass), (interface.RelationData, "dataclass", _encode_dataclass), - (interface.RevisionData, "dataclass", _encode_dataclass), - (interface.EntityType, "enum", _encode_enum), (interface.RelationType, "enum", _encode_enum), + (interface.RevisionData, "dataclass", _encode_dataclass), (set, "set", list), ] diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py --- a/swh/provenance/api/server.py +++ b/swh/provenance/api/server.py @@ -31,7 +31,13 @@ from swh.model.model import Sha1Git from .. import get_provenance_storage -from ..interface import EntityType, RelationData, RelationType, RevisionData +from ..interface import ( + DirectoryData, + EntityType, + RelationData, + RelationType, + RevisionData, +) from ..util import path_id from .serializers import DECODERS, ENCODERS @@ -53,21 +59,31 @@ pass -def resolve_dates( - dates: Iterable[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] -) -> Dict[Sha1Git, Optional[datetime]]: - result: Dict[Sha1Git, Optional[datetime]] = {} - for row in dates: - sha1 = row[0] - date = ( - cast(Tuple[Sha1Git, Optional[datetime]], row)[1] if len(row) > 1 else None - ) - known = result.setdefault(sha1, None) - if date is not None and (known is None or date < known): +def resolve_dates(dates: Iterable[Tuple[Sha1Git, datetime]]) -> Dict[Sha1Git, datetime]: + result: Dict[Sha1Git, datetime] = {} + for sha1, date in dates: + known = result.setdefault(sha1, date) + if date < known: result[sha1] = date return result +def resolve_directory( + data: Iterable[Tuple[Sha1Git, DirectoryData]] +) -> Dict[Sha1Git, DirectoryData]: + result: Dict[Sha1Git, DirectoryData] = {} + for sha1, dir in data: + known = result.setdefault(sha1, dir) + value = known + if dir.date < known.date: + value = DirectoryData(date=dir.date, flat=value.flat) + if dir.flat: + value = DirectoryData(date=value.date, flat=dir.flat) + if value != known: + result[sha1] = value + return result + + def resolve_revision( data: Iterable[Union[Tuple[Sha1Git, RevisionData], Tuple[Sha1Git]]] ) -> Dict[Sha1Git, RevisionData]: @@ -458,8 +474,10 @@ @staticmethod def get_conflicts_func(meth_name: str) -> Callable[[Iterable[Any]], Any]: - if meth_name in ["content_add", "directory_add"]: + if meth_name == "content_add": return resolve_dates + elif meth_name == "directory_add": + return resolve_directory elif meth_name == "location_add": return lambda data: set(data) # just remove duplicates elif meth_name == "origin_add": diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -43,6 +43,18 @@ path: bytes +@dataclass(eq=True, frozen=True) +class DirectoryData: + """Object representing the data associated to a directory in the provenance model, + where `date` is the date of the directory in the isochrone frontier, and `flat` is a + flag acknowledging that a flat model for the elements outside the frontier has + already been created. + """ + + date: datetime + flat: bool + + @dataclass(eq=True, frozen=True) class RevisionData: """Object representing the data associated to a revision in the provenance model, @@ -85,11 +97,9 @@ ... @remote_api_endpoint("content_add") - def content_add( - self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - """Add blobs identified by sha1 ids, with an optional associated date (as paired - in `cnts`) to the provenance storage. Return a boolean stating whether the + def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: + """Add blobs identified by sha1 ids, with an associated date (as paired in + `cnts`) to the provenance storage. Return a boolean stating whether the information was successfully stored. """ ... @@ -108,25 +118,23 @@ @remote_api_endpoint("content_get") def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: - """Retrieve the associated date for each blob sha1 in `ids`. If some blob has - no associated date, it is not present in the resulting dictionary. - """ + """Retrieve the associated date for each blob sha1 in `ids`.""" ... @remote_api_endpoint("directory_add") - def directory_add( - self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - """Add directories identified by sha1 ids, with an optional associated date (as - paired in `dirs`) to the provenance storage. Return a boolean stating if the - information was successfully stored. + def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: + """Add directories identified by sha1 ids, with associated date and (optional) + flatten flag (as paired in `dirs`) to the provenance storage. If the flatten + flag is set to None, the previous value present in the storage is preserved. + Return a boolean stating if the information was successfully stored. """ ... @remote_api_endpoint("directory_get") - def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: - """Retrieve the associated date for each directory sha1 in `ids`. If some - directory has no associated date, it is not present in the resulting dictionary. + def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: + """Retrieve the associated date and (optional) flatten flag for each directory + sha1 in `ids`. If some directories has no associated date, it is not present in + the resulting dictionary. """ ... diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py --- a/swh/provenance/mongo/backend.py +++ b/swh/provenance/mongo/backend.py @@ -18,6 +18,7 @@ from swh.model.model import Sha1Git from ..interface import ( + DirectoryData, EntityType, ProvenanceResult, ProvenanceStorageInterface, @@ -52,21 +53,18 @@ self.db.client.close() @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"}) - def content_add( - self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - data = cnts if isinstance(cnts, dict) else dict.fromkeys(cnts) + def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: existing = { x["sha1"]: x for x in self.db.content.find( - {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1} + {"sha1": {"$in": list(cnts)}}, {"sha1": 1, "ts": 1, "_id": 1} ) } - for sha1, date in data.items(): - ts = datetime.timestamp(date) if date is not None else None + for sha1, date in cnts.items(): + ts = datetime.timestamp(date) if sha1 in existing: cnt = existing[sha1] - if ts is not None and (cnt["ts"] is None or ts < cnt["ts"]): + if ts < cnt["ts"]: self.db.content.update_one( {"_id": cnt["_id"]}, {"$set": {"ts": ts}} ) @@ -173,41 +171,43 @@ return { x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) for x in self.db.content.find( - {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, - {"sha1": 1, "ts": 1, "_id": 0}, + {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "_id": 0} ) } @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"}) - def directory_add( - self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - data = dirs if isinstance(dirs, dict) else dict.fromkeys(dirs) + def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: existing = { x["sha1"]: x for x in self.db.directory.find( - {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1} + {"sha1": {"$in": list(dirs)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 1} ) } - for sha1, date in data.items(): - ts = datetime.timestamp(date) if date is not None else None + for sha1, info in dirs.items(): + ts = datetime.timestamp(info.date) if sha1 in existing: dir = existing[sha1] - if ts is not None and (dir["ts"] is None or ts < dir["ts"]): + if ts >= dir["ts"]: + ts = dir["ts"] + flat = info.flat or dir["flat"] + if ts != dir["ts"] or flat != dir["flat"]: self.db.directory.update_one( - {"_id": dir["_id"]}, {"$set": {"ts": ts}} + {"_id": dir["_id"]}, {"$set": {"ts": ts, "flat": flat}} ) else: - self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}}) + self.db.directory.insert_one( + {"sha1": sha1, "ts": ts, "revision": {}, "flat": info.flat} + ) return True @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"}) - def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: + def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: return { - x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc) + x["sha1"]: DirectoryData( + date=datetime.fromtimestamp(x["ts"], timezone.utc), flat=x["flat"] + ) for x in self.db.directory.find( - {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}}, - {"sha1": 1, "ts": 1, "_id": 0}, + {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 0} ) } diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -14,13 +14,13 @@ import psycopg2.extensions import psycopg2.extras -from typing_extensions import Literal from swh.core.db import BaseDb from swh.core.statsd import statsd from swh.model.model import Sha1Git from ..interface import ( + DirectoryData, EntityType, ProvenanceResult, ProvenanceStorageInterface, @@ -82,10 +82,26 @@ self.conn.close() @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"}) - def content_add( - self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - return self._entity_set_date("content", cnts) + def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool: + try: + if cnts: + sql = """ + INSERT INTO content(sha1, date) VALUES %s + ON CONFLICT (sha1) DO + UPDATE SET date=LEAST(EXCLUDED.date,content.date) + """ + page_size = self.page_size or len(cnts) + with self.transaction() as cursor: + psycopg2.extras.execute_values( + cursor, sql, argslist=cnts.items(), page_size=page_size + ) + return True + except: # noqa: E722 + # Unexpected error occurred, rollback all changes and log message + LOGGER.exception("Unexpected error") + if self.raise_on_commit: + raise + return False @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_find_first"}) def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: @@ -106,17 +122,67 @@ @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_get"}) def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: - return self._entity_get_date("content", ids) + dates: Dict[Sha1Git, datetime] = {} + sha1s = tuple(ids) + if sha1s: + # TODO: consider splitting this query in several ones if sha1s is too big! + values = ", ".join(itertools.repeat("%s", len(sha1s))) + sql = f""" + SELECT sha1, date + FROM content + WHERE sha1 IN ({values}) + AND date IS NOT NULL + """ + with self.transaction(readonly=True) as cursor: + cursor.execute(query=sql, vars=sha1s) + dates.update((row["sha1"], row["date"]) for row in cursor) + return dates @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"}) - def directory_add( - self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]] - ) -> bool: - return self._entity_set_date("directory", dirs) + def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool: + data = [(sha1, rev.date, rev.flat) for sha1, rev in dirs.items()] + try: + if data: + sql = """ + INSERT INTO directory(sha1, date, flat) VALUES %s + ON CONFLICT (sha1) DO + UPDATE SET + date=LEAST(EXCLUDED.date, directory.date), + flat=(EXCLUDED.flat OR directory.flat) + """ + page_size = self.page_size or len(data) + with self.transaction() as cursor: + psycopg2.extras.execute_values( + cur=cursor, sql=sql, argslist=data, page_size=page_size + ) + return True + except: # noqa: E722 + # Unexpected error occurred, rollback all changes and log message + LOGGER.exception("Unexpected error") + if self.raise_on_commit: + raise + return False @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"}) - def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: - return self._entity_get_date("directory", ids) + def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]: + result: Dict[Sha1Git, DirectoryData] = {} + sha1s = tuple(ids) + if sha1s: + # TODO: consider splitting this query in several ones if sha1s is too big! + values = ", ".join(itertools.repeat("%s", len(sha1s))) + sql = f""" + SELECT sha1, date, flat + FROM directory + WHERE sha1 IN ({values}) + AND date IS NOT NULL + """ + with self.transaction(readonly=True) as cursor: + cursor.execute(query=sql, vars=sha1s) + result.update( + (row["sha1"], DirectoryData(date=row["date"], flat=row["flat"])) + for row in cursor + ) + return result @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"}) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: @@ -299,53 +365,6 @@ ) -> Dict[Sha1Git, Set[RelationData]]: return self._relation_get(relation, None) - def _entity_get_date( - self, - entity: Literal["content", "directory", "revision"], - ids: Iterable[Sha1Git], - ) -> Dict[Sha1Git, datetime]: - dates: Dict[Sha1Git, datetime] = {} - sha1s = tuple(ids) - if sha1s: - # TODO: consider splitting this query in several ones if sha1s is too big! - values = ", ".join(itertools.repeat("%s", len(sha1s))) - sql = f""" - SELECT sha1, date - FROM {entity} - WHERE sha1 IN ({values}) - AND date IS NOT NULL - """ - with self.transaction(readonly=True) as cursor: - cursor.execute(query=sql, vars=sha1s) - dates.update((row["sha1"], row["date"]) for row in cursor) - return dates - - def _entity_set_date( - self, - entity: Literal["content", "directory"], - dates: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]], - ) -> bool: - data = dates if isinstance(dates, dict) else dict.fromkeys(dates) - try: - if data: - sql = f""" - INSERT INTO {entity}(sha1, date) VALUES %s - ON CONFLICT (sha1) DO - UPDATE SET date=LEAST(EXCLUDED.date,{entity}.date) - """ - page_size = self.page_size or len(data) - with self.transaction() as cursor: - psycopg2.extras.execute_values( - cursor, sql, argslist=data.items(), page_size=page_size - ) - return True - except: # noqa: E722 - # Unexpected error occurred, rollback all changes and log message - LOGGER.exception("Unexpected error") - if self.raise_on_commit: - raise - return False - def _relation_get( self, relation: RelationType, diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -15,6 +15,7 @@ from swh.model.model import Sha1Git from .interface import ( + DirectoryData, ProvenanceInterface, ProvenanceResult, ProvenanceStorageInterface, @@ -49,6 +50,7 @@ class ProvenanceCache(TypedDict): content: DatetimeCache directory: DatetimeCache + directory_flatten: Dict[Sha1Git, Optional[bool]] revision: DatetimeCache # below are insertion caches only content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] @@ -65,6 +67,7 @@ return ProvenanceCache( content=DatetimeCache(data={}, added=set()), directory=DatetimeCache(data={}, added=set()), + directory_flatten={}, revision=DatetimeCache(data={}, added=set()), content_in_revision=set(), content_in_directory=set(), @@ -194,7 +197,8 @@ # properly resolve any internal reference if needed. Content and directory # entries may safely be registered with their associated dates. In contrast, # revision entries should be registered without date, as it is used to - # acknowledge that the flushing was successful. + # acknowledge that the flushing was successful. Also, directories are + # registered with their flatten flag not set. cnt_dates = { sha1: date for sha1, date in self.cache["content"]["data"].items() @@ -211,7 +215,7 @@ ) dir_dates = { - sha1: date + sha1: DirectoryData(date=date, flat=False) for sha1, date in self.cache["directory"]["data"].items() if sha1 in self.cache["directory"]["added"] and date is not None } @@ -306,8 +310,27 @@ RelationType.DIR_IN_REV, ) - # After relations, dates for the revisions can be safely set, acknowledging that - # these revisions won't need to be reprocessed in case of failure. + # After relations, flatten flags for directories can be safely set (if + # applicable) acknowledging those directories that have already be flattened. + # Similarly, dates for the revisions are set to acknowledge that these revisions + # won't need to be reprocessed in case of failure. + dir_acks = { + sha1: DirectoryData( + date=date, flat=self.cache["directory_flatten"].get(sha1) or False + ) + for sha1, date in self.cache["directory"]["data"].items() + if sha1 in self.cache["directory"]["added"] and date is not None + } + if dir_acks: + while not self.storage.directory_add(dir_acks): + statsd.increment( + metric=BACKEND_OPERATIONS_METRIC, + tags={"method": "flush_revision_content_retry_directory_ack"}, + ) + LOGGER.warning( + "Unable to write directory dates to the storage. Retrying..." + ) + rev_dates = { sha1: RevisionData(date=date, origin=None) for sha1, date in self.cache["revision"]["data"].items() @@ -388,14 +411,22 @@ cache = self.cache[entity] missing_ids = set(id for id in ids if id not in cache) if missing_ids: - if entity == "revision": - updated = { - id: rev.date - for id, rev in self.storage.revision_get(missing_ids).items() - } - else: - updated = getattr(self.storage, f"{entity}_get")(missing_ids) - cache["data"].update(updated) + if entity == "content": + cache["data"].update(self.storage.content_get(missing_ids)) + elif entity == "directory": + cache["data"].update( + { + id: dir.date + for id, dir in self.storage.directory_get(missing_ids).items() + } + ) + elif entity == "revision": + cache["data"].update( + { + id: rev.date + for id, rev in self.storage.revision_get(missing_ids).items() + } + ) dates: Dict[Sha1Git, datetime] = {} for sha1 in ids: date = cache["data"].setdefault(sha1, None) diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -16,7 +16,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(1, now(), 'Work In Progress'); + values(2, now(), 'Work In Progress'); -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); @@ -35,9 +35,9 @@ -- entity tables create table content ( - id bigserial primary key, -- internal identifier of the content blob - sha1 sha1_git unique not null, -- intrinsic identifier of the content blob - date timestamptz -- timestamp of the revision where the blob appears early + id bigserial primary key, -- internal identifier of the content blob + sha1 sha1_git unique not null, -- intrinsic identifier of the content blob + date timestamptz not null -- timestamp of the revision where the blob appears early ); comment on column content.id is 'Content internal identifier'; comment on column content.sha1 is 'Content intrinsic identifier'; @@ -45,9 +45,10 @@ create table directory ( - id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier - sha1 sha1_git unique not null, -- intrinsic identifier of the directory - date timestamptz -- max timestamp among those of the directory children's + id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier + sha1 sha1_git unique not null, -- intrinsic identifier of the directory + date timestamptz not null, -- max timestamp among those of the directory children's + flat boolean not null default false -- flag acknowledging if the directory is flattenned in the model ); comment on column directory.id is 'Directory internal identifier'; comment on column directory.sha1 is 'Directory intrinsic identifier'; @@ -55,10 +56,10 @@ create table revision ( - id bigserial primary key, -- internal identifier of the revision - sha1 sha1_git unique not null, -- intrinsic identifier of the revision - date timestamptz, -- timestamp of the revision - origin bigint -- id of the preferred origin + id bigserial primary key, -- internal identifier of the revision + sha1 sha1_git unique not null, -- intrinsic identifier of the revision + date timestamptz, -- timestamp of the revision + origin bigint -- id of the preferred origin -- foreign key (origin) references origin (id) ); comment on column revision.id is 'Revision internal identifier'; @@ -68,17 +69,17 @@ create table location ( - id bigserial primary key, -- internal identifier of the location - path unix_path unique not null -- path to the location + id bigserial primary key, -- internal identifier of the location + path unix_path unique not null -- path to the location ); comment on column location.id is 'Location internal identifier'; comment on column location.path is 'Path to the location'; create table origin ( - id bigserial primary key, -- internal identifier of the origin - sha1 sha1_git unique not null, -- intrinsic identifier of the origin - url text unique not null -- url of the origin + id bigserial primary key, -- internal identifier of the origin + sha1 sha1_git unique not null, -- intrinsic identifier of the origin + url text unique not null -- url of the origin ); comment on column origin.id is 'Origin internal identifier'; comment on column origin.sha1 is 'Origin intrinsic identifier'; @@ -87,13 +88,13 @@ -- relation tables create table content_in_revision ( - content bigint not null, -- internal identifier of the content blob + content bigint not null, -- internal identifier of the content blob \if :dbflavor_norm - revision bigint not null, -- internal identifier of the revision where the blob appears for the first time - location bigint -- location of the content relative to the revision's root directory + revision bigint not null, -- internal identifier of the revision where the blob appears for the first time + location bigint -- location of the content relative to the revision's root directory \else - revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time - location bigint[] -- locations of the content relative to the revisions' root directory + revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time + location bigint[] -- locations of the content relative to the revisions' root directory \endif -- foreign key (content) references content (id), -- foreign key (revision) references revision (id), @@ -109,13 +110,13 @@ create table content_in_directory ( - content bigint not null, -- internal identifier of the content blob + content bigint not null, -- internal identifier of the content blob \if :dbflavor_norm - directory bigint not null, -- internal identifier of the directory containing the blob - location bigint -- location of the content relative to its parent directory in the isochrone frontier + directory bigint not null, -- internal identifier of the directory containing the blob + location bigint -- location of the content relative to its parent directory in the isochrone frontier \else - directory bigint[], -- internal reference of the directories containing the blob - location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier + directory bigint[], -- internal reference of the directories containing the blob + location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier \endif -- foreign key (content) references content (id), -- foreign key (directory) references directory (id), @@ -131,13 +132,13 @@ create table directory_in_revision ( - directory bigint not null, -- internal identifier of the directory appearing in the revision + directory bigint not null, -- internal identifier of the directory appearing in the revision \if :dbflavor_norm - revision bigint not null, -- internal identifier of the revision containing the directory - location bigint -- location of the directory relative to the revision's root directory + revision bigint not null, -- internal identifier of the revision containing the directory + location bigint -- location of the directory relative to the revision's root directory \else - revision bigint[], -- internal identifiers of the revisions containing the directory - location bigint[] -- locations of the directory relative to the revisions' root directory + revision bigint[], -- internal identifiers of the revisions containing the directory + location bigint[] -- locations of the directory relative to the revisions' root directory \endif -- foreign key (directory) references directory (id), -- foreign key (revision) references revision (id), @@ -153,8 +154,8 @@ create table revision_in_origin ( - revision bigint not null, -- internal identifier of the revision poined by the origin - origin bigint not null -- internal identifier of the origin that points to the revision + revision bigint not null, -- internal identifier of the revision poined by the origin + origin bigint not null -- internal identifier of the origin that points to the revision -- foreign key (revision) references revision (id), -- foreign key (origin) references origin (id) ); @@ -163,8 +164,8 @@ create table revision_before_revision ( - prev bigserial not null, -- internal identifier of the source revision - next bigserial not null -- internal identifier of the destination revision + prev bigserial not null, -- internal identifier of the source revision + next bigserial not null -- internal identifier of the destination revision -- foreign key (prev) references revision (id), -- foreign key (next) references revision (id) ); diff --git a/swh/provenance/tests/test_conflict_resolution.py b/swh/provenance/tests/test_conflict_resolution.py --- a/swh/provenance/tests/test_conflict_resolution.py +++ b/swh/provenance/tests/test_conflict_resolution.py @@ -4,22 +4,29 @@ # See top-level LICENSE file for more information from datetime import datetime -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git -from swh.provenance.api.server import resolve_dates, resolve_relation, resolve_revision -from swh.provenance.interface import RelationData, RevisionData +from swh.provenance.api.server import ( + resolve_dates, + resolve_directory, + resolve_relation, + resolve_revision, +) +from swh.provenance.interface import DirectoryData, RelationData, RevisionData def test_resolve_dates() -> None: - items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [ - (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),), + items: List[Tuple[Sha1Git, datetime]] = [ + ( + hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), + datetime.fromtimestamp(1000000001), + ), ( hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), datetime.fromtimestamp(1000000000), ), - (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), None), ] assert resolve_dates(items) == { hash_to_bytes( @@ -28,21 +35,25 @@ } -def test_resolve_dates_keep_min() -> None: - items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [ +def test_resolve_directory() -> None: + items: List[Tuple[Sha1Git, DirectoryData]] = [ ( - hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - datetime.fromtimestamp(1000000001), + hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + DirectoryData(date=datetime.fromtimestamp(1000000002), flat=False), ), ( - hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), - datetime.fromtimestamp(1000000000), + hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + DirectoryData(date=datetime.fromtimestamp(1000000001), flat=True), + ), + ( + hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"), + DirectoryData(date=datetime.fromtimestamp(1000000000), flat=False), ), ] - assert resolve_dates(items) == { - hash_to_bytes( - "20329687bb9c1231a7e05afe86160343ad49b494" - ): datetime.fromtimestamp(1000000000) + assert resolve_directory(items) == { + hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"): DirectoryData( + date=datetime.fromtimestamp(1000000000), flat=True + ) } diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -12,6 +12,7 @@ from swh.model.model import Origin, Sha1Git from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ( + DirectoryData, EntityType, ProvenanceInterface, ProvenanceResult, @@ -38,17 +39,12 @@ # Add all content present in the current repo to the storage, just assigning their # creation dates. Then check that the returned results when querying are the same. - cnts = {cnt["sha1_git"] for idx, cnt in enumerate(data["content"]) if idx % 2 == 0} cnt_dates = { - cnt["sha1_git"]: cnt["ctime"] - for idx, cnt in enumerate(data["content"]) - if idx % 2 == 1 + cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"]) } - assert cnts or cnt_dates - assert provenance_storage.content_add(cnts) assert provenance_storage.content_add(cnt_dates) assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates - assert provenance_storage.entity_get_all(EntityType.CONTENT) == cnts | set( + assert provenance_storage.entity_get_all(EntityType.CONTENT) == set( cnt_dates.keys() ) @@ -75,21 +71,15 @@ ] return max(dates) if dates else None - dirs = { - dir["id"] - for dir in data["directory"] - if getmaxdate(dir, data["content"]) is None - } - dir_dates = { - dir["id"]: getmaxdate(dir, data["content"]) - for dir in data["directory"] - if getmaxdate(dir, data["content"]) is not None - } - assert dirs - assert provenance_storage.directory_add(dirs) + flat_values = (False, True) + dir_dates = {} + for idx, dir in enumerate(data["directory"]): + date = getmaxdate(dir, data["content"]) + if date is not None: + dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2]) assert provenance_storage.directory_add(dir_dates) assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates - assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == dirs | set( + assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set( dir_dates.keys() ) @@ -199,10 +189,13 @@ def entity_add( storage: ProvenanceStorageInterface, entity: EntityType, ids: Set[Sha1Git] ) -> bool: + now = datetime.now(tz=timezone.utc) if entity == EntityType.CONTENT: - return storage.content_add({sha1: None for sha1 in ids}) + return storage.content_add({sha1: now for sha1 in ids}) elif entity == EntityType.DIRECTORY: - return storage.directory_add({sha1: None for sha1 in ids}) + return storage.directory_add( + {sha1: DirectoryData(date=now, flat=False) for sha1 in ids} + ) else: # entity == EntityType.REVISION: return storage.revision_add( {sha1: RevisionData(date=None, origin=None) for sha1 in ids} diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -261,7 +261,9 @@ for rd in synth_rev["R_D"]: assert ( rev_ts + rd["rel_ts"] - == provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp() + == provenance.storage.directory_get([rd["dst"]])[ + rd["dst"] + ].date.timestamp() ), synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table