Page MenuHomeSoftware Heritage

D6712.diff
No OneTemporary

D6712.diff

diff --git a/sql/upgrades/002.sql b/sql/upgrades/002.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/002.sql
@@ -0,0 +1,17 @@
+-- SWH DB schema upgrade
+-- from_version: 1
+-- to_version: 2
+-- description: add flag to acknowledge directories' flattening
+
+
+insert into dbversion(version, release, description)
+ values(2, now(), 'Work In Progress');
+
+alter table content
+ alter column date set not null;
+
+alter table directory
+ add column flat boolean not null default false;
+
+alter table directory
+ alter column date set not null;
diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py
--- a/swh/provenance/api/serializers.py
+++ b/swh/provenance/api/serializers.py
@@ -33,11 +33,12 @@
ENCODERS: List[Tuple[type, str, Callable]] = [
+ (interface.DirectoryData, "dataclass", _encode_dataclass),
+ (interface.EntityType, "enum", _encode_enum),
(interface.ProvenanceResult, "dataclass", _encode_dataclass),
(interface.RelationData, "dataclass", _encode_dataclass),
- (interface.RevisionData, "dataclass", _encode_dataclass),
- (interface.EntityType, "enum", _encode_enum),
(interface.RelationType, "enum", _encode_enum),
+ (interface.RevisionData, "dataclass", _encode_dataclass),
(set, "set", list),
]
diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py
--- a/swh/provenance/api/server.py
+++ b/swh/provenance/api/server.py
@@ -31,7 +31,13 @@
from swh.model.model import Sha1Git
from .. import get_provenance_storage
-from ..interface import EntityType, RelationData, RelationType, RevisionData
+from ..interface import (
+ DirectoryData,
+ EntityType,
+ RelationData,
+ RelationType,
+ RevisionData,
+)
from ..util import path_id
from .serializers import DECODERS, ENCODERS
@@ -53,21 +59,31 @@
pass
-def resolve_dates(
- dates: Iterable[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]]
-) -> Dict[Sha1Git, Optional[datetime]]:
- result: Dict[Sha1Git, Optional[datetime]] = {}
- for row in dates:
- sha1 = row[0]
- date = (
- cast(Tuple[Sha1Git, Optional[datetime]], row)[1] if len(row) > 1 else None
- )
- known = result.setdefault(sha1, None)
- if date is not None and (known is None or date < known):
+def resolve_dates(dates: Iterable[Tuple[Sha1Git, datetime]]) -> Dict[Sha1Git, datetime]:
+ result: Dict[Sha1Git, datetime] = {}
+ for sha1, date in dates:
+ known = result.setdefault(sha1, date)
+ if date < known:
result[sha1] = date
return result
+def resolve_directory(
+ data: Iterable[Tuple[Sha1Git, DirectoryData]]
+) -> Dict[Sha1Git, DirectoryData]:
+ result: Dict[Sha1Git, DirectoryData] = {}
+ for sha1, dir in data:
+ known = result.setdefault(sha1, dir)
+ value = known
+ if dir.date < known.date:
+ value = DirectoryData(date=dir.date, flat=value.flat)
+ if dir.flat:
+ value = DirectoryData(date=value.date, flat=dir.flat)
+ if value != known:
+ result[sha1] = value
+ return result
+
+
def resolve_revision(
data: Iterable[Union[Tuple[Sha1Git, RevisionData], Tuple[Sha1Git]]]
) -> Dict[Sha1Git, RevisionData]:
@@ -458,8 +474,10 @@
@staticmethod
def get_conflicts_func(meth_name: str) -> Callable[[Iterable[Any]], Any]:
- if meth_name in ["content_add", "directory_add"]:
+ if meth_name == "content_add":
return resolve_dates
+ elif meth_name == "directory_add":
+ return resolve_directory
elif meth_name == "location_add":
return lambda data: set(data) # just remove duplicates
elif meth_name == "origin_add":
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -43,6 +43,18 @@
path: bytes
+@dataclass(eq=True, frozen=True)
+class DirectoryData:
+ """Object representing the data associated to a directory in the provenance model,
+ where `date` is the date of the directory in the isochrone frontier, and `flat` is a
+ flag acknowledging that a flat model for the elements outside the frontier has
+ already been created.
+ """
+
+ date: datetime
+ flat: bool
+
+
@dataclass(eq=True, frozen=True)
class RevisionData:
"""Object representing the data associated to a revision in the provenance model,
@@ -85,11 +97,9 @@
...
@remote_api_endpoint("content_add")
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- """Add blobs identified by sha1 ids, with an optional associated date (as paired
- in `cnts`) to the provenance storage. Return a boolean stating whether the
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
+ """Add blobs identified by sha1 ids, with an associated date (as paired in
+ `cnts`) to the provenance storage. Return a boolean stating whether the
information was successfully stored.
"""
...
@@ -108,25 +118,23 @@
@remote_api_endpoint("content_get")
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- """Retrieve the associated date for each blob sha1 in `ids`. If some blob has
- no associated date, it is not present in the resulting dictionary.
- """
+ """Retrieve the associated date for each blob sha1 in `ids`."""
...
@remote_api_endpoint("directory_add")
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- """Add directories identified by sha1 ids, with an optional associated date (as
- paired in `dirs`) to the provenance storage. Return a boolean stating if the
- information was successfully stored.
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
+ """Add directories identified by sha1 ids, with associated date and (optional)
+ flatten flag (as paired in `dirs`) to the provenance storage. If the flatten
+ flag is set to None, the previous value present in the storage is preserved.
+ Return a boolean stating if the information was successfully stored.
"""
...
@remote_api_endpoint("directory_get")
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- """Retrieve the associated date for each directory sha1 in `ids`. If some
- directory has no associated date, it is not present in the resulting dictionary.
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
+ """Retrieve the associated date and (optional) flatten flag for each directory
+ sha1 in `ids`. If some directories has no associated date, it is not present in
+ the resulting dictionary.
"""
...
diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py
--- a/swh/provenance/mongo/backend.py
+++ b/swh/provenance/mongo/backend.py
@@ -18,6 +18,7 @@
from swh.model.model import Sha1Git
from ..interface import (
+ DirectoryData,
EntityType,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -52,21 +53,18 @@
self.db.client.close()
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"})
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- data = cnts if isinstance(cnts, dict) else dict.fromkeys(cnts)
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
existing = {
x["sha1"]: x
for x in self.db.content.find(
- {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ {"sha1": {"$in": list(cnts)}}, {"sha1": 1, "ts": 1, "_id": 1}
)
}
- for sha1, date in data.items():
- ts = datetime.timestamp(date) if date is not None else None
+ for sha1, date in cnts.items():
+ ts = datetime.timestamp(date)
if sha1 in existing:
cnt = existing[sha1]
- if ts is not None and (cnt["ts"] is None or ts < cnt["ts"]):
+ if ts < cnt["ts"]:
self.db.content.update_one(
{"_id": cnt["_id"]}, {"$set": {"ts": ts}}
)
@@ -173,41 +171,43 @@
return {
x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
for x in self.db.content.find(
- {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
- {"sha1": 1, "ts": 1, "_id": 0},
+ {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "_id": 0}
)
}
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"})
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- data = dirs if isinstance(dirs, dict) else dict.fromkeys(dirs)
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
existing = {
x["sha1"]: x
for x in self.db.directory.find(
- {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ {"sha1": {"$in": list(dirs)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 1}
)
}
- for sha1, date in data.items():
- ts = datetime.timestamp(date) if date is not None else None
+ for sha1, info in dirs.items():
+ ts = datetime.timestamp(info.date)
if sha1 in existing:
dir = existing[sha1]
- if ts is not None and (dir["ts"] is None or ts < dir["ts"]):
+ if ts >= dir["ts"]:
+ ts = dir["ts"]
+ flat = info.flat or dir["flat"]
+ if ts != dir["ts"] or flat != dir["flat"]:
self.db.directory.update_one(
- {"_id": dir["_id"]}, {"$set": {"ts": ts}}
+ {"_id": dir["_id"]}, {"$set": {"ts": ts, "flat": flat}}
)
else:
- self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}})
+ self.db.directory.insert_one(
+ {"sha1": sha1, "ts": ts, "revision": {}, "flat": info.flat}
+ )
return True
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"})
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
return {
- x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
+ x["sha1"]: DirectoryData(
+ date=datetime.fromtimestamp(x["ts"], timezone.utc), flat=x["flat"]
+ )
for x in self.db.directory.find(
- {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
- {"sha1": 1, "ts": 1, "_id": 0},
+ {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 0}
)
}
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -14,13 +14,13 @@
import psycopg2.extensions
import psycopg2.extras
-from typing_extensions import Literal
from swh.core.db import BaseDb
from swh.core.statsd import statsd
from swh.model.model import Sha1Git
from ..interface import (
+ DirectoryData,
EntityType,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -82,10 +82,26 @@
self.conn.close()
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"})
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- return self._entity_set_date("content", cnts)
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
+ try:
+ if cnts:
+ sql = """
+ INSERT INTO content(sha1, date) VALUES %s
+ ON CONFLICT (sha1) DO
+ UPDATE SET date=LEAST(EXCLUDED.date,content.date)
+ """
+ page_size = self.page_size or len(cnts)
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(
+ cursor, sql, argslist=cnts.items(), page_size=page_size
+ )
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_find_first"})
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
@@ -106,17 +122,67 @@
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_get"})
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- return self._entity_get_date("content", ids)
+ dates: Dict[Sha1Git, datetime] = {}
+ sha1s = tuple(ids)
+ if sha1s:
+ # TODO: consider splitting this query in several ones if sha1s is too big!
+ values = ", ".join(itertools.repeat("%s", len(sha1s)))
+ sql = f"""
+ SELECT sha1, date
+ FROM content
+ WHERE sha1 IN ({values})
+ AND date IS NOT NULL
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=sha1s)
+ dates.update((row["sha1"], row["date"]) for row in cursor)
+ return dates
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"})
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- return self._entity_set_date("directory", dirs)
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
+ data = [(sha1, rev.date, rev.flat) for sha1, rev in dirs.items()]
+ try:
+ if data:
+ sql = """
+ INSERT INTO directory(sha1, date, flat) VALUES %s
+ ON CONFLICT (sha1) DO
+ UPDATE SET
+ date=LEAST(EXCLUDED.date, directory.date),
+ flat=(EXCLUDED.flat OR directory.flat)
+ """
+ page_size = self.page_size or len(data)
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(
+ cur=cursor, sql=sql, argslist=data, page_size=page_size
+ )
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"})
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- return self._entity_get_date("directory", ids)
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
+ result: Dict[Sha1Git, DirectoryData] = {}
+ sha1s = tuple(ids)
+ if sha1s:
+ # TODO: consider splitting this query in several ones if sha1s is too big!
+ values = ", ".join(itertools.repeat("%s", len(sha1s)))
+ sql = f"""
+ SELECT sha1, date, flat
+ FROM directory
+ WHERE sha1 IN ({values})
+ AND date IS NOT NULL
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=sha1s)
+ result.update(
+ (row["sha1"], DirectoryData(date=row["date"], flat=row["flat"]))
+ for row in cursor
+ )
+ return result
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"})
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
@@ -299,53 +365,6 @@
) -> Dict[Sha1Git, Set[RelationData]]:
return self._relation_get(relation, None)
- def _entity_get_date(
- self,
- entity: Literal["content", "directory", "revision"],
- ids: Iterable[Sha1Git],
- ) -> Dict[Sha1Git, datetime]:
- dates: Dict[Sha1Git, datetime] = {}
- sha1s = tuple(ids)
- if sha1s:
- # TODO: consider splitting this query in several ones if sha1s is too big!
- values = ", ".join(itertools.repeat("%s", len(sha1s)))
- sql = f"""
- SELECT sha1, date
- FROM {entity}
- WHERE sha1 IN ({values})
- AND date IS NOT NULL
- """
- with self.transaction(readonly=True) as cursor:
- cursor.execute(query=sql, vars=sha1s)
- dates.update((row["sha1"], row["date"]) for row in cursor)
- return dates
-
- def _entity_set_date(
- self,
- entity: Literal["content", "directory"],
- dates: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]],
- ) -> bool:
- data = dates if isinstance(dates, dict) else dict.fromkeys(dates)
- try:
- if data:
- sql = f"""
- INSERT INTO {entity}(sha1, date) VALUES %s
- ON CONFLICT (sha1) DO
- UPDATE SET date=LEAST(EXCLUDED.date,{entity}.date)
- """
- page_size = self.page_size or len(data)
- with self.transaction() as cursor:
- psycopg2.extras.execute_values(
- cursor, sql, argslist=data.items(), page_size=page_size
- )
- return True
- except: # noqa: E722
- # Unexpected error occurred, rollback all changes and log message
- LOGGER.exception("Unexpected error")
- if self.raise_on_commit:
- raise
- return False
-
def _relation_get(
self,
relation: RelationType,
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -15,6 +15,7 @@
from swh.model.model import Sha1Git
from .interface import (
+ DirectoryData,
ProvenanceInterface,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -32,7 +33,7 @@
class DatetimeCache(TypedDict):
- data: Dict[Sha1Git, Optional[datetime]]
+ data: Dict[Sha1Git, Optional[datetime]] # None means unknown
added: Set[Sha1Git]
@@ -49,6 +50,7 @@
class ProvenanceCache(TypedDict):
content: DatetimeCache
directory: DatetimeCache
+ directory_flatten: Dict[Sha1Git, Optional[bool]] # None means unknown
revision: DatetimeCache
# below are insertion caches only
content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
@@ -65,6 +67,7 @@
return ProvenanceCache(
content=DatetimeCache(data={}, added=set()),
directory=DatetimeCache(data={}, added=set()),
+ directory_flatten={},
revision=DatetimeCache(data={}, added=set()),
content_in_revision=set(),
content_in_directory=set(),
@@ -194,7 +197,8 @@
# properly resolve any internal reference if needed. Content and directory
# entries may safely be registered with their associated dates. In contrast,
# revision entries should be registered without date, as it is used to
- # acknowledge that the flushing was successful.
+ # acknowledge that the flushing was successful. Also, directories are
+ # registered with their flatten flag not set.
cnt_dates = {
sha1: date
for sha1, date in self.cache["content"]["data"].items()
@@ -211,7 +215,7 @@
)
dir_dates = {
- sha1: date
+ sha1: DirectoryData(date=date, flat=False)
for sha1, date in self.cache["directory"]["data"].items()
if sha1 in self.cache["directory"]["added"] and date is not None
}
@@ -306,8 +310,27 @@
RelationType.DIR_IN_REV,
)
- # After relations, dates for the revisions can be safely set, acknowledging that
- # these revisions won't need to be reprocessed in case of failure.
+ # After relations, flatten flags for directories can be safely set (if
+ # applicable) acknowledging those directories that have already be flattened.
+ # Similarly, dates for the revisions are set to acknowledge that these revisions
+ # won't need to be reprocessed in case of failure.
+ dir_acks = {
+ sha1: DirectoryData(
+ date=date, flat=self.cache["directory_flatten"].get(sha1) or False
+ )
+ for sha1, date in self.cache["directory"]["data"].items()
+ if sha1 in self.cache["directory"]["added"] and date is not None
+ }
+ if dir_acks:
+ while not self.storage.directory_add(dir_acks):
+ statsd.increment(
+ metric=BACKEND_OPERATIONS_METRIC,
+ tags={"method": "flush_revision_content_retry_directory_ack"},
+ )
+ LOGGER.warning(
+ "Unable to write directory dates to the storage. Retrying..."
+ )
+
rev_dates = {
sha1: RevisionData(date=date, origin=None)
for sha1, date in self.cache["revision"]["data"].items()
@@ -388,14 +411,22 @@
cache = self.cache[entity]
missing_ids = set(id for id in ids if id not in cache)
if missing_ids:
- if entity == "revision":
- updated = {
- id: rev.date
- for id, rev in self.storage.revision_get(missing_ids).items()
- }
- else:
- updated = getattr(self.storage, f"{entity}_get")(missing_ids)
- cache["data"].update(updated)
+ if entity == "content":
+ cache["data"].update(self.storage.content_get(missing_ids))
+ elif entity == "directory":
+ cache["data"].update(
+ {
+ id: dir.date
+ for id, dir in self.storage.directory_get(missing_ids).items()
+ }
+ )
+ elif entity == "revision":
+ cache["data"].update(
+ {
+ id: rev.date
+ for id, rev in self.storage.revision_get(missing_ids).items()
+ }
+ )
dates: Dict[Sha1Git, datetime] = {}
for sha1 in ids:
date = cache["data"].setdefault(sha1, None)
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -16,7 +16,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(1, now(), 'Work In Progress');
+ values(2, now(), 'Work In Progress');
-- a Git object ID, i.e., a Git-style salted SHA1 checksum
create domain sha1_git as bytea check (length(value) = 20);
@@ -35,9 +35,9 @@
-- entity tables
create table content
(
- id bigserial primary key, -- internal identifier of the content blob
- sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
- date timestamptz -- timestamp of the revision where the blob appears early
+ id bigserial primary key, -- internal identifier of the content blob
+ sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
+ date timestamptz not null -- timestamp of the revision where the blob appears early
);
comment on column content.id is 'Content internal identifier';
comment on column content.sha1 is 'Content intrinsic identifier';
@@ -45,9 +45,10 @@
create table directory
(
- id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
- sha1 sha1_git unique not null, -- intrinsic identifier of the directory
- date timestamptz -- max timestamp among those of the directory children's
+ id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
+ sha1 sha1_git unique not null, -- intrinsic identifier of the directory
+ date timestamptz not null, -- max timestamp among those of the directory children's
+ flat boolean not null default false -- flag acknowledging if the directory is flattenned in the model
);
comment on column directory.id is 'Directory internal identifier';
comment on column directory.sha1 is 'Directory intrinsic identifier';
@@ -55,10 +56,10 @@
create table revision
(
- id bigserial primary key, -- internal identifier of the revision
- sha1 sha1_git unique not null, -- intrinsic identifier of the revision
- date timestamptz, -- timestamp of the revision
- origin bigint -- id of the preferred origin
+ id bigserial primary key, -- internal identifier of the revision
+ sha1 sha1_git unique not null, -- intrinsic identifier of the revision
+ date timestamptz, -- timestamp of the revision
+ origin bigint -- id of the preferred origin
-- foreign key (origin) references origin (id)
);
comment on column revision.id is 'Revision internal identifier';
@@ -68,17 +69,17 @@
create table location
(
- id bigserial primary key, -- internal identifier of the location
- path unix_path unique not null -- path to the location
+ id bigserial primary key, -- internal identifier of the location
+ path unix_path unique not null -- path to the location
);
comment on column location.id is 'Location internal identifier';
comment on column location.path is 'Path to the location';
create table origin
(
- id bigserial primary key, -- internal identifier of the origin
- sha1 sha1_git unique not null, -- intrinsic identifier of the origin
- url text unique not null -- url of the origin
+ id bigserial primary key, -- internal identifier of the origin
+ sha1 sha1_git unique not null, -- intrinsic identifier of the origin
+ url text unique not null -- url of the origin
);
comment on column origin.id is 'Origin internal identifier';
comment on column origin.sha1 is 'Origin intrinsic identifier';
@@ -87,13 +88,13 @@
-- relation tables
create table content_in_revision
(
- content bigint not null, -- internal identifier of the content blob
+ content bigint not null, -- internal identifier of the content blob
\if :dbflavor_norm
- revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
- location bigint -- location of the content relative to the revision's root directory
+ revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
+ location bigint -- location of the content relative to the revision's root directory
\else
- revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time
- location bigint[] -- locations of the content relative to the revisions' root directory
+ revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time
+ location bigint[] -- locations of the content relative to the revisions' root directory
\endif
-- foreign key (content) references content (id),
-- foreign key (revision) references revision (id),
@@ -109,13 +110,13 @@
create table content_in_directory
(
- content bigint not null, -- internal identifier of the content blob
+ content bigint not null, -- internal identifier of the content blob
\if :dbflavor_norm
- directory bigint not null, -- internal identifier of the directory containing the blob
- location bigint -- location of the content relative to its parent directory in the isochrone frontier
+ directory bigint not null, -- internal identifier of the directory containing the blob
+ location bigint -- location of the content relative to its parent directory in the isochrone frontier
\else
- directory bigint[], -- internal reference of the directories containing the blob
- location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier
+ directory bigint[], -- internal reference of the directories containing the blob
+ location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier
\endif
-- foreign key (content) references content (id),
-- foreign key (directory) references directory (id),
@@ -131,13 +132,13 @@
create table directory_in_revision
(
- directory bigint not null, -- internal identifier of the directory appearing in the revision
+ directory bigint not null, -- internal identifier of the directory appearing in the revision
\if :dbflavor_norm
- revision bigint not null, -- internal identifier of the revision containing the directory
- location bigint -- location of the directory relative to the revision's root directory
+ revision bigint not null, -- internal identifier of the revision containing the directory
+ location bigint -- location of the directory relative to the revision's root directory
\else
- revision bigint[], -- internal identifiers of the revisions containing the directory
- location bigint[] -- locations of the directory relative to the revisions' root directory
+ revision bigint[], -- internal identifiers of the revisions containing the directory
+ location bigint[] -- locations of the directory relative to the revisions' root directory
\endif
-- foreign key (directory) references directory (id),
-- foreign key (revision) references revision (id),
@@ -153,8 +154,8 @@
create table revision_in_origin
(
- revision bigint not null, -- internal identifier of the revision poined by the origin
- origin bigint not null -- internal identifier of the origin that points to the revision
+ revision bigint not null, -- internal identifier of the revision poined by the origin
+ origin bigint not null -- internal identifier of the origin that points to the revision
-- foreign key (revision) references revision (id),
-- foreign key (origin) references origin (id)
);
@@ -163,8 +164,8 @@
create table revision_before_revision
(
- prev bigserial not null, -- internal identifier of the source revision
- next bigserial not null -- internal identifier of the destination revision
+ prev bigserial not null, -- internal identifier of the source revision
+ next bigserial not null -- internal identifier of the destination revision
-- foreign key (prev) references revision (id),
-- foreign key (next) references revision (id)
);
diff --git a/swh/provenance/tests/test_conflict_resolution.py b/swh/provenance/tests/test_conflict_resolution.py
--- a/swh/provenance/tests/test_conflict_resolution.py
+++ b/swh/provenance/tests/test_conflict_resolution.py
@@ -4,22 +4,29 @@
# See top-level LICENSE file for more information
from datetime import datetime
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Sha1Git
-from swh.provenance.api.server import resolve_dates, resolve_relation, resolve_revision
-from swh.provenance.interface import RelationData, RevisionData
+from swh.provenance.api.server import (
+ resolve_dates,
+ resolve_directory,
+ resolve_relation,
+ resolve_revision,
+)
+from swh.provenance.interface import DirectoryData, RelationData, RevisionData
def test_resolve_dates() -> None:
- items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [
- (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),),
+ items: List[Tuple[Sha1Git, datetime]] = [
+ (
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ datetime.fromtimestamp(1000000001),
+ ),
(
hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
datetime.fromtimestamp(1000000000),
),
- (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), None),
]
assert resolve_dates(items) == {
hash_to_bytes(
@@ -28,21 +35,25 @@
}
-def test_resolve_dates_keep_min() -> None:
- items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [
+def test_resolve_directory() -> None:
+ items: List[Tuple[Sha1Git, DirectoryData]] = [
(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- datetime.fromtimestamp(1000000001),
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000002), flat=False),
),
(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- datetime.fromtimestamp(1000000000),
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000001), flat=True),
+ ),
+ (
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000000), flat=False),
),
]
- assert resolve_dates(items) == {
- hash_to_bytes(
- "20329687bb9c1231a7e05afe86160343ad49b494"
- ): datetime.fromtimestamp(1000000000)
+ assert resolve_directory(items) == {
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"): DirectoryData(
+ date=datetime.fromtimestamp(1000000000), flat=True
+ )
}
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -12,6 +12,7 @@
from swh.model.model import Origin, Sha1Git
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import (
+ DirectoryData,
EntityType,
ProvenanceInterface,
ProvenanceResult,
@@ -38,17 +39,12 @@
# Add all content present in the current repo to the storage, just assigning their
# creation dates. Then check that the returned results when querying are the same.
- cnts = {cnt["sha1_git"] for idx, cnt in enumerate(data["content"]) if idx % 2 == 0}
cnt_dates = {
- cnt["sha1_git"]: cnt["ctime"]
- for idx, cnt in enumerate(data["content"])
- if idx % 2 == 1
+ cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
}
- assert cnts or cnt_dates
- assert provenance_storage.content_add(cnts)
assert provenance_storage.content_add(cnt_dates)
assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
- assert provenance_storage.entity_get_all(EntityType.CONTENT) == cnts | set(
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
cnt_dates.keys()
)
@@ -75,21 +71,15 @@
]
return max(dates) if dates else None
- dirs = {
- dir["id"]
- for dir in data["directory"]
- if getmaxdate(dir, data["content"]) is None
- }
- dir_dates = {
- dir["id"]: getmaxdate(dir, data["content"])
- for dir in data["directory"]
- if getmaxdate(dir, data["content"]) is not None
- }
- assert dirs
- assert provenance_storage.directory_add(dirs)
+ flat_values = (False, True)
+ dir_dates = {}
+ for idx, dir in enumerate(data["directory"]):
+ date = getmaxdate(dir, data["content"])
+ if date is not None:
+ dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2])
assert provenance_storage.directory_add(dir_dates)
assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
- assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == dirs | set(
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
dir_dates.keys()
)
@@ -199,10 +189,13 @@
def entity_add(
storage: ProvenanceStorageInterface, entity: EntityType, ids: Set[Sha1Git]
) -> bool:
+ now = datetime.now(tz=timezone.utc)
if entity == EntityType.CONTENT:
- return storage.content_add({sha1: None for sha1 in ids})
+ return storage.content_add({sha1: now for sha1 in ids})
elif entity == EntityType.DIRECTORY:
- return storage.directory_add({sha1: None for sha1 in ids})
+ return storage.directory_add(
+ {sha1: DirectoryData(date=now, flat=False) for sha1 in ids}
+ )
else: # entity == EntityType.REVISION:
return storage.revision_add(
{sha1: RevisionData(date=None, origin=None) for sha1 in ids}
diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py
--- a/swh/provenance/tests/test_revision_content_layer.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -261,7 +261,9 @@
for rd in synth_rev["R_D"]:
assert (
rev_ts + rd["rel_ts"]
- == provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp()
+ == provenance.storage.directory_get([rd["dst"]])[
+ rd["dst"]
+ ].date.timestamp()
), synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 6:02 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219261

Event Timeline