Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123909
D6712.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
38 KB
Subscribers
None
D6712.diff
View Options
diff --git a/sql/upgrades/002.sql b/sql/upgrades/002.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/002.sql
@@ -0,0 +1,17 @@
+-- SWH DB schema upgrade
+-- from_version: 1
+-- to_version: 2
+-- description: add flag to acknowledge directories' flattening
+
+
+insert into dbversion(version, release, description)
+ values(2, now(), 'Work In Progress');
+
+alter table content
+ alter column date set not null;
+
+alter table directory
+ add column flat boolean not null default false;
+
+alter table directory
+ alter column date set not null;
diff --git a/swh/provenance/api/serializers.py b/swh/provenance/api/serializers.py
--- a/swh/provenance/api/serializers.py
+++ b/swh/provenance/api/serializers.py
@@ -33,11 +33,12 @@
ENCODERS: List[Tuple[type, str, Callable]] = [
+ (interface.DirectoryData, "dataclass", _encode_dataclass),
+ (interface.EntityType, "enum", _encode_enum),
(interface.ProvenanceResult, "dataclass", _encode_dataclass),
(interface.RelationData, "dataclass", _encode_dataclass),
- (interface.RevisionData, "dataclass", _encode_dataclass),
- (interface.EntityType, "enum", _encode_enum),
(interface.RelationType, "enum", _encode_enum),
+ (interface.RevisionData, "dataclass", _encode_dataclass),
(set, "set", list),
]
diff --git a/swh/provenance/api/server.py b/swh/provenance/api/server.py
--- a/swh/provenance/api/server.py
+++ b/swh/provenance/api/server.py
@@ -31,7 +31,13 @@
from swh.model.model import Sha1Git
from .. import get_provenance_storage
-from ..interface import EntityType, RelationData, RelationType, RevisionData
+from ..interface import (
+ DirectoryData,
+ EntityType,
+ RelationData,
+ RelationType,
+ RevisionData,
+)
from ..util import path_id
from .serializers import DECODERS, ENCODERS
@@ -53,21 +59,31 @@
pass
-def resolve_dates(
- dates: Iterable[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]]
-) -> Dict[Sha1Git, Optional[datetime]]:
- result: Dict[Sha1Git, Optional[datetime]] = {}
- for row in dates:
- sha1 = row[0]
- date = (
- cast(Tuple[Sha1Git, Optional[datetime]], row)[1] if len(row) > 1 else None
- )
- known = result.setdefault(sha1, None)
- if date is not None and (known is None or date < known):
+def resolve_dates(dates: Iterable[Tuple[Sha1Git, datetime]]) -> Dict[Sha1Git, datetime]:
+ result: Dict[Sha1Git, datetime] = {}
+ for sha1, date in dates:
+ known = result.setdefault(sha1, date)
+ if date < known:
result[sha1] = date
return result
+def resolve_directory(
+ data: Iterable[Tuple[Sha1Git, DirectoryData]]
+) -> Dict[Sha1Git, DirectoryData]:
+ result: Dict[Sha1Git, DirectoryData] = {}
+ for sha1, dir in data:
+ known = result.setdefault(sha1, dir)
+ value = known
+ if dir.date < known.date:
+ value = DirectoryData(date=dir.date, flat=value.flat)
+ if dir.flat:
+ value = DirectoryData(date=value.date, flat=dir.flat)
+ if value != known:
+ result[sha1] = value
+ return result
+
+
def resolve_revision(
data: Iterable[Union[Tuple[Sha1Git, RevisionData], Tuple[Sha1Git]]]
) -> Dict[Sha1Git, RevisionData]:
@@ -458,8 +474,10 @@
@staticmethod
def get_conflicts_func(meth_name: str) -> Callable[[Iterable[Any]], Any]:
- if meth_name in ["content_add", "directory_add"]:
+ if meth_name == "content_add":
return resolve_dates
+ elif meth_name == "directory_add":
+ return resolve_directory
elif meth_name == "location_add":
return lambda data: set(data) # just remove duplicates
elif meth_name == "origin_add":
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -43,6 +43,18 @@
path: bytes
+@dataclass(eq=True, frozen=True)
+class DirectoryData:
+ """Object representing the data associated to a directory in the provenance model,
+ where `date` is the date of the directory in the isochrone frontier, and `flat` is a
+ flag acknowledging that a flat model for the elements outside the frontier has
+ already been created.
+ """
+
+ date: datetime
+ flat: bool
+
+
@dataclass(eq=True, frozen=True)
class RevisionData:
"""Object representing the data associated to a revision in the provenance model,
@@ -85,11 +97,9 @@
...
@remote_api_endpoint("content_add")
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- """Add blobs identified by sha1 ids, with an optional associated date (as paired
- in `cnts`) to the provenance storage. Return a boolean stating whether the
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
+ """Add blobs identified by sha1 ids, with an associated date (as paired in
+ `cnts`) to the provenance storage. Return a boolean stating whether the
information was successfully stored.
"""
...
@@ -108,25 +118,23 @@
@remote_api_endpoint("content_get")
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- """Retrieve the associated date for each blob sha1 in `ids`. If some blob has
- no associated date, it is not present in the resulting dictionary.
- """
+ """Retrieve the associated date for each blob sha1 in `ids`."""
...
@remote_api_endpoint("directory_add")
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- """Add directories identified by sha1 ids, with an optional associated date (as
- paired in `dirs`) to the provenance storage. Return a boolean stating if the
- information was successfully stored.
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
+ """Add directories identified by sha1 ids, with associated date and (optional)
+ flatten flag (as paired in `dirs`) to the provenance storage. If the flatten
+ flag is set to None, the previous value present in the storage is preserved.
+ Return a boolean stating if the information was successfully stored.
"""
...
@remote_api_endpoint("directory_get")
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- """Retrieve the associated date for each directory sha1 in `ids`. If some
- directory has no associated date, it is not present in the resulting dictionary.
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
+ """Retrieve the associated date and (optional) flatten flag for each directory
+ sha1 in `ids`. If some directories has no associated date, it is not present in
+ the resulting dictionary.
"""
...
diff --git a/swh/provenance/mongo/backend.py b/swh/provenance/mongo/backend.py
--- a/swh/provenance/mongo/backend.py
+++ b/swh/provenance/mongo/backend.py
@@ -18,6 +18,7 @@
from swh.model.model import Sha1Git
from ..interface import (
+ DirectoryData,
EntityType,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -52,21 +53,18 @@
self.db.client.close()
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"})
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- data = cnts if isinstance(cnts, dict) else dict.fromkeys(cnts)
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
existing = {
x["sha1"]: x
for x in self.db.content.find(
- {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ {"sha1": {"$in": list(cnts)}}, {"sha1": 1, "ts": 1, "_id": 1}
)
}
- for sha1, date in data.items():
- ts = datetime.timestamp(date) if date is not None else None
+ for sha1, date in cnts.items():
+ ts = datetime.timestamp(date)
if sha1 in existing:
cnt = existing[sha1]
- if ts is not None and (cnt["ts"] is None or ts < cnt["ts"]):
+ if ts < cnt["ts"]:
self.db.content.update_one(
{"_id": cnt["_id"]}, {"$set": {"ts": ts}}
)
@@ -173,41 +171,43 @@
return {
x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
for x in self.db.content.find(
- {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
- {"sha1": 1, "ts": 1, "_id": 0},
+ {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "_id": 0}
)
}
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"})
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- data = dirs if isinstance(dirs, dict) else dict.fromkeys(dirs)
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
existing = {
x["sha1"]: x
for x in self.db.directory.find(
- {"sha1": {"$in": list(data)}}, {"sha1": 1, "ts": 1, "_id": 1}
+ {"sha1": {"$in": list(dirs)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 1}
)
}
- for sha1, date in data.items():
- ts = datetime.timestamp(date) if date is not None else None
+ for sha1, info in dirs.items():
+ ts = datetime.timestamp(info.date)
if sha1 in existing:
dir = existing[sha1]
- if ts is not None and (dir["ts"] is None or ts < dir["ts"]):
+ if ts >= dir["ts"]:
+ ts = dir["ts"]
+ flat = info.flat or dir["flat"]
+ if ts != dir["ts"] or flat != dir["flat"]:
self.db.directory.update_one(
- {"_id": dir["_id"]}, {"$set": {"ts": ts}}
+ {"_id": dir["_id"]}, {"$set": {"ts": ts, "flat": flat}}
)
else:
- self.db.directory.insert_one({"sha1": sha1, "ts": ts, "revision": {}})
+ self.db.directory.insert_one(
+ {"sha1": sha1, "ts": ts, "revision": {}, "flat": info.flat}
+ )
return True
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"})
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
return {
- x["sha1"]: datetime.fromtimestamp(x["ts"], timezone.utc)
+ x["sha1"]: DirectoryData(
+ date=datetime.fromtimestamp(x["ts"], timezone.utc), flat=x["flat"]
+ )
for x in self.db.directory.find(
- {"sha1": {"$in": list(ids)}, "ts": {"$ne": None}},
- {"sha1": 1, "ts": 1, "_id": 0},
+ {"sha1": {"$in": list(ids)}}, {"sha1": 1, "ts": 1, "flat": 1, "_id": 0}
)
}
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -14,13 +14,13 @@
import psycopg2.extensions
import psycopg2.extras
-from typing_extensions import Literal
from swh.core.db import BaseDb
from swh.core.statsd import statsd
from swh.model.model import Sha1Git
from ..interface import (
+ DirectoryData,
EntityType,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -82,10 +82,26 @@
self.conn.close()
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_add"})
- def content_add(
- self, cnts: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- return self._entity_set_date("content", cnts)
+ def content_add(self, cnts: Dict[Sha1Git, datetime]) -> bool:
+ try:
+ if cnts:
+ sql = """
+ INSERT INTO content(sha1, date) VALUES %s
+ ON CONFLICT (sha1) DO
+ UPDATE SET date=LEAST(EXCLUDED.date,content.date)
+ """
+ page_size = self.page_size or len(cnts)
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(
+ cursor, sql, argslist=cnts.items(), page_size=page_size
+ )
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_find_first"})
def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
@@ -106,17 +122,67 @@
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "content_get"})
def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- return self._entity_get_date("content", ids)
+ dates: Dict[Sha1Git, datetime] = {}
+ sha1s = tuple(ids)
+ if sha1s:
+ # TODO: consider splitting this query in several ones if sha1s is too big!
+ values = ", ".join(itertools.repeat("%s", len(sha1s)))
+ sql = f"""
+ SELECT sha1, date
+ FROM content
+ WHERE sha1 IN ({values})
+ AND date IS NOT NULL
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=sha1s)
+ dates.update((row["sha1"], row["date"]) for row in cursor)
+ return dates
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_add"})
- def directory_add(
- self, dirs: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]]
- ) -> bool:
- return self._entity_set_date("directory", dirs)
+ def directory_add(self, dirs: Dict[Sha1Git, DirectoryData]) -> bool:
+ data = [(sha1, rev.date, rev.flat) for sha1, rev in dirs.items()]
+ try:
+ if data:
+ sql = """
+ INSERT INTO directory(sha1, date, flat) VALUES %s
+ ON CONFLICT (sha1) DO
+ UPDATE SET
+ date=LEAST(EXCLUDED.date, directory.date),
+ flat=(EXCLUDED.flat OR directory.flat)
+ """
+ page_size = self.page_size or len(data)
+ with self.transaction() as cursor:
+ psycopg2.extras.execute_values(
+ cur=cursor, sql=sql, argslist=data, page_size=page_size
+ )
+ return True
+ except: # noqa: E722
+ # Unexpected error occurred, rollback all changes and log message
+ LOGGER.exception("Unexpected error")
+ if self.raise_on_commit:
+ raise
+ return False
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "directory_get"})
- def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]:
- return self._entity_get_date("directory", ids)
+ def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, DirectoryData]:
+ result: Dict[Sha1Git, DirectoryData] = {}
+ sha1s = tuple(ids)
+ if sha1s:
+ # TODO: consider splitting this query in several ones if sha1s is too big!
+ values = ", ".join(itertools.repeat("%s", len(sha1s)))
+ sql = f"""
+ SELECT sha1, date, flat
+ FROM directory
+ WHERE sha1 IN ({values})
+ AND date IS NOT NULL
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=sha1s)
+ result.update(
+ (row["sha1"], DirectoryData(date=row["date"], flat=row["flat"]))
+ for row in cursor
+ )
+ return result
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"})
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
@@ -299,53 +365,6 @@
) -> Dict[Sha1Git, Set[RelationData]]:
return self._relation_get(relation, None)
- def _entity_get_date(
- self,
- entity: Literal["content", "directory", "revision"],
- ids: Iterable[Sha1Git],
- ) -> Dict[Sha1Git, datetime]:
- dates: Dict[Sha1Git, datetime] = {}
- sha1s = tuple(ids)
- if sha1s:
- # TODO: consider splitting this query in several ones if sha1s is too big!
- values = ", ".join(itertools.repeat("%s", len(sha1s)))
- sql = f"""
- SELECT sha1, date
- FROM {entity}
- WHERE sha1 IN ({values})
- AND date IS NOT NULL
- """
- with self.transaction(readonly=True) as cursor:
- cursor.execute(query=sql, vars=sha1s)
- dates.update((row["sha1"], row["date"]) for row in cursor)
- return dates
-
- def _entity_set_date(
- self,
- entity: Literal["content", "directory"],
- dates: Union[Iterable[Sha1Git], Dict[Sha1Git, Optional[datetime]]],
- ) -> bool:
- data = dates if isinstance(dates, dict) else dict.fromkeys(dates)
- try:
- if data:
- sql = f"""
- INSERT INTO {entity}(sha1, date) VALUES %s
- ON CONFLICT (sha1) DO
- UPDATE SET date=LEAST(EXCLUDED.date,{entity}.date)
- """
- page_size = self.page_size or len(data)
- with self.transaction() as cursor:
- psycopg2.extras.execute_values(
- cursor, sql, argslist=data.items(), page_size=page_size
- )
- return True
- except: # noqa: E722
- # Unexpected error occurred, rollback all changes and log message
- LOGGER.exception("Unexpected error")
- if self.raise_on_commit:
- raise
- return False
-
def _relation_get(
self,
relation: RelationType,
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -15,6 +15,7 @@
from swh.model.model import Sha1Git
from .interface import (
+ DirectoryData,
ProvenanceInterface,
ProvenanceResult,
ProvenanceStorageInterface,
@@ -32,7 +33,7 @@
class DatetimeCache(TypedDict):
- data: Dict[Sha1Git, Optional[datetime]]
+ data: Dict[Sha1Git, Optional[datetime]] # None means unknown
added: Set[Sha1Git]
@@ -49,6 +50,7 @@
class ProvenanceCache(TypedDict):
content: DatetimeCache
directory: DatetimeCache
+ directory_flatten: Dict[Sha1Git, Optional[bool]] # None means unknown
revision: DatetimeCache
# below are insertion caches only
content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
@@ -65,6 +67,7 @@
return ProvenanceCache(
content=DatetimeCache(data={}, added=set()),
directory=DatetimeCache(data={}, added=set()),
+ directory_flatten={},
revision=DatetimeCache(data={}, added=set()),
content_in_revision=set(),
content_in_directory=set(),
@@ -194,7 +197,8 @@
# properly resolve any internal reference if needed. Content and directory
# entries may safely be registered with their associated dates. In contrast,
# revision entries should be registered without date, as it is used to
- # acknowledge that the flushing was successful.
+ # acknowledge that the flushing was successful. Also, directories are
+ # registered with their flatten flag not set.
cnt_dates = {
sha1: date
for sha1, date in self.cache["content"]["data"].items()
@@ -211,7 +215,7 @@
)
dir_dates = {
- sha1: date
+ sha1: DirectoryData(date=date, flat=False)
for sha1, date in self.cache["directory"]["data"].items()
if sha1 in self.cache["directory"]["added"] and date is not None
}
@@ -306,8 +310,27 @@
RelationType.DIR_IN_REV,
)
- # After relations, dates for the revisions can be safely set, acknowledging that
- # these revisions won't need to be reprocessed in case of failure.
+ # After relations, flatten flags for directories can be safely set (if
+ # applicable) acknowledging those directories that have already be flattened.
+ # Similarly, dates for the revisions are set to acknowledge that these revisions
+ # won't need to be reprocessed in case of failure.
+ dir_acks = {
+ sha1: DirectoryData(
+ date=date, flat=self.cache["directory_flatten"].get(sha1) or False
+ )
+ for sha1, date in self.cache["directory"]["data"].items()
+ if sha1 in self.cache["directory"]["added"] and date is not None
+ }
+ if dir_acks:
+ while not self.storage.directory_add(dir_acks):
+ statsd.increment(
+ metric=BACKEND_OPERATIONS_METRIC,
+ tags={"method": "flush_revision_content_retry_directory_ack"},
+ )
+ LOGGER.warning(
+ "Unable to write directory dates to the storage. Retrying..."
+ )
+
rev_dates = {
sha1: RevisionData(date=date, origin=None)
for sha1, date in self.cache["revision"]["data"].items()
@@ -388,14 +411,22 @@
cache = self.cache[entity]
missing_ids = set(id for id in ids if id not in cache)
if missing_ids:
- if entity == "revision":
- updated = {
- id: rev.date
- for id, rev in self.storage.revision_get(missing_ids).items()
- }
- else:
- updated = getattr(self.storage, f"{entity}_get")(missing_ids)
- cache["data"].update(updated)
+ if entity == "content":
+ cache["data"].update(self.storage.content_get(missing_ids))
+ elif entity == "directory":
+ cache["data"].update(
+ {
+ id: dir.date
+ for id, dir in self.storage.directory_get(missing_ids).items()
+ }
+ )
+ elif entity == "revision":
+ cache["data"].update(
+ {
+ id: rev.date
+ for id, rev in self.storage.revision_get(missing_ids).items()
+ }
+ )
dates: Dict[Sha1Git, datetime] = {}
for sha1 in ids:
date = cache["data"].setdefault(sha1, None)
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -16,7 +16,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(1, now(), 'Work In Progress');
+ values(2, now(), 'Work In Progress');
-- a Git object ID, i.e., a Git-style salted SHA1 checksum
create domain sha1_git as bytea check (length(value) = 20);
@@ -35,9 +35,9 @@
-- entity tables
create table content
(
- id bigserial primary key, -- internal identifier of the content blob
- sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
- date timestamptz -- timestamp of the revision where the blob appears early
+ id bigserial primary key, -- internal identifier of the content blob
+ sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
+ date timestamptz not null -- timestamp of the revision where the blob appears early
);
comment on column content.id is 'Content internal identifier';
comment on column content.sha1 is 'Content intrinsic identifier';
@@ -45,9 +45,10 @@
create table directory
(
- id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
- sha1 sha1_git unique not null, -- intrinsic identifier of the directory
- date timestamptz -- max timestamp among those of the directory children's
+ id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
+ sha1 sha1_git unique not null, -- intrinsic identifier of the directory
+ date timestamptz not null, -- max timestamp among those of the directory children's
+ flat boolean not null default false -- flag acknowledging if the directory is flattenned in the model
);
comment on column directory.id is 'Directory internal identifier';
comment on column directory.sha1 is 'Directory intrinsic identifier';
@@ -55,10 +56,10 @@
create table revision
(
- id bigserial primary key, -- internal identifier of the revision
- sha1 sha1_git unique not null, -- intrinsic identifier of the revision
- date timestamptz, -- timestamp of the revision
- origin bigint -- id of the preferred origin
+ id bigserial primary key, -- internal identifier of the revision
+ sha1 sha1_git unique not null, -- intrinsic identifier of the revision
+ date timestamptz, -- timestamp of the revision
+ origin bigint -- id of the preferred origin
-- foreign key (origin) references origin (id)
);
comment on column revision.id is 'Revision internal identifier';
@@ -68,17 +69,17 @@
create table location
(
- id bigserial primary key, -- internal identifier of the location
- path unix_path unique not null -- path to the location
+ id bigserial primary key, -- internal identifier of the location
+ path unix_path unique not null -- path to the location
);
comment on column location.id is 'Location internal identifier';
comment on column location.path is 'Path to the location';
create table origin
(
- id bigserial primary key, -- internal identifier of the origin
- sha1 sha1_git unique not null, -- intrinsic identifier of the origin
- url text unique not null -- url of the origin
+ id bigserial primary key, -- internal identifier of the origin
+ sha1 sha1_git unique not null, -- intrinsic identifier of the origin
+ url text unique not null -- url of the origin
);
comment on column origin.id is 'Origin internal identifier';
comment on column origin.sha1 is 'Origin intrinsic identifier';
@@ -87,13 +88,13 @@
-- relation tables
create table content_in_revision
(
- content bigint not null, -- internal identifier of the content blob
+ content bigint not null, -- internal identifier of the content blob
\if :dbflavor_norm
- revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
- location bigint -- location of the content relative to the revision's root directory
+ revision bigint not null, -- internal identifier of the revision where the blob appears for the first time
+ location bigint -- location of the content relative to the revision's root directory
\else
- revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time
- location bigint[] -- locations of the content relative to the revisions' root directory
+ revision bigint[], -- internal identifiers of the revisions where the blob appears for the first time
+ location bigint[] -- locations of the content relative to the revisions' root directory
\endif
-- foreign key (content) references content (id),
-- foreign key (revision) references revision (id),
@@ -109,13 +110,13 @@
create table content_in_directory
(
- content bigint not null, -- internal identifier of the content blob
+ content bigint not null, -- internal identifier of the content blob
\if :dbflavor_norm
- directory bigint not null, -- internal identifier of the directory containing the blob
- location bigint -- location of the content relative to its parent directory in the isochrone frontier
+ directory bigint not null, -- internal identifier of the directory containing the blob
+ location bigint -- location of the content relative to its parent directory in the isochrone frontier
\else
- directory bigint[], -- internal reference of the directories containing the blob
- location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier
+ directory bigint[], -- internal reference of the directories containing the blob
+ location bigint[] -- locations of the content relative to its parent directories in the isochrone frontier
\endif
-- foreign key (content) references content (id),
-- foreign key (directory) references directory (id),
@@ -131,13 +132,13 @@
create table directory_in_revision
(
- directory bigint not null, -- internal identifier of the directory appearing in the revision
+ directory bigint not null, -- internal identifier of the directory appearing in the revision
\if :dbflavor_norm
- revision bigint not null, -- internal identifier of the revision containing the directory
- location bigint -- location of the directory relative to the revision's root directory
+ revision bigint not null, -- internal identifier of the revision containing the directory
+ location bigint -- location of the directory relative to the revision's root directory
\else
- revision bigint[], -- internal identifiers of the revisions containing the directory
- location bigint[] -- locations of the directory relative to the revisions' root directory
+ revision bigint[], -- internal identifiers of the revisions containing the directory
+ location bigint[] -- locations of the directory relative to the revisions' root directory
\endif
-- foreign key (directory) references directory (id),
-- foreign key (revision) references revision (id),
@@ -153,8 +154,8 @@
create table revision_in_origin
(
- revision bigint not null, -- internal identifier of the revision poined by the origin
- origin bigint not null -- internal identifier of the origin that points to the revision
+ revision bigint not null, -- internal identifier of the revision poined by the origin
+ origin bigint not null -- internal identifier of the origin that points to the revision
-- foreign key (revision) references revision (id),
-- foreign key (origin) references origin (id)
);
@@ -163,8 +164,8 @@
create table revision_before_revision
(
- prev bigserial not null, -- internal identifier of the source revision
- next bigserial not null -- internal identifier of the destination revision
+ prev bigserial not null, -- internal identifier of the source revision
+ next bigserial not null -- internal identifier of the destination revision
-- foreign key (prev) references revision (id),
-- foreign key (next) references revision (id)
);
diff --git a/swh/provenance/tests/test_conflict_resolution.py b/swh/provenance/tests/test_conflict_resolution.py
--- a/swh/provenance/tests/test_conflict_resolution.py
+++ b/swh/provenance/tests/test_conflict_resolution.py
@@ -4,22 +4,29 @@
# See top-level LICENSE file for more information
from datetime import datetime
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Sha1Git
-from swh.provenance.api.server import resolve_dates, resolve_relation, resolve_revision
-from swh.provenance.interface import RelationData, RevisionData
+from swh.provenance.api.server import (
+ resolve_dates,
+ resolve_directory,
+ resolve_relation,
+ resolve_revision,
+)
+from swh.provenance.interface import DirectoryData, RelationData, RevisionData
def test_resolve_dates() -> None:
- items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [
- (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),),
+ items: List[Tuple[Sha1Git, datetime]] = [
+ (
+ hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
+ datetime.fromtimestamp(1000000001),
+ ),
(
hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
datetime.fromtimestamp(1000000000),
),
- (hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), None),
]
assert resolve_dates(items) == {
hash_to_bytes(
@@ -28,21 +35,25 @@
}
-def test_resolve_dates_keep_min() -> None:
- items: List[Union[Tuple[Sha1Git, Optional[datetime]], Tuple[Sha1Git]]] = [
+def test_resolve_directory() -> None:
+ items: List[Tuple[Sha1Git, DirectoryData]] = [
(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- datetime.fromtimestamp(1000000001),
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000002), flat=False),
),
(
- hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"),
- datetime.fromtimestamp(1000000000),
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000001), flat=True),
+ ),
+ (
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"),
+ DirectoryData(date=datetime.fromtimestamp(1000000000), flat=False),
),
]
- assert resolve_dates(items) == {
- hash_to_bytes(
- "20329687bb9c1231a7e05afe86160343ad49b494"
- ): datetime.fromtimestamp(1000000000)
+ assert resolve_directory(items) == {
+ hash_to_bytes("c0d8929936631ecbcf9147be6b8aa13b13b014e4"): DirectoryData(
+ date=datetime.fromtimestamp(1000000000), flat=True
+ )
}
diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py
--- a/swh/provenance/tests/test_provenance_storage.py
+++ b/swh/provenance/tests/test_provenance_storage.py
@@ -12,6 +12,7 @@
from swh.model.model import Origin, Sha1Git
from swh.provenance.archive import ArchiveInterface
from swh.provenance.interface import (
+ DirectoryData,
EntityType,
ProvenanceInterface,
ProvenanceResult,
@@ -38,17 +39,12 @@
# Add all content present in the current repo to the storage, just assigning their
# creation dates. Then check that the returned results when querying are the same.
- cnts = {cnt["sha1_git"] for idx, cnt in enumerate(data["content"]) if idx % 2 == 0}
cnt_dates = {
- cnt["sha1_git"]: cnt["ctime"]
- for idx, cnt in enumerate(data["content"])
- if idx % 2 == 1
+ cnt["sha1_git"]: cnt["ctime"] for idx, cnt in enumerate(data["content"])
}
- assert cnts or cnt_dates
- assert provenance_storage.content_add(cnts)
assert provenance_storage.content_add(cnt_dates)
assert provenance_storage.content_get(set(cnt_dates.keys())) == cnt_dates
- assert provenance_storage.entity_get_all(EntityType.CONTENT) == cnts | set(
+ assert provenance_storage.entity_get_all(EntityType.CONTENT) == set(
cnt_dates.keys()
)
@@ -75,21 +71,15 @@
]
return max(dates) if dates else None
- dirs = {
- dir["id"]
- for dir in data["directory"]
- if getmaxdate(dir, data["content"]) is None
- }
- dir_dates = {
- dir["id"]: getmaxdate(dir, data["content"])
- for dir in data["directory"]
- if getmaxdate(dir, data["content"]) is not None
- }
- assert dirs
- assert provenance_storage.directory_add(dirs)
+ flat_values = (False, True)
+ dir_dates = {}
+ for idx, dir in enumerate(data["directory"]):
+ date = getmaxdate(dir, data["content"])
+ if date is not None:
+ dir_dates[dir["id"]] = DirectoryData(date=date, flat=flat_values[idx % 2])
assert provenance_storage.directory_add(dir_dates)
assert provenance_storage.directory_get(set(dir_dates.keys())) == dir_dates
- assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == dirs | set(
+ assert provenance_storage.entity_get_all(EntityType.DIRECTORY) == set(
dir_dates.keys()
)
@@ -199,10 +189,13 @@
def entity_add(
storage: ProvenanceStorageInterface, entity: EntityType, ids: Set[Sha1Git]
) -> bool:
+ now = datetime.now(tz=timezone.utc)
if entity == EntityType.CONTENT:
- return storage.content_add({sha1: None for sha1 in ids})
+ return storage.content_add({sha1: now for sha1 in ids})
elif entity == EntityType.DIRECTORY:
- return storage.directory_add({sha1: None for sha1 in ids})
+ return storage.directory_add(
+ {sha1: DirectoryData(date=now, flat=False) for sha1 in ids}
+ )
else: # entity == EntityType.REVISION:
return storage.revision_add(
{sha1: RevisionData(date=None, origin=None) for sha1 in ids}
diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py
--- a/swh/provenance/tests/test_revision_content_layer.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -261,7 +261,9 @@
for rd in synth_rev["R_D"]:
assert (
rev_ts + rd["rel_ts"]
- == provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp()
+ == provenance.storage.directory_get([rd["dst"]])[
+ rd["dst"]
+ ].date.timestamp()
), synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 6:02 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219261
Attached To
D6712: Add explicit flag for flattenned directories to `ProvenanceStorageInterface`
Event Timeline
Log In to Comment