Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066670
D5946.id21396.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
43 KB
Subscribers
None
D5946.id21396.diff
View Options
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -33,7 +33,7 @@
)
with_path = kwargs.get("with_path")
- from swh.provenance.provenance import ProvenanceBackend
+ from swh.provenance.backend import ProvenanceBackend
prov = ProvenanceBackend(conn)
if with_path is not None:
diff --git a/swh/provenance/provenance.py b/swh/provenance/backend.py
copy from swh/provenance/provenance.py
copy to swh/provenance/backend.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/backend.py
@@ -3,109 +3,13 @@
import os
from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple
-import psycopg2
-from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable
+import psycopg2 # TODO: remove this dependency
+from typing_extensions import Literal, TypedDict
from swh.model.model import Sha1Git
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
-
-
-# XXX: this protocol doesn't make much sense now that flavours have been delegated to
-# another class, lower in the callstack.
-@runtime_checkable
-class ProvenanceInterface(Protocol):
- raise_on_commit: bool = False
-
- def commit(self):
- """Commit currently ongoing transactions in the backend DB"""
- ...
-
- def content_add_to_directory(
- self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
- ) -> None:
- ...
-
- def content_add_to_revision(
- self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
- ) -> None:
- ...
-
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
- ...
-
- def content_find_all(
- self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
- ...
-
- def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
- ...
-
- def content_get_early_dates(
- self, blobs: Iterable[FileEntry]
- ) -> Dict[Sha1Git, datetime]:
- ...
-
- def content_set_early_date(self, blob: FileEntry, date: datetime) -> None:
- ...
-
- def directory_add_to_revision(
- self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
- ) -> None:
- ...
-
- def directory_get_date_in_isochrone_frontier(
- self, directory: DirectoryEntry
- ) -> Optional[datetime]:
- ...
-
- def directory_get_dates_in_isochrone_frontier(
- self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[Sha1Git, datetime]:
- ...
-
- def directory_set_date_in_isochrone_frontier(
- self, directory: DirectoryEntry, date: datetime
- ) -> None:
- ...
-
- def origin_add(self, origin: OriginEntry) -> None:
- ...
-
- def revision_add(self, revision: RevisionEntry) -> None:
- ...
-
- def revision_add_before_revision(
- self, relative: RevisionEntry, revision: RevisionEntry
- ) -> None:
- ...
-
- def revision_add_to_origin(
- self, origin: OriginEntry, revision: RevisionEntry
- ) -> None:
- ...
-
- def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
- ...
-
- def revision_get_preferred_origin(
- self, revision: RevisionEntry
- ) -> Optional[Sha1Git]:
- ...
-
- def revision_in_history(self, revision: RevisionEntry) -> bool:
- ...
-
- def revision_set_preferred_origin(
- self, origin: OriginEntry, revision: RevisionEntry
- ) -> None:
- ...
-
- def revision_visited(self, revision: RevisionEntry) -> bool:
- ...
+from .provenance import ProvenanceResult
class DatetimeCache(TypedDict):
@@ -138,7 +42,7 @@
revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]]
-def new_cache():
+def new_cache() -> ProvenanceCache:
return ProvenanceCache(
content=DatetimeCache(data={}, added=set()),
directory=DatetimeCache(data={}, added=set()),
@@ -153,7 +57,6 @@
)
-# TODO: maybe move this to a separate file
class ProvenanceBackend:
raise_on_commit: bool = False
@@ -173,39 +76,37 @@
self.storage = ProvenanceWithoutPathDB(conn)
self.cache: ProvenanceCache = new_cache()
- def clear_caches(self):
+ def clear_caches(self) -> None:
self.cache = new_cache()
- def commit(self):
+ def flush(self) -> None:
# TODO: for now we just forward the cache. This should be improved!
while not self.storage.commit(self.cache, raise_on_commit=self.raise_on_commit):
logging.warning(
- f"Unable to commit cached information {self.write_cache}. Retrying..."
+ f"Unable to commit cached information {self.cache}. Retrying..."
)
self.clear_caches()
def content_add_to_directory(
self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
- ):
+ ) -> None:
self.cache["content_in_directory"].add(
(blob.id, directory.id, normalize(os.path.join(prefix, blob.name)))
)
def content_add_to_revision(
self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
- ):
+ ) -> None:
self.cache["content_in_revision"].add(
(blob.id, revision.id, normalize(os.path.join(prefix, blob.name)))
)
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
return self.storage.content_find_first(id)
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ ) -> Generator[ProvenanceResult, None, None]:
yield from self.storage.content_find_all(id, limit=limit)
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
@@ -216,13 +117,13 @@
) -> Dict[Sha1Git, datetime]:
return self.get_dates("content", [blob.id for blob in blobs])
- def content_set_early_date(self, blob: FileEntry, date: datetime):
+ def content_set_early_date(self, blob: FileEntry, date: datetime) -> None:
self.cache["content"]["data"][blob.id] = date
self.cache["content"]["added"].add(blob.id)
def directory_add_to_revision(
self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
- ):
+ ) -> None:
self.cache["directory_in_revision"].add(
(directory.id, revision.id, normalize(path))
)
@@ -239,7 +140,7 @@
def directory_set_date_in_isochrone_frontier(
self, directory: DirectoryEntry, date: datetime
- ):
+ ) -> None:
self.cache["directory"]["data"][directory.id] = date
self.cache["directory"]["added"].add(directory.id)
@@ -260,21 +161,23 @@
self.cache["origin"]["data"][origin.id] = origin.url
self.cache["origin"]["added"].add(origin.id)
- def revision_add(self, revision: RevisionEntry):
+ def revision_add(self, revision: RevisionEntry) -> None:
self.cache["revision"]["data"][revision.id] = revision.date
self.cache["revision"]["added"].add(revision.id)
def revision_add_before_revision(
- self, relative: RevisionEntry, revision: RevisionEntry
- ):
+ self, head: RevisionEntry, revision: RevisionEntry
+ ) -> None:
self.cache["revision_before_revision"].setdefault(revision.id, set()).add(
- relative.id
+ head.id
)
- def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
+ def revision_add_to_origin(
+ self, origin: OriginEntry, revision: RevisionEntry
+ ) -> None:
self.cache["revision_in_origin"].add((revision.id, origin.id))
- def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
+ def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]:
return self.get_dates("revision", [revision.id]).get(revision.id, None)
def revision_get_preferred_origin(
@@ -294,7 +197,7 @@
def revision_set_preferred_origin(
self, origin: OriginEntry, revision: RevisionEntry
- ):
+ ) -> None:
self.cache["revision_origin"]["data"][revision.id] = origin.id
self.cache["revision_origin"]["added"].add(revision.id)
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -165,15 +165,14 @@
provenance = get_provenance(**ctx.obj["config"]["provenance"])
# TODO: return a dictionary with proper keys for each field
- row = provenance.content_find_first(hash_to_bytes(swhid))
- if row is not None:
+ occur = provenance.content_find_first(hash_to_bytes(swhid))
+ if occur is not None:
print(
- "swh:1:cnt:{cnt}, swh:1:rev:{rev}, {date}, {path}".format(
- cnt=hash_to_hex(row[0]),
- rev=hash_to_hex(row[1]),
- date=row[2],
- path=os.fsdecode(row[3]),
- )
+ f"swh:1:cnt:{hash_to_hex(occur.content)}, "
+ f"swh:1:rev:{hash_to_hex(occur.revision)}, "
+ f"{occur.date}, "
+ f"{occur.origin}, "
+ f"{os.fsdecode(occur.path)}"
)
else:
print(f"Cannot find a content with the id {swhid}")
@@ -189,12 +188,11 @@
provenance = get_provenance(**ctx.obj["config"]["provenance"])
# TODO: return a dictionary with proper keys for each field
- for row in provenance.content_find_all(hash_to_bytes(swhid), limit=limit):
+ for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit):
print(
- "swh:1:cnt:{cnt}, swh:1:rev:{rev}, {date}, {path}".format(
- cnt=hash_to_hex(row[0]),
- rev=hash_to_hex(row[1]),
- date=row[2],
- path=os.fsdecode(row[3]),
- )
+ f"swh:1:cnt:{hash_to_hex(occur.content)}, "
+ f"swh:1:rev:{hash_to_hex(occur.revision)}, "
+ f"{occur.date}, "
+ f"{occur.origin}, "
+ f"{os.fsdecode(occur.path)}"
)
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -52,7 +52,7 @@
graph = build_history_graph(archive, provenance, revision)
origin_add_revision(provenance, origin, graph)
done = time.time()
- provenance.commit()
+ provenance.flush()
stop = time.time()
logging.debug(
"Origins "
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -1,20 +1,22 @@
from datetime import datetime
import itertools
import logging
-from typing import Any, Dict, Generator, List, Optional, Set, Tuple
+from typing import Any, Dict, Generator, List, Mapping, Optional, Set, Tuple
import psycopg2
import psycopg2.extras
from swh.model.model import Sha1Git
+from ..provenance import ProvenanceResult
+
class ProvenanceDBBase:
def __init__(self, conn: psycopg2.extensions.connection):
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
conn.set_session(autocommit=True)
self.conn = conn
- self.cursor = self.conn.cursor()
+ self.cursor = self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# XXX: not sure this is the best place to do it!
self.cursor.execute("SET timezone TO 'UTC'")
self._flavor: Optional[str] = None
@@ -22,8 +24,8 @@
@property
def flavor(self) -> str:
if self._flavor is None:
- self.cursor.execute("select swh_get_dbflavor()")
- self._flavor = self.cursor.fetchone()[0]
+ self.cursor.execute("SELECT swh_get_dbflavor() AS flavor")
+ self._flavor = self.cursor.fetchone()["flavor"]
assert self._flavor is not None
return self._flavor
@@ -31,7 +33,7 @@
def with_path(self) -> bool:
return self.flavor == "with-path"
- def commit(self, data: Dict[str, Any], raise_on_commit: bool = False) -> bool:
+ def commit(self, data: Mapping[str, Any], raise_on_commit: bool = False) -> bool:
try:
# First insert entities
for entity in ("content", "directory", "revision"):
@@ -87,25 +89,23 @@
return False
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
...
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ ) -> Generator[ProvenanceResult, None, None]:
...
def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]:
- dates = {}
+ dates: Dict[Sha1Git, datetime] = {}
if ids:
values = ", ".join(itertools.repeat("%s", len(ids)))
self.cursor.execute(
f"""SELECT sha1, date FROM {entity} WHERE sha1 IN ({values})""",
tuple(ids),
)
- dates.update(self.cursor.fetchall())
+ dates.update(((row["sha1"], row["date"]) for row in self.cursor.fetchall()))
return dates
def insert_entity(self, entity: str, data: Dict[Sha1Git, datetime]):
@@ -217,7 +217,7 @@
(revision,),
)
row = self.cursor.fetchone()
- return row[0] if row is not None else None
+ return row["sha1"] if row is not None else None
def revision_in_history(self, revision: Sha1Git) -> bool:
self.cursor.execute(
diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py
--- a/swh/provenance/postgresql/provenancedb_with_path.py
+++ b/swh/provenance/postgresql/provenancedb_with_path.py
@@ -1,4 +1,3 @@
-from datetime import datetime
from typing import Generator, Optional, Set, Tuple
import psycopg2
@@ -6,66 +5,68 @@
from swh.model.model import Sha1Git
+from ..provenance import ProvenanceResult
from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithPathDB(ProvenanceDBBase):
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
- self.cursor.execute(
- """
- SELECT C.sha1 AS blob,
- R.sha1 AS rev,
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
+ sql = """
+ SELECT C.sha1 AS content,
+ R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
L.path AS path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
- INNER JOIN location as L ON (CR.location = L.id)
- INNER JOIN revision as R ON (CR.revision = R.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
+ INNER JOIN location as L ON (CR.location=L.id)
+ INNER JOIN revision as R ON (CR.revision=R.id)
+ LEFT JOIN origin as O ON (R.origin=O.id)
WHERE C.sha1=%s
- ORDER BY date, rev, path ASC LIMIT 1
- """,
- (id,),
- )
- return self.cursor.fetchone()
+ ORDER BY date, revision, origin, path ASC LIMIT 1
+ """
+ self.cursor.execute(sql, (id,))
+ row = self.cursor.fetchone()
+ return ProvenanceResult(**row) if row is not None else None
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ ) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- self.cursor.execute(
- f"""
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
+ sql = f"""
+ (SELECT C.sha1 AS content,
+ R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
L.path AS path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
- INNER JOIN location AS L ON (CR.location = L.id)
- INNER JOIN revision AS R ON (CR.revision = R.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
+ INNER JOIN location AS L ON (CR.location=L.id)
+ INNER JOIN revision AS R ON (CR.revision=R.id)
+ LEFT JOIN origin as O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
CASE DL.path
WHEN '' THEN CL.path
WHEN '.' THEN CL.path
ELSE (DL.path || '/' || CL.path)::unix_path
END AS path
FROM content AS C
- INNER JOIN content_in_directory AS CD ON (C.id = CD.content)
- INNER JOIN directory_in_revision AS DR ON (CD.directory = DR.directory)
- INNER JOIN revision AS R ON (DR.revision = R.id)
- INNER JOIN location AS CL ON (CD.location = CL.id)
- INNER JOIN location AS DL ON (DR.location = DL.id)
+ INNER JOIN content_in_directory AS CD ON (C.id=CD.content)
+ INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory)
+ INNER JOIN revision AS R ON (DR.revision=R.id)
+ INNER JOIN location AS CL ON (CD.location=CL.id)
+ INNER JOIN location AS DL ON (DR.location=DL.id)
+ LEFT JOIN origin AS O ON (R.origin=O.id)
WHERE C.sha1=%s)
- ORDER BY date, rev, path {early_cut}
- """,
- (id, id),
- )
- yield from self.cursor.fetchall()
+ ORDER BY date, revision, origin, path {early_cut}
+ """
+ self.cursor.execute(sql, (id, id))
+ yield from (ProvenanceResult(**row) for row in self.cursor.fetchall())
def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
"""Insert entries in `relation` from `data`
diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py
--- a/swh/provenance/postgresql/provenancedb_without_path.py
+++ b/swh/provenance/postgresql/provenancedb_without_path.py
@@ -1,4 +1,3 @@
-from datetime import datetime
from typing import Generator, Optional, Set, Tuple
import psycopg2
@@ -6,58 +5,60 @@
from swh.model.model import Sha1Git
+from ..provenance import ProvenanceResult
from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithoutPathDB(ProvenanceDBBase):
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
- self.cursor.execute(
- """
- SELECT C.sha1 AS blob,
- R.sha1 AS rev,
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
+ sql = """
+ SELECT C.sha1 AS content,
+ R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
'\\x'::bytea as path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
- INNER JOIN revision as R ON (CR.revision = R.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
+ INNER JOIN revision as R ON (CR.revision=R.id)
+ LEFT JOIN origin as O ON (R.origin=O.id)
WHERE C.sha1=%s
- ORDER BY date, rev ASC LIMIT 1
- """,
- (id,),
- )
- return self.cursor.fetchone()
+ ORDER BY date, revision, origin ASC LIMIT 1
+ """
+ self.cursor.execute(sql, (id,))
+ row = self.cursor.fetchone()
+ return ProvenanceResult(**row) if row is not None else None
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ ) -> Generator[ProvenanceResult, None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
- self.cursor.execute(
- f"""
- (SELECT C.sha1 AS blob,
- R.sha1 AS rev,
+ sql = f"""
+ (SELECT C.sha1 AS content,
+ R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
'\\x'::bytea as path
FROM content AS C
- INNER JOIN content_in_revision AS CR ON (CR.content = C.id)
- INNER JOIN revision AS R ON (CR.revision = R.id)
+ INNER JOIN content_in_revision AS CR ON (CR.content=C.id)
+ INNER JOIN revision AS R ON (CR.revision=R.id)
+ LEFT JOIN origin as O ON (R.origin=O.id)
WHERE C.sha1=%s)
UNION
(SELECT C.sha1 AS content,
R.sha1 AS revision,
R.date AS date,
+ O.url AS origin,
'\\x'::bytea as path
FROM content AS C
- INNER JOIN content_in_directory AS CD ON (C.id = CD.content)
- INNER JOIN directory_in_revision AS DR ON (CD.directory = DR.directory)
- INNER JOIN revision AS R ON (DR.revision = R.id)
+ INNER JOIN content_in_directory AS CD ON (C.id=CD.content)
+ INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory)
+ INNER JOIN revision AS R ON (DR.revision=R.id)
+ LEFT JOIN origin as O ON (R.origin=O.id)
WHERE C.sha1=%s)
- ORDER BY date, rev, path {early_cut}
- """,
- (id, id),
- )
- yield from self.cursor.fetchall()
+ ORDER BY date, revision, origin {early_cut}
+ """
+ self.cursor.execute(sql, (id, id))
+ yield from (ProvenanceResult(**row) for row in self.cursor.fetchall())
def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
if data:
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -1,308 +1,161 @@
from datetime import datetime
-import logging
-import os
-from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple
+from typing import Dict, Generator, Iterable, Optional
-import psycopg2
-from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable
+from typing_extensions import Protocol, runtime_checkable
from swh.model.model import Sha1Git
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
-# XXX: this protocol doesn't make much sense now that flavours have been delegated to
-# another class, lower in the callstack.
+class ProvenanceResult:
+ def __init__(
+ self,
+ content: Sha1Git,
+ revision: Sha1Git,
+ date: datetime,
+ origin: Optional[str],
+ path: bytes,
+ ) -> None:
+ self.content = content
+ self.revision = revision
+ self.date = date
+ self.origin = origin
+ self.path = path
+
+
@runtime_checkable
class ProvenanceInterface(Protocol):
raise_on_commit: bool = False
- def commit(self):
- """Commit currently ongoing transactions in the backend DB"""
+ def flush(self) -> None:
+ """Flush internal cache to the underlying `storage`."""
...
def content_add_to_directory(
self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
) -> None:
+ """Associate `blob` with `directory` in the provenance model. `prefix` is the
+ relative path from `directory` to `blob` (excluding `blob`'s name).
+ """
...
def content_add_to_revision(
self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
) -> None:
+ """Associate `blob` with `revision` in the provenance model. `prefix` is the
+ absolute path from `revision`'s root directory to `blob` (excluding `blob`'s
+ name).
+ """
...
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
+ def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]:
+ """Retrieve the first occurrence of the blob identified by `id`."""
...
def content_find_all(
self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ ) -> Generator[ProvenanceResult, None, None]:
+ """Retrieve all the occurrences of the blob identified by `id`."""
...
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
+ """Retrieve the earliest known date of `blob`."""
...
def content_get_early_dates(
self, blobs: Iterable[FileEntry]
) -> Dict[Sha1Git, datetime]:
+ """Retrieve the earliest known date for each blob in `blobs`. If some blob has
+ no associated date, it is not present in the resulting dictionary.
+ """
...
def content_set_early_date(self, blob: FileEntry, date: datetime) -> None:
+ """Associate `date` to `blob` as it's earliest known date."""
...
def directory_add_to_revision(
self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
) -> None:
+ """Associate `directory` with `revision` in the provenance model. `path` is the
+ absolute path from `revision`'s root directory to `directory` (including
+ `directory`'s name).
+ """
...
def directory_get_date_in_isochrone_frontier(
self, directory: DirectoryEntry
) -> Optional[datetime]:
+ """Retrieve the earliest known date of `directory` as an isochrone frontier in
+ the provenance model.
+ """
...
def directory_get_dates_in_isochrone_frontier(
self, dirs: Iterable[DirectoryEntry]
) -> Dict[Sha1Git, datetime]:
+ """Retrieve the earliest known date for each directory in `dirs` as isochrone
+ frontiers provenance model. If some directory has no associated date, it is not
+ present in the resulting dictionary.
+ """
...
def directory_set_date_in_isochrone_frontier(
self, directory: DirectoryEntry, date: datetime
) -> None:
+ """Associate `date` to `directory` as it's earliest known date as an isochrone
+ frontier in the provenance model.
+ """
...
def origin_add(self, origin: OriginEntry) -> None:
+ """Add `origin` to the provenance model."""
...
def revision_add(self, revision: RevisionEntry) -> None:
+ """Add `revision` to the provenance model. This implies storing `revision`'s
+ date in the model, thus `revision.date` must be a valid date.
+ """
...
def revision_add_before_revision(
- self, relative: RevisionEntry, revision: RevisionEntry
+ self, head: RevisionEntry, revision: RevisionEntry
) -> None:
+ """Associate `revision` to `head` as an ancestor of the latter."""
...
def revision_add_to_origin(
self, origin: OriginEntry, revision: RevisionEntry
) -> None:
+ """Associate `revision` to `origin` as a head revision of the latter (ie. the
+ target of an snapshot for `origin` in the archive)."""
...
- def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
+ def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]:
+ """Retrieve the date associated to `revision`."""
...
def revision_get_preferred_origin(
self, revision: RevisionEntry
) -> Optional[Sha1Git]:
+ """Retrieve the preferred origin associated to `revision`."""
...
def revision_in_history(self, revision: RevisionEntry) -> bool:
+ """Check if `revision` is known to be an ancestor of some head revision in the
+ provenance model.
+ """
...
def revision_set_preferred_origin(
self, origin: OriginEntry, revision: RevisionEntry
) -> None:
+ """Associate `origin` as the preferred origin for `revision`."""
...
def revision_visited(self, revision: RevisionEntry) -> bool:
+ """Check if `revision` is known to be a head revision for some origin in the
+ provenance model.
+ """
...
-
-
-class DatetimeCache(TypedDict):
- data: Dict[Sha1Git, Optional[datetime]]
- added: Set[Sha1Git]
-
-
-class OriginCache(TypedDict):
- data: Dict[Sha1Git, str]
- added: Set[Sha1Git]
-
-
-class RevisionCache(TypedDict):
- data: Dict[Sha1Git, Sha1Git]
- added: Set[Sha1Git]
-
-
-class ProvenanceCache(TypedDict):
- content: DatetimeCache
- directory: DatetimeCache
- revision: DatetimeCache
- # below are insertion caches only
- content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
- content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]]
- directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
- # these two are for the origin layer
- origin: OriginCache
- revision_origin: RevisionCache
- revision_before_revision: Dict[Sha1Git, Set[Sha1Git]]
- revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]]
-
-
-def new_cache():
- return ProvenanceCache(
- content=DatetimeCache(data={}, added=set()),
- directory=DatetimeCache(data={}, added=set()),
- revision=DatetimeCache(data={}, added=set()),
- content_in_revision=set(),
- content_in_directory=set(),
- directory_in_revision=set(),
- origin=OriginCache(data={}, added=set()),
- revision_origin=RevisionCache(data={}, added=set()),
- revision_before_revision={},
- revision_in_origin=set(),
- )
-
-
-# TODO: maybe move this to a separate file
-class ProvenanceBackend:
- raise_on_commit: bool = False
-
- def __init__(self, conn: psycopg2.extensions.connection):
- from .postgresql.provenancedb_base import ProvenanceDBBase
-
- # TODO: this class should not know what the actual used DB is.
- self.storage: ProvenanceDBBase
- flavor = ProvenanceDBBase(conn).flavor
- if flavor == "with-path":
- from .postgresql.provenancedb_with_path import ProvenanceWithPathDB
-
- self.storage = ProvenanceWithPathDB(conn)
- else:
- from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB
-
- self.storage = ProvenanceWithoutPathDB(conn)
- self.cache: ProvenanceCache = new_cache()
-
- def clear_caches(self):
- self.cache = new_cache()
-
- def commit(self):
- # TODO: for now we just forward the cache. This should be improved!
- while not self.storage.commit(self.cache, raise_on_commit=self.raise_on_commit):
- logging.warning(
- f"Unable to commit cached information {self.write_cache}. Retrying..."
- )
- self.clear_caches()
-
- def content_add_to_directory(
- self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
- ):
- self.cache["content_in_directory"].add(
- (blob.id, directory.id, normalize(os.path.join(prefix, blob.name)))
- )
-
- def content_add_to_revision(
- self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
- ):
- self.cache["content_in_revision"].add(
- (blob.id, revision.id, normalize(os.path.join(prefix, blob.name)))
- )
-
- def content_find_first(
- self, id: Sha1Git
- ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
- return self.storage.content_find_first(id)
-
- def content_find_all(
- self, id: Sha1Git, limit: Optional[int] = None
- ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
- yield from self.storage.content_find_all(id, limit=limit)
-
- def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
- return self.get_dates("content", [blob.id]).get(blob.id, None)
-
- def content_get_early_dates(
- self, blobs: Iterable[FileEntry]
- ) -> Dict[Sha1Git, datetime]:
- return self.get_dates("content", [blob.id for blob in blobs])
-
- def content_set_early_date(self, blob: FileEntry, date: datetime):
- self.cache["content"]["data"][blob.id] = date
- self.cache["content"]["added"].add(blob.id)
-
- def directory_add_to_revision(
- self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
- ):
- self.cache["directory_in_revision"].add(
- (directory.id, revision.id, normalize(path))
- )
-
- def directory_get_date_in_isochrone_frontier(
- self, directory: DirectoryEntry
- ) -> Optional[datetime]:
- return self.get_dates("directory", [directory.id]).get(directory.id, None)
-
- def directory_get_dates_in_isochrone_frontier(
- self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[Sha1Git, datetime]:
- return self.get_dates("directory", [directory.id for directory in dirs])
-
- def directory_set_date_in_isochrone_frontier(
- self, directory: DirectoryEntry, date: datetime
- ):
- self.cache["directory"]["data"][directory.id] = date
- self.cache["directory"]["added"].add(directory.id)
-
- def get_dates(
- self, entity: Literal["content", "revision", "directory"], ids: List[Sha1Git]
- ) -> Dict[Sha1Git, datetime]:
- cache = self.cache[entity]
- missing_ids = set(id for id in ids if id not in cache)
- if missing_ids:
- cache["data"].update(self.storage.get_dates(entity, list(missing_ids)))
- return {
- sha1: date
- for sha1, date in cache["data"].items()
- if sha1 in ids and date is not None
- }
-
- def origin_add(self, origin: OriginEntry) -> None:
- self.cache["origin"]["data"][origin.id] = origin.url
- self.cache["origin"]["added"].add(origin.id)
-
- def revision_add(self, revision: RevisionEntry):
- self.cache["revision"]["data"][revision.id] = revision.date
- self.cache["revision"]["added"].add(revision.id)
-
- def revision_add_before_revision(
- self, relative: RevisionEntry, revision: RevisionEntry
- ):
- self.cache["revision_before_revision"].setdefault(revision.id, set()).add(
- relative.id
- )
-
- def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
- self.cache["revision_in_origin"].add((revision.id, origin.id))
-
- def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
- return self.get_dates("revision", [revision.id]).get(revision.id, None)
-
- def revision_get_preferred_origin(
- self, revision: RevisionEntry
- ) -> Optional[Sha1Git]:
- cache = self.cache["revision_origin"]
- if revision.id not in cache:
- origin = self.storage.revision_get_preferred_origin(revision.id)
- if origin is not None:
- cache["data"][revision.id] = origin
- return cache["data"].get(revision.id)
-
- def revision_in_history(self, revision: RevisionEntry) -> bool:
- return revision.id in self.cache[
- "revision_before_revision"
- ] or self.storage.revision_in_history(revision.id)
-
- def revision_set_preferred_origin(
- self, origin: OriginEntry, revision: RevisionEntry
- ):
- self.cache["revision_origin"]["data"][revision.id] = origin.id
- self.cache["revision_origin"]["added"].add(revision.id)
-
- def revision_visited(self, revision: RevisionEntry) -> bool:
- return revision.id in dict(
- self.cache["revision_in_origin"]
- ) or self.storage.revision_visited(revision.id)
-
-
-def normalize(path: bytes) -> bytes:
- return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -69,7 +69,7 @@
assert revision.date is not None
assert revision.root is not None
# Processed content starting from the revision's root directory.
- date = provenance.revision_get_early_date(revision)
+ date = provenance.revision_get_date(revision)
if date is None or revision.date < date:
logging.debug(
f"Processing revisions {revision.id.hex()}"
@@ -93,16 +93,12 @@
)
done = time.time()
if commit:
- provenance.commit()
+ provenance.flush()
stop = time.time()
logging.debug(
f"Revisions {';'.join([revision.id.hex() for revision in revisions])} "
f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
- # logging.critical(
- # ";".join([revision.id.hex() for revision in revisions])
- # + f",{stop - start},{stop - done}"
- # )
def revision_process_content(
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -29,7 +29,7 @@
flavor = request.param
populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor)
- from swh.provenance.provenance import ProvenanceBackend
+ from swh.provenance.backend import ProvenanceBackend
BaseDb.adapt_conn(postgresql)
prov = ProvenanceBackend(postgresql)
diff --git a/swh/provenance/tests/test_conftest.py b/swh/provenance/tests/test_conftest.py
--- a/swh/provenance/tests/test_conftest.py
+++ b/swh/provenance/tests/test_conftest.py
@@ -7,7 +7,7 @@
def test_provenance_fixture(provenance):
"""Check the 'provenance' fixture produce a working ProvenanceDB object"""
assert provenance
- provenance.commit() # should be a noop
+ provenance.flush() # should be a noop
def test_storage(swh_storage_with_objects):
diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py
--- a/swh/provenance/tests/test_history_graph.py
+++ b/swh/provenance/tests/test_history_graph.py
@@ -59,4 +59,4 @@
origin_add_revision(provenance, entry, computed_graph)
if not batch:
- provenance.commit()
+ provenance.flush()
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -25,7 +25,7 @@
'cur' is a cursor to the provenance index DB.
"""
cur.execute(f"SELECT sha1 FROM {table}")
- return set(sha1.hex() for (sha1,) in cur.fetchall())
+ return set(row["sha1"].hex() for row in cur.fetchall())
def locations(cur):
@@ -33,8 +33,8 @@
'cur' is a cursor to the provenance index DB.
"""
- cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location")
- return set(x for (x,) in cur.fetchall())
+ cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location")
+ return set(row["path"] for row in cur.fetchall())
def relations(cur, src, dst):
@@ -46,17 +46,17 @@
'cur' is a cursor to the provenance index DB.
"""
relation = f"{src}_in_{dst}"
- cur.execute("select swh_get_dbflavor()")
- with_path = cur.fetchone()[0] == "with-path"
+ cur.execute("SELECT swh_get_dbflavor() AS flavor")
+ with_path = cur.fetchone()["flavor"] == "with-path"
# note that the columns have the same name as the relations they refer to,
# so we can write things like "rel.{dst}=src.id" in the query below
if with_path:
cur.execute(
f"""
- SELECT encode(src.sha1::bytea, 'hex'),
- encode(dst.sha1::bytea, 'hex'),
- encode(location.path::bytea, 'escape')
+ SELECT encode(src.sha1::bytea, 'hex') AS src,
+ encode(dst.sha1::bytea, 'hex') AS dst,
+ encode(location.path::bytea, 'escape') AS path
FROM {relation} as relation
INNER JOIN {src} AS src ON (relation.{src} = src.id)
INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id)
@@ -66,15 +66,15 @@
else:
cur.execute(
f"""
- SELECT encode(src.sha1::bytea, 'hex'),
- encode(dst.sha1::bytea, 'hex'),
- ''
+ SELECT encode(src.sha1::bytea, 'hex') AS src,
+ encode(dst.sha1::bytea, 'hex') AS dst,
+ '' AS path
FROM {relation} as relation
INNER JOIN {src} AS src ON (src.id = relation.{src})
INNER JOIN {dst} AS dst ON (dst.id = relation.{dst})
"""
)
- return set(cur.fetchall())
+ return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall())
def get_timestamp(cur, table, sha1):
@@ -85,7 +85,7 @@
if isinstance(sha1, str):
sha1 = hash_to_bytes(sha1)
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,))
- return [date.timestamp() for (date,) in cur.fetchall()]
+ return [row["date"].timestamp() for row in cur.fetchall()]
@pytest.mark.parametrize(
@@ -253,21 +253,25 @@
for rc in synth_rev["R_C"]:
expected_occurrences.setdefault(rc["dst"].hex(), []).append(
- (rev_id, rev_ts, maybe_path(rc["path"]))
+ (rev_id, rev_ts, None, maybe_path(rc["path"]))
)
for dc in synth_rev["D_C"]:
assert dc["prefix"] is not None # to please mypy
expected_occurrences.setdefault(dc["dst"].hex(), []).append(
- (rev_id, rev_ts, maybe_path(dc["prefix"] + "/" + dc["path"]))
+ (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
)
for content_id, results in expected_occurrences.items():
expected = [(content_id, *result) for result in results]
db_occurrences = [
- (blob.hex(), rev.hex(), date.timestamp(), path.decode())
- for blob, rev, date, path in provenance.content_find_all(
- hash_to_bytes(content_id)
+ (
+ occur.content.hex(),
+ occur.revision.hex(),
+ occur.date.timestamp(),
+ occur.origin,
+ occur.path.decode(),
)
+ for occur in provenance.content_find_all(hash_to_bytes(content_id))
]
if provenance.storage.with_path:
# this is not true if the db stores no path, because a same content
@@ -337,11 +341,10 @@
# nothing to do there, this content cannot be a "first seen file"
for content_id, (rev_id, ts, paths) in expected_first.items():
- (r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first(
- hash_to_bytes(content_id)
- )
- assert r_sha1.hex() == content_id
- assert r_rev_id.hex() == rev_id
- assert r_ts.timestamp() == ts
+ occur = provenance.content_find_first(hash_to_bytes(content_id))
+ assert occur.content.hex() == content_id
+ assert occur.revision.hex() == rev_id
+ assert occur.date.timestamp() == ts
+ assert occur.origin is None
if provenance.storage.with_path:
- assert r_path.decode() in paths
+ assert occur.path.decode() in paths
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 5:56 PM (11 w, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228507
Attached To
D5946: Rework `ProvenanceInterface` as discussed during backend design
Event Timeline
Log In to Comment