diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -33,7 +33,7 @@ ) with_path = kwargs.get("with_path") - from swh.provenance.provenance import ProvenanceBackend + from swh.provenance.backend import ProvenanceBackend prov = ProvenanceBackend(conn) if with_path is not None: diff --git a/swh/provenance/provenance.py b/swh/provenance/backend.py copy from swh/provenance/provenance.py copy to swh/provenance/backend.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/backend.py @@ -3,109 +3,13 @@ import os from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple -import psycopg2 -from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable +import psycopg2 # TODO: remove this dependency +from typing_extensions import Literal, TypedDict from swh.model.model import Sha1Git from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry - - -# XXX: this protocol doesn't make much sense now that flavours have been delegated to -# another class, lower in the callstack. -@runtime_checkable -class ProvenanceInterface(Protocol): - raise_on_commit: bool = False - - def commit(self): - """Commit currently ongoing transactions in the backend DB""" - ... - - def content_add_to_directory( - self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes - ) -> None: - ... - - def content_add_to_revision( - self, revision: RevisionEntry, blob: FileEntry, prefix: bytes - ) -> None: - ... - - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: - ... - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: - ... - - def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: - ... - - def content_get_early_dates( - self, blobs: Iterable[FileEntry] - ) -> Dict[Sha1Git, datetime]: - ... - - def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: - ... - - def directory_add_to_revision( - self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes - ) -> None: - ... - - def directory_get_date_in_isochrone_frontier( - self, directory: DirectoryEntry - ) -> Optional[datetime]: - ... - - def directory_get_dates_in_isochrone_frontier( - self, dirs: Iterable[DirectoryEntry] - ) -> Dict[Sha1Git, datetime]: - ... - - def directory_set_date_in_isochrone_frontier( - self, directory: DirectoryEntry, date: datetime - ) -> None: - ... - - def origin_add(self, origin: OriginEntry) -> None: - ... - - def revision_add(self, revision: RevisionEntry) -> None: - ... - - def revision_add_before_revision( - self, relative: RevisionEntry, revision: RevisionEntry - ) -> None: - ... - - def revision_add_to_origin( - self, origin: OriginEntry, revision: RevisionEntry - ) -> None: - ... - - def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: - ... - - def revision_get_preferred_origin( - self, revision: RevisionEntry - ) -> Optional[Sha1Git]: - ... - - def revision_in_history(self, revision: RevisionEntry) -> bool: - ... - - def revision_set_preferred_origin( - self, origin: OriginEntry, revision: RevisionEntry - ) -> None: - ... - - def revision_visited(self, revision: RevisionEntry) -> bool: - ... +from .provenance import ProvenanceResult class DatetimeCache(TypedDict): @@ -138,7 +42,7 @@ revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]] -def new_cache(): +def new_cache() -> ProvenanceCache: return ProvenanceCache( content=DatetimeCache(data={}, added=set()), directory=DatetimeCache(data={}, added=set()), @@ -153,7 +57,6 @@ ) -# TODO: maybe move this to a separate file class ProvenanceBackend: raise_on_commit: bool = False @@ -173,39 +76,37 @@ self.storage = ProvenanceWithoutPathDB(conn) self.cache: ProvenanceCache = new_cache() - def clear_caches(self): + def clear_caches(self) -> None: self.cache = new_cache() - def commit(self): + def flush(self) -> None: # TODO: for now we just forward the cache. This should be improved! while not self.storage.commit(self.cache, raise_on_commit=self.raise_on_commit): logging.warning( - f"Unable to commit cached information {self.write_cache}. Retrying..." + f"Unable to commit cached information {self.cache}. Retrying..." ) self.clear_caches() def content_add_to_directory( self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes - ): + ) -> None: self.cache["content_in_directory"].add( (blob.id, directory.id, normalize(os.path.join(prefix, blob.name))) ) def content_add_to_revision( self, revision: RevisionEntry, blob: FileEntry, prefix: bytes - ): + ) -> None: self.cache["content_in_revision"].add( (blob.id, revision.id, normalize(os.path.join(prefix, blob.name))) ) - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: return self.storage.content_find_first(id) def content_find_all( self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + ) -> Generator[ProvenanceResult, None, None]: yield from self.storage.content_find_all(id, limit=limit) def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: @@ -216,13 +117,13 @@ ) -> Dict[Sha1Git, datetime]: return self.get_dates("content", [blob.id for blob in blobs]) - def content_set_early_date(self, blob: FileEntry, date: datetime): + def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: self.cache["content"]["data"][blob.id] = date self.cache["content"]["added"].add(blob.id) def directory_add_to_revision( self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes - ): + ) -> None: self.cache["directory_in_revision"].add( (directory.id, revision.id, normalize(path)) ) @@ -239,7 +140,7 @@ def directory_set_date_in_isochrone_frontier( self, directory: DirectoryEntry, date: datetime - ): + ) -> None: self.cache["directory"]["data"][directory.id] = date self.cache["directory"]["added"].add(directory.id) @@ -260,21 +161,23 @@ self.cache["origin"]["data"][origin.id] = origin.url self.cache["origin"]["added"].add(origin.id) - def revision_add(self, revision: RevisionEntry): + def revision_add(self, revision: RevisionEntry) -> None: self.cache["revision"]["data"][revision.id] = revision.date self.cache["revision"]["added"].add(revision.id) def revision_add_before_revision( self, relative: RevisionEntry, revision: RevisionEntry - ): + ) -> None: self.cache["revision_before_revision"].setdefault(revision.id, set()).add( relative.id ) - def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): + def revision_add_to_origin( + self, origin: OriginEntry, revision: RevisionEntry + ) -> None: self.cache["revision_in_origin"].add((revision.id, origin.id)) - def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: + def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: return self.get_dates("revision", [revision.id]).get(revision.id, None) def revision_get_preferred_origin( @@ -294,7 +197,7 @@ def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry - ): + ) -> None: self.cache["revision_origin"]["data"][revision.id] = origin.id self.cache["revision_origin"]["added"].add(revision.id) diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -52,7 +52,7 @@ graph = build_history_graph(archive, provenance, revision) origin_add_revision(provenance, origin, graph) done = time.time() - provenance.commit() + provenance.flush() stop = time.time() logging.debug( "Origins " diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -1,13 +1,15 @@ from datetime import datetime import itertools import logging -from typing import Any, Dict, Generator, List, Optional, Set, Tuple +from typing import Any, Dict, Generator, List, Mapping, Optional, Set, Tuple import psycopg2 import psycopg2.extras from swh.model.model import Sha1Git +from ..provenance import ProvenanceResult + class ProvenanceDBBase: def __init__(self, conn: psycopg2.extensions.connection): @@ -31,7 +33,7 @@ def with_path(self) -> bool: return self.flavor == "with-path" - def commit(self, data: Dict[str, Any], raise_on_commit: bool = False) -> bool: + def commit(self, data: Mapping[str, Any], raise_on_commit: bool = False) -> bool: try: # First insert entities for entity in ("content", "directory", "revision"): @@ -87,14 +89,12 @@ return False - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: ... def content_find_all( self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + ) -> Generator[ProvenanceResult, None, None]: ... def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]: diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -1,4 +1,3 @@ -from datetime import datetime from typing import Generator, Optional, Set, Tuple import psycopg2 @@ -6,66 +5,80 @@ from swh.model.model import Sha1Git +from ..provenance import ProvenanceResult from .provenancedb_base import ProvenanceDBBase class ProvenanceWithPathDB(ProvenanceDBBase): - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: self.cursor.execute( """ SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, + O.url AS url, L.path AS path FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN location as L ON (CR.location = L.id) - INNER JOIN revision as R ON (CR.revision = R.id) + INNER JOIN content_in_revision AS CR ON (CR.content=C.id) + INNER JOIN location as L ON (CR.location=L.id) + INNER JOIN revision as R ON (CR.revision=R.id) + LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s - ORDER BY date, rev, path ASC LIMIT 1 + ORDER BY date, rev, url, path ASC LIMIT 1 """, (id,), ) - return self.cursor.fetchone() + row = self.cursor.fetchone() + if row: + return ProvenanceResult( + content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] + ) + else: + return None def content_find_all( self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" self.cursor.execute( f""" (SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, + O.url AS url, L.path AS path FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN location AS L ON (CR.location = L.id) - INNER JOIN revision AS R ON (CR.revision = R.id) + INNER JOIN content_in_revision AS CR ON (CR.content=C.id) + INNER JOIN location AS L ON (CR.location=L.id) + INNER JOIN revision AS R ON (CR.revision=R.id) + LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) UNION (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, + O.url AS url, CASE DL.path WHEN '' THEN CL.path WHEN '.' THEN CL.path ELSE (DL.path || '/' || CL.path)::unix_path END AS path FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id = CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory = DR.directory) - INNER JOIN revision AS R ON (DR.revision = R.id) - INNER JOIN location AS CL ON (CD.location = CL.id) - INNER JOIN location AS DL ON (DR.location = DL.id) + INNER JOIN content_in_directory AS CD ON (C.id=CD.content) + INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) + INNER JOIN revision AS R ON (DR.revision=R.id) + INNER JOIN location AS CL ON (CD.location=CL.id) + INNER JOIN location AS DL ON (DR.location=DL.id) + LEFT JOIN origin AS O ON (R.origin=O.id) WHERE C.sha1=%s) - ORDER BY date, rev, path {early_cut} + ORDER BY date, rev, url, path {early_cut} """, (id, id), ) - yield from self.cursor.fetchall() + for row in self.cursor.fetchall(): + yield ProvenanceResult( + content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] + ) def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): """Insert entries in `relation` from `data` diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -1,4 +1,3 @@ -from datetime import datetime from typing import Generator, Optional, Set, Tuple import psycopg2 @@ -6,58 +5,72 @@ from swh.model.model import Sha1Git +from ..provenance import ProvenanceResult from .provenancedb_base import ProvenanceDBBase class ProvenanceWithoutPathDB(ProvenanceDBBase): - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: self.cursor.execute( """ SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, + O.url AS url, '\\x'::bytea as path FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN revision as R ON (CR.revision = R.id) + INNER JOIN content_in_revision AS CR ON (CR.content=C.id) + INNER JOIN revision as R ON (CR.revision=R.id) + LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s - ORDER BY date, rev ASC LIMIT 1 + ORDER BY date, rev, url ASC LIMIT 1 """, (id,), ) - return self.cursor.fetchone() + row = self.cursor.fetchone() + if row: + return ProvenanceResult( + content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] + ) + else: + return None def content_find_all( self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" self.cursor.execute( f""" (SELECT C.sha1 AS blob, R.sha1 AS rev, R.date AS date, + O.url AS url, '\\x'::bytea as path FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN revision AS R ON (CR.revision = R.id) + INNER JOIN content_in_revision AS CR ON (CR.content=C.id) + INNER JOIN revision AS R ON (CR.revision=R.id) + LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) UNION (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, + O.url AS url, '\\x'::bytea as path FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id = CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory = DR.directory) - INNER JOIN revision AS R ON (DR.revision = R.id) + INNER JOIN content_in_directory AS CD ON (C.id=CD.content) + INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) + INNER JOIN revision AS R ON (DR.revision=R.id) + LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) - ORDER BY date, rev, path {early_cut} + ORDER BY date, rev, url {early_cut} """, (id, id), ) - yield from self.cursor.fetchall() + for row in self.cursor.fetchall(): + yield ProvenanceResult( + content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] + ) def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): if data: diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -1,24 +1,27 @@ from datetime import datetime -import logging -import os -from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple +from typing import Dict, Generator, Iterable, Optional -import psycopg2 -from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable +from typing_extensions import Protocol, TypedDict, runtime_checkable from swh.model.model import Sha1Git from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry -# XXX: this protocol doesn't make much sense now that flavours have been delegated to -# another class, lower in the callstack. +class ProvenanceResult(TypedDict): + content: Sha1Git + revision: Sha1Git + date: datetime + origin: Optional[str] + path: bytes + + @runtime_checkable class ProvenanceInterface(Protocol): raise_on_commit: bool = False - def commit(self): - """Commit currently ongoing transactions in the backend DB""" + def flush(self) -> None: + """Flush internal cache to the underlying `ProvenanceStorageInterface`""" ... def content_add_to_directory( @@ -31,14 +34,12 @@ ) -> None: ... - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: ... def content_find_all( self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + ) -> Generator[ProvenanceResult, None, None]: ... def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: @@ -79,7 +80,7 @@ ... def revision_add_before_revision( - self, relative: RevisionEntry, revision: RevisionEntry + self, after: RevisionEntry, before: RevisionEntry ) -> None: ... @@ -88,7 +89,7 @@ ) -> None: ... - def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: + def revision_get_date(self, revision: RevisionEntry) -> Optional[datetime]: ... def revision_get_preferred_origin( @@ -106,203 +107,3 @@ def revision_visited(self, revision: RevisionEntry) -> bool: ... - - -class DatetimeCache(TypedDict): - data: Dict[Sha1Git, Optional[datetime]] - added: Set[Sha1Git] - - -class OriginCache(TypedDict): - data: Dict[Sha1Git, str] - added: Set[Sha1Git] - - -class RevisionCache(TypedDict): - data: Dict[Sha1Git, Sha1Git] - added: Set[Sha1Git] - - -class ProvenanceCache(TypedDict): - content: DatetimeCache - directory: DatetimeCache - revision: DatetimeCache - # below are insertion caches only - content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] - content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]] - directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] - # these two are for the origin layer - origin: OriginCache - revision_origin: RevisionCache - revision_before_revision: Dict[Sha1Git, Set[Sha1Git]] - revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]] - - -def new_cache(): - return ProvenanceCache( - content=DatetimeCache(data={}, added=set()), - directory=DatetimeCache(data={}, added=set()), - revision=DatetimeCache(data={}, added=set()), - content_in_revision=set(), - content_in_directory=set(), - directory_in_revision=set(), - origin=OriginCache(data={}, added=set()), - revision_origin=RevisionCache(data={}, added=set()), - revision_before_revision={}, - revision_in_origin=set(), - ) - - -# TODO: maybe move this to a separate file -class ProvenanceBackend: - raise_on_commit: bool = False - - def __init__(self, conn: psycopg2.extensions.connection): - from .postgresql.provenancedb_base import ProvenanceDBBase - - # TODO: this class should not know what the actual used DB is. - self.storage: ProvenanceDBBase - flavor = ProvenanceDBBase(conn).flavor - if flavor == "with-path": - from .postgresql.provenancedb_with_path import ProvenanceWithPathDB - - self.storage = ProvenanceWithPathDB(conn) - else: - from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB - - self.storage = ProvenanceWithoutPathDB(conn) - self.cache: ProvenanceCache = new_cache() - - def clear_caches(self): - self.cache = new_cache() - - def commit(self): - # TODO: for now we just forward the cache. This should be improved! - while not self.storage.commit(self.cache, raise_on_commit=self.raise_on_commit): - logging.warning( - f"Unable to commit cached information {self.write_cache}. Retrying..." - ) - self.clear_caches() - - def content_add_to_directory( - self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes - ): - self.cache["content_in_directory"].add( - (blob.id, directory.id, normalize(os.path.join(prefix, blob.name))) - ) - - def content_add_to_revision( - self, revision: RevisionEntry, blob: FileEntry, prefix: bytes - ): - self.cache["content_in_revision"].add( - (blob.id, revision.id, normalize(os.path.join(prefix, blob.name))) - ) - - def content_find_first( - self, id: Sha1Git - ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: - return self.storage.content_find_first(id) - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: - yield from self.storage.content_find_all(id, limit=limit) - - def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: - return self.get_dates("content", [blob.id]).get(blob.id, None) - - def content_get_early_dates( - self, blobs: Iterable[FileEntry] - ) -> Dict[Sha1Git, datetime]: - return self.get_dates("content", [blob.id for blob in blobs]) - - def content_set_early_date(self, blob: FileEntry, date: datetime): - self.cache["content"]["data"][blob.id] = date - self.cache["content"]["added"].add(blob.id) - - def directory_add_to_revision( - self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes - ): - self.cache["directory_in_revision"].add( - (directory.id, revision.id, normalize(path)) - ) - - def directory_get_date_in_isochrone_frontier( - self, directory: DirectoryEntry - ) -> Optional[datetime]: - return self.get_dates("directory", [directory.id]).get(directory.id, None) - - def directory_get_dates_in_isochrone_frontier( - self, dirs: Iterable[DirectoryEntry] - ) -> Dict[Sha1Git, datetime]: - return self.get_dates("directory", [directory.id for directory in dirs]) - - def directory_set_date_in_isochrone_frontier( - self, directory: DirectoryEntry, date: datetime - ): - self.cache["directory"]["data"][directory.id] = date - self.cache["directory"]["added"].add(directory.id) - - def get_dates( - self, entity: Literal["content", "revision", "directory"], ids: List[Sha1Git] - ) -> Dict[Sha1Git, datetime]: - cache = self.cache[entity] - missing_ids = set(id for id in ids if id not in cache) - if missing_ids: - cache["data"].update(self.storage.get_dates(entity, list(missing_ids))) - return { - sha1: date - for sha1, date in cache["data"].items() - if sha1 in ids and date is not None - } - - def origin_add(self, origin: OriginEntry) -> None: - self.cache["origin"]["data"][origin.id] = origin.url - self.cache["origin"]["added"].add(origin.id) - - def revision_add(self, revision: RevisionEntry): - self.cache["revision"]["data"][revision.id] = revision.date - self.cache["revision"]["added"].add(revision.id) - - def revision_add_before_revision( - self, relative: RevisionEntry, revision: RevisionEntry - ): - self.cache["revision_before_revision"].setdefault(revision.id, set()).add( - relative.id - ) - - def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): - self.cache["revision_in_origin"].add((revision.id, origin.id)) - - def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: - return self.get_dates("revision", [revision.id]).get(revision.id, None) - - def revision_get_preferred_origin( - self, revision: RevisionEntry - ) -> Optional[Sha1Git]: - cache = self.cache["revision_origin"] - if revision.id not in cache: - origin = self.storage.revision_get_preferred_origin(revision.id) - if origin is not None: - cache["data"][revision.id] = origin - return cache["data"].get(revision.id) - - def revision_in_history(self, revision: RevisionEntry) -> bool: - return revision.id in self.cache[ - "revision_before_revision" - ] or self.storage.revision_in_history(revision.id) - - def revision_set_preferred_origin( - self, origin: OriginEntry, revision: RevisionEntry - ): - self.cache["revision_origin"]["data"][revision.id] = origin.id - self.cache["revision_origin"]["added"].add(revision.id) - - def revision_visited(self, revision: RevisionEntry) -> bool: - return revision.id in dict( - self.cache["revision_in_origin"] - ) or self.storage.revision_visited(revision.id) - - -def normalize(path: bytes) -> bytes: - return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -69,7 +69,7 @@ assert revision.date is not None assert revision.root is not None # Processed content starting from the revision's root directory. - date = provenance.revision_get_early_date(revision) + date = provenance.revision_get_date(revision) if date is None or revision.date < date: logging.debug( f"Processing revisions {revision.id.hex()}" @@ -93,16 +93,12 @@ ) done = time.time() if commit: - provenance.commit() + provenance.flush() stop = time.time() logging.debug( f"Revisions {';'.join([revision.id.hex() for revision in revisions])} " f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) - # logging.critical( - # ";".join([revision.id.hex() for revision in revisions]) - # + f",{stop - start},{stop - done}" - # ) def revision_process_content( diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -29,7 +29,7 @@ flavor = request.param populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor) - from swh.provenance.provenance import ProvenanceBackend + from swh.provenance.backend import ProvenanceBackend BaseDb.adapt_conn(postgresql) prov = ProvenanceBackend(postgresql) diff --git a/swh/provenance/tests/test_conftest.py b/swh/provenance/tests/test_conftest.py --- a/swh/provenance/tests/test_conftest.py +++ b/swh/provenance/tests/test_conftest.py @@ -7,7 +7,7 @@ def test_provenance_fixture(provenance): """Check the 'provenance' fixture produce a working ProvenanceDB object""" assert provenance - provenance.commit() # should be a noop + provenance.flush() # should be a noop def test_storage(swh_storage_with_objects): diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -59,4 +59,4 @@ origin_add_revision(provenance, entry, computed_graph) if not batch: - provenance.commit() + provenance.flush() diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -253,21 +253,25 @@ for rc in synth_rev["R_C"]: expected_occurrences.setdefault(rc["dst"].hex(), []).append( - (rev_id, rev_ts, maybe_path(rc["path"])) + (rev_id, rev_ts, None, maybe_path(rc["path"])) ) for dc in synth_rev["D_C"]: assert dc["prefix"] is not None # to please mypy expected_occurrences.setdefault(dc["dst"].hex(), []).append( - (rev_id, rev_ts, maybe_path(dc["prefix"] + "/" + dc["path"])) + (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) ) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ - (blob.hex(), rev.hex(), date.timestamp(), path.decode()) - for blob, rev, date, path in provenance.content_find_all( - hash_to_bytes(content_id) + ( + res["content"].hex(), + res["revision"].hex(), + res["date"].timestamp(), + res["origin"], + res["path"].decode(), ) + for res in provenance.content_find_all(hash_to_bytes(content_id)) ] if provenance.storage.with_path: # this is not true if the db stores no path, because a same content @@ -337,11 +341,10 @@ # nothing to do there, this content cannot be a "first seen file" for content_id, (rev_id, ts, paths) in expected_first.items(): - (r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first( - hash_to_bytes(content_id) - ) - assert r_sha1.hex() == content_id - assert r_rev_id.hex() == rev_id - assert r_ts.timestamp() == ts + result = provenance.content_find_first(hash_to_bytes(content_id)) + assert result["content"].hex() == content_id + assert result["revision"].hex() == rev_id + assert result["date"].timestamp() == ts + assert result["origin"] is None if provenance.storage.with_path: - assert r_path.decode() in paths + assert result["path"].decode() in paths