Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/provenance.py
from datetime import datetime | from datetime import datetime | ||||
import logging | import logging | ||||
import os | import os | ||||
from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple | from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple | ||||
import psycopg2 | import psycopg2 | ||||
from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable | from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable | ||||
from swh.model.model import Sha1Git | |||||
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry | from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry | ||||
# XXX: this protocol doesn't make much sense now that flavours have been delegated to | # XXX: this protocol doesn't make much sense now that flavours have been delegated to | ||||
# another class, lower in the callstack. | # another class, lower in the callstack. | ||||
@runtime_checkable | @runtime_checkable | ||||
class ProvenanceInterface(Protocol): | class ProvenanceInterface(Protocol): | ||||
raise_on_commit: bool = False | raise_on_commit: bool = False | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | class ProvenanceInterface(Protocol): | ||||
) -> Dict[bytes, datetime]: | ) -> Dict[bytes, datetime]: | ||||
... | ... | ||||
def directory_set_date_in_isochrone_frontier( | def directory_set_date_in_isochrone_frontier( | ||||
self, directory: DirectoryEntry, date: datetime | self, directory: DirectoryEntry, date: datetime | ||||
) -> None: | ) -> None: | ||||
... | ... | ||||
def origin_add(self, origin: OriginEntry) -> None: | |||||
... | |||||
def revision_add(self, revision: RevisionEntry) -> None: | def revision_add(self, revision: RevisionEntry) -> None: | ||||
... | ... | ||||
def revision_add_before_revision( | def revision_add_before_revision( | ||||
self, relative: RevisionEntry, revision: RevisionEntry | self, relative: RevisionEntry, revision: RevisionEntry | ||||
) -> None: | ) -> None: | ||||
... | ... | ||||
def revision_add_to_origin( | def revision_add_to_origin( | ||||
self, origin: OriginEntry, revision: RevisionEntry | self, origin: OriginEntry, revision: RevisionEntry | ||||
) -> None: | ) -> None: | ||||
... | ... | ||||
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | ||||
... | ... | ||||
def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: | def revision_get_preferred_origin( | ||||
self, revision: RevisionEntry | |||||
) -> Optional[Sha1Git]: | |||||
... | ... | ||||
def revision_in_history(self, revision: RevisionEntry) -> bool: | def revision_in_history(self, revision: RevisionEntry) -> bool: | ||||
... | ... | ||||
def revision_set_preferred_origin( | def revision_set_preferred_origin( | ||||
self, origin: OriginEntry, revision: RevisionEntry | self, origin: OriginEntry, revision: RevisionEntry | ||||
) -> None: | ) -> None: | ||||
... | ... | ||||
def revision_visited(self, revision: RevisionEntry) -> bool: | def revision_visited(self, revision: RevisionEntry) -> bool: | ||||
... | ... | ||||
class DatetimeCache(TypedDict): | class DatetimeCache(TypedDict): | ||||
data: Dict[bytes, datetime] | data: Dict[bytes, datetime] | ||||
added: Set[bytes] | added: Set[bytes] | ||||
class OriginCache(TypedDict): | class OriginCache(TypedDict): | ||||
data: Dict[bytes, str] | data: Dict[Sha1Git, str] | ||||
added: Set[bytes] | added: Set[Sha1Git] | ||||
class RevisionCache(TypedDict): | |||||
data: Dict[Sha1Git, Sha1Git] | |||||
added: Set[Sha1Git] | |||||
class ProvenanceCache(TypedDict): | class ProvenanceCache(TypedDict): | ||||
content: DatetimeCache | content: DatetimeCache | ||||
directory: DatetimeCache | directory: DatetimeCache | ||||
revision: DatetimeCache | revision: DatetimeCache | ||||
# below are insertion caches only | # below are insertion caches only | ||||
content_in_revision: Set[Tuple[bytes, bytes, bytes]] | content_in_revision: Set[Tuple[bytes, bytes, bytes]] | ||||
content_in_directory: Set[Tuple[bytes, bytes, bytes]] | content_in_directory: Set[Tuple[bytes, bytes, bytes]] | ||||
directory_in_revision: Set[Tuple[bytes, bytes, bytes]] | directory_in_revision: Set[Tuple[bytes, bytes, bytes]] | ||||
# these two are for the origin layer | # these two are for the origin layer | ||||
origin: OriginCache | |||||
revision_origin: RevisionCache | |||||
revision_before_revision: Dict[bytes, Set[bytes]] | revision_before_revision: Dict[bytes, Set[bytes]] | ||||
revision_in_origin: Set[Tuple[bytes, str]] | revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]] | ||||
revision_preferred_origin: OriginCache | |||||
def new_cache(): | def new_cache(): | ||||
return ProvenanceCache( | return ProvenanceCache( | ||||
content=DatetimeCache(data={}, added=set()), | content=DatetimeCache(data={}, added=set()), | ||||
directory=DatetimeCache(data={}, added=set()), | directory=DatetimeCache(data={}, added=set()), | ||||
revision=DatetimeCache(data={}, added=set()), | revision=DatetimeCache(data={}, added=set()), | ||||
content_in_revision=set(), | content_in_revision=set(), | ||||
content_in_directory=set(), | content_in_directory=set(), | ||||
directory_in_revision=set(), | directory_in_revision=set(), | ||||
origin=OriginCache(data={}, added=set()), | |||||
revision_origin=RevisionCache(data={}, added=set()), | |||||
revision_before_revision={}, | revision_before_revision={}, | ||||
revision_in_origin=set(), | revision_in_origin=set(), | ||||
revision_preferred_origin=OriginCache(data={}, added=set()), | |||||
) | ) | ||||
# TODO: maybe move this to a separate file | # TODO: maybe move this to a separate file | ||||
class ProvenanceBackend: | class ProvenanceBackend: | ||||
raise_on_commit: bool = False | raise_on_commit: bool = False | ||||
def __init__(self, conn: psycopg2.extensions.connection): | def __init__(self, conn: psycopg2.extensions.connection): | ||||
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines | def get_dates( | ||||
self, entity: Literal["content", "revision", "directory"], ids: List[bytes] | self, entity: Literal["content", "revision", "directory"], ids: List[bytes] | ||||
) -> Dict[bytes, datetime]: | ) -> Dict[bytes, datetime]: | ||||
cache = self.cache[entity] | cache = self.cache[entity] | ||||
missing_ids = set(id for id in ids if id not in cache) | missing_ids = set(id for id in ids if id not in cache) | ||||
if missing_ids: | if missing_ids: | ||||
cache["data"].update(self.storage.get_dates(entity, list(missing_ids))) | cache["data"].update(self.storage.get_dates(entity, list(missing_ids))) | ||||
return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]} | return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]} | ||||
def origin_add(self, origin: OriginEntry) -> None: | |||||
self.cache["origin"]["data"][origin.id] = origin.url | |||||
self.cache["origin"]["added"].add(origin.id) | |||||
def revision_add(self, revision: RevisionEntry): | def revision_add(self, revision: RevisionEntry): | ||||
# Add current revision to the compact DB | # Add current revision to the compact DB | ||||
assert revision.date is not None | assert revision.date is not None | ||||
self.cache["revision"]["data"][revision.id] = revision.date | self.cache["revision"]["data"][revision.id] = revision.date | ||||
self.cache["revision"]["added"].add(revision.id) | self.cache["revision"]["added"].add(revision.id) | ||||
def revision_add_before_revision( | def revision_add_before_revision( | ||||
self, relative: RevisionEntry, revision: RevisionEntry | self, relative: RevisionEntry, revision: RevisionEntry | ||||
): | ): | ||||
self.cache["revision_before_revision"].setdefault(revision.id, set()).add( | self.cache["revision_before_revision"].setdefault(revision.id, set()).add( | ||||
relative.id | relative.id | ||||
) | ) | ||||
def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): | def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): | ||||
self.cache["revision_in_origin"].add((revision.id, origin.url)) | self.cache["revision_in_origin"].add((revision.id, origin.id)) | ||||
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: | ||||
return self.get_dates("revision", [revision.id]).get(revision.id, None) | return self.get_dates("revision", [revision.id]).get(revision.id, None) | ||||
def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: | def revision_get_preferred_origin( | ||||
if revision.id not in self.cache["revision_preferred_origin"]["data"]: | self, revision: RevisionEntry | ||||
url = self.storage.revision_get_preferred_origin(revision.id) | ) -> Optional[Sha1Git]: | ||||
if url is not None: | cache = self.cache["revision_origin"] | ||||
self.cache["revision_preferred_origin"]["data"][revision.id] = url | if revision.id not in cache: | ||||
return self.cache["revision_preferred_origin"]["data"].get(revision.id) | origin = self.storage.revision_get_preferred_origin(revision.id) | ||||
if origin is not None: | |||||
cache["data"][revision.id] = origin | |||||
return cache["data"].get(revision.id) | |||||
def revision_in_history(self, revision: RevisionEntry) -> bool: | def revision_in_history(self, revision: RevisionEntry) -> bool: | ||||
return revision.id in self.cache[ | return revision.id in self.cache[ | ||||
"revision_before_revision" | "revision_before_revision" | ||||
] or self.storage.revision_in_history(revision.id) | ] or self.storage.revision_in_history(revision.id) | ||||
def revision_set_preferred_origin( | def revision_set_preferred_origin( | ||||
self, origin: OriginEntry, revision: RevisionEntry | self, origin: OriginEntry, revision: RevisionEntry | ||||
): | ): | ||||
self.cache["revision_preferred_origin"]["data"][revision.id] = origin.url | self.cache["revision_origin"]["data"][revision.id] = origin.id | ||||
self.cache["revision_preferred_origin"]["added"].add(revision.id) | self.cache["revision_origin"]["added"].add(revision.id) | ||||
def revision_visited(self, revision: RevisionEntry) -> bool: | def revision_visited(self, revision: RevisionEntry) -> bool: | ||||
return revision.id in dict( | return revision.id in dict( | ||||
self.cache["revision_in_origin"] | self.cache["revision_in_origin"] | ||||
) or self.storage.revision_visited(revision.id) | ) or self.storage.revision_visited(revision.id) | ||||
def normalize(path: bytes) -> bytes: | def normalize(path: bytes) -> bytes: | ||||
return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path | return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path |