diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py --- a/swh/provenance/archive.py +++ b/swh/provenance/archive.py @@ -2,12 +2,12 @@ from typing_extensions import Protocol, runtime_checkable -from swh.model.model import Revision, Sha1 +from swh.model.model import Revision, Sha1Git @runtime_checkable class ArchiveInterface(Protocol): - def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]: + def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: """List entries for one directory. Args: @@ -19,7 +19,7 @@ """ ... - def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]: + def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: """Given a list of sha1, return the revisions' information Args: @@ -32,11 +32,11 @@ """ ... - def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]: + def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: """List all revisions pointed by one snapshot. Args: - snapshot: the snapshot's identifier + id: sha1 id of the snapshot. Yields: sha1 ids of found revisions. diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -3,6 +3,8 @@ import os from typing import Dict, Optional, Set +from swh.model.model import Sha1Git + from .archive import ArchiveInterface from .model import DirectoryEntry, RevisionEntry from .provenance import ProvenanceInterface @@ -166,7 +168,7 @@ logging.debug( f"Recursively creating isochrone graph for revision {revision.id.hex()}..." ) - fdates: Dict[bytes, datetime] = {} # map {file_id: date} + fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date} while stack: current = stack.pop() if current.dbdate is None or current.dbdate > revision.date: diff --git a/swh/provenance/model.py b/swh/provenance/model.py --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -6,14 +6,17 @@ from datetime import datetime from typing import Iterable, Iterator, List, Optional +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import origin_identifier +from swh.model.model import Sha1Git + from .archive import ArchiveInterface class OriginEntry: - def __init__(self, url: str, date: datetime, snapshot: bytes): + def __init__(self, url: str, snapshot: Sha1Git): self.url = url - # TODO: this is probably not needed and will be removed! - # self.date = date + self.id: Sha1Git = hash_to_bytes(origin_identifier({"url": self.url})) self.snapshot = snapshot self._revisions: Optional[List[RevisionEntry]] = None @@ -33,19 +36,16 @@ return (x for x in self._revisions) def __str__(self): - return ( - f"" - ) + return f"" class RevisionEntry: def __init__( self, - id: bytes, + id: Sha1Git, date: Optional[datetime] = None, - root: Optional[bytes] = None, - parents: Optional[Iterable[bytes]] = None, + root: Optional[Sha1Git] = None, + parents: Optional[Iterable[Sha1Git]] = None, ): self.id = id self.date = date @@ -91,7 +91,7 @@ class DirectoryEntry: - def __init__(self, id: bytes, name: bytes = b""): + def __init__(self, id: Sha1Git, name: bytes = b""): self.id = id self.name = name self._files: Optional[List[FileEntry]] = None @@ -141,7 +141,7 @@ class FileEntry: - def __init__(self, id: bytes, name: bytes): + def __init__(self, id: Sha1Git, name: bytes): self.id = id self.name = name diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,10 +1,9 @@ -from datetime import datetime, timezone from itertools import islice import logging import time from typing import Iterable, Iterator, List, Optional, Tuple -import iso8601 +from swh.model.model import Sha1Git from .archive import ArchiveInterface from .graph import HistoryNode, build_history_graph @@ -16,31 +15,28 @@ """Iterator over origin visit statuses typically present in the given CSV file. - The input is an iterator that produces 3 elements per row: + The input is an iterator that produces 2 elements per row: - (url, date, snap) + (url, snap) where: - url: is the origin url of the visit - - date: is the date of the visit - snap: sha1_git of the snapshot pointed by the visit status """ def __init__( self, - statuses: Iterable[Tuple[str, datetime, bytes]], + statuses: Iterable[Tuple[str, Sha1Git]], limit: Optional[int] = None, ): - self.statuses: Iterator[Tuple[str, datetime, bytes]] + self.statuses: Iterator[Tuple[str, Sha1Git]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) def __iter__(self): - for url, date, snap in self.statuses: - date = iso8601.parse_date(date, default_timezone=timezone.utc) - yield OriginEntry(url, date, snap) + return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) def origin_add( @@ -50,6 +46,7 @@ ): start = time.time() for origin in origins: + provenance.origin_add(origin) origin.retrieve_revisions(archive) for revision in origin.revisions: graph = build_history_graph(archive, provenance, revision) @@ -59,7 +56,7 @@ stop = time.time() logging.debug( "Origins " - ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins]) + ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" ) diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -3,7 +3,7 @@ from methodtools import lru_cache import psycopg2 -from swh.model.model import ObjectType, Revision, Sha1, TargetType +from swh.model.model import ObjectType, Revision, Sha1Git, TargetType from swh.storage.postgresql.storage import Storage @@ -12,14 +12,14 @@ self.conn = conn self.storage = Storage(conn, objstorage={"cls": "memory"}) - def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]: + def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: # TODO: only call directory_ls_internal if the id is not being queried by # someone else. Otherwise wait until results get properly cached. entries = self.directory_ls_internal(id) yield from entries @lru_cache(maxsize=100000) - def directory_ls_internal(self, id: Sha1) -> List[Dict[str, Any]]: + def directory_ls_internal(self, id: Sha1Git) -> List[Dict[str, Any]]: # TODO: add file size filtering with self.conn.cursor() as cursor: cursor.execute( @@ -62,7 +62,7 @@ for row in cursor.fetchall() ] - def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]: + def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: with self.conn.cursor() as cursor: psycopg2.extras.execute_values( cursor, @@ -96,7 +96,7 @@ } ) - def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]: + def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: # TODO: this code is duplicated here (same as in swh.provenance.storage.archive) # but it's just temporary. This method should actually perform a direct query to # the SQL db of the archive. @@ -123,7 +123,7 @@ if release is not None and release.target_type == ObjectType.REVISION ) - revisions: Set[Sha1] = set() + revisions: Set[Sha1Git] = set() for targets in grouper(targets_set, batchsize): revisions.update( revision.id diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -6,6 +6,8 @@ import psycopg2 import psycopg2.extras +from swh.model.model import Sha1Git + class ProvenanceDBBase: def __init__(self, conn: psycopg2.extensions.connection): @@ -51,6 +53,16 @@ ): self.insert_relation(relation, data[relation]) + # Insert origins + self.insert_origin( + { + sha1: data["origin"]["data"][sha1] + for sha1 in data["origin"]["added"] + }, + ) + data["origin"]["data"].clear() + data["origin"]["added"].clear() + # Insert relations from the origin-revision layer self.insert_origin_head(data["revision_in_origin"]) self.insert_revision_history(data["revision_before_revision"]) @@ -58,12 +70,12 @@ # Update preferred origins self.update_preferred_origin( { - sha1: data["revision_preferred_origin"]["data"][sha1] - for sha1 in data["revision_preferred_origin"]["added"] + sha1: data["revision_origin"]["data"][sha1] + for sha1 in data["revision_origin"]["added"] } ) - data["revision_preferred_origin"]["data"].clear() - data["revision_preferred_origin"]["added"].clear() + data["revision_origin"]["data"].clear() + data["revision_origin"]["added"].clear() return True @@ -76,16 +88,16 @@ return False def content_find_first( - self, blob: bytes - ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: + self, id: Sha1Git + ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: ... def content_find_all( - self, blob: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: ... - def get_dates(self, entity: str, ids: List[bytes]) -> Dict[bytes, datetime]: + def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]: dates = {} if ids: values = ", ".join(itertools.repeat("%s", len(ids))) @@ -96,7 +108,7 @@ dates.update(self.cursor.fetchall()) return dates - def insert_entity(self, entity: str, data: Dict[bytes, datetime]): + def insert_entity(self, entity: str, data: Dict[Sha1Git, datetime]): if data: psycopg2.extras.execute_values( self.cursor, @@ -112,7 +124,22 @@ # This might be useless! data.clear() - def insert_origin_head(self, data: Set[Tuple[bytes, str]]): + def insert_origin(self, data: Dict[Sha1Git, str]): + if data: + psycopg2.extras.execute_values( + self.cursor, + """ + LOCK TABLE ONLY origin; + INSERT INTO origin(sha1, url) VALUES %s + ON CONFLICT DO NOTHING + """, + data.items(), + ) + # XXX: not sure if Python takes a reference or a copy. + # This might be useless! + data.clear() + + def insert_origin_head(self, data: Set[Tuple[Sha1Git, Sha1Git]]): if data: psycopg2.extras.execute_values( self.cursor, @@ -123,16 +150,16 @@ SELECT R.id, O.id FROM (VALUES %s) AS V(rev, org) INNER JOIN revision AS R on (R.sha1=V.rev) - INNER JOIN origin AS O on (O.url=V.org::unix_path) + INNER JOIN origin AS O on (O.sha1=V.org) """, data, ) data.clear() - def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]): + def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): ... - def insert_revision_history(self, data: Dict[bytes, bytes]): + def insert_revision_history(self, data: Dict[Sha1Git, Sha1Git]): if data: values = [[(prev, next) for next in data[prev]] for prev in data] psycopg2.extras.execute_values( @@ -150,10 +177,10 @@ ) data.clear() - def revision_get_preferred_origin(self, revision: bytes) -> Optional[str]: + def revision_get_preferred_origin(self, revision: Sha1Git) -> Optional[Sha1Git]: self.cursor.execute( """ - SELECT O.url + SELECT O.sha1 FROM revision AS R JOIN origin as O ON R.origin=O.id @@ -161,9 +188,9 @@ (revision,), ) row = self.cursor.fetchone() - return str(row[0], encoding="utf-8") if row is not None else None + return row[0] if row is not None else None - def revision_in_history(self, revision: bytes) -> bool: + def revision_in_history(self, revision: Sha1Git) -> bool: self.cursor.execute( """ SELECT 1 @@ -176,7 +203,7 @@ ) return self.cursor.fetchone() is not None - def revision_visited(self, revision: bytes) -> bool: + def revision_visited(self, revision: Sha1Git) -> bool: self.cursor.execute( """ SELECT 1 @@ -189,18 +216,18 @@ ) return self.cursor.fetchone() is not None - def update_preferred_origin(self, data: Dict[bytes, str]): + def update_preferred_origin(self, data: Dict[Sha1Git, Sha1Git]): if data: # XXX: this is assuming the revision already exists in the db! It should # be improved by allowing null dates in the revision table. psycopg2.extras.execute_values( self.cursor, """ - UPDATE revision + UPDATE revision R SET origin=O.id FROM (VALUES %s) AS V(rev, org) - INNER JOIN origin AS O on (O.url=V.org::unix_path) - WHERE sha1=V.rev + INNER JOIN origin AS O on (O.sha1=V.org) + WHERE R.sha1=V.rev """, data.items(), ) diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -4,13 +4,15 @@ import psycopg2 import psycopg2.extras +from swh.model.model import Sha1Git + from .provenancedb_base import ProvenanceDBBase class ProvenanceWithPathDB(ProvenanceDBBase): def content_find_first( - self, blob: bytes - ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: + self, id: Sha1Git + ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: self.cursor.execute( """ SELECT C.sha1 AS blob, @@ -24,13 +26,13 @@ WHERE C.sha1=%s ORDER BY date, rev, path ASC LIMIT 1 """, - (blob,), + (id,), ) return self.cursor.fetchone() def content_find_all( - self, blob: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" self.cursor.execute( f""" @@ -61,11 +63,11 @@ WHERE C.sha1=%s) ORDER BY date, rev, path {early_cut} """, - (blob, blob), + (id, id), ) yield from self.cursor.fetchall() - def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]): + def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): """Insert entries in `relation` from `data` Also insert missing location entries in the 'location' table. diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -4,17 +4,15 @@ import psycopg2 import psycopg2.extras -from .provenancedb_base import ProvenanceDBBase +from swh.model.model import Sha1Git -######################################################################################## -######################################################################################## -######################################################################################## +from .provenancedb_base import ProvenanceDBBase class ProvenanceWithoutPathDB(ProvenanceDBBase): def content_find_first( - self, blob: bytes - ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: + self, id: Sha1Git + ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: self.cursor.execute( """ SELECT C.sha1 AS blob, @@ -27,13 +25,13 @@ WHERE C.sha1=%s ORDER BY date, rev ASC LIMIT 1 """, - (blob,), + (id,), ) return self.cursor.fetchone() def content_find_all( - self, blob: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" self.cursor.execute( f""" @@ -57,11 +55,11 @@ WHERE C.sha1=%s) ORDER BY date, rev, path {early_cut} """, - (blob, blob), + (id, id), ) yield from self.cursor.fetchall() - def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]): + def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): if data: assert relation in ( "content_in_revision", diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -6,6 +6,8 @@ import psycopg2 from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable +from swh.model.model import Sha1Git + from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry @@ -30,13 +32,13 @@ ... def content_find_first( - self, blob: bytes - ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: + self, id: Sha1Git + ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: ... def content_find_all( - self, blob: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: ... def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: @@ -44,7 +46,7 @@ def content_get_early_dates( self, blobs: Iterable[FileEntry] - ) -> Dict[bytes, datetime]: + ) -> Dict[Sha1Git, datetime]: ... def content_set_early_date(self, blob: FileEntry, date: datetime) -> None: @@ -62,7 +64,7 @@ def directory_get_dates_in_isochrone_frontier( self, dirs: Iterable[DirectoryEntry] - ) -> Dict[bytes, datetime]: + ) -> Dict[Sha1Git, datetime]: ... def directory_set_date_in_isochrone_frontier( @@ -70,6 +72,9 @@ ) -> None: ... + def origin_add(self, origin: OriginEntry) -> None: + ... + def revision_add(self, revision: RevisionEntry) -> None: ... @@ -86,7 +91,9 @@ def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: ... - def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: + def revision_get_preferred_origin( + self, revision: RevisionEntry + ) -> Optional[Sha1Git]: ... def revision_in_history(self, revision: RevisionEntry) -> bool: @@ -102,13 +109,18 @@ class DatetimeCache(TypedDict): - data: Dict[bytes, datetime] - added: Set[bytes] + data: Dict[Sha1Git, datetime] + added: Set[Sha1Git] class OriginCache(TypedDict): - data: Dict[bytes, str] - added: Set[bytes] + data: Dict[Sha1Git, str] + added: Set[Sha1Git] + + +class RevisionCache(TypedDict): + data: Dict[Sha1Git, Sha1Git] + added: Set[Sha1Git] class ProvenanceCache(TypedDict): @@ -116,13 +128,14 @@ directory: DatetimeCache revision: DatetimeCache # below are insertion caches only - content_in_revision: Set[Tuple[bytes, bytes, bytes]] - content_in_directory: Set[Tuple[bytes, bytes, bytes]] - directory_in_revision: Set[Tuple[bytes, bytes, bytes]] + content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] + content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]] + directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]] # these two are for the origin layer - revision_before_revision: Dict[bytes, Set[bytes]] - revision_in_origin: Set[Tuple[bytes, str]] - revision_preferred_origin: OriginCache + origin: OriginCache + revision_origin: RevisionCache + revision_before_revision: Dict[Sha1Git, Set[Sha1Git]] + revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]] def new_cache(): @@ -133,9 +146,10 @@ content_in_revision=set(), content_in_directory=set(), directory_in_revision=set(), + origin=OriginCache(data={}, added=set()), + revision_origin=RevisionCache(data={}, added=set()), revision_before_revision={}, revision_in_origin=set(), - revision_preferred_origin=OriginCache(data={}, added=set()), ) @@ -185,21 +199,21 @@ ) def content_find_first( - self, blob: bytes - ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]: - return self.storage.content_find_first(blob) + self, id: Sha1Git + ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]: + return self.storage.content_find_first(id) def content_find_all( - self, blob: bytes, limit: Optional[int] = None - ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]: - yield from self.storage.content_find_all(blob, limit=limit) + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]: + yield from self.storage.content_find_all(id, limit=limit) def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]: return self.get_dates("content", [blob.id]).get(blob.id, None) def content_get_early_dates( self, blobs: Iterable[FileEntry] - ) -> Dict[bytes, datetime]: + ) -> Dict[Sha1Git, datetime]: return self.get_dates("content", [blob.id for blob in blobs]) def content_set_early_date(self, blob: FileEntry, date: datetime): @@ -220,7 +234,7 @@ def directory_get_dates_in_isochrone_frontier( self, dirs: Iterable[DirectoryEntry] - ) -> Dict[bytes, datetime]: + ) -> Dict[Sha1Git, datetime]: return self.get_dates("directory", [directory.id for directory in dirs]) def directory_set_date_in_isochrone_frontier( @@ -230,14 +244,18 @@ self.cache["directory"]["added"].add(directory.id) def get_dates( - self, entity: Literal["content", "revision", "directory"], ids: List[bytes] - ) -> Dict[bytes, datetime]: + self, entity: Literal["content", "revision", "directory"], ids: List[Sha1Git] + ) -> Dict[Sha1Git, datetime]: cache = self.cache[entity] missing_ids = set(id for id in ids if id not in cache) if missing_ids: cache["data"].update(self.storage.get_dates(entity, list(missing_ids))) return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]} + def origin_add(self, origin: OriginEntry) -> None: + self.cache["origin"]["data"][origin.id] = origin.url + self.cache["origin"]["added"].add(origin.id) + def revision_add(self, revision: RevisionEntry): # Add current revision to the compact DB assert revision.date is not None @@ -252,17 +270,20 @@ ) def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry): - self.cache["revision_in_origin"].add((revision.id, origin.url)) + self.cache["revision_in_origin"].add((revision.id, origin.id)) def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]: return self.get_dates("revision", [revision.id]).get(revision.id, None) - def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]: - if revision.id not in self.cache["revision_preferred_origin"]["data"]: - url = self.storage.revision_get_preferred_origin(revision.id) - if url is not None: - self.cache["revision_preferred_origin"]["data"][revision.id] = url - return self.cache["revision_preferred_origin"]["data"].get(revision.id) + def revision_get_preferred_origin( + self, revision: RevisionEntry + ) -> Optional[Sha1Git]: + cache = self.cache["revision_origin"] + if revision.id not in cache: + origin = self.storage.revision_get_preferred_origin(revision.id) + if origin is not None: + cache["data"][revision.id] = origin + return cache["data"].get(revision.id) def revision_in_history(self, revision: RevisionEntry) -> bool: return revision.id in self.cache[ @@ -272,8 +293,8 @@ def revision_set_preferred_origin( self, origin: OriginEntry, revision: RevisionEntry ): - self.cache["revision_preferred_origin"]["data"][revision.id] = origin.url - self.cache["revision_preferred_origin"]["added"].add(revision.id) + self.cache["revision_origin"]["data"][revision.id] = origin.id + self.cache["revision_origin"]["added"].add(revision.id) def revision_visited(self, revision: RevisionEntry) -> bool: return revision.id in dict( diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -8,6 +8,7 @@ import iso8601 from swh.model.hashutil import hash_to_bytes +from swh.model.model import Sha1Git from .archive import ArchiveInterface from .graph import IsochroneNode, build_isochrone_graph @@ -30,10 +31,10 @@ def __init__( self, - revisions: Iterable[Tuple[bytes, datetime, bytes]], + revisions: Iterable[Tuple[Sha1Git, datetime, Sha1Git]], limit: Optional[int] = None, ): - self.revisions: Iterator[Tuple[bytes, datetime, bytes]] + self.revisions: Iterator[Tuple[Sha1Git, datetime, Sha1Git]] if limit is not None: self.revisions = islice(revisions, limit) else: diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql --- a/swh/provenance/sql/30-schema.sql +++ b/swh/provenance/sql/30-schema.sql @@ -67,9 +67,11 @@ create table origin ( id bigserial primary key, -- internal identifier of the origin + sha1 sha1_git unique not null, -- intrinsic identifier of the origin url unix_path unique not null -- url of the origin ); comment on column origin.id is 'Origin internal identifier'; +comment on column origin.sha1 is 'Origin intrinsic identifier'; comment on column origin.url is 'URL of the origin'; -- relation tables diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py --- a/swh/provenance/storage/archive.py +++ b/swh/provenance/storage/archive.py @@ -1,6 +1,6 @@ from typing import Any, Dict, Iterable, Set -from swh.model.model import ObjectType, Revision, Sha1, TargetType +from swh.model.model import ObjectType, Revision, Sha1Git, TargetType from swh.storage.interface import StorageInterface @@ -8,17 +8,17 @@ def __init__(self, storage: StorageInterface): self.storage = storage - def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]: + def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: # TODO: filter unused fields yield from self.storage.directory_ls(id) - def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]: + def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]: # TODO: filter unused fields yield from ( rev for rev in self.storage.revision_get(list(ids)) if rev is not None ) - def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]: + def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]: from swh.core.utils import grouper from swh.storage.algos.snapshot import snapshot_get_all_branches @@ -42,7 +42,7 @@ if release is not None and release.target_type == ObjectType.REVISION ) - revisions: Set[Sha1] = set() + revisions: Set[Sha1Git] = set() for targets in grouper(targets_set, batchsize): revisions.update( revision.id diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -13,6 +13,8 @@ from swh.core.db import BaseDb from swh.journal.serializers import msgpack_ext_hook +from swh.model.hashutil import hash_to_bytes +from swh.model.model import Sha1Git from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage @@ -114,13 +116,13 @@ class SynthRelation(TypedDict): prefix: Optional[str] path: str - src: bytes - dst: bytes + src: Sha1Git + dst: Sha1Git rel_ts: float class SynthRevision(TypedDict): - sha1: bytes + sha1: Sha1Git date: float msg: str R_C: List[SynthRelation] @@ -134,7 +136,7 @@ Generated SynthRevision (typed dict) with the following elements: - "sha1": (bytes) sha1 of the revision, + "sha1": (Sha1Git) sha1 of the revision, "date": (float) timestamp of the revision, "msg": (str) commit message of the revision, "R_C": (list) new R---C relations added by this revision @@ -144,8 +146,8 @@ Each relation above is a SynthRelation typed dict with: "path": (str) location - "src": (bytes) sha1 of the source of the relation - "dst": (bytes) sha1 of the destination of the relation + "src": (Sha1Git) sha1 of the source of the relation + "dst": (Sha1Git) sha1 of the destination of the relation "rel_ts": (float) timestamp of the target of the relation (related to the timestamp of the revision) @@ -183,7 +185,7 @@ def _mk_synth_rev(synth_rev) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( - sha1=bytes.fromhex(synth_rev[0]["sha1"]), + sha1=hash_to_bytes(synth_rev[0]["sha1"]), date=float(synth_rev[0]["ts"]), msg=synth_rev[0]["revname"], R_C=[], @@ -202,7 +204,7 @@ prefix=None, path=row["path"], src=rev["sha1"], - dst=bytes.fromhex(row["sha1"]), + dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) @@ -214,7 +216,7 @@ prefix=None, path=row["path"], src=rev["sha1"], - dst=bytes.fromhex(row["sha1"]), + dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) @@ -226,7 +228,7 @@ prefix=current_path, path=row["path"], src=rev["R_D"][-1]["dst"], - dst=bytes.fromhex(row["sha1"]), + dst=hash_to_bytes(row["sha1"]), rel_ts=float(row["ts"]), ) ) diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py --- a/swh/provenance/tests/data/generate_storage_from_git.py +++ b/swh/provenance/tests/data/generate_storage_from_git.py @@ -11,6 +11,7 @@ import yaml from swh.loader.git.from_disk import GitLoaderFromDisk +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Origin, OriginVisit, @@ -90,7 +91,7 @@ # add a snapshot with branches from the input file branches = { f"refs/heads/{name}".encode(): SnapshotBranch( - target=bytes.fromhex(all_branches[f"refs/heads/{name}"]), + target=hash_to_bytes(all_branches[f"refs/heads/{name}"]), target_type=TargetType.REVISION, ) for name in visit["branches"] diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -6,9 +6,9 @@ from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.origin import CSVOriginIterator from swh.storage.algos.origin import ( - iter_origins, - iter_origin_visits, iter_origin_visit_statuses, + iter_origin_visits, + iter_origins, ) @@ -21,9 +21,7 @@ swh_storage_with_objects, origin.url, visit.visit ): if status.snapshot is not None: - origins_csv.append( - (status.origin, status.date.isoformat(), status.snapshot) - ) + origins_csv.append((status.origin, status.snapshot)) origins = list(CSVOriginIterator(origins_csv)) assert origins assert len(origins) == len( diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -26,9 +26,7 @@ archive = ArchiveStorage(swh_storage_with_objects) for status in TEST_OBJECTS["origin_visit_status"]: if status.snapshot is not None: - entry = OriginEntry( - url=status.origin, date=status.date, snapshot=status.snapshot - ) + entry = OriginEntry(url=status.origin, snapshot=status.snapshot) origin_add(provenance, archive, [entry]) # TODO: check some facts here diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -7,6 +7,7 @@ import pytest +from swh.model.hashutil import hash_to_bytes from swh.provenance.model import RevisionEntry from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( @@ -82,7 +83,7 @@ 'cur' is a cursor to the provenance index DB. """ if isinstance(sha1, str): - sha1 = bytes.fromhex(sha1) + sha1 = hash_to_bytes(sha1) cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) return [date.timestamp() for (date,) in cur.fetchall()] @@ -265,7 +266,7 @@ db_occurrences = [ (blob.hex(), rev.hex(), date.timestamp(), path.decode()) for blob, rev, date, path in provenance.content_find_all( - bytes.fromhex(content_id) + hash_to_bytes(content_id) ) ] if provenance.storage.with_path: @@ -337,7 +338,7 @@ for content_id, (rev_id, ts, paths) in expected_first.items(): (r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first( - bytes.fromhex(content_id) + hash_to_bytes(content_id) ) assert r_sha1.hex() == content_id assert r_rev_id.hex() == rev_id