Page MenuHomeSoftware Heritage

D5924.id21285.diff
No OneTemporary

D5924.id21285.diff

diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py
--- a/swh/provenance/archive.py
+++ b/swh/provenance/archive.py
@@ -2,12 +2,12 @@
from typing_extensions import Protocol, runtime_checkable
-from swh.model.model import Revision, Sha1
+from swh.model.model import Revision, Sha1Git
@runtime_checkable
class ArchiveInterface(Protocol):
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
"""List entries for one directory.
Args:
@@ -19,7 +19,7 @@
"""
...
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
"""Given a list of sha1, return the revisions' information
Args:
@@ -32,11 +32,11 @@
"""
...
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
"""List all revisions pointed by one snapshot.
Args:
- snapshot: the snapshot's identifier
+ id: sha1 id of the snapshot.
Yields:
sha1 ids of found revisions.
diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py
--- a/swh/provenance/graph.py
+++ b/swh/provenance/graph.py
@@ -3,6 +3,8 @@
import os
from typing import Dict, Optional, Set
+from swh.model.model import Sha1Git
+
from .archive import ArchiveInterface
from .model import DirectoryEntry, RevisionEntry
from .provenance import ProvenanceInterface
@@ -166,7 +168,7 @@
logging.debug(
f"Recursively creating isochrone graph for revision {revision.id.hex()}..."
)
- fdates: Dict[bytes, datetime] = {} # map {file_id: date}
+ fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date}
while stack:
current = stack.pop()
if current.dbdate is None or current.dbdate > revision.date:
diff --git a/swh/provenance/model.py b/swh/provenance/model.py
--- a/swh/provenance/model.py
+++ b/swh/provenance/model.py
@@ -6,14 +6,17 @@
from datetime import datetime
from typing import Iterable, Iterator, List, Optional
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import origin_identifier
+from swh.model.model import Sha1Git
+
from .archive import ArchiveInterface
class OriginEntry:
- def __init__(self, url: str, date: datetime, snapshot: bytes):
+ def __init__(self, url: str, snapshot: Sha1Git):
self.url = url
- # TODO: this is probably not needed and will be removed!
- # self.date = date
+ self.id: Sha1Git = hash_to_bytes(origin_identifier({"url": self.url}))
self.snapshot = snapshot
self._revisions: Optional[List[RevisionEntry]] = None
@@ -33,19 +36,16 @@
return (x for x in self._revisions)
def __str__(self):
- return (
- f"<MOrigin[{self.url}] "
- f"snap={self.snapshot.hex()}, date={self.date.isoformat()}>"
- )
+ return f"<MOrigin[{self.id.hex()}] url={self.url}, snap={self.snapshot.hex()}>"
class RevisionEntry:
def __init__(
self,
- id: bytes,
+ id: Sha1Git,
date: Optional[datetime] = None,
- root: Optional[bytes] = None,
- parents: Optional[Iterable[bytes]] = None,
+ root: Optional[Sha1Git] = None,
+ parents: Optional[Iterable[Sha1Git]] = None,
):
self.id = id
self.date = date
@@ -91,7 +91,7 @@
class DirectoryEntry:
- def __init__(self, id: bytes, name: bytes = b""):
+ def __init__(self, id: Sha1Git, name: bytes = b""):
self.id = id
self.name = name
self._files: Optional[List[FileEntry]] = None
@@ -141,7 +141,7 @@
class FileEntry:
- def __init__(self, id: bytes, name: bytes):
+ def __init__(self, id: Sha1Git, name: bytes):
self.id = id
self.name = name
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -1,10 +1,9 @@
-from datetime import datetime, timezone
from itertools import islice
import logging
import time
from typing import Iterable, Iterator, List, Optional, Tuple
-import iso8601
+from swh.model.model import Sha1Git
from .archive import ArchiveInterface
from .graph import HistoryNode, build_history_graph
@@ -16,31 +15,28 @@
"""Iterator over origin visit statuses typically present in the given CSV
file.
- The input is an iterator that produces 3 elements per row:
+ The input is an iterator that produces 2 elements per row:
- (url, date, snap)
+ (url, snap)
where:
- url: is the origin url of the visit
- - date: is the date of the visit
- snap: sha1_git of the snapshot pointed by the visit status
"""
def __init__(
self,
- statuses: Iterable[Tuple[str, datetime, bytes]],
+ statuses: Iterable[Tuple[str, Sha1Git]],
limit: Optional[int] = None,
):
- self.statuses: Iterator[Tuple[str, datetime, bytes]]
+ self.statuses: Iterator[Tuple[str, Sha1Git]]
if limit is not None:
self.statuses = islice(statuses, limit)
else:
self.statuses = iter(statuses)
def __iter__(self):
- for url, date, snap in self.statuses:
- date = iso8601.parse_date(date, default_timezone=timezone.utc)
- yield OriginEntry(url, date, snap)
+ return (OriginEntry(url, snapshot) for url, snapshot in self.statuses)
def origin_add(
@@ -50,6 +46,7 @@
):
start = time.time()
for origin in origins:
+ provenance.origin_add(origin)
origin.retrieve_revisions(archive)
for revision in origin.revisions:
graph = build_history_graph(archive, provenance, revision)
@@ -59,7 +56,7 @@
stop = time.time()
logging.debug(
"Origins "
- ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins])
+ ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins])
+ f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py
--- a/swh/provenance/postgresql/archive.py
+++ b/swh/provenance/postgresql/archive.py
@@ -3,7 +3,7 @@
from methodtools import lru_cache
import psycopg2
-from swh.model.model import ObjectType, Revision, Sha1, TargetType
+from swh.model.model import ObjectType, Revision, Sha1Git, TargetType
from swh.storage.postgresql.storage import Storage
@@ -12,14 +12,14 @@
self.conn = conn
self.storage = Storage(conn, objstorage={"cls": "memory"})
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
# TODO: only call directory_ls_internal if the id is not being queried by
# someone else. Otherwise wait until results get properly cached.
entries = self.directory_ls_internal(id)
yield from entries
@lru_cache(maxsize=100000)
- def directory_ls_internal(self, id: Sha1) -> List[Dict[str, Any]]:
+ def directory_ls_internal(self, id: Sha1Git) -> List[Dict[str, Any]]:
# TODO: add file size filtering
with self.conn.cursor() as cursor:
cursor.execute(
@@ -62,7 +62,7 @@
for row in cursor.fetchall()
]
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
with self.conn.cursor() as cursor:
psycopg2.extras.execute_values(
cursor,
@@ -96,7 +96,7 @@
}
)
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
# TODO: this code is duplicated here (same as in swh.provenance.storage.archive)
# but it's just temporary. This method should actually perform a direct query to
# the SQL db of the archive.
@@ -123,7 +123,7 @@
if release is not None and release.target_type == ObjectType.REVISION
)
- revisions: Set[Sha1] = set()
+ revisions: Set[Sha1Git] = set()
for targets in grouper(targets_set, batchsize):
revisions.update(
revision.id
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -6,6 +6,8 @@
import psycopg2
import psycopg2.extras
+from swh.model.model import Sha1Git
+
class ProvenanceDBBase:
def __init__(self, conn: psycopg2.extensions.connection):
@@ -51,6 +53,16 @@
):
self.insert_relation(relation, data[relation])
+ # Insert origins
+ self.insert_origin(
+ {
+ sha1: data["origin"]["data"][sha1]
+ for sha1 in data["origin"]["added"]
+ },
+ )
+ data["origin"]["data"].clear()
+ data["origin"]["added"].clear()
+
# Insert relations from the origin-revision layer
self.insert_origin_head(data["revision_in_origin"])
self.insert_revision_history(data["revision_before_revision"])
@@ -58,12 +70,12 @@
# Update preferred origins
self.update_preferred_origin(
{
- sha1: data["revision_preferred_origin"]["data"][sha1]
- for sha1 in data["revision_preferred_origin"]["added"]
+ sha1: data["revision_origin"]["data"][sha1]
+ for sha1 in data["revision_origin"]["added"]
}
)
- data["revision_preferred_origin"]["data"].clear()
- data["revision_preferred_origin"]["added"].clear()
+ data["revision_origin"]["data"].clear()
+ data["revision_origin"]["added"].clear()
return True
@@ -76,16 +88,16 @@
return False
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
...
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
...
- def get_dates(self, entity: str, ids: List[bytes]) -> Dict[bytes, datetime]:
+ def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]:
dates = {}
if ids:
values = ", ".join(itertools.repeat("%s", len(ids)))
@@ -96,7 +108,7 @@
dates.update(self.cursor.fetchall())
return dates
- def insert_entity(self, entity: str, data: Dict[bytes, datetime]):
+ def insert_entity(self, entity: str, data: Dict[Sha1Git, datetime]):
if data:
psycopg2.extras.execute_values(
self.cursor,
@@ -112,7 +124,22 @@
# This might be useless!
data.clear()
- def insert_origin_head(self, data: Set[Tuple[bytes, str]]):
+ def insert_origin(self, data: Dict[Sha1Git, str]):
+ if data:
+ psycopg2.extras.execute_values(
+ self.cursor,
+ """
+ LOCK TABLE ONLY origin;
+ INSERT INTO origin(sha1, url) VALUES %s
+ ON CONFLICT DO NOTHING
+ """,
+ data.items(),
+ )
+ # XXX: not sure if Python takes a reference or a copy.
+ # This might be useless!
+ data.clear()
+
+ def insert_origin_head(self, data: Set[Tuple[Sha1Git, Sha1Git]]):
if data:
psycopg2.extras.execute_values(
self.cursor,
@@ -123,16 +150,16 @@
SELECT R.id, O.id
FROM (VALUES %s) AS V(rev, org)
INNER JOIN revision AS R on (R.sha1=V.rev)
- INNER JOIN origin AS O on (O.url=V.org::unix_path)
+ INNER JOIN origin AS O on (O.sha1=V.org)
""",
data,
)
data.clear()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
...
- def insert_revision_history(self, data: Dict[bytes, bytes]):
+ def insert_revision_history(self, data: Dict[Sha1Git, Sha1Git]):
if data:
values = [[(prev, next) for next in data[prev]] for prev in data]
psycopg2.extras.execute_values(
@@ -150,10 +177,10 @@
)
data.clear()
- def revision_get_preferred_origin(self, revision: bytes) -> Optional[str]:
+ def revision_get_preferred_origin(self, revision: Sha1Git) -> Optional[Sha1Git]:
self.cursor.execute(
"""
- SELECT O.url
+ SELECT O.sha1
FROM revision AS R
JOIN origin as O
ON R.origin=O.id
@@ -161,9 +188,9 @@
(revision,),
)
row = self.cursor.fetchone()
- return str(row[0], encoding="utf-8") if row is not None else None
+ return row[0] if row is not None else None
- def revision_in_history(self, revision: bytes) -> bool:
+ def revision_in_history(self, revision: Sha1Git) -> bool:
self.cursor.execute(
"""
SELECT 1
@@ -176,7 +203,7 @@
)
return self.cursor.fetchone() is not None
- def revision_visited(self, revision: bytes) -> bool:
+ def revision_visited(self, revision: Sha1Git) -> bool:
self.cursor.execute(
"""
SELECT 1
@@ -189,18 +216,18 @@
)
return self.cursor.fetchone() is not None
- def update_preferred_origin(self, data: Dict[bytes, str]):
+ def update_preferred_origin(self, data: Dict[Sha1Git, Sha1Git]):
if data:
# XXX: this is assuming the revision already exists in the db! It should
# be improved by allowing null dates in the revision table.
psycopg2.extras.execute_values(
self.cursor,
"""
- UPDATE revision
+ UPDATE revision R
SET origin=O.id
FROM (VALUES %s) AS V(rev, org)
- INNER JOIN origin AS O on (O.url=V.org::unix_path)
- WHERE sha1=V.rev
+ INNER JOIN origin AS O on (O.sha1=V.org)
+ WHERE R.sha1=V.rev
""",
data.items(),
)
diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py
--- a/swh/provenance/postgresql/provenancedb_with_path.py
+++ b/swh/provenance/postgresql/provenancedb_with_path.py
@@ -4,13 +4,15 @@
import psycopg2
import psycopg2.extras
+from swh.model.model import Sha1Git
+
from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithPathDB(ProvenanceDBBase):
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
self.cursor.execute(
"""
SELECT C.sha1 AS blob,
@@ -24,13 +26,13 @@
WHERE C.sha1=%s
ORDER BY date, rev, path ASC LIMIT 1
""",
- (blob,),
+ (id,),
)
return self.cursor.fetchone()
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
self.cursor.execute(
f"""
@@ -61,11 +63,11 @@
WHERE C.sha1=%s)
ORDER BY date, rev, path {early_cut}
""",
- (blob, blob),
+ (id, id),
)
yield from self.cursor.fetchall()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
"""Insert entries in `relation` from `data`
Also insert missing location entries in the 'location' table.
diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py
--- a/swh/provenance/postgresql/provenancedb_without_path.py
+++ b/swh/provenance/postgresql/provenancedb_without_path.py
@@ -4,17 +4,15 @@
import psycopg2
import psycopg2.extras
-from .provenancedb_base import ProvenanceDBBase
+from swh.model.model import Sha1Git
-########################################################################################
-########################################################################################
-########################################################################################
+from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithoutPathDB(ProvenanceDBBase):
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
self.cursor.execute(
"""
SELECT C.sha1 AS blob,
@@ -27,13 +25,13 @@
WHERE C.sha1=%s
ORDER BY date, rev ASC LIMIT 1
""",
- (blob,),
+ (id,),
)
return self.cursor.fetchone()
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
self.cursor.execute(
f"""
@@ -57,11 +55,11 @@
WHERE C.sha1=%s)
ORDER BY date, rev, path {early_cut}
""",
- (blob, blob),
+ (id, id),
)
yield from self.cursor.fetchall()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
if data:
assert relation in (
"content_in_revision",
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -6,6 +6,8 @@
import psycopg2
from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable
+from swh.model.model import Sha1Git
+
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
@@ -30,13 +32,13 @@
...
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
...
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
...
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
@@ -44,7 +46,7 @@
def content_get_early_dates(
self, blobs: Iterable[FileEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
...
def content_set_early_date(self, blob: FileEntry, date: datetime) -> None:
@@ -62,7 +64,7 @@
def directory_get_dates_in_isochrone_frontier(
self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
...
def directory_set_date_in_isochrone_frontier(
@@ -70,6 +72,9 @@
) -> None:
...
+ def origin_add(self, origin: OriginEntry) -> None:
+ ...
+
def revision_add(self, revision: RevisionEntry) -> None:
...
@@ -86,7 +91,9 @@
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
...
- def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]:
+ def revision_get_preferred_origin(
+ self, revision: RevisionEntry
+ ) -> Optional[Sha1Git]:
...
def revision_in_history(self, revision: RevisionEntry) -> bool:
@@ -102,13 +109,18 @@
class DatetimeCache(TypedDict):
- data: Dict[bytes, datetime]
- added: Set[bytes]
+ data: Dict[Sha1Git, datetime]
+ added: Set[Sha1Git]
class OriginCache(TypedDict):
- data: Dict[bytes, str]
- added: Set[bytes]
+ data: Dict[Sha1Git, str]
+ added: Set[Sha1Git]
+
+
+class RevisionCache(TypedDict):
+ data: Dict[Sha1Git, Sha1Git]
+ added: Set[Sha1Git]
class ProvenanceCache(TypedDict):
@@ -116,13 +128,14 @@
directory: DatetimeCache
revision: DatetimeCache
# below are insertion caches only
- content_in_revision: Set[Tuple[bytes, bytes, bytes]]
- content_in_directory: Set[Tuple[bytes, bytes, bytes]]
- directory_in_revision: Set[Tuple[bytes, bytes, bytes]]
+ content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
+ content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]]
+ directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
# these two are for the origin layer
- revision_before_revision: Dict[bytes, Set[bytes]]
- revision_in_origin: Set[Tuple[bytes, str]]
- revision_preferred_origin: OriginCache
+ origin: OriginCache
+ revision_origin: RevisionCache
+ revision_before_revision: Dict[Sha1Git, Set[Sha1Git]]
+ revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]]
def new_cache():
@@ -133,9 +146,10 @@
content_in_revision=set(),
content_in_directory=set(),
directory_in_revision=set(),
+ origin=OriginCache(data={}, added=set()),
+ revision_origin=RevisionCache(data={}, added=set()),
revision_before_revision={},
revision_in_origin=set(),
- revision_preferred_origin=OriginCache(data={}, added=set()),
)
@@ -185,21 +199,21 @@
)
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
- return self.storage.content_find_first(blob)
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
+ return self.storage.content_find_first(id)
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
- yield from self.storage.content_find_all(blob, limit=limit)
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ yield from self.storage.content_find_all(id, limit=limit)
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
return self.get_dates("content", [blob.id]).get(blob.id, None)
def content_get_early_dates(
self, blobs: Iterable[FileEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
return self.get_dates("content", [blob.id for blob in blobs])
def content_set_early_date(self, blob: FileEntry, date: datetime):
@@ -220,7 +234,7 @@
def directory_get_dates_in_isochrone_frontier(
self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
return self.get_dates("directory", [directory.id for directory in dirs])
def directory_set_date_in_isochrone_frontier(
@@ -230,14 +244,18 @@
self.cache["directory"]["added"].add(directory.id)
def get_dates(
- self, entity: Literal["content", "revision", "directory"], ids: List[bytes]
- ) -> Dict[bytes, datetime]:
+ self, entity: Literal["content", "revision", "directory"], ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, datetime]:
cache = self.cache[entity]
missing_ids = set(id for id in ids if id not in cache)
if missing_ids:
cache["data"].update(self.storage.get_dates(entity, list(missing_ids)))
return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]}
+ def origin_add(self, origin: OriginEntry) -> None:
+ self.cache["origin"]["data"][origin.id] = origin.url
+ self.cache["origin"]["added"].add(origin.id)
+
def revision_add(self, revision: RevisionEntry):
# Add current revision to the compact DB
assert revision.date is not None
@@ -252,17 +270,20 @@
)
def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
- self.cache["revision_in_origin"].add((revision.id, origin.url))
+ self.cache["revision_in_origin"].add((revision.id, origin.id))
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
return self.get_dates("revision", [revision.id]).get(revision.id, None)
- def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]:
- if revision.id not in self.cache["revision_preferred_origin"]["data"]:
- url = self.storage.revision_get_preferred_origin(revision.id)
- if url is not None:
- self.cache["revision_preferred_origin"]["data"][revision.id] = url
- return self.cache["revision_preferred_origin"]["data"].get(revision.id)
+ def revision_get_preferred_origin(
+ self, revision: RevisionEntry
+ ) -> Optional[Sha1Git]:
+ cache = self.cache["revision_origin"]
+ if revision.id not in cache:
+ origin = self.storage.revision_get_preferred_origin(revision.id)
+ if origin is not None:
+ cache["data"][revision.id] = origin
+ return cache["data"].get(revision.id)
def revision_in_history(self, revision: RevisionEntry) -> bool:
return revision.id in self.cache[
@@ -272,8 +293,8 @@
def revision_set_preferred_origin(
self, origin: OriginEntry, revision: RevisionEntry
):
- self.cache["revision_preferred_origin"]["data"][revision.id] = origin.url
- self.cache["revision_preferred_origin"]["added"].add(revision.id)
+ self.cache["revision_origin"]["data"][revision.id] = origin.id
+ self.cache["revision_origin"]["added"].add(revision.id)
def revision_visited(self, revision: RevisionEntry) -> bool:
return revision.id in dict(
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -8,6 +8,7 @@
import iso8601
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
from .archive import ArchiveInterface
from .graph import IsochroneNode, build_isochrone_graph
@@ -30,10 +31,10 @@
def __init__(
self,
- revisions: Iterable[Tuple[bytes, datetime, bytes]],
+ revisions: Iterable[Tuple[Sha1Git, datetime, Sha1Git]],
limit: Optional[int] = None,
):
- self.revisions: Iterator[Tuple[bytes, datetime, bytes]]
+ self.revisions: Iterator[Tuple[Sha1Git, datetime, Sha1Git]]
if limit is not None:
self.revisions = islice(revisions, limit)
else:
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -67,9 +67,11 @@
create table origin
(
id bigserial primary key, -- internal identifier of the origin
+ sha1 sha1_git unique not null, -- intrinsic identifier of the origin
url unix_path unique not null -- url of the origin
);
comment on column origin.id is 'Origin internal identifier';
+comment on column origin.sha1 is 'Origin intrinsic identifier';
comment on column origin.url is 'URL of the origin';
-- relation tables
diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py
--- a/swh/provenance/storage/archive.py
+++ b/swh/provenance/storage/archive.py
@@ -1,6 +1,6 @@
from typing import Any, Dict, Iterable, Set
-from swh.model.model import ObjectType, Revision, Sha1, TargetType
+from swh.model.model import ObjectType, Revision, Sha1Git, TargetType
from swh.storage.interface import StorageInterface
@@ -8,17 +8,17 @@
def __init__(self, storage: StorageInterface):
self.storage = storage
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
# TODO: filter unused fields
yield from self.storage.directory_ls(id)
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
# TODO: filter unused fields
yield from (
rev for rev in self.storage.revision_get(list(ids)) if rev is not None
)
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
from swh.core.utils import grouper
from swh.storage.algos.snapshot import snapshot_get_all_branches
@@ -42,7 +42,7 @@
if release is not None and release.target_type == ObjectType.REVISION
)
- revisions: Set[Sha1] = set()
+ revisions: Set[Sha1Git] = set()
for targets in grouper(targets_set, batchsize):
revisions.update(
revision.id
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -13,6 +13,8 @@
from swh.core.db import BaseDb
from swh.journal.serializers import msgpack_ext_hook
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
from swh.model.tests.swh_model_data import TEST_OBJECTS
from swh.provenance.postgresql.archive import ArchivePostgreSQL
from swh.provenance.storage.archive import ArchiveStorage
@@ -114,13 +116,13 @@
class SynthRelation(TypedDict):
prefix: Optional[str]
path: str
- src: bytes
- dst: bytes
+ src: Sha1Git
+ dst: Sha1Git
rel_ts: float
class SynthRevision(TypedDict):
- sha1: bytes
+ sha1: Sha1Git
date: float
msg: str
R_C: List[SynthRelation]
@@ -134,7 +136,7 @@
Generated SynthRevision (typed dict) with the following elements:
- "sha1": (bytes) sha1 of the revision,
+ "sha1": (Sha1Git) sha1 of the revision,
"date": (float) timestamp of the revision,
"msg": (str) commit message of the revision,
"R_C": (list) new R---C relations added by this revision
@@ -144,8 +146,8 @@
Each relation above is a SynthRelation typed dict with:
"path": (str) location
- "src": (bytes) sha1 of the source of the relation
- "dst": (bytes) sha1 of the destination of the relation
+ "src": (Sha1Git) sha1 of the source of the relation
+ "dst": (Sha1Git) sha1 of the destination of the relation
"rel_ts": (float) timestamp of the target of the relation
(related to the timestamp of the revision)
@@ -183,7 +185,7 @@
def _mk_synth_rev(synth_rev) -> SynthRevision:
assert synth_rev[0]["type"] == "R"
rev = SynthRevision(
- sha1=bytes.fromhex(synth_rev[0]["sha1"]),
+ sha1=hash_to_bytes(synth_rev[0]["sha1"]),
date=float(synth_rev[0]["ts"]),
msg=synth_rev[0]["revname"],
R_C=[],
@@ -202,7 +204,7 @@
prefix=None,
path=row["path"],
src=rev["sha1"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
@@ -214,7 +216,7 @@
prefix=None,
path=row["path"],
src=rev["sha1"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
@@ -226,7 +228,7 @@
prefix=current_path,
path=row["path"],
src=rev["R_D"][-1]["dst"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py
--- a/swh/provenance/tests/data/generate_storage_from_git.py
+++ b/swh/provenance/tests/data/generate_storage_from_git.py
@@ -11,6 +11,7 @@
import yaml
from swh.loader.git.from_disk import GitLoaderFromDisk
+from swh.model.hashutil import hash_to_bytes
from swh.model.model import (
Origin,
OriginVisit,
@@ -90,7 +91,7 @@
# add a snapshot with branches from the input file
branches = {
f"refs/heads/{name}".encode(): SnapshotBranch(
- target=bytes.fromhex(all_branches[f"refs/heads/{name}"]),
+ target=hash_to_bytes(all_branches[f"refs/heads/{name}"]),
target_type=TargetType.REVISION,
)
for name in visit["branches"]
diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py
--- a/swh/provenance/tests/test_origin_iterator.py
+++ b/swh/provenance/tests/test_origin_iterator.py
@@ -6,9 +6,9 @@
from swh.model.tests.swh_model_data import TEST_OBJECTS
from swh.provenance.origin import CSVOriginIterator
from swh.storage.algos.origin import (
- iter_origins,
- iter_origin_visits,
iter_origin_visit_statuses,
+ iter_origin_visits,
+ iter_origins,
)
@@ -21,9 +21,7 @@
swh_storage_with_objects, origin.url, visit.visit
):
if status.snapshot is not None:
- origins_csv.append(
- (status.origin, status.date.isoformat(), status.snapshot)
- )
+ origins_csv.append((status.origin, status.snapshot))
origins = list(CSVOriginIterator(origins_csv))
assert origins
assert len(origins) == len(
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -26,9 +26,7 @@
archive = ArchiveStorage(swh_storage_with_objects)
for status in TEST_OBJECTS["origin_visit_status"]:
if status.snapshot is not None:
- entry = OriginEntry(
- url=status.origin, date=status.date, snapshot=status.snapshot
- )
+ entry = OriginEntry(url=status.origin, snapshot=status.snapshot)
origin_add(provenance, archive, [entry])
# TODO: check some facts here
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -7,6 +7,7 @@
import pytest
+from swh.model.hashutil import hash_to_bytes
from swh.provenance.model import RevisionEntry
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
@@ -82,7 +83,7 @@
'cur' is a cursor to the provenance index DB.
"""
if isinstance(sha1, str):
- sha1 = bytes.fromhex(sha1)
+ sha1 = hash_to_bytes(sha1)
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,))
return [date.timestamp() for (date,) in cur.fetchall()]
@@ -265,7 +266,7 @@
db_occurrences = [
(blob.hex(), rev.hex(), date.timestamp(), path.decode())
for blob, rev, date, path in provenance.content_find_all(
- bytes.fromhex(content_id)
+ hash_to_bytes(content_id)
)
]
if provenance.storage.with_path:
@@ -337,7 +338,7 @@
for content_id, (rev_id, ts, paths) in expected_first.items():
(r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first(
- bytes.fromhex(content_id)
+ hash_to_bytes(content_id)
)
assert r_sha1.hex() == content_id
assert r_rev_id.hex() == rev_id

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:24 PM (1 w, 2 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230737

Event Timeline