Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345532
D5924.id21285.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
36 KB
Subscribers
None
D5924.id21285.diff
View Options
diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py
--- a/swh/provenance/archive.py
+++ b/swh/provenance/archive.py
@@ -2,12 +2,12 @@
from typing_extensions import Protocol, runtime_checkable
-from swh.model.model import Revision, Sha1
+from swh.model.model import Revision, Sha1Git
@runtime_checkable
class ArchiveInterface(Protocol):
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
"""List entries for one directory.
Args:
@@ -19,7 +19,7 @@
"""
...
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
"""Given a list of sha1, return the revisions' information
Args:
@@ -32,11 +32,11 @@
"""
...
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
"""List all revisions pointed by one snapshot.
Args:
- snapshot: the snapshot's identifier
+ id: sha1 id of the snapshot.
Yields:
sha1 ids of found revisions.
diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py
--- a/swh/provenance/graph.py
+++ b/swh/provenance/graph.py
@@ -3,6 +3,8 @@
import os
from typing import Dict, Optional, Set
+from swh.model.model import Sha1Git
+
from .archive import ArchiveInterface
from .model import DirectoryEntry, RevisionEntry
from .provenance import ProvenanceInterface
@@ -166,7 +168,7 @@
logging.debug(
f"Recursively creating isochrone graph for revision {revision.id.hex()}..."
)
- fdates: Dict[bytes, datetime] = {} # map {file_id: date}
+ fdates: Dict[Sha1Git, datetime] = {} # map {file_id: date}
while stack:
current = stack.pop()
if current.dbdate is None or current.dbdate > revision.date:
diff --git a/swh/provenance/model.py b/swh/provenance/model.py
--- a/swh/provenance/model.py
+++ b/swh/provenance/model.py
@@ -6,14 +6,17 @@
from datetime import datetime
from typing import Iterable, Iterator, List, Optional
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import origin_identifier
+from swh.model.model import Sha1Git
+
from .archive import ArchiveInterface
class OriginEntry:
- def __init__(self, url: str, date: datetime, snapshot: bytes):
+ def __init__(self, url: str, snapshot: Sha1Git):
self.url = url
- # TODO: this is probably not needed and will be removed!
- # self.date = date
+ self.id: Sha1Git = hash_to_bytes(origin_identifier({"url": self.url}))
self.snapshot = snapshot
self._revisions: Optional[List[RevisionEntry]] = None
@@ -33,19 +36,16 @@
return (x for x in self._revisions)
def __str__(self):
- return (
- f"<MOrigin[{self.url}] "
- f"snap={self.snapshot.hex()}, date={self.date.isoformat()}>"
- )
+ return f"<MOrigin[{self.id.hex()}] url={self.url}, snap={self.snapshot.hex()}>"
class RevisionEntry:
def __init__(
self,
- id: bytes,
+ id: Sha1Git,
date: Optional[datetime] = None,
- root: Optional[bytes] = None,
- parents: Optional[Iterable[bytes]] = None,
+ root: Optional[Sha1Git] = None,
+ parents: Optional[Iterable[Sha1Git]] = None,
):
self.id = id
self.date = date
@@ -91,7 +91,7 @@
class DirectoryEntry:
- def __init__(self, id: bytes, name: bytes = b""):
+ def __init__(self, id: Sha1Git, name: bytes = b""):
self.id = id
self.name = name
self._files: Optional[List[FileEntry]] = None
@@ -141,7 +141,7 @@
class FileEntry:
- def __init__(self, id: bytes, name: bytes):
+ def __init__(self, id: Sha1Git, name: bytes):
self.id = id
self.name = name
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -1,10 +1,9 @@
-from datetime import datetime, timezone
from itertools import islice
import logging
import time
from typing import Iterable, Iterator, List, Optional, Tuple
-import iso8601
+from swh.model.model import Sha1Git
from .archive import ArchiveInterface
from .graph import HistoryNode, build_history_graph
@@ -16,31 +15,28 @@
"""Iterator over origin visit statuses typically present in the given CSV
file.
- The input is an iterator that produces 3 elements per row:
+ The input is an iterator that produces 2 elements per row:
- (url, date, snap)
+ (url, snap)
where:
- url: is the origin url of the visit
- - date: is the date of the visit
- snap: sha1_git of the snapshot pointed by the visit status
"""
def __init__(
self,
- statuses: Iterable[Tuple[str, datetime, bytes]],
+ statuses: Iterable[Tuple[str, Sha1Git]],
limit: Optional[int] = None,
):
- self.statuses: Iterator[Tuple[str, datetime, bytes]]
+ self.statuses: Iterator[Tuple[str, Sha1Git]]
if limit is not None:
self.statuses = islice(statuses, limit)
else:
self.statuses = iter(statuses)
def __iter__(self):
- for url, date, snap in self.statuses:
- date = iso8601.parse_date(date, default_timezone=timezone.utc)
- yield OriginEntry(url, date, snap)
+ return (OriginEntry(url, snapshot) for url, snapshot in self.statuses)
def origin_add(
@@ -50,6 +46,7 @@
):
start = time.time()
for origin in origins:
+ provenance.origin_add(origin)
origin.retrieve_revisions(archive)
for revision in origin.revisions:
graph = build_history_graph(archive, provenance, revision)
@@ -59,7 +56,7 @@
stop = time.time()
logging.debug(
"Origins "
- ";".join([origin.url + ":" + origin.snapshot.hex() for origin in origins])
+ ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins])
+ f" were processed in {stop - start} secs (commit took {stop - done} secs)!"
)
diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py
--- a/swh/provenance/postgresql/archive.py
+++ b/swh/provenance/postgresql/archive.py
@@ -3,7 +3,7 @@
from methodtools import lru_cache
import psycopg2
-from swh.model.model import ObjectType, Revision, Sha1, TargetType
+from swh.model.model import ObjectType, Revision, Sha1Git, TargetType
from swh.storage.postgresql.storage import Storage
@@ -12,14 +12,14 @@
self.conn = conn
self.storage = Storage(conn, objstorage={"cls": "memory"})
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
# TODO: only call directory_ls_internal if the id is not being queried by
# someone else. Otherwise wait until results get properly cached.
entries = self.directory_ls_internal(id)
yield from entries
@lru_cache(maxsize=100000)
- def directory_ls_internal(self, id: Sha1) -> List[Dict[str, Any]]:
+ def directory_ls_internal(self, id: Sha1Git) -> List[Dict[str, Any]]:
# TODO: add file size filtering
with self.conn.cursor() as cursor:
cursor.execute(
@@ -62,7 +62,7 @@
for row in cursor.fetchall()
]
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
with self.conn.cursor() as cursor:
psycopg2.extras.execute_values(
cursor,
@@ -96,7 +96,7 @@
}
)
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
# TODO: this code is duplicated here (same as in swh.provenance.storage.archive)
# but it's just temporary. This method should actually perform a direct query to
# the SQL db of the archive.
@@ -123,7 +123,7 @@
if release is not None and release.target_type == ObjectType.REVISION
)
- revisions: Set[Sha1] = set()
+ revisions: Set[Sha1Git] = set()
for targets in grouper(targets_set, batchsize):
revisions.update(
revision.id
diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py
--- a/swh/provenance/postgresql/provenancedb_base.py
+++ b/swh/provenance/postgresql/provenancedb_base.py
@@ -6,6 +6,8 @@
import psycopg2
import psycopg2.extras
+from swh.model.model import Sha1Git
+
class ProvenanceDBBase:
def __init__(self, conn: psycopg2.extensions.connection):
@@ -51,6 +53,16 @@
):
self.insert_relation(relation, data[relation])
+ # Insert origins
+ self.insert_origin(
+ {
+ sha1: data["origin"]["data"][sha1]
+ for sha1 in data["origin"]["added"]
+ },
+ )
+ data["origin"]["data"].clear()
+ data["origin"]["added"].clear()
+
# Insert relations from the origin-revision layer
self.insert_origin_head(data["revision_in_origin"])
self.insert_revision_history(data["revision_before_revision"])
@@ -58,12 +70,12 @@
# Update preferred origins
self.update_preferred_origin(
{
- sha1: data["revision_preferred_origin"]["data"][sha1]
- for sha1 in data["revision_preferred_origin"]["added"]
+ sha1: data["revision_origin"]["data"][sha1]
+ for sha1 in data["revision_origin"]["added"]
}
)
- data["revision_preferred_origin"]["data"].clear()
- data["revision_preferred_origin"]["added"].clear()
+ data["revision_origin"]["data"].clear()
+ data["revision_origin"]["added"].clear()
return True
@@ -76,16 +88,16 @@
return False
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
...
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
...
- def get_dates(self, entity: str, ids: List[bytes]) -> Dict[bytes, datetime]:
+ def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]:
dates = {}
if ids:
values = ", ".join(itertools.repeat("%s", len(ids)))
@@ -96,7 +108,7 @@
dates.update(self.cursor.fetchall())
return dates
- def insert_entity(self, entity: str, data: Dict[bytes, datetime]):
+ def insert_entity(self, entity: str, data: Dict[Sha1Git, datetime]):
if data:
psycopg2.extras.execute_values(
self.cursor,
@@ -112,7 +124,22 @@
# This might be useless!
data.clear()
- def insert_origin_head(self, data: Set[Tuple[bytes, str]]):
+ def insert_origin(self, data: Dict[Sha1Git, str]):
+ if data:
+ psycopg2.extras.execute_values(
+ self.cursor,
+ """
+ LOCK TABLE ONLY origin;
+ INSERT INTO origin(sha1, url) VALUES %s
+ ON CONFLICT DO NOTHING
+ """,
+ data.items(),
+ )
+ # XXX: not sure if Python takes a reference or a copy.
+ # This might be useless!
+ data.clear()
+
+ def insert_origin_head(self, data: Set[Tuple[Sha1Git, Sha1Git]]):
if data:
psycopg2.extras.execute_values(
self.cursor,
@@ -123,16 +150,16 @@
SELECT R.id, O.id
FROM (VALUES %s) AS V(rev, org)
INNER JOIN revision AS R on (R.sha1=V.rev)
- INNER JOIN origin AS O on (O.url=V.org::unix_path)
+ INNER JOIN origin AS O on (O.sha1=V.org)
""",
data,
)
data.clear()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
...
- def insert_revision_history(self, data: Dict[bytes, bytes]):
+ def insert_revision_history(self, data: Dict[Sha1Git, Sha1Git]):
if data:
values = [[(prev, next) for next in data[prev]] for prev in data]
psycopg2.extras.execute_values(
@@ -150,10 +177,10 @@
)
data.clear()
- def revision_get_preferred_origin(self, revision: bytes) -> Optional[str]:
+ def revision_get_preferred_origin(self, revision: Sha1Git) -> Optional[Sha1Git]:
self.cursor.execute(
"""
- SELECT O.url
+ SELECT O.sha1
FROM revision AS R
JOIN origin as O
ON R.origin=O.id
@@ -161,9 +188,9 @@
(revision,),
)
row = self.cursor.fetchone()
- return str(row[0], encoding="utf-8") if row is not None else None
+ return row[0] if row is not None else None
- def revision_in_history(self, revision: bytes) -> bool:
+ def revision_in_history(self, revision: Sha1Git) -> bool:
self.cursor.execute(
"""
SELECT 1
@@ -176,7 +203,7 @@
)
return self.cursor.fetchone() is not None
- def revision_visited(self, revision: bytes) -> bool:
+ def revision_visited(self, revision: Sha1Git) -> bool:
self.cursor.execute(
"""
SELECT 1
@@ -189,18 +216,18 @@
)
return self.cursor.fetchone() is not None
- def update_preferred_origin(self, data: Dict[bytes, str]):
+ def update_preferred_origin(self, data: Dict[Sha1Git, Sha1Git]):
if data:
# XXX: this is assuming the revision already exists in the db! It should
# be improved by allowing null dates in the revision table.
psycopg2.extras.execute_values(
self.cursor,
"""
- UPDATE revision
+ UPDATE revision R
SET origin=O.id
FROM (VALUES %s) AS V(rev, org)
- INNER JOIN origin AS O on (O.url=V.org::unix_path)
- WHERE sha1=V.rev
+ INNER JOIN origin AS O on (O.sha1=V.org)
+ WHERE R.sha1=V.rev
""",
data.items(),
)
diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py
--- a/swh/provenance/postgresql/provenancedb_with_path.py
+++ b/swh/provenance/postgresql/provenancedb_with_path.py
@@ -4,13 +4,15 @@
import psycopg2
import psycopg2.extras
+from swh.model.model import Sha1Git
+
from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithPathDB(ProvenanceDBBase):
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
self.cursor.execute(
"""
SELECT C.sha1 AS blob,
@@ -24,13 +26,13 @@
WHERE C.sha1=%s
ORDER BY date, rev, path ASC LIMIT 1
""",
- (blob,),
+ (id,),
)
return self.cursor.fetchone()
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
self.cursor.execute(
f"""
@@ -61,11 +63,11 @@
WHERE C.sha1=%s)
ORDER BY date, rev, path {early_cut}
""",
- (blob, blob),
+ (id, id),
)
yield from self.cursor.fetchall()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
"""Insert entries in `relation` from `data`
Also insert missing location entries in the 'location' table.
diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py
--- a/swh/provenance/postgresql/provenancedb_without_path.py
+++ b/swh/provenance/postgresql/provenancedb_without_path.py
@@ -4,17 +4,15 @@
import psycopg2
import psycopg2.extras
-from .provenancedb_base import ProvenanceDBBase
+from swh.model.model import Sha1Git
-########################################################################################
-########################################################################################
-########################################################################################
+from .provenancedb_base import ProvenanceDBBase
class ProvenanceWithoutPathDB(ProvenanceDBBase):
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
self.cursor.execute(
"""
SELECT C.sha1 AS blob,
@@ -27,13 +25,13 @@
WHERE C.sha1=%s
ORDER BY date, rev ASC LIMIT 1
""",
- (blob,),
+ (id,),
)
return self.cursor.fetchone()
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
early_cut = f"LIMIT {limit}" if limit is not None else ""
self.cursor.execute(
f"""
@@ -57,11 +55,11 @@
WHERE C.sha1=%s)
ORDER BY date, rev, path {early_cut}
""",
- (blob, blob),
+ (id, id),
)
yield from self.cursor.fetchall()
- def insert_relation(self, relation: str, data: Set[Tuple[bytes, bytes, bytes]]):
+ def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]):
if data:
assert relation in (
"content_in_revision",
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -6,6 +6,8 @@
import psycopg2
from typing_extensions import Literal, Protocol, TypedDict, runtime_checkable
+from swh.model.model import Sha1Git
+
from .model import DirectoryEntry, FileEntry, OriginEntry, RevisionEntry
@@ -30,13 +32,13 @@
...
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
...
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
...
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
@@ -44,7 +46,7 @@
def content_get_early_dates(
self, blobs: Iterable[FileEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
...
def content_set_early_date(self, blob: FileEntry, date: datetime) -> None:
@@ -62,7 +64,7 @@
def directory_get_dates_in_isochrone_frontier(
self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
...
def directory_set_date_in_isochrone_frontier(
@@ -70,6 +72,9 @@
) -> None:
...
+ def origin_add(self, origin: OriginEntry) -> None:
+ ...
+
def revision_add(self, revision: RevisionEntry) -> None:
...
@@ -86,7 +91,9 @@
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
...
- def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]:
+ def revision_get_preferred_origin(
+ self, revision: RevisionEntry
+ ) -> Optional[Sha1Git]:
...
def revision_in_history(self, revision: RevisionEntry) -> bool:
@@ -102,13 +109,18 @@
class DatetimeCache(TypedDict):
- data: Dict[bytes, datetime]
- added: Set[bytes]
+ data: Dict[Sha1Git, datetime]
+ added: Set[Sha1Git]
class OriginCache(TypedDict):
- data: Dict[bytes, str]
- added: Set[bytes]
+ data: Dict[Sha1Git, str]
+ added: Set[Sha1Git]
+
+
+class RevisionCache(TypedDict):
+ data: Dict[Sha1Git, Sha1Git]
+ added: Set[Sha1Git]
class ProvenanceCache(TypedDict):
@@ -116,13 +128,14 @@
directory: DatetimeCache
revision: DatetimeCache
# below are insertion caches only
- content_in_revision: Set[Tuple[bytes, bytes, bytes]]
- content_in_directory: Set[Tuple[bytes, bytes, bytes]]
- directory_in_revision: Set[Tuple[bytes, bytes, bytes]]
+ content_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
+ content_in_directory: Set[Tuple[Sha1Git, Sha1Git, bytes]]
+ directory_in_revision: Set[Tuple[Sha1Git, Sha1Git, bytes]]
# these two are for the origin layer
- revision_before_revision: Dict[bytes, Set[bytes]]
- revision_in_origin: Set[Tuple[bytes, str]]
- revision_preferred_origin: OriginCache
+ origin: OriginCache
+ revision_origin: RevisionCache
+ revision_before_revision: Dict[Sha1Git, Set[Sha1Git]]
+ revision_in_origin: Set[Tuple[Sha1Git, Sha1Git]]
def new_cache():
@@ -133,9 +146,10 @@
content_in_revision=set(),
content_in_directory=set(),
directory_in_revision=set(),
+ origin=OriginCache(data={}, added=set()),
+ revision_origin=RevisionCache(data={}, added=set()),
revision_before_revision={},
revision_in_origin=set(),
- revision_preferred_origin=OriginCache(data={}, added=set()),
)
@@ -185,21 +199,21 @@
)
def content_find_first(
- self, blob: bytes
- ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
- return self.storage.content_find_first(blob)
+ self, id: Sha1Git
+ ) -> Optional[Tuple[Sha1Git, Sha1Git, datetime, bytes]]:
+ return self.storage.content_find_first(id)
def content_find_all(
- self, blob: bytes, limit: Optional[int] = None
- ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
- yield from self.storage.content_find_all(blob, limit=limit)
+ self, id: Sha1Git, limit: Optional[int] = None
+ ) -> Generator[Tuple[Sha1Git, Sha1Git, datetime, bytes], None, None]:
+ yield from self.storage.content_find_all(id, limit=limit)
def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
return self.get_dates("content", [blob.id]).get(blob.id, None)
def content_get_early_dates(
self, blobs: Iterable[FileEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
return self.get_dates("content", [blob.id for blob in blobs])
def content_set_early_date(self, blob: FileEntry, date: datetime):
@@ -220,7 +234,7 @@
def directory_get_dates_in_isochrone_frontier(
self, dirs: Iterable[DirectoryEntry]
- ) -> Dict[bytes, datetime]:
+ ) -> Dict[Sha1Git, datetime]:
return self.get_dates("directory", [directory.id for directory in dirs])
def directory_set_date_in_isochrone_frontier(
@@ -230,14 +244,18 @@
self.cache["directory"]["added"].add(directory.id)
def get_dates(
- self, entity: Literal["content", "revision", "directory"], ids: List[bytes]
- ) -> Dict[bytes, datetime]:
+ self, entity: Literal["content", "revision", "directory"], ids: List[Sha1Git]
+ ) -> Dict[Sha1Git, datetime]:
cache = self.cache[entity]
missing_ids = set(id for id in ids if id not in cache)
if missing_ids:
cache["data"].update(self.storage.get_dates(entity, list(missing_ids)))
return {sha1: cache["data"][sha1] for sha1 in ids if sha1 in cache["data"]}
+ def origin_add(self, origin: OriginEntry) -> None:
+ self.cache["origin"]["data"][origin.id] = origin.url
+ self.cache["origin"]["added"].add(origin.id)
+
def revision_add(self, revision: RevisionEntry):
# Add current revision to the compact DB
assert revision.date is not None
@@ -252,17 +270,20 @@
)
def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
- self.cache["revision_in_origin"].add((revision.id, origin.url))
+ self.cache["revision_in_origin"].add((revision.id, origin.id))
def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
return self.get_dates("revision", [revision.id]).get(revision.id, None)
- def revision_get_preferred_origin(self, revision: RevisionEntry) -> Optional[str]:
- if revision.id not in self.cache["revision_preferred_origin"]["data"]:
- url = self.storage.revision_get_preferred_origin(revision.id)
- if url is not None:
- self.cache["revision_preferred_origin"]["data"][revision.id] = url
- return self.cache["revision_preferred_origin"]["data"].get(revision.id)
+ def revision_get_preferred_origin(
+ self, revision: RevisionEntry
+ ) -> Optional[Sha1Git]:
+ cache = self.cache["revision_origin"]
+ if revision.id not in cache:
+ origin = self.storage.revision_get_preferred_origin(revision.id)
+ if origin is not None:
+ cache["data"][revision.id] = origin
+ return cache["data"].get(revision.id)
def revision_in_history(self, revision: RevisionEntry) -> bool:
return revision.id in self.cache[
@@ -272,8 +293,8 @@
def revision_set_preferred_origin(
self, origin: OriginEntry, revision: RevisionEntry
):
- self.cache["revision_preferred_origin"]["data"][revision.id] = origin.url
- self.cache["revision_preferred_origin"]["added"].add(revision.id)
+ self.cache["revision_origin"]["data"][revision.id] = origin.id
+ self.cache["revision_origin"]["added"].add(revision.id)
def revision_visited(self, revision: RevisionEntry) -> bool:
return revision.id in dict(
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -8,6 +8,7 @@
import iso8601
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
from .archive import ArchiveInterface
from .graph import IsochroneNode, build_isochrone_graph
@@ -30,10 +31,10 @@
def __init__(
self,
- revisions: Iterable[Tuple[bytes, datetime, bytes]],
+ revisions: Iterable[Tuple[Sha1Git, datetime, Sha1Git]],
limit: Optional[int] = None,
):
- self.revisions: Iterator[Tuple[bytes, datetime, bytes]]
+ self.revisions: Iterator[Tuple[Sha1Git, datetime, Sha1Git]]
if limit is not None:
self.revisions = islice(revisions, limit)
else:
diff --git a/swh/provenance/sql/30-schema.sql b/swh/provenance/sql/30-schema.sql
--- a/swh/provenance/sql/30-schema.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -67,9 +67,11 @@
create table origin
(
id bigserial primary key, -- internal identifier of the origin
+ sha1 sha1_git unique not null, -- intrinsic identifier of the origin
url unix_path unique not null -- url of the origin
);
comment on column origin.id is 'Origin internal identifier';
+comment on column origin.sha1 is 'Origin intrinsic identifier';
comment on column origin.url is 'URL of the origin';
-- relation tables
diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py
--- a/swh/provenance/storage/archive.py
+++ b/swh/provenance/storage/archive.py
@@ -1,6 +1,6 @@
from typing import Any, Dict, Iterable, Set
-from swh.model.model import ObjectType, Revision, Sha1, TargetType
+from swh.model.model import ObjectType, Revision, Sha1Git, TargetType
from swh.storage.interface import StorageInterface
@@ -8,17 +8,17 @@
def __init__(self, storage: StorageInterface):
self.storage = storage
- def directory_ls(self, id: Sha1) -> Iterable[Dict[str, Any]]:
+ def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]:
# TODO: filter unused fields
yield from self.storage.directory_ls(id)
- def revision_get(self, ids: Iterable[Sha1]) -> Iterable[Revision]:
+ def revision_get(self, ids: Iterable[Sha1Git]) -> Iterable[Revision]:
# TODO: filter unused fields
yield from (
rev for rev in self.storage.revision_get(list(ids)) if rev is not None
)
- def snapshot_get_heads(self, id: Sha1) -> Iterable[Sha1]:
+ def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
from swh.core.utils import grouper
from swh.storage.algos.snapshot import snapshot_get_all_branches
@@ -42,7 +42,7 @@
if release is not None and release.target_type == ObjectType.REVISION
)
- revisions: Set[Sha1] = set()
+ revisions: Set[Sha1Git] = set()
for targets in grouper(targets_set, batchsize):
revisions.update(
revision.id
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -13,6 +13,8 @@
from swh.core.db import BaseDb
from swh.journal.serializers import msgpack_ext_hook
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
from swh.model.tests.swh_model_data import TEST_OBJECTS
from swh.provenance.postgresql.archive import ArchivePostgreSQL
from swh.provenance.storage.archive import ArchiveStorage
@@ -114,13 +116,13 @@
class SynthRelation(TypedDict):
prefix: Optional[str]
path: str
- src: bytes
- dst: bytes
+ src: Sha1Git
+ dst: Sha1Git
rel_ts: float
class SynthRevision(TypedDict):
- sha1: bytes
+ sha1: Sha1Git
date: float
msg: str
R_C: List[SynthRelation]
@@ -134,7 +136,7 @@
Generated SynthRevision (typed dict) with the following elements:
- "sha1": (bytes) sha1 of the revision,
+ "sha1": (Sha1Git) sha1 of the revision,
"date": (float) timestamp of the revision,
"msg": (str) commit message of the revision,
"R_C": (list) new R---C relations added by this revision
@@ -144,8 +146,8 @@
Each relation above is a SynthRelation typed dict with:
"path": (str) location
- "src": (bytes) sha1 of the source of the relation
- "dst": (bytes) sha1 of the destination of the relation
+ "src": (Sha1Git) sha1 of the source of the relation
+ "dst": (Sha1Git) sha1 of the destination of the relation
"rel_ts": (float) timestamp of the target of the relation
(related to the timestamp of the revision)
@@ -183,7 +185,7 @@
def _mk_synth_rev(synth_rev) -> SynthRevision:
assert synth_rev[0]["type"] == "R"
rev = SynthRevision(
- sha1=bytes.fromhex(synth_rev[0]["sha1"]),
+ sha1=hash_to_bytes(synth_rev[0]["sha1"]),
date=float(synth_rev[0]["ts"]),
msg=synth_rev[0]["revname"],
R_C=[],
@@ -202,7 +204,7 @@
prefix=None,
path=row["path"],
src=rev["sha1"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
@@ -214,7 +216,7 @@
prefix=None,
path=row["path"],
src=rev["sha1"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
@@ -226,7 +228,7 @@
prefix=current_path,
path=row["path"],
src=rev["R_D"][-1]["dst"],
- dst=bytes.fromhex(row["sha1"]),
+ dst=hash_to_bytes(row["sha1"]),
rel_ts=float(row["ts"]),
)
)
diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py
--- a/swh/provenance/tests/data/generate_storage_from_git.py
+++ b/swh/provenance/tests/data/generate_storage_from_git.py
@@ -11,6 +11,7 @@
import yaml
from swh.loader.git.from_disk import GitLoaderFromDisk
+from swh.model.hashutil import hash_to_bytes
from swh.model.model import (
Origin,
OriginVisit,
@@ -90,7 +91,7 @@
# add a snapshot with branches from the input file
branches = {
f"refs/heads/{name}".encode(): SnapshotBranch(
- target=bytes.fromhex(all_branches[f"refs/heads/{name}"]),
+ target=hash_to_bytes(all_branches[f"refs/heads/{name}"]),
target_type=TargetType.REVISION,
)
for name in visit["branches"]
diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py
--- a/swh/provenance/tests/test_origin_iterator.py
+++ b/swh/provenance/tests/test_origin_iterator.py
@@ -6,9 +6,9 @@
from swh.model.tests.swh_model_data import TEST_OBJECTS
from swh.provenance.origin import CSVOriginIterator
from swh.storage.algos.origin import (
- iter_origins,
- iter_origin_visits,
iter_origin_visit_statuses,
+ iter_origin_visits,
+ iter_origins,
)
@@ -21,9 +21,7 @@
swh_storage_with_objects, origin.url, visit.visit
):
if status.snapshot is not None:
- origins_csv.append(
- (status.origin, status.date.isoformat(), status.snapshot)
- )
+ origins_csv.append((status.origin, status.snapshot))
origins = list(CSVOriginIterator(origins_csv))
assert origins
assert len(origins) == len(
diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py
--- a/swh/provenance/tests/test_provenance_db.py
+++ b/swh/provenance/tests/test_provenance_db.py
@@ -26,9 +26,7 @@
archive = ArchiveStorage(swh_storage_with_objects)
for status in TEST_OBJECTS["origin_visit_status"]:
if status.snapshot is not None:
- entry = OriginEntry(
- url=status.origin, date=status.date, snapshot=status.snapshot
- )
+ entry = OriginEntry(url=status.origin, snapshot=status.snapshot)
origin_add(provenance, archive, [entry])
# TODO: check some facts here
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_provenance_heuristics.py
@@ -7,6 +7,7 @@
import pytest
+from swh.model.hashutil import hash_to_bytes
from swh.provenance.model import RevisionEntry
from swh.provenance.revision import revision_add
from swh.provenance.tests.conftest import (
@@ -82,7 +83,7 @@
'cur' is a cursor to the provenance index DB.
"""
if isinstance(sha1, str):
- sha1 = bytes.fromhex(sha1)
+ sha1 = hash_to_bytes(sha1)
cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,))
return [date.timestamp() for (date,) in cur.fetchall()]
@@ -265,7 +266,7 @@
db_occurrences = [
(blob.hex(), rev.hex(), date.timestamp(), path.decode())
for blob, rev, date, path in provenance.content_find_all(
- bytes.fromhex(content_id)
+ hash_to_bytes(content_id)
)
]
if provenance.storage.with_path:
@@ -337,7 +338,7 @@
for content_id, (rev_id, ts, paths) in expected_first.items():
(r_sha1, r_rev_id, r_ts, r_path) = provenance.content_find_first(
- bytes.fromhex(content_id)
+ hash_to_bytes(content_id)
)
assert r_sha1.hex() == content_id
assert r_rev_id.hex() == rev_id
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:24 PM (1 w, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230737
Attached To
D5924: Add support for sha1 identifiers for origins
Event Timeline
Log In to Comment