Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/origin.py
from itertools import islice | from itertools import islice | ||||
import logging | import logging | ||||
import time | import time | ||||
from typing import Iterable, Iterator, List, Optional, Tuple | from typing import Generator, Iterable, Iterator, List, Optional, Tuple | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .graph import HistoryNode, build_history_graph | from .graph import HistoryNode, build_history_graph | ||||
from .model import OriginEntry, RevisionEntry | from .model import OriginEntry, RevisionEntry | ||||
from .provenance import ProvenanceInterface | from .provenance import ProvenanceInterface | ||||
Show All 10 Lines | class CSVOriginIterator: | ||||
- url: is the origin url of the visit | - url: is the origin url of the visit | ||||
- snap: sha1_git of the snapshot pointed by the visit status | - snap: sha1_git of the snapshot pointed by the visit status | ||||
""" | """ | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
statuses: Iterable[Tuple[str, Sha1Git]], | statuses: Iterable[Tuple[str, Sha1Git]], | ||||
limit: Optional[int] = None, | limit: Optional[int] = None, | ||||
): | ) -> None: | ||||
self.statuses: Iterator[Tuple[str, Sha1Git]] | self.statuses: Iterator[Tuple[str, Sha1Git]] | ||||
if limit is not None: | if limit is not None: | ||||
self.statuses = islice(statuses, limit) | self.statuses = islice(statuses, limit) | ||||
else: | else: | ||||
self.statuses = iter(statuses) | self.statuses = iter(statuses) | ||||
def __iter__(self): | def __iter__(self) -> Generator[OriginEntry, None, None]: | ||||
return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) | return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) | ||||
def origin_add( | def origin_add( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
origins: List[OriginEntry], | origins: List[OriginEntry], | ||||
): | ) -> None: | ||||
start = time.time() | start = time.time() | ||||
for origin in origins: | for origin in origins: | ||||
provenance.origin_add(origin) | provenance.origin_add(origin) | ||||
origin.retrieve_revisions(archive) | origin.retrieve_revisions(archive) | ||||
for revision in origin.revisions: | for revision in origin.revisions: | ||||
graph = build_history_graph(archive, provenance, revision) | graph = build_history_graph(archive, provenance, revision) | ||||
origin_add_revision(provenance, origin, graph) | origin_add_revision(provenance, origin, graph) | ||||
done = time.time() | done = time.time() | ||||
provenance.flush() | provenance.flush() | ||||
stop = time.time() | stop = time.time() | ||||
logging.debug( | logging.debug( | ||||
"Origins " | "Origins " | ||||
";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) | ";".join([origin.id.hex() + ":" + origin.snapshot.hex() for origin in origins]) | ||||
+ f" were processed in {stop - start} secs (commit took {stop - done} secs)!" | + f" were processed in {stop - start} secs (commit took {stop - done} secs)!" | ||||
) | ) | ||||
def origin_add_revision( | def origin_add_revision( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
origin: OriginEntry, | origin: OriginEntry, | ||||
graph: HistoryNode, | graph: HistoryNode, | ||||
): | ) -> None: | ||||
# head is treated separately since it should always be added to the given origin | # head is treated separately since it should always be added to the given origin | ||||
head = graph.entry | head = graph.entry | ||||
check_preferred_origin(provenance, origin, head) | check_preferred_origin(provenance, origin, head) | ||||
provenance.revision_add_to_origin(origin, head) | provenance.revision_add_to_origin(origin, head) | ||||
# head's history should be recursively iterated starting from its parents | # head's history should be recursively iterated starting from its parents | ||||
stack = list(graph.parents) | stack = list(graph.parents) | ||||
while stack: | while stack: | ||||
Show All 11 Lines | while stack: | ||||
for parent in current.parents: | for parent in current.parents: | ||||
stack.append(parent) | stack.append(parent) | ||||
def check_preferred_origin( | def check_preferred_origin( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
origin: OriginEntry, | origin: OriginEntry, | ||||
revision: RevisionEntry, | revision: RevisionEntry, | ||||
): | ) -> None: | ||||
# if the revision has no preferred origin just set the given origin as the | # if the revision has no preferred origin just set the given origin as the | ||||
# preferred one. TODO: this should be improved in the future! | # preferred one. TODO: this should be improved in the future! | ||||
preferred = provenance.revision_get_preferred_origin(revision) | preferred = provenance.revision_get_preferred_origin(revision) | ||||
if preferred is None: | if preferred is None: | ||||
provenance.revision_set_preferred_origin(origin, revision) | provenance.revision_set_preferred_origin(origin, revision) |