Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/origin.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from itertools import islice | from itertools import islice | ||||
from typing import Generator, Iterable, Iterator, List, Optional, Tuple | from typing import Generator, Iterable, Iterator, List, Optional, Tuple | ||||
from swh.core.statsd import statsd | from swh.core.statsd import statsd | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .graph import HistoryGraph | from .graph import HistoryGraph | ||||
from .interface import ProvenanceInterface | from .interface import ProvenanceInterface | ||||
from .model import OriginEntry, RevisionEntry | from .model import OriginEntry, RevisionEntry | ||||
ORIGIN_DURATION_METRIC = "swh_provenance_origin_revision_layer_duration_seconds" | |||||
class CSVOriginIterator: | class CSVOriginIterator: | ||||
"""Iterator over origin visit statuses typically present in the given CSV | """Iterator over origin visit statuses typically present in the given CSV | ||||
file. | file. | ||||
The input is an iterator that produces 2 elements per row: | The input is an iterator that produces 2 elements per row: | ||||
(url, snap) | (url, snap) | ||||
Show All 13 Lines | ) -> None: | ||||
self.statuses = islice(statuses, limit) | self.statuses = islice(statuses, limit) | ||||
else: | else: | ||||
self.statuses = iter(statuses) | self.statuses = iter(statuses) | ||||
def __iter__(self) -> Generator[OriginEntry, None, None]: | def __iter__(self) -> Generator[OriginEntry, None, None]: | ||||
return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) | return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) | ||||
@statsd.timed( | @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "main"}) | ||||
douardda: I get this is not part of this diff, but why is this called "main" instead of "origin_add"? | |||||
Done Inline ActionsThis is the main method of the origin-revision layer algorithm. We identify the metrics associated to this layer with the metric tag. Is is done in the same way for the other algorithm aeviso: This is the main method of the origin-revision layer algorithm. We identify the metrics… | |||||
metric="swh_provenance_origin_revision_layer_accesstime_seconds", | |||||
tags={"method": "main"}, | |||||
) | |||||
def origin_add( | def origin_add( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
archive: ArchiveInterface, | archive: ArchiveInterface, | ||||
origins: List[OriginEntry], | origins: List[OriginEntry], | ||||
) -> None: | ) -> None: | ||||
for origin in origins: | for origin in origins: | ||||
provenance.origin_add(origin) | provenance.origin_add(origin) | ||||
origin.retrieve_revisions(archive) | origin.retrieve_revisions(archive) | ||||
for revision in origin.revisions: | for revision in origin.revisions: | ||||
graph = HistoryGraph(archive, provenance, revision) | graph = HistoryGraph(archive, provenance, revision) | ||||
origin_add_revision(provenance, origin, graph) | origin_add_revision(provenance, origin, graph) | ||||
provenance.flush() | provenance.flush() | ||||
@statsd.timed( | @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "process_revision"}) | ||||
metric="swh_provenance_origin_revision_layer_accesstime_seconds", | |||||
tags={"method": "process_revision"}, | |||||
) | |||||
def origin_add_revision( | def origin_add_revision( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
origin: OriginEntry, | origin: OriginEntry, | ||||
graph: HistoryGraph, | graph: HistoryGraph, | ||||
) -> None: | ) -> None: | ||||
# XXX: simplified version of the origin-revision algorithm. This is generating flat | # XXX: simplified version of the origin-revision algorithm. This is generating flat | ||||
# models for the history of all head revisions. No previous result is reused now! | # models for the history of all head revisions. No previous result is reused now! | ||||
# The previous implementation was missing some paths from origins to certain | # The previous implementation was missing some paths from origins to certain | ||||
Show All 13 Lines | while stack: | ||||
# create a link between it and the head, and recursively walk its history | # create a link between it and the head, and recursively walk its history | ||||
provenance.revision_add_before_revision(graph.head.entry, current.entry) | provenance.revision_add_before_revision(graph.head.entry, current.entry) | ||||
visited.add(current) | visited.add(current) | ||||
for parent in graph.parents[current]: | for parent in graph.parents[current]: | ||||
if parent not in visited: | if parent not in visited: | ||||
stack.append(parent) | stack.append(parent) | ||||
@statsd.timed( | @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "check_preferred_origin"}) | ||||
metric="swh_provenance_origin_revision_layer_accesstime_seconds", | |||||
tags={"method": "check_preferred_origin"}, | |||||
) | |||||
def check_preferred_origin( | def check_preferred_origin( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
origin: OriginEntry, | origin: OriginEntry, | ||||
revision: RevisionEntry, | revision: RevisionEntry, | ||||
) -> None: | ) -> None: | ||||
# if the revision has no preferred origin just set the given origin as the | # if the revision has no preferred origin just set the given origin as the | ||||
# preferred one. TODO: this should be improved in the future! | # preferred one. TODO: this should be improved in the future! | ||||
preferred = provenance.revision_get_preferred_origin(revision) | preferred = provenance.revision_get_preferred_origin(revision) | ||||
if preferred is None: | if preferred is None: | ||||
provenance.revision_set_preferred_origin(origin, revision) | provenance.revision_set_preferred_origin(origin, revision) |
I get this is not part of this diff, but why is this called "main" instead of "origin_add"?
The "method" tag should stick to the name of the decorated method/function.