Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/origin.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | |||||
from datetime import datetime | |||||
from itertools import islice | from itertools import islice | ||||
from typing import Generator, Iterable, Iterator, List, Optional, Tuple | from typing import Generator, Iterable, Iterator, List, Optional, Tuple | ||||
from swh.core.statsd import statsd | from swh.core.statsd import statsd | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
from .archive import ArchiveInterface | from .archive import ArchiveInterface | ||||
from .graph import HistoryGraph | from .graph import HistoryGraph | ||||
from .interface import ProvenanceInterface | from .interface import ProvenanceInterface | ||||
from .model import OriginEntry, RevisionEntry | from .model import OriginEntry, RevisionEntry | ||||
ORIGIN_DURATION_METRIC = "swh_provenance_origin_revision_layer_duration_seconds" | ORIGIN_DURATION_METRIC = "swh_provenance_origin_revision_layer_duration_seconds" | ||||
LOG_FORMAT = ( | |||||
"%(levelname) -10s %(asctime)s %(name) -30s %(funcName) " | |||||
"-35s %(lineno) -5d: %(message)s" | |||||
) | |||||
LOGGER = logging.getLogger(__name__) | |||||
class CSVOriginIterator: | class CSVOriginIterator: | ||||
"""Iterator over origin visit statuses typically present in the given CSV | """Iterator over origin visit statuses typically present in the given CSV | ||||
file. | file. | ||||
The input is an iterator that produces 2 elements per row: | The input is an iterator that produces 2 elements per row: | ||||
(url, snap) | (url, snap) | ||||
Show All 25 Lines | def origin_add( | ||||
origins: List[OriginEntry], | origins: List[OriginEntry], | ||||
commit: bool = True, | commit: bool = True, | ||||
) -> None: | ) -> None: | ||||
for origin in origins: | for origin in origins: | ||||
proceed_origin(provenance, archive, origin) | proceed_origin(provenance, archive, origin) | ||||
if commit: | if commit: | ||||
provenance.flush() | provenance.flush() | ||||
@statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "proceed_origin"}) | @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "proceed_origin"}) | ||||
def proceed_origin( | def proceed_origin( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, archive: ArchiveInterface, origin: OriginEntry | ||||
archive: ArchiveInterface, | ) -> None: | ||||
origin: OriginEntry) -> None: | LOGGER.info("Processing origin %s", origin.url) | ||||
start = datetime.now() | |||||
provenance.origin_add(origin) | provenance.origin_add(origin) | ||||
origin.retrieve_revisions(archive) | origin.retrieve_revisions(archive) | ||||
for revision in origin.revisions: | for revision in origin.revisions: | ||||
if not provenance.revision_is_head(revision): | if not provenance.revision_is_head(revision): | ||||
graph = HistoryGraph(archive, revision) | graph = HistoryGraph(archive, revision) | ||||
origin_add_revision(provenance, origin, graph) | origin_add_revision(provenance, origin, graph) | ||||
# head is treated separately | # head is treated separately | ||||
check_preferred_origin(provenance, origin, revision) | check_preferred_origin(provenance, origin, revision) | ||||
provenance.revision_add_to_origin(origin, revision) | provenance.revision_add_to_origin(origin, revision) | ||||
end = datetime.now() | |||||
LOGGER.info("Processed origin %s in %s", origin.url, (end - start)) | |||||
@statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "process_revision"}) | @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "process_revision"}) | ||||
def origin_add_revision( | def origin_add_revision( | ||||
provenance: ProvenanceInterface, | provenance: ProvenanceInterface, | ||||
origin: OriginEntry, | origin: OriginEntry, | ||||
graph: HistoryGraph, | graph: HistoryGraph, | ||||
) -> None: | ) -> None: | ||||
Show All 26 Lines |