diff --git a/swh/provenance/algos/__init__.py b/swh/provenance/algos/__init__.py new file mode 100644 diff --git a/swh/provenance/directory.py b/swh/provenance/algos/directory.py rename from swh/provenance/directory.py rename to swh/provenance/algos/directory.py --- a/swh/provenance/directory.py +++ b/swh/provenance/algos/directory.py @@ -8,10 +8,9 @@ from swh.core.statsd import statsd from swh.model.model import Sha1Git - -from .archive import ArchiveInterface -from .interface import ProvenanceInterface -from .model import DirectoryEntry +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import ProvenanceInterface +from swh.provenance.model import DirectoryEntry REVISION_DURATION_METRIC = "swh_provenance_directory_duration_seconds" diff --git a/swh/provenance/graph.py b/swh/provenance/algos/isochrone_graph.py rename from swh/provenance/graph.py rename to swh/provenance/algos/isochrone_graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/algos/isochrone_graph.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,10 +11,9 @@ from swh.core.statsd import statsd from swh.model.model import Sha1Git - -from .archive import ArchiveInterface -from .interface import ProvenanceInterface -from .model import DirectoryEntry, RevisionEntry +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import ProvenanceInterface +from swh.provenance.model import DirectoryEntry, RevisionEntry GRAPH_DURATION_METRIC = "swh_provenance_graph_duration_seconds" GRAPH_OPERATIONS_METRIC = "swh_provenance_graph_operations_total" @@ -26,50 +25,6 @@ pass -class HistoryGraph: - @statsd.timed(metric=GRAPH_DURATION_METRIC, tags={"method": "build_history_graph"}) - def __init__( - self, - archive: ArchiveInterface, - revision: RevisionEntry, - ) -> None: - self.head_id = revision.id - self._nodes: Set[Sha1Git] = set() - # rev -> set(parents) - self._edges: Dict[Sha1Git, Set[Sha1Git]] = {} - - stack = {self.head_id} - while stack: - current = stack.pop() - - if current not in self._nodes: - self._nodes.add(current) - self._edges.setdefault(current, set()) - for rev, parent in archive.revision_get_some_outbound_edges(current): - self._nodes.add(rev) - self._edges.setdefault(rev, set()).add(parent) - stack.add(parent) - - # don't process nodes for which we've already retrieved outbound edges - stack -= self._nodes - - def parent_ids(self) -> Set[Sha1Git]: - """Get all the known parent ids in the current graph""" - return self._nodes - {self.head_id} - - def __str__(self) -> str: - return f" Dict[str, Any]: - return { - "head": self.head_id.hex(), - "graph": { - node.hex(): sorted(parent.hex() for parent in parents) - for node, parents in self._edges.items() - }, - } - - class IsochroneNode: def __init__( self, diff --git a/swh/provenance/origin.py b/swh/provenance/algos/origin.py rename from swh/provenance/origin.py rename to swh/provenance/algos/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/algos/origin.py @@ -6,17 +6,15 @@ from datetime import datetime from itertools import islice import logging -from typing import Generator, Iterable, Iterator, List, Optional, Tuple +from typing import Any, Dict, Generator, Iterable, Iterator, List, Optional, Set, Tuple from swh.core.statsd import statsd from swh.model.model import Sha1Git +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import ProvenanceInterface +from swh.provenance.model import OriginEntry, RevisionEntry -from .archive import ArchiveInterface -from .graph import HistoryGraph -from .interface import ProvenanceInterface -from .model import OriginEntry - -ORIGIN_DURATION_METRIC = "swh_provenance_origin_revision_layer_duration_seconds" +ORIGIN_DURATION_METRIC = "swh_provenance_origin_duration_seconds" LOG_FORMAT = ( "%(levelname) -10s %(asctime)s %(name) -30s %(funcName) " @@ -26,6 +24,50 @@ LOGGER = logging.getLogger(__name__) +class HistoryGraph: + @statsd.timed(metric=ORIGIN_DURATION_METRIC, tags={"method": "HistoryGraph"}) + def __init__( + self, + archive: ArchiveInterface, + revision: RevisionEntry, + ) -> None: + self.head_id = revision.id + self._nodes: Set[Sha1Git] = set() + # rev -> set(parents) + self._edges: Dict[Sha1Git, Set[Sha1Git]] = {} + + stack = {self.head_id} + while stack: + current = stack.pop() + + if current not in self._nodes: + self._nodes.add(current) + self._edges.setdefault(current, set()) + for rev, parent in archive.revision_get_some_outbound_edges(current): + self._nodes.add(rev) + self._edges.setdefault(rev, set()).add(parent) + stack.add(parent) + + # don't process nodes for which we've already retrieved outbound edges + stack -= self._nodes + + def parent_ids(self) -> Set[Sha1Git]: + """Get all the known parent ids in the current graph""" + return self._nodes - {self.head_id} + + def __str__(self) -> str: + return f" Dict[str, Any]: + return { + "head": self.head_id.hex(), + "graph": { + node.hex(): sorted(parent.hex() for parent in parents) + for node, parents in self._edges.items() + }, + } + + class CSVOriginIterator: """Iterator over origin visit statuses typically present in the given CSV file. diff --git a/swh/provenance/revision.py b/swh/provenance/algos/revision.py rename from swh/provenance/revision.py rename to swh/provenance/algos/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/algos/revision.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,14 +9,14 @@ from swh.core.statsd import statsd from swh.model.model import Sha1Git +from swh.provenance.archive import ArchiveInterface +from swh.provenance.interface import ProvenanceInterface +from swh.provenance.model import DirectoryEntry, RevisionEntry -from .archive import ArchiveInterface from .directory import directory_flatten -from .graph import DirectoryTooLarge, IsochroneNode, build_isochrone_graph -from .interface import ProvenanceInterface -from .model import DirectoryEntry, RevisionEntry +from .isochrone_graph import DirectoryTooLarge, IsochroneNode, build_isochrone_graph -REVISION_DURATION_METRIC = "swh_provenance_revision_content_layer_duration_seconds" +REVISION_DURATION_METRIC = "swh_provenance_revision_duration_seconds" logger = logging.getLogger(__name__) diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -164,7 +164,7 @@ ) @click.pass_context def origin_from_csv(ctx: click.core.Context, filename: str, limit: Optional[int]): - from .origin import CSVOriginIterator, origin_add + from swh.provenance.algos.origin import CSVOriginIterator, origin_add provenance = ctx.obj["provenance"] archive = ctx.obj["archive"] @@ -298,7 +298,7 @@ min_size: int, max_directory_size: int, ) -> None: - from .revision import CSVRevisionIterator, revision_add + from swh.provenance.algos.revision import CSVRevisionIterator, revision_add provenance = ctx.obj["provenance"] archive = ctx.obj["archive"] @@ -456,7 +456,7 @@ ) @click.pass_context def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size): - from swh.provenance.directory import directory_flatten_range + from swh.provenance.algos.directory import directory_flatten_range provenance = ctx.obj["provenance"] archive = ctx.obj["archive"] @@ -495,9 +495,9 @@ min_size: int, ) -> None: """Process a provided list of directories in the isochrone frontier.""" - from . import get_provenance - from .archive import get_archive - from .directory import CSVDirectoryIterator, directory_add + from swh.provenance import get_provenance + from swh.provenance.algos.directory import CSVDirectoryIterator, directory_add + from swh.provenance.archive import get_archive archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) directories_provider = generate_directory_ids(filename) @@ -579,9 +579,9 @@ min_size: int, ) -> None: """Process a provided list of revisions.""" - from . import get_provenance - from .archive import get_archive - from .revision import CSVRevisionIterator, revision_add + from swh.provenance import get_provenance + from swh.provenance.algos.revision import CSVRevisionIterator, revision_add + from swh.provenance.archive import get_archive archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) revisions_provider = generate_revision_tuples(filename) @@ -626,9 +626,9 @@ @deprecated(version="0.0.1", reason="Use `swh provenance origin from-csv` instead") def iter_origins(ctx: click.core.Context, filename: str, limit: Optional[int]) -> None: """Process a provided list of origins.""" - from . import get_provenance - from .archive import get_archive - from .origin import CSVOriginIterator, origin_add + from swh.provenance import get_provenance + from swh.provenance.algos.origin import CSVOriginIterator, origin_add + from swh.provenance.archive import get_archive archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) origins_provider = generate_origin_tuples(filename) diff --git a/swh/provenance/journal_client.py b/swh/provenance/journal_client.py --- a/swh/provenance/journal_client.py +++ b/swh/provenance/journal_client.py @@ -13,11 +13,11 @@ import sentry_sdk from swh.model.model import TimestampWithTimezone +from swh.provenance.algos.origin import origin_add +from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry -from swh.provenance.origin import origin_add -from swh.provenance.revision import revision_add EPOCH = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) diff --git a/swh/provenance/tests/test_consistency.py b/swh/provenance/tests/test_consistency.py --- a/swh/provenance/tests/test_consistency.py +++ b/swh/provenance/tests/test_consistency.py @@ -4,10 +4,10 @@ # See top-level LICENSE file for more information from swh.model.hashutil import hash_to_bytes +from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import RevisionEntry -from swh.provenance.revision import revision_add from swh.provenance.storage.interface import DirectoryData, ProvenanceResult from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py --- a/swh/provenance/tests/test_directory_flatten.py +++ b/swh/provenance/tests/test_directory_flatten.py @@ -8,8 +8,8 @@ from typing import Tuple from swh.model.hashutil import hash_to_bytes +from swh.provenance.algos.directory import directory_add, directory_flatten_range from swh.provenance.archive import ArchiveInterface -from swh.provenance.directory import directory_add, directory_flatten_range from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import DirectoryEntry, FileEntry from swh.provenance.storage.interface import DirectoryData, RelationData, RelationType diff --git a/swh/provenance/tests/test_directory_iterator.py b/swh/provenance/tests/test_directory_iterator.py --- a/swh/provenance/tests/test_directory_iterator.py +++ b/swh/provenance/tests/test_directory_iterator.py @@ -5,7 +5,7 @@ import pytest -from swh.provenance.directory import CSVDirectoryIterator +from swh.provenance.algos.directory import CSVDirectoryIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.interface import StorageInterface diff --git a/swh/provenance/tests/test_history_graph.py b/swh/provenance/tests/test_history_graph.py --- a/swh/provenance/tests/test_history_graph.py +++ b/swh/provenance/tests/test_history_graph.py @@ -7,11 +7,10 @@ import yaml from swh.model.hashutil import hash_to_bytes +from swh.provenance.algos.origin import HistoryGraph, origin_add_revision from swh.provenance.archive import ArchiveInterface -from swh.provenance.graph import HistoryGraph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry -from swh.provenance.origin import origin_add_revision from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data diff --git a/swh/provenance/tests/test_isochrone_graph.py b/swh/provenance/tests/test_isochrone_graph.py --- a/swh/provenance/tests/test_isochrone_graph.py +++ b/swh/provenance/tests/test_isochrone_graph.py @@ -11,11 +11,15 @@ import yaml from swh.model.hashutil import hash_to_bytes +from swh.provenance.algos.isochrone_graph import ( + DirectoryTooLarge, + IsochroneNode, + build_isochrone_graph, +) +from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface -from swh.provenance.graph import DirectoryTooLarge, IsochroneNode, build_isochrone_graph from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import DirectoryEntry, RevisionEntry -from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( fill_storage, get_datafile, diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py --- a/swh/provenance/tests/test_origin_iterator.py +++ b/swh/provenance/tests/test_origin_iterator.py @@ -5,7 +5,7 @@ import pytest -from swh.provenance.origin import CSVOriginIterator +from swh.provenance.algos.origin import CSVOriginIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data from swh.storage.algos.origin import ( iter_origin_visit_statuses, diff --git a/swh/provenance/tests/test_origin_revision_layer.py b/swh/provenance/tests/test_origin_revision_layer.py --- a/swh/provenance/tests/test_origin_revision_layer.py +++ b/swh/provenance/tests/test_origin_revision_layer.py @@ -11,10 +11,10 @@ from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git +from swh.provenance.algos.origin import origin_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry -from swh.provenance.origin import origin_add from swh.provenance.storage.interface import EntityType, RelationType from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data diff --git a/swh/provenance/tests/test_provenance_storage.py b/swh/provenance/tests/test_provenance_storage.py --- a/swh/provenance/tests/test_provenance_storage.py +++ b/swh/provenance/tests/test_provenance_storage.py @@ -12,12 +12,12 @@ from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin, Sha1Git +from swh.provenance.algos.origin import origin_add +from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry, RevisionEntry -from swh.provenance.origin import origin_add from swh.provenance.provenance import Provenance -from swh.provenance.revision import revision_add from swh.provenance.storage.interface import ( DirectoryData, EntityType, diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -11,11 +11,11 @@ from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git +from swh.provenance.algos.directory import directory_add +from swh.provenance.algos.revision import revision_add from swh.provenance.archive import ArchiveInterface -from swh.provenance.directory import directory_add from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import DirectoryEntry, RevisionEntry -from swh.provenance.revision import revision_add from swh.provenance.storage.interface import EntityType, RelationType from swh.provenance.tests.conftest import ( fill_storage, diff --git a/swh/provenance/tests/test_revision_iterator.py b/swh/provenance/tests/test_revision_iterator.py --- a/swh/provenance/tests/test_revision_iterator.py +++ b/swh/provenance/tests/test_revision_iterator.py @@ -5,7 +5,7 @@ import pytest -from swh.provenance.revision import CSVRevisionIterator +from swh.provenance.algos.revision import CSVRevisionIterator from swh.provenance.tests.conftest import fill_storage, load_repo_data, ts2dt from swh.storage.interface import StorageInterface