diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -5,16 +5,19 @@ # WARNING: do not import unnecessary things here to keep cli startup time under # control +from datetime import datetime, timezone import os -from typing import Any, Dict, Optional +from typing import Any, Dict, Generator, Optional, Tuple import click +import iso8601 import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.model import Sha1Git # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" @@ -74,7 +77,7 @@ help="""Enable profiling to specified file.""", ) @click.pass_context -def cli(ctx, config_file: Optional[str], profile: str): +def cli(ctx, config_file: Optional[str], profile: str) -> None: if config_file is None and config.config_exists(DEFAULT_PATH): config_file = DEFAULT_PATH @@ -98,7 +101,7 @@ pr = cProfile.Profile() pr.enable() - def exit(): + def exit() -> None: pr.disable() pr.dump_stats(profile) @@ -112,7 +115,14 @@ @click.option("-m", "--min-depth", default=1, type=int) @click.option("-r", "--reuse", default=True, type=bool) @click.pass_context -def iter_revisions(ctx, filename, track_all, limit, min_depth, reuse): +def iter_revisions( + ctx, + filename: str, + track_all: bool, + limit: Optional[int], + min_depth: int, + reuse: bool, +) -> None: # TODO: add file size filtering """Process a provided list of revisions.""" from . import get_archive, get_provenance @@ -120,9 +130,7 @@ archive = get_archive(**ctx.obj["config"]["archive"]) provenance = get_provenance(**ctx.obj["config"]["provenance"]) - revisions_provider = ( - line.strip().split(",") for line in open(filename, "r") if line.strip() - ) + revisions_provider = generate_revision_tuples(filename) revisions = CSVRevisionIterator(revisions_provider, limit=limit) for revision in revisions: @@ -136,30 +144,48 @@ ) +def generate_revision_tuples( + filename: str, +) -> Generator[Tuple[Sha1Git, datetime, Sha1Git], None, None]: + for line in open(filename, "r"): + if line.strip(): + revision, date, root = line.strip().split(",") + yield ( + hash_to_bytes(revision), + iso8601.parse_date(date, default_timezone=timezone.utc), + hash_to_bytes(root), + ) + + @cli.command(name="iter-origins") @click.argument("filename") @click.option("-l", "--limit", type=int) @click.pass_context -def iter_origins(ctx, filename, limit): +def iter_origins(ctx, filename: str, limit: Optional[int]) -> None: """Process a provided list of origins.""" from . import get_archive, get_provenance from .origin import CSVOriginIterator, origin_add archive = get_archive(**ctx.obj["config"]["archive"]) provenance = get_provenance(**ctx.obj["config"]["provenance"]) - origins_provider = ( - line.strip().split(",") for line in open(filename, "r") if line.strip() - ) + origins_provider = generate_origin_tuples(filename) origins = CSVOriginIterator(origins_provider, limit=limit) for origin in origins: origin_add(provenance, archive, [origin]) +def generate_origin_tuples(filename: str) -> Generator[Tuple[str, bytes], None, None]: + for line in open(filename, "r"): + if line.strip(): + url, snapshot = line.strip().split(",") + yield (url, hash_to_bytes(snapshot)) + + @cli.command(name="find-first") @click.argument("swhid") @click.pass_context -def find_first(ctx, swhid): +def find_first(ctx, swhid: str) -> None: """Find first occurrence of the requested blob.""" from . import get_provenance @@ -182,7 +208,7 @@ @click.argument("swhid") @click.option("-l", "--limit", type=int) @click.pass_context -def find_all(ctx, swhid, limit): +def find_all(ctx, swhid: str, limit: Optional[int]) -> None: """Find all occurrences of the requested blob.""" from . import get_provenance diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py --- a/swh/provenance/origin.py +++ b/swh/provenance/origin.py @@ -1,7 +1,7 @@ from itertools import islice import logging import time -from typing import Iterable, Iterator, List, Optional, Tuple +from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.model.model import Sha1Git @@ -28,14 +28,14 @@ self, statuses: Iterable[Tuple[str, Sha1Git]], limit: Optional[int] = None, - ): + ) -> None: self.statuses: Iterator[Tuple[str, Sha1Git]] if limit is not None: self.statuses = islice(statuses, limit) else: self.statuses = iter(statuses) - def __iter__(self): + def __iter__(self) -> Generator[OriginEntry, None, None]: return (OriginEntry(url, snapshot) for url, snapshot in self.statuses) @@ -43,7 +43,7 @@ provenance: ProvenanceInterface, archive: ArchiveInterface, origins: List[OriginEntry], -): +) -> None: start = time.time() for origin in origins: provenance.origin_add(origin) @@ -65,7 +65,7 @@ provenance: ProvenanceInterface, origin: OriginEntry, graph: HistoryNode, -): +) -> None: # head is treated separately since it should always be added to the given origin head = graph.entry check_preferred_origin(provenance, origin, head) @@ -93,7 +93,7 @@ provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry, -): +) -> None: # if the revision has no preferred origin just set the given origin as the # preferred one. TODO: this should be improved in the future! preferred = provenance.revision_get_preferred_origin(revision) diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -3,11 +3,8 @@ import logging import os import time -from typing import Iterable, Iterator, List, Optional, Tuple +from typing import Generator, Iterable, Iterator, List, Optional, Tuple -import iso8601 - -from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git from .archive import ArchiveInterface @@ -33,26 +30,18 @@ self, revisions: Iterable[Tuple[Sha1Git, datetime, Sha1Git]], limit: Optional[int] = None, - ): + ) -> None: self.revisions: Iterator[Tuple[Sha1Git, datetime, Sha1Git]] if limit is not None: self.revisions = islice(revisions, limit) else: self.revisions = iter(revisions) - def __iter__(self): - return self - - def __next__(self): - id, date, root = next(self.revisions) - date = iso8601.parse_date(date) - if date.tzinfo is None: - date = date.replace(tzinfo=timezone.utc) - return RevisionEntry( - hash_to_bytes(id), - date=date, - root=hash_to_bytes(root), - ) + def __iter__(self) -> Generator[RevisionEntry, None, None]: + for id, date, root in self.revisions: + if date.tzinfo is None: + date = date.replace(tzinfo=timezone.utc) + yield RevisionEntry(id, date=date, root=root) def revision_add( @@ -109,7 +98,7 @@ trackall: bool = True, lower: bool = True, mindepth: int = 1, -): +) -> None: assert revision.date is not None provenance.revision_add(revision) diff --git a/swh/provenance/tests/test_revision_iterator.py b/swh/provenance/tests/test_revision_iterator.py --- a/swh/provenance/tests/test_revision_iterator.py +++ b/swh/provenance/tests/test_revision_iterator.py @@ -22,8 +22,7 @@ data = load_repo_data(repo) fill_storage(swh_storage, data) revisions_csv = [ - (rev["id"], ts2dt(rev["date"]).isoformat(), rev["directory"]) - for rev in data["revision"] + (rev["id"], ts2dt(rev["date"]), rev["directory"]) for rev in data["revision"] ] revisions = list(CSVRevisionIterator(revisions_csv)) assert revisions