Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/cli.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
# WARNING: do not import unnecessary things here to keep cli startup time under | # WARNING: do not import unnecessary things here to keep cli startup time under | ||||
# control | # control | ||||
from datetime import datetime, timezone | |||||
import os | import os | ||||
from typing import Any, Dict, Optional | from typing import Any, Dict, Generator, Optional, Tuple | ||||
import click | import click | ||||
import iso8601 | |||||
import yaml | import yaml | ||||
from swh.core import config | from swh.core import config | ||||
from swh.core.cli import CONTEXT_SETTINGS | from swh.core.cli import CONTEXT_SETTINGS | ||||
from swh.core.cli import swh as swh_cli_group | from swh.core.cli import swh as swh_cli_group | ||||
from swh.model.hashutil import hash_to_bytes, hash_to_hex | from swh.model.hashutil import hash_to_bytes, hash_to_hex | ||||
# All generic config code should reside in swh.core.config | # All generic config code should reside in swh.core.config | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | |||||
@click.option( | @click.option( | ||||
"-P", | "-P", | ||||
"--profile", | "--profile", | ||||
default=None, | default=None, | ||||
type=click.Path(exists=False, dir_okay=False, path_type=str), | type=click.Path(exists=False, dir_okay=False, path_type=str), | ||||
help="""Enable profiling to specified file.""", | help="""Enable profiling to specified file.""", | ||||
) | ) | ||||
@click.pass_context | @click.pass_context | ||||
def cli(ctx, config_file: Optional[str], profile: str): | def cli(ctx, config_file: Optional[str], profile: str) -> None: | ||||
if config_file is None and config.config_exists(DEFAULT_PATH): | if config_file is None and config.config_exists(DEFAULT_PATH): | ||||
config_file = DEFAULT_PATH | config_file = DEFAULT_PATH | ||||
if config_file is None: | if config_file is None: | ||||
conf = DEFAULT_CONFIG | conf = DEFAULT_CONFIG | ||||
else: | else: | ||||
# read_raw_config do not fail on ENOENT | # read_raw_config do not fail on ENOENT | ||||
if not config.config_exists(config_file): | if not config.config_exists(config_file): | ||||
raise FileNotFoundError(config_file) | raise FileNotFoundError(config_file) | ||||
conf = config.read_raw_config(config.config_basepath(config_file)) | conf = config.read_raw_config(config.config_basepath(config_file)) | ||||
conf = config.merge_configs(DEFAULT_CONFIG, conf) | conf = config.merge_configs(DEFAULT_CONFIG, conf) | ||||
ctx.ensure_object(dict) | ctx.ensure_object(dict) | ||||
ctx.obj["config"] = conf | ctx.obj["config"] = conf | ||||
if profile: | if profile: | ||||
import atexit | import atexit | ||||
import cProfile | import cProfile | ||||
print("Profiling...") | print("Profiling...") | ||||
pr = cProfile.Profile() | pr = cProfile.Profile() | ||||
pr.enable() | pr.enable() | ||||
def exit(): | def exit() -> None: | ||||
pr.disable() | pr.disable() | ||||
pr.dump_stats(profile) | pr.dump_stats(profile) | ||||
atexit.register(exit) | atexit.register(exit) | ||||
@cli.command(name="iter-revisions") | @cli.command(name="iter-revisions") | ||||
@click.argument("filename") | @click.argument("filename") | ||||
@click.option("-a", "--track-all", default=True, type=bool) | @click.option("-a", "--track-all", default=True, type=bool) | ||||
@click.option("-l", "--limit", type=int) | @click.option("-l", "--limit", type=int) | ||||
@click.option("-m", "--min-depth", default=1, type=int) | @click.option("-m", "--min-depth", default=1, type=int) | ||||
@click.option("-r", "--reuse", default=True, type=bool) | @click.option("-r", "--reuse", default=True, type=bool) | ||||
@click.pass_context | @click.pass_context | ||||
def iter_revisions(ctx, filename, track_all, limit, min_depth, reuse): | def iter_revisions( | ||||
ctx, | |||||
filename: str, | |||||
track_all: bool, | |||||
limit: Optional[int], | |||||
min_depth: int, | |||||
reuse: bool, | |||||
) -> None: | |||||
# TODO: add file size filtering | # TODO: add file size filtering | ||||
"""Process a provided list of revisions.""" | """Process a provided list of revisions.""" | ||||
from . import get_archive, get_provenance | from . import get_archive, get_provenance | ||||
from .revision import CSVRevisionIterator, revision_add | from .revision import CSVRevisionIterator, revision_add | ||||
archive = get_archive(**ctx.obj["config"]["archive"]) | archive = get_archive(**ctx.obj["config"]["archive"]) | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]) | ||||
revisions_provider = ( | revisions_provider = generate_revision_tuples(filename) | ||||
line.strip().split(",") for line in open(filename, "r") if line.strip() | |||||
) | |||||
revisions = CSVRevisionIterator(revisions_provider, limit=limit) | revisions = CSVRevisionIterator(revisions_provider, limit=limit) | ||||
for revision in revisions: | for revision in revisions: | ||||
revision_add( | revision_add( | ||||
provenance, | provenance, | ||||
archive, | archive, | ||||
[revision], | [revision], | ||||
trackall=track_all, | trackall=track_all, | ||||
lower=reuse, | lower=reuse, | ||||
mindepth=min_depth, | mindepth=min_depth, | ||||
) | ) | ||||
def generate_revision_tuples( | |||||
filename: str, | |||||
) -> Generator[Tuple[bytes, datetime, bytes], None, None]: | |||||
for line in open(filename, "r"): | |||||
if line.strip(): | |||||
revision, date, root = line.strip().split(",") | |||||
yield ( | |||||
revision.encode("utf-8"), | |||||
iso8601.parse_date(date, default_timezone=timezone.utc), | |||||
root.encode("utf-8"), | |||||
) | |||||
@cli.command(name="iter-origins") | @cli.command(name="iter-origins") | ||||
@click.argument("filename") | @click.argument("filename") | ||||
@click.option("-l", "--limit", type=int) | @click.option("-l", "--limit", type=int) | ||||
@click.pass_context | @click.pass_context | ||||
def iter_origins(ctx, filename, limit): | def iter_origins(ctx, filename: str, limit: Optional[int]) -> None: | ||||
"""Process a provided list of origins.""" | """Process a provided list of origins.""" | ||||
from . import get_archive, get_provenance | from . import get_archive, get_provenance | ||||
from .origin import CSVOriginIterator, origin_add | from .origin import CSVOriginIterator, origin_add | ||||
archive = get_archive(**ctx.obj["config"]["archive"]) | archive = get_archive(**ctx.obj["config"]["archive"]) | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]) | ||||
origins_provider = ( | origins_provider = generate_origin_tuples(filename) | ||||
line.strip().split(",") for line in open(filename, "r") if line.strip() | |||||
) | |||||
origins = CSVOriginIterator(origins_provider, limit=limit) | origins = CSVOriginIterator(origins_provider, limit=limit) | ||||
for origin in origins: | for origin in origins: | ||||
origin_add(provenance, archive, [origin]) | origin_add(provenance, archive, [origin]) | ||||
def generate_origin_tuples(filename: str) -> Generator[Tuple[str, bytes], None, None]: | |||||
for line in open(filename, "r"): | |||||
if line.strip(): | |||||
url, snapshot = line.strip().split(",") | |||||
yield (url, snapshot.encode("utf-8")) | |||||
@cli.command(name="find-first") | @cli.command(name="find-first") | ||||
@click.argument("swhid") | @click.argument("swhid") | ||||
@click.pass_context | @click.pass_context | ||||
def find_first(ctx, swhid): | def find_first(ctx, swhid: str) -> None: | ||||
"""Find first occurrence of the requested blob.""" | """Find first occurrence of the requested blob.""" | ||||
from . import get_provenance | from . import get_provenance | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]) | ||||
# TODO: return a dictionary with proper keys for each field | # TODO: return a dictionary with proper keys for each field | ||||
occur = provenance.content_find_first(hash_to_bytes(swhid)) | occur = provenance.content_find_first(hash_to_bytes(swhid)) | ||||
if occur is not None: | if occur is not None: | ||||
print( | print( | ||||
f"swh:1:cnt:{hash_to_hex(occur.content)}, " | f"swh:1:cnt:{hash_to_hex(occur.content)}, " | ||||
f"swh:1:rev:{hash_to_hex(occur.revision)}, " | f"swh:1:rev:{hash_to_hex(occur.revision)}, " | ||||
f"{occur.date}, " | f"{occur.date}, " | ||||
f"{occur.origin}, " | f"{occur.origin}, " | ||||
f"{os.fsdecode(occur.path)}" | f"{os.fsdecode(occur.path)}" | ||||
) | ) | ||||
else: | else: | ||||
print(f"Cannot find a content with the id {swhid}") | print(f"Cannot find a content with the id {swhid}") | ||||
@cli.command(name="find-all") | @cli.command(name="find-all") | ||||
@click.argument("swhid") | @click.argument("swhid") | ||||
@click.option("-l", "--limit", type=int) | @click.option("-l", "--limit", type=int) | ||||
@click.pass_context | @click.pass_context | ||||
def find_all(ctx, swhid, limit): | def find_all(ctx, swhid: str, limit: Optional[int]) -> None: | ||||
"""Find all occurrences of the requested blob.""" | """Find all occurrences of the requested blob.""" | ||||
from . import get_provenance | from . import get_provenance | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]) | ||||
# TODO: return a dictionary with proper keys for each field | # TODO: return a dictionary with proper keys for each field | ||||
for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit): | for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit): | ||||
print( | print( | ||||
f"swh:1:cnt:{hash_to_hex(occur.content)}, " | f"swh:1:cnt:{hash_to_hex(occur.content)}, " | ||||
f"swh:1:rev:{hash_to_hex(occur.revision)}, " | f"swh:1:rev:{hash_to_hex(occur.revision)}, " | ||||
f"{occur.date}, " | f"{occur.date}, " | ||||
f"{occur.origin}, " | f"{occur.origin}, " | ||||
f"{os.fsdecode(occur.path)}" | f"{os.fsdecode(occur.path)}" | ||||
) | ) |