Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/cli.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
# WARNING: do not import unnecessary things here to keep cli startup time under | # WARNING: do not import unnecessary things here to keep cli startup time under | ||||
# control | # control | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import os | import os | ||||
from typing import Any, Dict, Generator, Optional, Tuple | from typing import Any, Dict, Generator, Optional, Tuple | ||||
import click | import click | ||||
import iso8601 | import iso8601 | ||||
import yaml | import yaml | ||||
from swh.core import config | from swh.core import config | ||||
from swh.core.cli import CONTEXT_SETTINGS | from swh.core.cli import CONTEXT_SETTINGS | ||||
from swh.core.cli import swh as swh_cli_group | from swh.core.cli import swh as swh_cli_group | ||||
from swh.model.hashutil import hash_to_bytes, hash_to_hex | from swh.model.hashutil import hash_to_bytes, hash_to_hex | ||||
from swh.model.model import Sha1Git | from swh.model.model import Sha1Git | ||||
# All generic config code should reside in swh.core.config | # All generic config code should reside in swh.core.config | ||||
CONFIG_ENVVAR = "SWH_CONFIG_FILE" | CONFIG_ENVVAR = "SWH_CONFIG_FILENAME" | ||||
DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") | DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") | ||||
DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, DEFAULT_CONFIG_PATH) | DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, DEFAULT_CONFIG_PATH) | ||||
DEFAULT_CONFIG: Dict[str, Any] = { | DEFAULT_CONFIG: Dict[str, Any] = { | ||||
"provenance": { | |||||
"archive": { | "archive": { | ||||
"cls": "api", | # "cls": "api", | ||||
"storage": { | # "storage": { | ||||
"cls": "remote", | # "cls": "remote", | ||||
"url": "http://uffizi.internal.softwareheritage.org:5002", | # "url": "http://uffizi.internal.softwareheritage.org:5002", | ||||
} | |||||
# "cls": "direct", | |||||
# "db": { | |||||
# "host": "db.internal.softwareheritage.org", | |||||
# "dbname": "softwareheritage", | |||||
# "user": "guest" | |||||
# } | # } | ||||
"cls": "direct", | |||||
"db": { | |||||
"host": "db.internal.softwareheritage.org", | |||||
"dbname": "softwareheritage", | |||||
"user": "guest", | |||||
}, | |||||
}, | }, | ||||
"provenance": {"cls": "local", "db": {"host": "localhost", "dbname": "provenance"}}, | "storage": { | ||||
"cls": "local", | |||||
"db": {"host": "localhost", "dbname": "provenance"}, | |||||
}, | |||||
} | |||||
} | } | ||||
CONFIG_FILE_HELP = f"""Configuration file: | CONFIG_FILE_HELP = f"""Configuration file: | ||||
\b | \b | ||||
The CLI option or the environment variable will fail if invalid. | The CLI option or the environment variable will fail if invalid. | ||||
CLI option is checked first. | CLI option is checked first. | ||||
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines | def iter_revisions( | ||||
min_depth: int, | min_depth: int, | ||||
reuse: bool, | reuse: bool, | ||||
) -> None: | ) -> None: | ||||
# TODO: add file size filtering | # TODO: add file size filtering | ||||
"""Process a provided list of revisions.""" | """Process a provided list of revisions.""" | ||||
from . import get_archive, get_provenance | from . import get_archive, get_provenance | ||||
from .revision import CSVRevisionIterator, revision_add | from .revision import CSVRevisionIterator, revision_add | ||||
archive = get_archive(**ctx.obj["config"]["archive"]) | archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) | ||||
revisions_provider = generate_revision_tuples(filename) | revisions_provider = generate_revision_tuples(filename) | ||||
revisions = CSVRevisionIterator(revisions_provider, limit=limit) | revisions = CSVRevisionIterator(revisions_provider, limit=limit) | ||||
for revision in revisions: | for revision in revisions: | ||||
revision_add( | revision_add( | ||||
provenance, | provenance, | ||||
archive, | archive, | ||||
[revision], | [revision], | ||||
Show All 20 Lines | |||||
@click.argument("filename") | @click.argument("filename") | ||||
@click.option("-l", "--limit", type=int) | @click.option("-l", "--limit", type=int) | ||||
@click.pass_context | @click.pass_context | ||||
def iter_origins(ctx: click.core.Context, filename: str, limit: Optional[int]) -> None: | def iter_origins(ctx: click.core.Context, filename: str, limit: Optional[int]) -> None: | ||||
"""Process a provided list of origins.""" | """Process a provided list of origins.""" | ||||
from . import get_archive, get_provenance | from . import get_archive, get_provenance | ||||
from .origin import CSVOriginIterator, origin_add | from .origin import CSVOriginIterator, origin_add | ||||
archive = get_archive(**ctx.obj["config"]["archive"]) | archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) | ||||
origins_provider = generate_origin_tuples(filename) | origins_provider = generate_origin_tuples(filename) | ||||
origins = CSVOriginIterator(origins_provider, limit=limit) | origins = CSVOriginIterator(origins_provider, limit=limit) | ||||
for origin in origins: | for origin in origins: | ||||
origin_add(provenance, archive, [origin]) | origin_add(provenance, archive, [origin]) | ||||
def generate_origin_tuples(filename: str) -> Generator[Tuple[str, bytes], None, None]: | def generate_origin_tuples(filename: str) -> Generator[Tuple[str, bytes], None, None]: | ||||
for line in open(filename, "r"): | for line in open(filename, "r"): | ||||
if line.strip(): | if line.strip(): | ||||
url, snapshot = line.strip().split(",") | url, snapshot = line.strip().split(",") | ||||
yield (url, hash_to_bytes(snapshot)) | yield (url, hash_to_bytes(snapshot)) | ||||
@cli.command(name="find-first") | @cli.command(name="find-first") | ||||
@click.argument("swhid") | @click.argument("swhid") | ||||
@click.pass_context | @click.pass_context | ||||
def find_first(ctx: click.core.Context, swhid: str) -> None: | def find_first(ctx: click.core.Context, swhid: str) -> None: | ||||
"""Find first occurrence of the requested blob.""" | """Find first occurrence of the requested blob.""" | ||||
from . import get_provenance | from . import get_provenance | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) | ||||
# TODO: return a dictionary with proper keys for each field | # TODO: return a dictionary with proper keys for each field | ||||
occur = provenance.content_find_first(hash_to_bytes(swhid)) | occur = provenance.content_find_first(hash_to_bytes(swhid)) | ||||
if occur is not None: | if occur is not None: | ||||
print( | print( | ||||
f"swh:1:cnt:{hash_to_hex(occur.content)}, " | f"swh:1:cnt:{hash_to_hex(occur.content)}, " | ||||
f"swh:1:rev:{hash_to_hex(occur.revision)}, " | f"swh:1:rev:{hash_to_hex(occur.revision)}, " | ||||
f"{occur.date}, " | f"{occur.date}, " | ||||
f"{occur.origin}, " | f"{occur.origin}, " | ||||
f"{os.fsdecode(occur.path)}" | f"{os.fsdecode(occur.path)}" | ||||
) | ) | ||||
else: | else: | ||||
print(f"Cannot find a content with the id {swhid}") | print(f"Cannot find a content with the id {swhid}") | ||||
@cli.command(name="find-all") | @cli.command(name="find-all") | ||||
@click.argument("swhid") | @click.argument("swhid") | ||||
@click.option("-l", "--limit", type=int) | @click.option("-l", "--limit", type=int) | ||||
@click.pass_context | @click.pass_context | ||||
def find_all(ctx: click.core.Context, swhid: str, limit: Optional[int]) -> None: | def find_all(ctx: click.core.Context, swhid: str, limit: Optional[int]) -> None: | ||||
"""Find all occurrences of the requested blob.""" | """Find all occurrences of the requested blob.""" | ||||
from . import get_provenance | from . import get_provenance | ||||
provenance = get_provenance(**ctx.obj["config"]["provenance"]) | provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) | ||||
# TODO: return a dictionary with proper keys for each field | # TODO: return a dictionary with proper keys for each field | ||||
for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit): | for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit): | ||||
print( | print( | ||||
f"swh:1:cnt:{hash_to_hex(occur.content)}, " | f"swh:1:cnt:{hash_to_hex(occur.content)}, " | ||||
f"swh:1:rev:{hash_to_hex(occur.revision)}, " | f"swh:1:rev:{hash_to_hex(occur.revision)}, " | ||||
f"{occur.date}, " | f"{occur.date}, " | ||||
f"{occur.origin}, " | f"{occur.origin}, " | ||||
f"{os.fsdecode(occur.path)}" | f"{os.fsdecode(occur.path)}" | ||||
) | ) |