diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ -swh.core[db,http] >= 0.14.0 +swh.core[db,http] >= 2.9 swh.model >= 0.0.15 swh.objstorage >= 0.2.2 swh.scheduler >= 0.5.2 diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Iterator +from typing import Callable, Dict, Iterator, List, Optional # WARNING: do not import unnecessary things here to keep cli startup time under # control @@ -213,6 +213,12 @@ @indexer_cli_group.command("journal-client") +@click.argument( + "indexer", + type=click.Choice(["origin-intrinsic-metadata", "*"]), + required=False + # TODO: remove required=False after we stop using it +) @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API") @click.option( "--origin-metadata-task-type", @@ -236,18 +242,27 @@ @click.pass_context def journal_client( ctx, - scheduler_url, - origin_metadata_task_type, - brokers, - prefix, - group_id, - stop_after_objects, + indexer: Optional[str], + scheduler_url: str, + origin_metadata_task_type: str, + brokers: List[str], + prefix: str, + group_id: str, + stop_after_objects: Optional[int], ): - """Listens for new objects from the SWH Journal, and schedules tasks - to run relevant indexers (currently, only origin-intrinsic-metadata) - on these new objects.""" + """ + Listens for new objects from the SWH Journal, and either: + + * runs the indexer with the name passed as argument, if any + * schedules tasks to run relevant indexers (currently, only + origin-intrinsic-metadata) on these new objects otherwise. + + Passing '*' as indexer name runs all indexers. + """ import functools + import warnings + from swh.indexer.indexer import ObjectsDict from swh.indexer.journal_client import process_journal_objects from swh.journal.client import get_journal_client from swh.scheduler import get_scheduler @@ -268,22 +283,50 @@ ) stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects") + object_types = set() + worker_fns: List[Callable[[ObjectsDict], Dict]] = [] + + if indexer is None: + warnings.warn( + "'swh indexer journal-client' with no argument creates scheduler tasks " + "to index, rather than index directly.", + DeprecationWarning, + ) + object_types.add("origin_visit_status") + worker_fns.append( + functools.partial( + process_journal_objects, + scheduler=scheduler, + task_names={ + "origin_metadata": origin_metadata_task_type, + }, + ) + ) + + if indexer in ("origin-intrinsic-metadata", "*"): + from swh.indexer.metadata import OriginMetadataIndexer + + object_types.add("origin_visit_status") + idx = OriginMetadataIndexer() + idx.catch_exceptions = False # don't commit offsets if indexation failed + worker_fns.append(idx.process_journal_objects) + + if not worker_fns: + raise click.ClickException(f"Unknown indexer: {indexer}") + client = get_journal_client( cls="kafka", brokers=brokers, prefix=prefix, group_id=group_id, - object_types=["origin_visit_status"], + object_types=list(object_types), stop_after_objects=stop_after_objects, ) - worker_fn = functools.partial( - process_journal_objects, - scheduler=scheduler, - task_names={ - "origin_metadata": origin_metadata_task_type, - }, - ) + def worker_fn(objects: ObjectsDict): + for fn in worker_fns: + fn(objects) + try: client.process(worker_fn) except KeyboardInterrupt: diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 The Software Heritage developers +# Copyright (C) 2016-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,19 +18,21 @@ List, Optional, Set, + Tuple, TypeVar, Union, ) import warnings import sentry_sdk +from typing_extensions import TypedDict from swh.core import utils from swh.core.config import load_from_envvar, merge_configs from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage from swh.indexer.storage.interface import IndexerStorageInterface from swh.model import hashutil -from swh.model.model import Revision, Sha1Git +from swh.model.model import Directory, Origin, Sha1Git from swh.objstorage.exc import ObjNotFoundError from swh.objstorage.factory import get_objstorage from swh.scheduler import CONFIG as SWH_CONFIG @@ -38,6 +40,12 @@ from swh.storage.interface import StorageInterface +class ObjectsDict(TypedDict, total=False): + directory: List[Dict] + origin: List[Dict] + origin_visit_status: List[Dict] + + @contextmanager def write_to_temp(filename: str, data: bytes, working_directory: str) -> Iterator[str]: """Write the sha1's content in a temporary file. @@ -102,7 +110,7 @@ content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level - classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, + classes: :class:`ContentIndexer`, :class:`DirectoryIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: @@ -526,9 +534,29 @@ DeprecationWarning, ) del kwargs["policy_update"] + + origins = [{"url": url} for url in origin_urls] + + return self.process_journal_objects({"origin": origins}) + + def process_journal_objects(self, objects: ObjectsDict) -> Dict: + """Worker function for ``JournalClient``. Expects ``objects`` to have a single + key, either ``origin`` or ``"origin_visit_status"``.""" + origins = [ + Origin(url=status["origin"]) + for status in objects.get("origin_visit_status", []) + if status["status"] == "full" + ] + [Origin(url=origin["url"]) for origin in objects.get("origin", [])] + summary: Dict[str, Any] = {"status": "uneventful"} try: - results = self.index_list(origin_urls, **kwargs) + results = self.index_list( + origins, + check_origin_known=False, + # no need to check they exist, as we just received either an origin or + # visit status; which cannot be created by swh-storage unless the origin + # already exists + ) except Exception: if not self.catch_exceptions: raise @@ -544,23 +572,23 @@ summary.update(summary_persist) return summary - def index_list(self, origin_urls: List[str], **kwargs) -> List[TResult]: + def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]: results = [] - for origin_url in origin_urls: + for origin in origins: try: - results.extend(self.index(origin_url, **kwargs)) + results.extend(self.index(origin.url, **kwargs)) except Exception: - self.log.exception("Problem when processing origin %s", origin_url) + self.log.exception("Problem when processing origin %s", origin.url) sentry_sdk.capture_exception() raise return results -class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]): +class DirectoryIndexer(BaseIndexer[Sha1Git, Directory, TResult], Generic[TResult]): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements Revision indexing using the run method + implements Directory indexing using the run method - Note: the :class:`RevisionIndexer` is not an instantiable object. + Note: the :class:`DirectoryIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. @@ -570,7 +598,7 @@ def run(self, ids: List[Sha1Git], **kwargs) -> Dict: """Given a list of sha1_gits: - - retrieve revisions from storage + - retrieve directories from storage - execute the indexing computations - store the results @@ -584,28 +612,40 @@ DeprecationWarning, ) del kwargs["policy_update"] - summary: Dict[str, Any] = {"status": "uneventful"} - results = [] - revision_ids = [ + directory_ids = [ hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids ] - for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)): - if not rev: - # TODO: call self.index() with rev=None? - self.log.warning( - "Revision %s not found in storage", hashutil.hash_to_hex(rev_id) - ) - continue + + return self._process_directories([(dir_id, None) for dir_id in directory_ids]) + + def process_journal_objects(self, objects: ObjectsDict) -> Dict: + """Worker function for ``JournalClient``. Expects ``objects`` to have a single + key, ``"directory"``.""" + assert set(objects) == {"directory"} + return self._process_directories( + [(dir_["id"], Directory.from_dict(dir_)) for dir_ in objects["directory"]] + ) + + def _process_directories( + self, + directories: Union[List[Tuple[Sha1Git, Directory]], List[Tuple[Sha1Git, None]]], + ) -> Dict: + + summary: Dict[str, Any] = {"status": "uneventful"} + results = [] + + # TODO: fetch raw_manifest when useful? + + for (dir_id, dir_) in directories: try: - results.extend(self.index(rev_id, rev)) + results.extend(self.index(dir_id, dir_)) except Exception: if not self.catch_exceptions: raise - self.log.exception("Problem when processing revision") + self.log.exception("Problem when processing directory") sentry_sdk.capture_exception() summary["status"] = "failed" - return summary summary_persist = self.persist_index_computations(results) if summary_persist: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,20 +21,24 @@ from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents -from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer +from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS -from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Revision, Sha1Git +from swh.model.model import Directory +from swh.model.model import ObjectType as ModelObjectType +from swh.model.model import Origin, Sha1Git +from swh.model.swhids import CoreSWHID, ObjectType REVISION_GET_BATCH_SIZE = 10 +RELEASE_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 @@ -82,7 +86,7 @@ self, id: Sha1, data: Optional[bytes] = None, - log_suffix="unknown revision", + log_suffix="unknown directory", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. @@ -144,18 +148,18 @@ } -class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]): - """Revision-level indexer +class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): + """Directory-level indexer This indexer is in charge of: - - filtering revisions already indexed in revision_intrinsic_metadata table + - filtering directories already indexed in directory_intrinsic_metadata table with defined computation tool - - retrieve all entry_files in root directory + - retrieve all entry_files in directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - - store the results for revision + - store the results for directory """ @@ -165,7 +169,7 @@ def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones.""" - yield from self.idx_storage.revision_intrinsic_metadata_missing( + yield from self.idx_storage.directory_intrinsic_metadata_missing( ( { "id": sha1_git, @@ -176,51 +180,52 @@ ) def index( - self, id: Sha1Git, data: Optional[Revision], **kwargs - ) -> List[RevisionIntrinsicMetadataRow]: - """Index rev by processing it and organizing result. + self, id: Sha1Git, data: Optional[Directory] = None, **kwargs + ) -> List[DirectoryIntrinsicMetadataRow]: + """Index directory by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - - if multiple file detected -> translation needed at revision level + - if multiple file detected -> translation needed at directory level Args: - id: sha1_git of the revision - data: revision model object from storage + id: sha1_git of the directory + data: directory model object from storage Returns: - dict: dictionary representing a revision_intrinsic_metadata, with + dict: dictionary representing a directory_intrinsic_metadata, with keys: - - id (str): rev's identifier (sha1_git) + - id: directory's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ - rev = data - assert isinstance(rev, Revision) + if data is None: + dir_ = list(self.storage.directory_ls(id, recursive=False)) + else: + assert isinstance(data, Directory) + dir_ = data.to_dict() try: - root_dir = rev.directory - dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) - if [entry["type"] for entry in dir_ls] == ["dir"]: + if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs - subdir = dir_ls[0]["target"] - dir_ls = list(self.storage.directory_ls(subdir, recursive=False)) - files = [entry for entry in dir_ls if entry["type"] == "file"] + subdir = dir_[0]["target"] + dir_ = list(self.storage.directory_ls(subdir, recursive=False)) + files = [entry for entry in dir_ if entry["type"] == "file"] detected_files = detect_metadata(files) - (mappings, metadata) = self.translate_revision_intrinsic_metadata( + (mappings, metadata) = self.translate_directory_intrinsic_metadata( detected_files, - log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id), + log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: - self.log.exception("Problem when indexing rev: %r", e) + self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() return [ - RevisionIntrinsicMetadataRow( - id=rev.id, + DirectoryIntrinsicMetadataRow( + id=id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, @@ -228,7 +233,7 @@ ] def persist_index_computations( - self, results: List[RevisionIntrinsicMetadataRow] + self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. @@ -241,10 +246,10 @@ """ # TODO: add functions in storage to keep data in - # revision_intrinsic_metadata - return self.idx_storage.revision_intrinsic_metadata_add(results) + # directory_intrinsic_metadata + return self.idx_storage.directory_intrinsic_metadata_add(results) - def translate_revision_intrinsic_metadata( + def translate_directory_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str ) -> Tuple[List[Any], Any]: """ @@ -315,81 +320,129 @@ class OriginMetadataIndexer( - OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]] + OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) - self.origin_head_indexer = OriginHeadIndexer(config=config) - self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) + self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( - self, origin_urls: List[str], **kwargs - ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]: + self, origins: List[Origin], check_origin_known: bool = True, **kwargs + ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] - origins_with_head = [] - origins = list( - call_with_batches( - self.storage.origin_get, - origin_urls, - ORIGIN_GET_BATCH_SIZE, + head_rel_ids = [] + origin_heads: Dict[Origin, CoreSWHID] = {} + + # Filter out origins not in the storage + if check_origin_known: + known_origins = list( + call_with_batches( + self.storage.origin_get, + [origin.url for origin in origins], + ORIGIN_GET_BATCH_SIZE, + ) ) - ) - for origin in origins: + else: + known_origins = list(origins) + + for origin in known_origins: if origin is None: continue - head_results = self.origin_head_indexer.index(origin.url) - if head_results: - (head_result,) = head_results - origins_with_head.append(origin) - head_rev_ids.append(head_result["revision_id"]) - - head_revs = list( - call_with_batches( - self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + head_swhid = get_head_swhid(self.storage, origin.url) + if head_swhid: + origin_heads[origin] = head_swhid + if head_swhid.object_type == ObjectType.REVISION: + head_rev_ids.append(head_swhid.object_id) + elif head_swhid.object_type == ObjectType.RELEASE: + head_rel_ids.append(head_swhid.object_id) + else: + assert False, head_swhid + + head_revs = dict( + zip( + head_rev_ids, + call_with_batches( + self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + ), + ) + ) + head_rels = dict( + zip( + head_rel_ids, + call_with_batches( + self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE + ), ) ) - assert len(head_revs) == len(head_rev_ids) results = [] - for (origin, rev) in zip(origins_with_head, head_revs): - if not rev: - self.log.warning("Missing head revision of origin %r", origin.url) - continue - - for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev): - # There is at most one rev_metadata + for (origin, head_swhid) in origin_heads.items(): + if head_swhid.object_type == ObjectType.REVISION: + rev = head_revs[head_swhid.object_id] + if not rev: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + directory_id = rev.directory + elif head_swhid.object_type == ObjectType.RELEASE: + rel = head_rels[head_swhid.object_id] + if not rel: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + if rel.target_type != ModelObjectType.DIRECTORY: + # TODO + self.log.warning( + "Head release %s of %r has unexpected target type %s", + head_swhid, + origin.url, + rel.target_type, + ) + continue + assert rel.target, rel + directory_id = rel.target + else: + assert False, head_swhid + + for dir_metadata in self.directory_metadata_indexer.index(directory_id): + # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( - from_revision=rev_metadata.id, + from_directory=dir_metadata.id, id=origin.url, - metadata=rev_metadata.metadata, - mappings=rev_metadata.mappings, - indexer_configuration_id=rev_metadata.indexer_configuration_id, + metadata=dir_metadata.metadata, + mappings=dir_metadata.mappings, + indexer_configuration_id=dir_metadata.indexer_configuration_id, ) - results.append((orig_metadata, rev_metadata)) + results.append((orig_metadata, dir_metadata)) + return results def persist_index_computations( self, - results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]], + results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: - # Deduplicate revisions - rev_metadata: List[RevisionIntrinsicMetadataRow] = [] + # Deduplicate directories + dir_metadata: List[DirectoryIntrinsicMetadataRow] = [] orig_metadata: List[OriginIntrinsicMetadataRow] = [] summary: Dict = {} - for (orig_item, rev_item) in results: - assert rev_item.metadata == orig_item.metadata - if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}): + for (orig_item, dir_item) in results: + assert dir_item.metadata == orig_item.metadata + if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets - if rev_item not in rev_metadata: - rev_metadata.append(rev_item) + if dir_item not in dir_metadata: + dir_metadata.append(dir_item) if orig_item not in orig_metadata: orig_metadata.append(orig_item) - if rev_metadata: - summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata) - summary.update(summary_rev) + if dir_metadata: + summary_dir = self.idx_storage.directory_intrinsic_metadata_add( + dir_metadata + ) + summary.update(summary_dir) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata) summary.update(summary_ori) diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -6,10 +6,12 @@ from .base import DictMapping, SingleFileMapping -yaml.SafeLoader.yaml_implicit_resolvers = { - k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"] - for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items() -} + +class SafeLoader(yaml.SafeLoader): + yaml_implicit_resolvers = { + k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"] + for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items() + } class CffMapping(DictMapping, SingleFileMapping): diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -133,6 +133,73 @@ author[SCHEMA_URI + "url"] = {"@id": url} return {"@list": [author]} + def normalize_description(self, description): + r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common + mistake that causes issues in the database because of null bytes in JSON. + + >>> NpmMapping().normalize_description("foo bar") + 'foo bar' + >>> NpmMapping().normalize_description( + ... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00" + ... ) + 'foo bar' + >>> NpmMapping().normalize_description( + ... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 " + ... ) + 'foo bar' + >>> NpmMapping().normalize_description( + ... # invalid UTF-16 and meaningless UTF-8: + ... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00" + ... ) is None + True + >>> NpmMapping().normalize_description( + ... # ditto (ut looks like little-endian at first) + ... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00" + ... ) is None + True + >>> NpmMapping().normalize_description(None) is None + True + """ + if description is None: + return None + # XXX: if this function ever need to support more cases, consider + # switching to https://pypi.org/project/ftfy/ instead of adding more hacks + if description.startswith("\ufffd\ufffd") and "\x00" in description: + # 2 unicode replacement characters followed by '# ' encoded as UTF-16 + # is a common mistake, which indicates a README.md was saved as UTF-16, + # and some NPM tool opened it as UTF-8 and used the first line as + # description. + + description_bytes = description.encode() + + # Strip the the two unicode replacement characters + assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd") + description_bytes = description_bytes[6:] + + # If the following attempts fail to recover the description, discard it + # entirely because the current indexer storage backend (postgresql) cannot + # store zero bytes in JSON columns. + description = None + + if not description_bytes.startswith(b"\x00"): + # try UTF-16 little-endian (the most common) first + try: + description = description_bytes.decode("utf-16le") + except UnicodeDecodeError: + pass + if description is None: + # if it fails, try UTF-16 big-endian + try: + description = description_bytes.decode("utf-16be") + except UnicodeDecodeError: + pass + + if description: + if description.startswith("# "): + description = description[2:] + return description.rstrip() + return description + def normalize_license(self, s): """https://docs.npmjs.com/files/package.json#license diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,159 +1,120 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging import re -from typing import Any, Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import click - -from swh.indexer.indexer import OriginIndexer from swh.model.model import SnapshotBranch, TargetType +from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches -class OriginHeadIndexer(OriginIndexer[Dict]): - """Origin-level indexer. - - This indexer is in charge of looking up the revision that acts as the - "head" of an origin. - - In git, this is usually the commit pointed to by the 'master' branch.""" - - USE_TOOLS = False - - def persist_index_computations(self, results: Any) -> Dict[str, int]: - """Do nothing. The indexer's results are not persistent, they - should only be piped to another indexer.""" - return {} - - # Dispatch - - def index(self, id: str, data: None = None, **kwargs) -> List[Dict]: - origin_url = id - visit_status = origin_get_latest_visit_status( - self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True - ) - if not visit_status: - return [] - assert visit_status.snapshot is not None - snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot) - if snapshot is None: - return [] - method = getattr( - self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic - ) - - rev_id = method(snapshot.branches) # type: ignore - if rev_id is not None: - return [ - { - "origin_url": origin_url, - "revision_id": rev_id, - } - ] - - # could not find a head revision - return [] - - # Tarballs - - _archive_filename_re = re.compile( - rb"^" - rb"(?P.*)[-_]" - rb"(?P[0-9]+(\.[0-9])*)" - rb"(?P[-+][a-zA-Z0-9.~]+?)?" - rb"(?P(\.[a-zA-Z0-9]+)+)" - rb"$" +def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: + """Returns the SWHID of the head revision or release of an origin""" + visit_status = origin_get_latest_visit_status( + storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) + if not visit_status: + return None + assert visit_status.snapshot is not None + snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) + if snapshot is None: + return None + + if visit_status.type == "ftp": + return _try_get_ftp_head(dict(snapshot.branches)) + else: + return _try_get_head_generic(dict(snapshot.branches)) + + +_archive_filename_re = re.compile( + rb"^" + rb"(?P.*)[-_]" + rb"(?P[0-9]+(\.[0-9])*)" + rb"(?P[-+][a-zA-Z0-9.~]+?)?" + rb"(?P(\.[a-zA-Z0-9]+)+)" + rb"$" +) - @classmethod - def _parse_version(cls: Any, filename: bytes) -> Tuple[Union[float, int], ...]: - """Extracts the release version from an archive filename, - to get an ordering whose maximum is likely to be the last - version of the software - - >>> OriginHeadIndexer._parse_version(b'foo') - (-inf,) - >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') - (-inf,) - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') - (0, 0, 1, 0) - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') - (0, 0, 1, -1, 'beta2') - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') - (0, 0, 1, 1, 'foobar') - """ - res = cls._archive_filename_re.match(filename) - if res is None: - return (float("-infinity"),) - version = [int(n) for n in res.group("version").decode().split(".")] - if res.group("preversion") is None: - version.append(0) + +def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]: + """Extracts the release version from an archive filename, + to get an ordering whose maximum is likely to be the last + version of the software + + >>> _parse_version(b'foo') + (-inf,) + >>> _parse_version(b'foo.tar.gz') + (-inf,) + >>> _parse_version(b'gnu-hello-0.0.1.tar.gz') + (0, 0, 1, 0) + >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') + (0, 0, 1, -1, 'beta2') + >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') + (0, 0, 1, 1, 'foobar') + """ + res = _archive_filename_re.match(filename) + if res is None: + return (float("-infinity"),) + version: List[Union[float, int, str]] = [ + int(n) for n in res.group("version").decode().split(".") + ] + if res.group("preversion") is None: + version.append(0) + else: + preversion = res.group("preversion").decode() + if preversion.startswith("-"): + version.append(-1) + version.append(preversion[1:]) + elif preversion.startswith("+"): + version.append(1) + version.append(preversion[1:]) else: - preversion = res.group("preversion").decode() - if preversion.startswith("-"): - version.append(-1) - version.append(preversion[1:]) - elif preversion.startswith("+"): - version.append(1) - version.append(preversion[1:]) - else: - assert False, res.group("preversion") - return tuple(version) - - def _try_get_ftp_head(self, branches: Dict[bytes, SnapshotBranch]) -> Any: - archive_names = list(branches) - max_archive_name = max(archive_names, key=self._parse_version) - r = self._try_resolve_target(branches, max_archive_name) - return r - - # Generic - - def _try_get_head_generic(self, branches: Dict[bytes, SnapshotBranch]) -> Any: - # Works on 'deposit', 'pypi', and VCSs. - return self._try_resolve_target(branches, b"HEAD") or self._try_resolve_target( - branches, b"master" - ) - - def _try_resolve_target( - self, branches: Dict[bytes, SnapshotBranch], branch_name: bytes - ) -> Any: - try: - branch = branches[branch_name] - if branch is None: - return None - while branch.target_type == TargetType.ALIAS: - branch = branches[branch.target] - if branch is None: - return None - - if branch.target_type == TargetType.REVISION: - return branch.target - elif branch.target_type == TargetType.CONTENT: - return None # TODO - elif branch.target_type == TargetType.DIRECTORY: - return None # TODO - elif branch.target_type == TargetType.RELEASE: - return None # TODO - else: - assert False, branch - except KeyError: - return None + assert False, res.group("preversion") + return tuple(version) -@click.command() -@click.option( - "--origins", "-i", help='Origins to lookup, in the "type+url" format', multiple=True -) -def main(origins: List[str]) -> None: - rev_metadata_indexer = OriginHeadIndexer() - rev_metadata_indexer.run(origins) +def _try_get_ftp_head( + branches: Dict[bytes, Optional[SnapshotBranch]] +) -> Optional[CoreSWHID]: + archive_names = list(branches) + max_archive_name = max(archive_names, key=_parse_version) + return _try_resolve_target(branches, max_archive_name) -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - main() +def _try_get_head_generic( + branches: Dict[bytes, Optional[SnapshotBranch]] +) -> Optional[CoreSWHID]: + # Works on 'deposit', 'pypi', and VCSs. + return _try_resolve_target(branches, b"HEAD") or _try_resolve_target( + branches, b"master" + ) + + +def _try_resolve_target( + branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes +) -> Optional[CoreSWHID]: + try: + branch = branches[branch_name] + if branch is None: + return None + while branch.target_type == TargetType.ALIAS: + branch = branches[branch.target] + if branch is None: + return None + + if branch.target_type == TargetType.REVISION: + return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target) + elif branch.target_type == TargetType.CONTENT: + return None # TODO + elif branch.target_type == TargetType.DIRECTORY: + return None # TODO + elif branch.target_type == TargetType.RELEASE: + return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target) + else: + assert False, branch + except KeyError: + return None diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -99,34 +99,34 @@ comment on column content_metadata.metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; --- The table revision_intrinsic_metadata provides a minimal set of intrinsic +-- The table directory_intrinsic_metadata provides a minimal set of intrinsic -- metadata detected with the detection tool (indexer_configuration_id) and -- aggregated from the content_metadata translation. -create table revision_intrinsic_metadata( +create table directory_intrinsic_metadata( id sha1_git not null, metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); -comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; -comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; -comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; -comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; -comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; +comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory'; +comment on column directory_intrinsic_metadata.id is 'sha1_git of directory'; +comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( id text not null, -- origin url metadata jsonb, indexer_configuration_id bigint not null, - from_revision sha1_git not null, + from_directory sha1_git not null, metadata_tsvector tsvector, mappings text array not null ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; comment on column origin_intrinsic_metadata.id is 'url of the origin'; -comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; -comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; +comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.'; comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql --- a/swh/indexer/sql/50-func.sql +++ b/swh/indexer/sql/50-func.sql @@ -273,25 +273,25 @@ -- end content_metadata functions --- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, +-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata, -- overwriting duplicates. -- -- If filtering duplicates is in order, the call to --- swh_revision_intrinsic_metadata_missing must take place before calling this +-- swh_directory_intrinsic_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_revision_intrinsic_metadata, 2. call this function -create or replace function swh_revision_intrinsic_metadata_add() +-- tmp_directory_intrinsic_metadata, 2. call this function +create or replace function swh_directory_intrinsic_metadata_add() returns bigint language plpgsql as $$ declare res bigint; begin - insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) select id, metadata, mappings, indexer_configuration_id - from tmp_revision_intrinsic_metadata tcm + from tmp_directory_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, @@ -302,19 +302,19 @@ end $$; -comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata'; +comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata'; --- create a temporary table for retrieving revision_intrinsic_metadata -create or replace function swh_mktemp_revision_intrinsic_metadata() +-- create a temporary table for retrieving directory_intrinsic_metadata +create or replace function swh_mktemp_directory_intrinsic_metadata() returns void language sql as $$ - create temporary table if not exists tmp_revision_intrinsic_metadata ( - like revision_intrinsic_metadata including defaults + create temporary table if not exists tmp_directory_intrinsic_metadata ( + like directory_intrinsic_metadata including defaults ) on commit delete rows; $$; -comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata'; +comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata'; -- create a temporary table for retrieving origin_intrinsic_metadata create or replace function swh_mktemp_origin_intrinsic_metadata() @@ -380,8 +380,8 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) @@ -389,7 +389,7 @@ metadata = excluded.metadata, metadata_tsvector = excluded.metadata_tsvector, mappings = excluded.mappings, - from_revision = excluded.from_revision; + from_directory = excluded.from_directory; get diagnostics res = ROW_COUNT; return res; diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql --- a/swh/indexer/sql/60-indexes.sql +++ b/swh/indexer/sql/60-indexes.sql @@ -25,12 +25,12 @@ alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; --- revision_intrinsic_metadata -create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); -alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; +-- directory_intrinsic_metadata +create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id); +alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey; -alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey; +alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey; -- content_mimetype create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); diff --git a/swh/indexer/sql/upgrades/134.sql b/swh/indexer/sql/upgrades/134.sql new file mode 100644 --- /dev/null +++ b/swh/indexer/sql/upgrades/134.sql @@ -0,0 +1,154 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 133 +-- to_version: 134 +-- description: replace revision_intrinsic_metadata with directory_intrinsic_metadata +-- and origin_intrinsic_metadata.from_revision with origin_intrinsic_metadata.from_directory +-- This migration works by dropping both tables and reindexing from scratch. + +insert into dbversion(version, release, description) + values(134, now(), 'Work In Progress'); + +drop table origin_intrinsic_metadata; +drop table revision_intrinsic_metadata; +drop function swh_revision_intrinsic_metadata_add; +drop function swh_mktemp_revision_intrinsic_metadata; + + +create table directory_intrinsic_metadata( + id sha1_git not null, + metadata jsonb not null, + indexer_configuration_id bigint not null, + mappings text array not null +); + +comment on table directory_intrinsic_metadata is 'metadata semantically detected and translated in a directory'; +comment on column directory_intrinsic_metadata.id is 'sha1_git of directory'; +comment on column directory_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column directory_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column directory_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; + +create table origin_intrinsic_metadata( + id text not null, -- origin url + metadata jsonb, + indexer_configuration_id bigint not null, + from_directory sha1_git not null, + metadata_tsvector tsvector, + mappings text array not null +); + +comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; +comment on column origin_intrinsic_metadata.id is 'url of the origin'; +comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a directory'; +comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.'; +comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; + +-- add tmp_directory_intrinsic_metadata entries to directory_intrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_directory_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_directory_intrinsic_metadata, 2. call this function +create or replace function swh_directory_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into directory_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_directory_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_directory_intrinsic_metadata_add() IS 'Add new directory intrinsic metadata'; + +-- create a temporary table for retrieving directory_intrinsic_metadata +create or replace function swh_mktemp_directory_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_directory_intrinsic_metadata ( + like directory_intrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_directory_intrinsic_metadata() is 'Helper table to add directory intrinsic metadata'; + +-- create a temporary table for retrieving origin_intrinsic_metadata +create or replace function swh_mktemp_origin_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_origin_intrinsic_metadata ( + like origin_intrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_origin_intrinsic_metadata() is 'Helper table to add origin intrinsic metadata'; + +-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_intrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_intrinsic_metadata, 2. call this function +create or replace function swh_origin_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_directory, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_directory, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_directory = excluded.from_directory; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata'; + + + +-- directory_intrinsic_metadata +create unique index directory_intrinsic_metadata_pkey on directory_intrinsic_metadata(id, indexer_configuration_id); +alter table directory_intrinsic_metadata add primary key using index directory_intrinsic_metadata_pkey; + +alter table directory_intrinsic_metadata add constraint directory_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table directory_intrinsic_metadata validate constraint directory_intrinsic_metadata_indexer_configuration_id_fkey; + +-- origin_intrinsic_metadata +create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id); +alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; + +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; + +create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); +create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -30,8 +30,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -120,7 +120,9 @@ class IndexerStorage: - """SWH Indexer Storage""" + """SWH Indexer Storage Datastore""" + + current_version = 134 def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None): """ @@ -152,10 +154,6 @@ if db is not self._db: db.put_conn() - @db_transaction() - def get_current_version(self, *, db=None, cur=None): - return db.current_version - @timed @db_transaction() def check_config(self, *, check_write, db=None, cur=None): @@ -522,52 +520,52 @@ @timed @db_transaction() - def revision_intrinsic_metadata_missing( + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict], db=None, cur=None ) -> List[Tuple[Sha1, int]]: return [ obj[0] - for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur) + for obj in db.directory_intrinsic_metadata_missing_from_list(metadata, cur) ] @timed @db_transaction() - def revision_intrinsic_metadata_get( + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1], db=None, cur=None - ) -> List[RevisionIntrinsicMetadataRow]: + ) -> List[DirectoryIntrinsicMetadataRow]: return [ - RevisionIntrinsicMetadataRow.from_dict( + DirectoryIntrinsicMetadataRow.from_dict( converters.db_to_metadata( - dict(zip(db.revision_intrinsic_metadata_cols, c)) + dict(zip(db.directory_intrinsic_metadata_cols, c)) ) ) - for c in db.revision_intrinsic_metadata_get_from_list(ids, cur) + for c in db.directory_intrinsic_metadata_get_from_list(ids, cur) ] @timed @process_metrics @db_transaction() - def revision_intrinsic_metadata_add( + def directory_intrinsic_metadata_add( self, - metadata: List[RevisionIntrinsicMetadataRow], + metadata: List[DirectoryIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) - self.journal_writer.write_additions("revision_intrinsic_metadata", metadata) + self.journal_writer.write_additions("directory_intrinsic_metadata", metadata) - db.mktemp_revision_intrinsic_metadata(cur) + db.mktemp_directory_intrinsic_metadata(cur) db.copy_to( [m.to_dict() for m in metadata], - "tmp_revision_intrinsic_metadata", + "tmp_directory_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, ) - count = db.revision_intrinsic_metadata_add_from_temp(cur) + count = db.directory_intrinsic_metadata_add_from_temp(cur) return { - "revision_intrinsic_metadata:add": count, + "directory_intrinsic_metadata:add": count, } @timed @@ -602,7 +600,13 @@ db.copy_to( [m.to_dict() for m in metadata], "tmp_origin_intrinsic_metadata", - ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"], + [ + "id", + "metadata", + "indexer_configuration_id", + "from_directory", + "mappings", + ], cur, ) count = db.origin_intrinsic_metadata_add_from_temp(cur) diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -86,10 +86,10 @@ if type == "local": vcfg = cfg["indexer_storage"] cls = vcfg.get("cls") - if cls != "local": + if cls not in ("local", "postgresql"): raise ValueError( "The indexer_storage backend can only be started with a " - "'local' configuration" + "'postgresql' configuration" ) if not vcfg.get("db"): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,7 +16,6 @@ """Proxy to the SWH Indexer DB, with wrappers around stored procedures""" content_mimetype_hash_keys = ["id", "indexer_configuration_id"] - current_version = 133 def _missing_from_list( self, table: str, data: Iterable[Dict], hash_keys: List[str], cur=None @@ -350,18 +349,18 @@ "content_metadata", ids, self.content_metadata_cols, cur=cur ) - revision_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"] + directory_intrinsic_metadata_hash_keys = ["id", "indexer_configuration_id"] - def revision_intrinsic_metadata_missing_from_list(self, metadata, cur=None): + def directory_intrinsic_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata.""" yield from self._missing_from_list( - "revision_intrinsic_metadata", + "directory_intrinsic_metadata", metadata, - self.revision_intrinsic_metadata_hash_keys, + self.directory_intrinsic_metadata_hash_keys, cur=cur, ) - revision_intrinsic_metadata_cols = [ + directory_intrinsic_metadata_cols = [ "id", "metadata", "mappings", @@ -371,27 +370,27 @@ "tool_configuration", ] - @stored_procedure("swh_mktemp_revision_intrinsic_metadata") - def mktemp_revision_intrinsic_metadata(self, cur=None): + @stored_procedure("swh_mktemp_directory_intrinsic_metadata") + def mktemp_directory_intrinsic_metadata(self, cur=None): pass - def revision_intrinsic_metadata_add_from_temp(self, cur=None): + def directory_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_revision_intrinsic_metadata_add()") + cur.execute("select * from swh_directory_intrinsic_metadata_add()") return cur.fetchone()[0] - def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): + def directory_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( - "revision_intrinsic_metadata", + "directory_intrinsic_metadata", ids, - self.revision_intrinsic_metadata_cols, + self.directory_intrinsic_metadata_cols, cur=cur, ) origin_intrinsic_metadata_cols = [ "id", "metadata", - "from_revision", + "from_directory", "mappings", "tool_id", "tool_name", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -38,8 +38,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -250,8 +250,8 @@ self._content_ctags = SubStorage(ContentCtagsRow, *args) self._licenses = SubStorage(ContentLicenseRow, *args) self._content_metadata = SubStorage(ContentMetadataRow, *args) - self._revision_intrinsic_metadata = SubStorage( - RevisionIntrinsicMetadataRow, *args + self._directory_intrinsic_metadata = SubStorage( + DirectoryIntrinsicMetadataRow, *args ) self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args) @@ -369,21 +369,21 @@ added = self._content_metadata.add(metadata) return {"content_metadata:add": added} - def revision_intrinsic_metadata_missing( + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict] ) -> List[Tuple[Sha1, int]]: - return self._revision_intrinsic_metadata.missing(metadata) + return self._directory_intrinsic_metadata.missing(metadata) - def revision_intrinsic_metadata_get( + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1] - ) -> List[RevisionIntrinsicMetadataRow]: - return self._revision_intrinsic_metadata.get(ids) + ) -> List[DirectoryIntrinsicMetadataRow]: + return self._directory_intrinsic_metadata.get(ids) - def revision_intrinsic_metadata_add( - self, metadata: List[RevisionIntrinsicMetadataRow] + def directory_intrinsic_metadata_add( + self, metadata: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: - added = self._revision_intrinsic_metadata.add(metadata) - return {"revision_intrinsic_metadata:add": added} + added = self._directory_intrinsic_metadata.add(metadata) + return {"directory_intrinsic_metadata:add": added} def origin_intrinsic_metadata_get( self, urls: Iterable[str] diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -15,8 +15,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) TResult = TypeVar("TResult") @@ -341,8 +341,8 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata/missing") - def revision_intrinsic_metadata_missing( + @remote_api_endpoint("directory_intrinsic_metadata/missing") + def directory_intrinsic_metadata_missing( self, metadata: Iterable[Dict] ) -> List[Tuple[Sha1, int]]: """List metadata missing from storage. @@ -350,7 +350,7 @@ Args: metadata (iterable): dictionaries with keys: - - **id** (bytes): sha1_git revision identifier + - **id** (bytes): sha1_git directory identifier - **indexer_configuration_id** (int): tool used to compute the results @@ -360,11 +360,11 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata") - def revision_intrinsic_metadata_get( + @remote_api_endpoint("directory_intrinsic_metadata") + def directory_intrinsic_metadata_get( self, ids: Iterable[Sha1] - ) -> List[RevisionIntrinsicMetadataRow]: - """Retrieve revision metadata per id. + ) -> List[DirectoryIntrinsicMetadataRow]: + """Retrieve directory metadata per id. Args: ids (iterable): sha1 checksums @@ -375,10 +375,10 @@ """ ... - @remote_api_endpoint("revision_intrinsic_metadata/add") - def revision_intrinsic_metadata_add( + @remote_api_endpoint("directory_intrinsic_metadata/add") + def directory_intrinsic_metadata_add( self, - metadata: List[RevisionIntrinsicMetadataRow], + metadata: List[DirectoryIntrinsicMetadataRow], ) -> Dict[str, int]: """Add metadata not present in storage. diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -120,8 +120,8 @@ @attr.s -class RevisionIntrinsicMetadataRow(BaseRow): - object_type: Final = "revision_intrinsic_metadata" +class DirectoryIntrinsicMetadataRow(BaseRow): + object_type: Final = "directory_intrinsic_metadata" id = attr.ib(type=Sha1Git) metadata = attr.ib(type=Dict[str, Any]) @@ -134,5 +134,5 @@ id = attr.ib(type=str) metadata = attr.ib(type=Dict[str, Any]) - from_revision = attr.ib(type=Sha1Git) + from_directory = attr.ib(type=Sha1Git) mappings = attr.ib(type=List[str]) diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,9 +13,8 @@ from pytest_postgresql import factories import yaml -from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact -from swh.indexer.storage import get_indexer_storage -from swh.indexer.storage.db import Db as IndexerDb +from swh.core.db.pytest_plugin import initialize_database_for_module +from swh.indexer.storage import IndexerStorage, get_indexer_storage from swh.objstorage.factory import get_objstorage from swh.storage import get_storage @@ -23,23 +22,22 @@ TASK_NAMES: List[Tuple[str, str]] = [ # (scheduler-task-type, task-class-test-name) - ("index-revision-metadata", "revision_intrinsic_metadata"), + ("index-directory-metadata", "directory_intrinsic_metadata"), ("index-origin-metadata", "origin_intrinsic_metadata"), ] idx_postgresql_proc = factories.postgresql_proc( - dbname="indexer_storage", load=[ partial( initialize_database_for_module, modname="indexer", - version=IndexerDb.current_version, + version=IndexerStorage.current_version, ) ], ) -idx_storage_postgresql = postgresql_fact("idx_postgresql_proc") +idx_storage_postgresql = factories.postgresql("idx_postgresql_proc") @pytest.fixture diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py --- a/swh/indexer/tests/storage/conftest.py +++ b/swh/indexer/tests/storage/conftest.py @@ -41,9 +41,9 @@ data.tools = tools data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689") data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7") - data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") - data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") - data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") + data.directory_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") + data.directory_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") + data.directory_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") data.origin_url_1 = "file:///dev/0/zero" # 44434341 data.origin_url_2 = "file:///dev/1/one" # 44434342 data.origin_url_3 = "file:///dev/2/two" # 54974445 diff --git a/swh/indexer/tests/storage/test_server.py b/swh/indexer/tests/storage/test_server.py --- a/swh/indexer/tests/storage/test_server.py +++ b/swh/indexer/tests/storage/test_server.py @@ -57,13 +57,13 @@ def test_load_and_check_config_remote_config_local_type_raise( class_storage, tmpdir ) -> None: - """Any other configuration than 'local' (the default) is rejected""" + """Any other configuration than 'postgresql' (the default) is rejected""" assert class_storage != "local" incompatible_config = {"indexer_storage": {"cls": class_storage}} config_path = prepare_config_file(tmpdir, incompatible_config) expected_error = ( - "The indexer_storage backend can only be started with a 'local' " + "The indexer_storage backend can only be started with a 'postgresql' " "configuration" ) with pytest.raises(ValueError, match=expected_error): @@ -82,8 +82,8 @@ def test_load_and_check_config_local_incomplete_configuration(tmpdir) -> None: - """Incomplete 'local' configuration should raise""" - config = {"indexer_storage": {"cls": "local"}} + """Incomplete 'postgresql' configuration should raise""" + config = {"indexer_storage": {"cls": "postgresql"}} expected_error = "Invalid configuration; missing 'db' config entry" config_path = prepare_config_file(tmpdir, config) @@ -95,10 +95,10 @@ """'Complete 'local' configuration is fine""" config = { "indexer_storage": { - "cls": "local", + "cls": "postgresql", "db": "db", } } config_path = prepare_config_file(tmpdir, config) - cfg = load_and_check_config(config_path, type="local") + cfg = load_and_check_config(config_path, type="postgresql") assert cfg == config diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -19,8 +19,8 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model.hashutil import hash_to_bytes @@ -289,37 +289,37 @@ etype = self.endpoint_type tool = data.tools[self.tool_name] - data_rev1 = self.row_class.from_dict( + data_dir1 = self.row_class.from_dict( { - "id": data.revision_id_2, + "id": data.directory_id_2, **self.example_data[0], "indexer_configuration_id": tool["id"], } ) - data_rev2 = self.row_class.from_dict( + data_dir2 = self.row_class.from_dict( { - "id": data.revision_id_2, + "id": data.directory_id_2, **self.example_data[1], "indexer_configuration_id": tool["id"], } ) # when - summary = endpoint(storage, etype, "add")([data_rev1]) + summary = endpoint(storage, etype, "add")([data_dir1]) assert summary == expected_summary(1, etype) with pytest.raises(DuplicateId): - endpoint(storage, etype, "add")([data_rev2, data_rev2]) + endpoint(storage, etype, "add")([data_dir2, data_dir2]) # then actual_data = list( - endpoint(storage, etype, "get")([data.revision_id_2, data.revision_id_1]) + endpoint(storage, etype, "get")([data.directory_id_2, data.directory_id_1]) ) expected_data = [ self.row_class.from_dict( - {"id": data.revision_id_2, **self.example_data[0], "tool": tool} + {"id": data.directory_id_2, **self.example_data[0], "tool": tool} ) ] assert actual_data == expected_data @@ -806,11 +806,11 @@ row_class = ContentMetadataRow -class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): - """Test Indexer Storage revision_intrinsic_metadata related methods""" +class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester): + """Test Indexer Storage directory_intrinsic_metadata related methods""" tool_name = "swh-metadata-detector" - endpoint_type = "revision_intrinsic_metadata" + endpoint_type = "directory_intrinsic_metadata" example_data = [ { "metadata": { @@ -830,7 +830,7 @@ "mappings": ["mapping2"], }, ] - row_class = RevisionIntrinsicMetadataRow + row_class = DirectoryIntrinsicMetadataRow class TestIndexerStorageContentFossologyLicense(StorageETypeTester): @@ -1102,8 +1102,8 @@ "version": None, "name": None, } - metadata_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, @@ -1113,11 +1113,11 @@ metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata_rev]) + storage.directory_intrinsic_metadata_add([metadata_dir]) storage.origin_intrinsic_metadata_add([metadata_origin]) # then @@ -1130,7 +1130,7 @@ id=data.origin_url_1, metadata=metadata, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, mappings=["mapping1"], ) ] @@ -1156,8 +1156,8 @@ "version": None, "name": None, } - metadata_rev_v1 = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir_v1 = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata_v1, mappings=[], indexer_configuration_id=tool_id, @@ -1167,11 +1167,11 @@ metadata=metadata_v1.copy(), indexer_configuration_id=tool_id, mappings=[], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # given - storage.revision_intrinsic_metadata_add([metadata_rev_v1]) + storage.directory_intrinsic_metadata_add([metadata_dir_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1185,7 +1185,7 @@ id=data.origin_url_1, metadata=metadata_v1, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, mappings=[], ) ] @@ -1199,16 +1199,16 @@ "author": "MG", } ) - metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) + metadata_dir_v2 = attr.evolve(metadata_dir_v1, metadata=metadata_v2) metadata_origin_v2 = OriginIntrinsicMetadataRow( id=data.origin_url_1, metadata=metadata_v2.copy(), indexer_configuration_id=tool_id, mappings=["npm"], - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) - storage.revision_intrinsic_metadata_add([metadata_rev_v2]) + storage.directory_intrinsic_metadata_add([metadata_dir_v2]) storage.origin_intrinsic_metadata_add([metadata_origin_v2]) actual_metadata = list( @@ -1220,7 +1220,7 @@ id=data.origin_url_1, metadata=metadata_v2, tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, mappings=["npm"], ) ] @@ -1252,8 +1252,8 @@ "mappings": [], } - metadata_rev_v1 = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir_v1 = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata={ "version": None, "name": None, @@ -1265,7 +1265,7 @@ data_v1 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, indexer_configuration_id=tool_id, **example_data1, ) @@ -1274,7 +1274,7 @@ data_v2 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, indexer_configuration_id=tool_id, **example_data2, ) @@ -1287,7 +1287,7 @@ data_v2b = list(reversed(data_v2[0:-1])) # given - storage.revision_intrinsic_metadata_add([metadata_rev_v1]) + storage.directory_intrinsic_metadata_add([metadata_dir_v1]) storage.origin_intrinsic_metadata_add(data_v1) # when @@ -1296,7 +1296,7 @@ expected_data_v1 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, tool=data.tools["swh-metadata-detector"], **example_data1, ) @@ -1326,7 +1326,7 @@ expected_data_v2 = [ OriginIntrinsicMetadataRow( id=origin, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, tool=data.tools["swh-metadata-detector"], **example_data2, ) @@ -1351,8 +1351,8 @@ "developmentStatus": None, "name": None, } - metadata_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata, mappings=["mapping1"], indexer_configuration_id=tool_id, @@ -1362,11 +1362,11 @@ metadata=metadata, indexer_configuration_id=tool_id, mappings=["mapping1"], - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata_rev]) + storage.directory_intrinsic_metadata_add([metadata_dir]) with pytest.raises(DuplicateId): storage.origin_intrinsic_metadata_add([metadata_origin, metadata_origin]) @@ -1381,8 +1381,8 @@ metadata1 = { "author": "John Doe", } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, @@ -1392,13 +1392,13 @@ metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "author": "Jane Doe", } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, @@ -1408,13 +1408,13 @@ metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then @@ -1444,8 +1444,8 @@ "Jane Doe", ] } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, @@ -1455,7 +1455,7 @@ metadata=metadata1, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "author": [ @@ -1463,8 +1463,8 @@ "Jane Doe", ] } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, @@ -1474,13 +1474,13 @@ metadata=metadata2, mappings=[], indexer_configuration_id=tool_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) # when - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then @@ -1508,8 +1508,8 @@ "@context": "foo", "author": "John Doe", } - metadata1_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, + metadata1_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_1, metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, @@ -1519,14 +1519,14 @@ metadata=metadata1, mappings=["npm"], indexer_configuration_id=tool1_id, - from_revision=data.revision_id_1, + from_directory=data.directory_id_1, ) metadata2 = { "@context": "foo", "author": "Jane Doe", } - metadata2_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_2, + metadata2_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_2, metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, @@ -1536,13 +1536,13 @@ metadata=metadata2, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) metadata3 = { "@context": "foo", } - metadata3_rev = RevisionIntrinsicMetadataRow( - id=data.revision_id_3, + metadata3_dir = DirectoryIntrinsicMetadataRow( + id=data.directory_id_3, metadata=metadata3, mappings=["npm", "gemspec"], indexer_configuration_id=tool2_id, @@ -1552,14 +1552,14 @@ metadata=metadata3, mappings=["pkg-info"], indexer_configuration_id=tool2_id, - from_revision=data.revision_id_3, + from_directory=data.directory_id_3, ) - storage.revision_intrinsic_metadata_add([metadata1_rev]) + storage.directory_intrinsic_metadata_add([metadata1_dir]) storage.origin_intrinsic_metadata_add([metadata1_origin]) - storage.revision_intrinsic_metadata_add([metadata2_rev]) + storage.directory_intrinsic_metadata_add([metadata2_dir]) storage.origin_intrinsic_metadata_add([metadata2_origin]) - storage.revision_intrinsic_metadata_add([metadata3_rev]) + storage.directory_intrinsic_metadata_add([metadata3_dir]) storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer( @@ -1685,7 +1685,7 @@ }, mappings=["npm", "gemspec"], tool=tool2, - from_revision=data.revision_id_2, + from_directory=data.directory_id_2, ) ], next_page_token=None, diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py --- a/swh/indexer/tests/tasks.py +++ b/swh/indexer/tests/tasks.py @@ -1,13 +1,12 @@ from celery import current_app as app -from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.metadata import DirectoryMetadataIndexer, OriginMetadataIndexer from .test_metadata import ContentMetadataTestIndexer -from .test_origin_head import OriginHeadTestIndexer from .utils import BASE_TEST_CONFIG -class RevisionMetadataTestIndexer(RevisionMetadataIndexer): +class DirectoryMetadataTestIndexer(DirectoryMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ @@ -30,13 +29,12 @@ return {**BASE_TEST_CONFIG, "tools": []} def _prepare_sub_indexers(self): - self.origin_head_indexer = OriginHeadTestIndexer() - self.revision_metadata_indexer = RevisionMetadataTestIndexer() + self.directory_metadata_indexer = DirectoryMetadataTestIndexer() @app.task -def revision_intrinsic_metadata(*args, **kwargs): - indexer = RevisionMetadataTestIndexer() +def directory_intrinsic_metadata(*args, **kwargs): + indexer = DirectoryMetadataTestIndexer() indexer.run(*args, **kwargs) print("REV RESULT=", indexer.results) diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -16,13 +16,15 @@ from swh.indexer.cli import indexer_cli_group from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes from swh.model.model import OriginVisitStatus +from .utils import DIRECTORY2, REVISION + def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: tools: List[Dict[str, Any]] = [ @@ -38,15 +40,15 @@ origin_metadata = [ OriginIntrinsicMetadataRow( id="file://dev/%04d" % origin_id, - from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)), + from_directory=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] - revision_metadata = [ - RevisionIntrinsicMetadataRow( + directory_metadata = [ + DirectoryIntrinsicMetadataRow( id=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, @@ -55,7 +57,7 @@ for origin_id in range(nb_rows) ] - idx_storage.revision_intrinsic_metadata_add(revision_metadata) + idx_storage.directory_intrinsic_metadata_add(directory_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool["id"] for tool in tools] @@ -400,7 +402,7 @@ return datetime.datetime.now(tz=datetime.timezone.utc) -def test_cli_journal_client( +def test_cli_journal_client_schedule( cli_runner, swh_config, indexer_scheduler, @@ -523,3 +525,131 @@ ], catch_exceptions=False, ) + + +@pytest.mark.parametrize("indexer_name", ["origin-intrinsic-metadata", "*"]) +def test_cli_journal_client_index( + cli_runner, + swh_config, + kafka_prefix: str, + kafka_server, + consumer: Consumer, + idx_storage, + storage, + mocker, + swh_indexer_config, + indexer_name: str, +): + """Test the 'swh indexer journal-client' cli tool.""" + journal_writer = get_journal_writer( + "kafka", + brokers=[kafka_server], + prefix=kafka_prefix, + client_id="test producer", + value_sanitizer=lambda object_type, value: value, + flush_timeout=3, # fail early if something is going wrong + ) + + visit_statuses = [ + OriginVisitStatus( + origin="file:///dev/zero", + visit=1, + date=now(), + status="full", + snapshot=None, + ), + OriginVisitStatus( + origin="file:///dev/foobar", + visit=2, + date=now(), + status="full", + snapshot=None, + ), + OriginVisitStatus( + origin="file:///tmp/spamegg", + visit=3, + date=now(), + status="full", + snapshot=None, + ), + OriginVisitStatus( + origin="file:///dev/0002", + visit=6, + date=now(), + status="full", + snapshot=None, + ), + OriginVisitStatus( # will be filtered out due to its 'partial' status + origin="file:///dev/0000", + visit=4, + date=now(), + status="partial", + snapshot=None, + ), + OriginVisitStatus( # will be filtered out due to its 'ongoing' status + origin="file:///dev/0001", + visit=5, + date=now(), + status="ongoing", + snapshot=None, + ), + ] + + journal_writer.write_additions("origin_visit_status", visit_statuses) + visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"] + storage.revision_add([REVISION]) + + mocker.patch( + "swh.indexer.metadata.get_head_swhid", + return_value=REVISION.swhid(), + ) + + mocker.patch( + "swh.indexer.metadata.DirectoryMetadataIndexer.index", + return_value=[ + DirectoryIntrinsicMetadataRow( + id=DIRECTORY2.id, + indexer_configuration_id=1, + mappings=["cff"], + metadata={"foo": "bar"}, + ) + ], + ) + result = cli_runner.invoke( + indexer_cli_group, + [ + "-C", + swh_config, + "journal-client", + indexer_name, + "--broker", + kafka_server, + "--prefix", + kafka_prefix, + "--group-id", + "test-consumer", + "--stop-after-objects", + len(visit_statuses), + ], + catch_exceptions=False, + ) + + # Check the output + expected_output = "Done.\n" + assert result.exit_code == 0, result.output + assert result.output == expected_output + + results = idx_storage.origin_intrinsic_metadata_get( + [status.origin for status in visit_statuses] + ) + expected_results = [ + OriginIntrinsicMetadataRow( + id=status.origin, + from_directory=DIRECTORY2.id, + tool={"id": 1, **swh_indexer_config["tools"]}, + mappings=["cff"], + metadata={"foo": "bar"}, + ) + for status in sorted(visit_statuses_full, key=lambda r: r.origin) + ] + assert sorted(results, key=lambda r: r.id) == expected_results diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -11,13 +11,13 @@ from swh.indexer.indexer import ( ContentIndexer, ContentPartitionIndexer, + DirectoryIndexer, OriginIndexer, - RevisionIndexer, ) from swh.indexer.storage import PagedResult, Sha1 from swh.model.model import Content -from .utils import BASE_TEST_CONFIG +from .utils import BASE_TEST_CONFIG, DIRECTORY2 class _TestException(Exception): @@ -49,7 +49,7 @@ pass -class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer): +class CrashingDirectoryIndexer(CrashingIndexerMixin, DirectoryIndexer): pass @@ -86,29 +86,43 @@ indexer.run([b"foo"]) -def test_revision_indexer_catch_exceptions(): - indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG) +def test_directory_indexer_catch_exceptions(): + indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG) indexer.storage = Mock() - indexer.storage.revision_get.return_value = ["rev"] + indexer.storage.directory_get.return_value = [DIRECTORY2] assert indexer.run([b"foo"]) == {"status": "failed"} + assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == { + "status": "failed" + } + indexer.catch_exceptions = False with pytest.raises(_TestException): indexer.run([b"foo"]) + with pytest.raises(_TestException): + indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) + def test_origin_indexer_catch_exceptions(): indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG) assert indexer.run(["http://example.org"]) == {"status": "failed"} + assert indexer.process_journal_objects( + {"origin": [{"url": "http://example.org"}]} + ) == {"status": "failed"} + indexer.catch_exceptions = False with pytest.raises(_TestException): indexer.run(["http://example.org"]) + with pytest.raises(_TestException): + indexer.process_journal_objects({"origin": [{"url": "http://example.org"}]}) + def test_content_partition_indexer_catch_exceptions(): indexer = CrashingContentPartitionIndexer( diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,24 +1,25 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -import unittest +import logging from hypothesis import HealthCheck, given, settings, strategies +import pytest from swh.indexer.codemeta import CODEMETA_TERMS -from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping -from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow -from swh.indexer.tests.utils import DIRECTORY2, REVISION +from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow +from swh.indexer.tests.utils import DIRECTORY2 from swh.model.hashutil import hash_to_bytes -from swh.model.model import Directory, DirectoryEntry, Revision +from swh.model.model import Directory, DirectoryEntry from .utils import ( BASE_TEST_CONFIG, @@ -42,25 +43,21 @@ """ def parse_config_file(self, *args, **kwargs): - assert False, "should not be called; the rev indexer configures it." + assert False, "should not be called; the dir indexer configures it." -REVISION_METADATA_CONFIG = { +DIRECTORY_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } -class Metadata(unittest.TestCase): +class TestMetadata: """ Tests metadata_mock_tool tool for Metadata detection """ - def setUp(self): - """ - shows the entire diff in the results - """ - self.maxDiff = None + def setup_method(self): self.npm_mapping = MAPPINGS["NpmMapping"]() self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() self.maven_mapping = MAPPINGS["MavenMapping"]() @@ -81,7 +78,7 @@ # when result = self.npm_mapping.translate(content) # then - self.assertEqual(declared_metadata, result) + assert declared_metadata == result def test_compute_metadata_cff(self): """ @@ -160,7 +157,7 @@ # when result = self.cff_mapping.translate(content) # then - self.assertEqual(expected, result) + assert expected == result def test_compute_metadata_npm(self): """ @@ -201,7 +198,7 @@ # when result = self.npm_mapping.translate(content) # then - self.assertEqual(declared_metadata, result) + assert declared_metadata == result def test_index_content_metadata_npm(self): """ @@ -275,7 +272,7 @@ del result.tool["id"] # The assertion below returns False sometimes because of nested lists - self.assertEqual(expected_results, results) + assert expected_results == results def test_npm_bugs_normalization(self): # valid dictionary @@ -287,15 +284,12 @@ } }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "issueTracker": "https://github.com/owner/project/issues", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "issueTracker": "https://github.com/owner/project/issues", + "type": "SoftwareSourceCode", + } # "invalid" dictionary package_json = b"""{ @@ -305,14 +299,11 @@ } }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "type": "SoftwareSourceCode", + } # string package_json = b"""{ @@ -320,15 +311,12 @@ "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "issueTracker": "https://github.com/owner/project/issues", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "issueTracker": "https://github.com/owner/project/issues", + "type": "SoftwareSourceCode", + } def test_npm_repository_normalization(self): # normal @@ -340,15 +328,12 @@ } }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "codeRepository": "git+https://github.com/npm/cli.git", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "codeRepository": "git+https://github.com/npm/cli.git", + "type": "SoftwareSourceCode", + } # missing url package_json = b"""{ @@ -358,14 +343,11 @@ } }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "type": "SoftwareSourceCode", + } # github shortcut package_json = b"""{ @@ -379,7 +361,7 @@ "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } - self.assertEqual(result, expected_result) + assert result == expected_result # github shortshortcut package_json = b"""{ @@ -387,7 +369,7 @@ "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) - self.assertEqual(result, expected_result) + assert result == expected_result # gitlab shortcut package_json = b"""{ @@ -395,52 +377,48 @@ "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) - self.assertEqual( - result, + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "foo", + "codeRepository": "git+https://gitlab.com/user/repo.git", + "type": "SoftwareSourceCode", + } + + @pytest.mark.parametrize( + "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] + ) + def test_detect_metadata_package_json(self, filename): + # given + df = [ { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "name": "foo", - "codeRepository": "git+https://gitlab.com/user/repo.git", - "type": "SoftwareSourceCode", + "sha1_git": b"abc", + "name": b"index.js", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", }, - ) - - def test_detect_metadata_package_json(self): - filenames = [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] - - for filename in filenames: - with self.subTest(filename=filename): - # given - df = [ - { - "sha1_git": b"abc", - "name": b"index.js", - "target": b"abc", - "length": 897, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"bcd", - }, - { - "sha1_git": b"aab", - "name": filename, - "target": b"aab", - "length": 712, - "status": "visible", - "type": "file", - "perms": 33188, - "dir_id": b"dir_a", - "sha1": b"cde", - }, - ] - # when - results = detect_metadata(df) + { + "sha1_git": b"aab", + "name": filename, + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"cde", + }, + ] + # when + results = detect_metadata(df) - expected_results = {"NpmMapping": [b"cde"]} - # then - self.assertEqual(expected_results, results) + expected_results = {"NpmMapping": [b"cde"]} + # then + assert expected_results == results def test_detect_metadata_codemeta_json_uppercase(self): # given @@ -473,7 +451,7 @@ expected_results = {"CodemetaMapping": [b"bcd"]} # then - self.assertEqual(expected_results, results) + assert expected_results == results def test_compute_metadata_valid_codemeta(self): raw_content = b"""{ @@ -580,7 +558,7 @@ "programmingLanguage": "JSON-LD", } result = self.codemeta_mapping.translate(raw_content) - self.assertEqual(result, expected_result) + assert result == expected_result def test_compute_metadata_codemeta_alternate_context(self): raw_content = b"""{ @@ -594,7 +572,7 @@ "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) - self.assertEqual(result, expected_result) + assert result == expected_result def test_compute_metadata_maven(self): raw_content = b""" @@ -625,33 +603,27 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", - "codeRepository": ( - "http://repo1.maven.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", + "codeRepository": ( + "http://repo1.maven.org/maven2/com/mycompany/app/my-app" + ), + } def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } def test_compute_metadata_maven_almost_empty(self): raw_content = b""" @@ -659,81 +631,85 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } - def test_compute_metadata_maven_invalid_xml(self): + def test_compute_metadata_maven_invalid_xml(self, caplog): expected_warning = ( - "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" - "Error parsing XML from foo" + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error parsing XML from foo", ) + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") raw_content = b""" """ - with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertEqual(cm.output, [expected_warning]) - self.assertEqual(result, None) + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning] + assert result is None raw_content = b""" """ - with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertEqual(cm.output, [expected_warning]) - self.assertEqual(result, None) + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning] + assert result is None - def test_compute_metadata_maven_unknown_encoding(self): + def test_compute_metadata_maven_unknown_encoding(self, caplog): expected_warning = ( - "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" - "Error detecting XML encoding from foo" + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error detecting XML encoding from foo", ) + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") raw_content = b""" """ - with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertEqual(cm.output, [expected_warning]) - self.assertEqual(result, None) + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning] + assert result is None raw_content = b""" """ - with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertEqual(cm.output, [expected_warning]) - self.assertEqual(result, None) + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples == [expected_warning] + assert result is None - def test_compute_metadata_maven_invalid_encoding(self): + def test_compute_metadata_maven_invalid_encoding(self, caplog): expected_warning = [ # libexpat1 <= 2.2.10-2+deb11u1 [ ( - "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" - "Error unidecoding XML from foo" + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error unidecoding XML from foo", ) ], # libexpat1 >= 2.2.10-2+deb11u2 [ ( - "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" - "Error parsing XML from foo" + "swh.indexer.metadata_dictionary.maven.MavenMapping", + logging.WARNING, + "Error parsing XML from foo", ) ], ] + caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary") raw_content = b""" """ - with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: - result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) - self.assertIn(cm.output, expected_warning) - self.assertEqual(result, None) + caplog.clear() + result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) + assert caplog.record_tuples in expected_warning + assert result is None def test_compute_metadata_maven_minimal(self): raw_content = b""" @@ -745,19 +721,16 @@ 1.2.3 """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" @@ -771,19 +744,16 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } raw_content = b""" @@ -794,18 +764,15 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } raw_content = b""" @@ -816,18 +783,15 @@ 1.2.3 """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } raw_content = b""" @@ -840,19 +804,16 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } raw_content = b""" @@ -860,14 +821,11 @@ 1.2.3 """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "version": "1.2.3", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.2.3", + } def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" @@ -882,19 +840,16 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "codeRepository": ( - "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" - ), - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "codeRepository": ( + "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" + ), + } def test_compute_metadata_maven_multiple(self): """Tests when there are multiple code repos and licenses.""" @@ -936,24 +891,21 @@ """ result = self.maven_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "Maven Default Project", - "identifier": "com.mycompany.app", - "version": "1.2.3", - "license": [ - "https://www.apache.org/licenses/LICENSE-2.0.txt", - "https://opensource.org/licenses/MIT", - ], - "codeRepository": [ - "http://repo1.maven.org/maven2/com/mycompany/app/my-app", - "http://example.org/maven2/com/mycompany/app/my-app", - ], - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "Maven Default Project", + "identifier": "com.mycompany.app", + "version": "1.2.3", + "license": [ + "https://www.apache.org/licenses/LICENSE-2.0.txt", + "https://opensource.org/licenses/MIT", + ], + "codeRepository": [ + "http://repo1.maven.org/maven2/com/mycompany/app/my-app", + "http://example.org/maven2/com/mycompany/app/my-app", + ], + } def test_compute_metadata_pkginfo(self): raw_content = b"""\ @@ -987,40 +939,33 @@ Provides-Extra: testing """ # noqa result = self.pkginfo_mapping.translate(raw_content) - self.assertCountEqual( - result["description"], - [ - "Software Heritage core utilities", # note the comma here - "swh-core\n" - "========\n" - "\n" - "core library for swh's modules:\n" - "- config parser\n" - "- hash computations\n" - "- serialization\n" - "- logging mechanism\n" - "", - ], - result, - ) + assert result["description"] == [ + "Software Heritage core utilities", # note the comma here + "swh-core\n" + "========\n" + "\n" + "core library for swh's modules:\n" + "- config parser\n" + "- hash computations\n" + "- serialization\n" + "- logging mechanism\n" + "", + ], result del result["description"] - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "url": "https://forge.softwareheritage.org/diffusion/DCORE/", - "name": "swh.core", - "author": [ - { - "type": "Person", - "name": "Software Heritage developers", - "email": "swh-devel@inria.fr", - } - ], - "version": "0.0.49", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "url": "https://forge.softwareheritage.org/diffusion/DCORE/", + "name": "swh.core", + "author": [ + { + "type": "Person", + "name": "Software Heritage developers", + "email": "swh-devel@inria.fr", + } + ], + "version": "0.0.49", + } def test_compute_metadata_pkginfo_utf8(self): raw_content = b"""\ @@ -1031,15 +976,12 @@ Hydrology N\xc2\xb083 """ # noqa result = self.pkginfo_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "snowpyt", - "description": "foo\nHydrology N°83", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "snowpyt", + "description": "foo\nHydrology N°83", + } def test_compute_metadata_pkginfo_keywords(self): raw_content = b"""\ @@ -1048,15 +990,12 @@ Keywords: foo bar baz """ # noqa result = self.pkginfo_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "foo", - "keywords": ["foo", "bar", "baz"], - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "foo", + "keywords": ["foo", "bar", "baz"], + } def test_compute_metadata_pkginfo_license(self): raw_content = b"""\ @@ -1065,15 +1004,12 @@ License: MIT """ # noqa result = self.pkginfo_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "foo", - "license": "MIT", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "foo", + "license": "MIT", + } def test_gemspec_base(self): raw_content = b""" @@ -1090,23 +1026,20 @@ s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) - self.assertCountEqual( - result.pop("description"), - ["This is an example!", "Much longer explanation of the example!"], - ) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [{"type": "Person", "name": "Ruby Coder"}], - "name": "example", - "license": "https://spdx.org/licenses/MIT", - "codeRepository": "https://rubygems.org/gems/example", - "email": "rubycoder@example.com", - "version": "0.1.0", - }, - ) + assert set(result.pop("description")) == { + "This is an example!", + "Much longer explanation of the example!", + } + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"type": "Person", "name": "Ruby Coder"}], + "name": "example", + "license": "https://spdx.org/licenses/MIT", + "codeRepository": "https://rubygems.org/gems/example", + "email": "rubycoder@example.com", + "version": "0.1.0", + } def test_gemspec_two_author_fields(self): raw_content = b""" @@ -1115,20 +1048,20 @@ s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) - self.assertCountEqual( - result.pop("author"), + assert result.pop("author") in ( [ {"type": "Person", "name": "Ruby Coder1"}, {"type": "Person", "name": "Ruby Coder2"}, ], + [ + {"type": "Person", "name": "Ruby Coder2"}, + {"type": "Person", "name": "Ruby Coder1"}, + ], ) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } def test_gemspec_invalid_author(self): raw_content = b""" @@ -1136,38 +1069,29 @@ s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "author": [{"type": "Person", "name": "Ruby Coder1"}], - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [{"type": "Person", "name": "Ruby Coder1"}], + } def test_gemspec_alternative_header(self): raw_content = b""" @@ -1179,15 +1103,12 @@ } """ result = self.gemspec_mapping.translate(raw_content) - self.assertEqual( - result, - { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", - "type": "SoftwareSourceCode", - "name": "rb-system-with-aliases", - "description": "execute system commands with aliases", - }, - ) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "name": "rb-system-with-aliases", + "description": "execute system commands with aliases", + } @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(NpmMapping.mapping))) @@ -1233,8 +1154,8 @@ parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) - def test_revision_metadata_indexer(self): - metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) + def test_directory_metadata_indexer(self): + metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) @@ -1242,8 +1163,7 @@ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None - rev = REVISION - assert rev.directory == DIRECTORY2.id + dir_ = DIRECTORY2 metadata_indexer.idx_storage.content_metadata_add( [ @@ -1255,15 +1175,17 @@ ] ) - metadata_indexer.run([rev.id]) + metadata_indexer.run([dir_.id]) results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get( + [DIRECTORY2.id] + ) ) expected_results = [ - RevisionIntrinsicMetadataRow( - id=rev.id, + DirectoryIntrinsicMetadataRow( + id=dir_.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], @@ -1274,35 +1196,29 @@ del result.tool["id"] # then - self.assertEqual(results, expected_results) + assert results == expected_results - def test_revision_metadata_indexer_single_root_dir(self): - metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) + def test_directory_metadata_indexer_single_root_dir(self): + metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root - # of the revision - rev = REVISION - assert rev.directory == DIRECTORY2.id + # of the directory + dir_ = DIRECTORY2 - directory = Directory( + new_dir = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", - target=rev.directory, + target=dir_.id, perms=16384, ), ), ) - assert directory.id is not None - metadata_indexer.storage.directory_add([directory]) - - new_rev_dict = {**rev.to_dict(), "directory": directory.id} - new_rev_dict.pop("id") - new_rev = Revision.from_dict(new_rev_dict) - metadata_indexer.storage.revision_add([new_rev]) + assert new_dir.id is not None + metadata_indexer.storage.directory_add([new_dir]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} @@ -1319,15 +1235,15 @@ ] ) - metadata_indexer.run([new_rev.id]) + metadata_indexer.run([new_dir.id]) results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id]) ) expected_results = [ - RevisionIntrinsicMetadataRow( - id=new_rev.id, + DirectoryIntrinsicMetadataRow( + id=new_dir.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], @@ -1338,4 +1254,4 @@ del result.tool["id"] # then - self.assertEqual(results, expected_results) + assert results == expected_results diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,15 +1,13 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy from datetime import datetime, timezone -import unittest import pytest -from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.origin_head import get_head_swhid from swh.indexer.tests.utils import fill_storage from swh.model.model import ( Origin, @@ -19,37 +17,9 @@ SnapshotBranch, TargetType, ) +from swh.model.swhids import CoreSWHID from swh.storage.utils import now - -@pytest.fixture -def swh_indexer_config(swh_indexer_config): - config = copy.deepcopy(swh_indexer_config) - config.update( - { - "tools": { - "name": "origin-metadata", - "version": "0.0.1", - "configuration": {}, - }, - "tasks": { - "revision_intrinsic_metadata": None, - "origin_intrinsic_metadata": None, - }, - } - ) - return config - - -class OriginHeadTestIndexer(OriginHeadIndexer): - """Specific indexer whose configuration is enough to satisfy the - indexing tests. - """ - - def persist_index_computations(self, results): - self.results = results - - SAMPLE_SNAPSHOT = Snapshot( branches={ b"foo": None, @@ -61,156 +31,127 @@ ) -class OriginHead(unittest.TestCase): - @pytest.fixture(autouse=True) - def init(self, swh_config): - super().setUp() - self.indexer = OriginHeadTestIndexer() - self.indexer.catch_exceptions = False - fill_storage(self.indexer.storage) - - def test_git(self): - origin_url = "https://github.com/SoftwareHeritage/swh-storage" - self.indexer.run([origin_url]) - rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm" - self.assertEqual( - self.indexer.results, - [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ], - ) - - def test_git_partial_snapshot(self): - """Checks partial snapshots are ignored.""" - origin_url = "https://github.com/SoftwareHeritage/swh-core" - self.indexer.storage.origin_add([Origin(url=origin_url)]) - visit = self.indexer.storage.origin_visit_add( - [ - OriginVisit( - origin=origin_url, - date=datetime(2019, 2, 27, tzinfo=timezone.utc), - type="git", - ) - ] - )[0] - self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit.visit, - date=now(), - status="partial", - snapshot=SAMPLE_SNAPSHOT.id, - ) - self.indexer.storage.origin_visit_status_add([visit_status]) - self.indexer.run([origin_url]) - self.assertEqual(self.indexer.results, []) - - def test_vcs_missing_snapshot(self): - origin_url = "https://github.com/SoftwareHeritage/swh-indexer" - self.indexer.storage.origin_add([Origin(url=origin_url)]) - self.indexer.run([origin_url]) - self.assertEqual(self.indexer.results, []) - - def test_pypi_missing_branch(self): - origin_url = "https://pypi.org/project/abcdef/" - self.indexer.storage.origin_add( - [ - Origin( - url=origin_url, - ) - ] - ) - visit = self.indexer.storage.origin_visit_add( - [ - OriginVisit( - origin=origin_url, - date=datetime(2019, 2, 27, tzinfo=timezone.utc), - type="pypi", - ) - ] - )[0] - self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit.visit, - date=now(), - status="full", - snapshot=SAMPLE_SNAPSHOT.id, - ) - self.indexer.storage.origin_visit_status_add([visit_status]) - self.indexer.run(["https://pypi.org/project/abcdef/"]) - self.assertEqual(self.indexer.results, []) - - def test_ftp(self): - origin_url = "rsync://ftp.gnu.org/gnu/3dldf" - self.indexer.run([origin_url]) - rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by" - self.assertEqual( - self.indexer.results, - [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ], - ) - - def test_ftp_missing_snapshot(self): - origin_url = "rsync://ftp.gnu.org/gnu/foobar" - self.indexer.storage.origin_add([Origin(url=origin_url)]) - self.indexer.run([origin_url]) - self.assertEqual(self.indexer.results, []) - - def test_deposit(self): - origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" - self.indexer.storage.origin_add([Origin(url=origin_url)]) - self.indexer.run([origin_url]) - rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb" - self.assertEqual( - self.indexer.results, - [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ], - ) - - def test_deposit_missing_snapshot(self): - origin_url = "https://forge.softwareheritage.org/source/foobar" - self.indexer.storage.origin_add( - [ - Origin( - url=origin_url, - ) - ] - ) - self.indexer.run([origin_url]) - self.assertEqual(self.indexer.results, []) - - def test_pypi(self): - origin_url = "https://pypi.org/project/limnoria/" - self.indexer.run([origin_url]) - - rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t" - self.assertEqual( - self.indexer.results, - [{"revision_id": rev_id, "origin_url": origin_url}], - ) - - def test_svn(self): - origin_url = "http://0-512-md.googlecode.com/svn/" - self.indexer.run([origin_url]) - rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18" - self.assertEqual( - self.indexer.results, - [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ], - ) +@pytest.fixture +def storage(swh_storage): + fill_storage(swh_storage) + return swh_storage + + +def test_git(storage): + origin_url = "https://github.com/SoftwareHeritage/swh-storage" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d" + ) + + +def test_git_partial_snapshot(storage): + """Checks partial snapshots are ignored.""" + origin_url = "https://github.com/SoftwareHeritage/swh-core" + storage.origin_add([Origin(url=origin_url)]) + visit = storage.origin_visit_add( + [ + OriginVisit( + origin=origin_url, + date=datetime(2019, 2, 27, tzinfo=timezone.utc), + type="git", + ) + ] + )[0] + storage.snapshot_add([SAMPLE_SNAPSHOT]) + visit_status = OriginVisitStatus( + origin=origin_url, + visit=visit.visit, + date=now(), + status="partial", + snapshot=SAMPLE_SNAPSHOT.id, + ) + storage.origin_visit_status_add([visit_status]) + assert get_head_swhid(storage, origin_url) is None + + +def test_vcs_missing_snapshot(storage): + origin_url = "https://github.com/SoftwareHeritage/swh-indexer" + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) is None + + +def test_pypi_missing_branch(storage): + origin_url = "https://pypi.org/project/abcdef/" + storage.origin_add( + [ + Origin( + url=origin_url, + ) + ] + ) + visit = storage.origin_visit_add( + [ + OriginVisit( + origin=origin_url, + date=datetime(2019, 2, 27, tzinfo=timezone.utc), + type="pypi", + ) + ] + )[0] + storage.snapshot_add([SAMPLE_SNAPSHOT]) + visit_status = OriginVisitStatus( + origin=origin_url, + visit=visit.visit, + date=now(), + status="full", + snapshot=SAMPLE_SNAPSHOT.id, + ) + storage.origin_visit_status_add([visit_status]) + assert get_head_swhid(storage, origin_url) is None + + +def test_ftp(storage): + origin_url = "rsync://ftp.gnu.org/gnu/3dldf" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + ) + + +def test_ftp_missing_snapshot(storage): + origin_url = "rsync://ftp.gnu.org/gnu/foobar" + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) is None + + +def test_deposit(storage): + origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb" + ) + + +def test_deposit_missing_snapshot(storage): + origin_url = "https://forge.softwareheritage.org/source/foobar" + storage.origin_add( + [ + Origin( + url=origin_url, + ) + ] + ) + assert get_head_swhid(storage, origin_url) is None + + +def test_pypi(storage): + origin_url = "https://old-pypi.example.org/project/limnoria/" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" + ) + + origin_url = "https://pypi.org/project/limnoria/" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" + ) + + +def test_svn(storage): + origin_url = "http://0-512-md.googlecode.com/svn/" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18" + ) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -11,14 +11,14 @@ from swh.indexer.metadata import OriginMetadataIndexer from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, - RevisionIntrinsicMetadataRow, ) from swh.model.model import Origin from swh.storage.interface import StorageInterface from .test_metadata import TRANSLATOR_TOOL -from .utils import REVISION, YARN_PARSER_METADATA +from .utils import DIRECTORY2, YARN_PARSER_METADATA @pytest.fixture @@ -29,7 +29,47 @@ return cfg -def test_origin_metadata_indexer( +def test_origin_metadata_indexer_release( + swh_indexer_config, + idx_storage: IndexerStorageInterface, + storage: StorageInterface, + obj_storage, +) -> None: + indexer = OriginMetadataIndexer(config=swh_indexer_config) + origin = "https://npm.example.org/yarn-parser" + indexer.run([origin]) + + tool = swh_indexer_config["tools"] + + dir_id = DIRECTORY2.id + dir_metadata = DirectoryIntrinsicMetadataRow( + id=dir_id, + tool=tool, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + origin_metadata = OriginIntrinsicMetadataRow( + id=origin, + tool=tool, + from_directory=dir_id, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + + dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) + for dir_result in dir_results: + assert dir_result.tool + del dir_result.tool["id"] + assert dir_results == [dir_metadata] + + orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) + for orig_result in orig_results: + assert orig_result.tool + del orig_result.tool["id"] + assert orig_results == [origin_metadata] + + +def test_origin_metadata_indexer_revision( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, @@ -41,9 +81,9 @@ tool = swh_indexer_config["tools"] - rev_id = REVISION.id - rev_metadata = RevisionIntrinsicMetadataRow( - id=rev_id, + dir_id = DIRECTORY2.id + dir_metadata = DirectoryIntrinsicMetadataRow( + id=dir_id, tool=tool, metadata=YARN_PARSER_METADATA, mappings=["npm"], @@ -51,16 +91,16 @@ origin_metadata = OriginIntrinsicMetadataRow( id=origin, tool=tool, - from_revision=rev_id, + from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) - rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id])) - for rev_result in rev_results: - assert rev_result.tool - del rev_result.tool["id"] - assert rev_results == [rev_metadata] + dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) + for dir_result in dir_results: + assert dir_result.tool + del dir_result.tool["id"] + assert dir_results == [dir_metadata] orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) for orig_result in orig_results: @@ -82,10 +122,10 @@ indexer.run(["https://github.com/librariesio/yarn-parser"] * 2) origin = "https://github.com/librariesio/yarn-parser" - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert len(rev_results) == 1 + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert len(dir_results) == 1 orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert len(orig_results) == 1 @@ -121,15 +161,15 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.run([origin1, origin2]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [ - RevisionIntrinsicMetadataRow( - id=rev_id, + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [ + DirectoryIntrinsicMetadataRow( + id=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], - tool=rev_results[0].tool, + tool=dir_results[0].tool, ) ] @@ -140,7 +180,7 @@ assert orig_results == [ OriginIntrinsicMetadataRow( id=origin2, - from_revision=rev_id, + from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], tool=orig_results[0].tool, @@ -148,7 +188,7 @@ ] -def test_origin_metadata_indexer_duplicate_revision( +def test_origin_metadata_indexer_duplicate_directory( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, @@ -162,10 +202,10 @@ origin2 = "https://github.com/librariesio/yarn-parser.git" indexer.run([origin1, origin2]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert len(rev_results) == 1 + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert len(dir_results) == 1 orig_results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) @@ -185,10 +225,10 @@ with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] @@ -204,16 +244,16 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( - "swh.indexer.metadata.RevisionMetadataIndexer" - ".translate_revision_intrinsic_metadata", + "swh.indexer.metadata.DirectoryMetadataIndexer" + ".translate_directory_intrinsic_metadata", return_value=(["npm"], {"@context": "foo"}), ): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] @@ -229,16 +269,16 @@ indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( - "swh.indexer.metadata.RevisionMetadataIndexer" - ".translate_revision_intrinsic_metadata", + "swh.indexer.metadata.DirectoryMetadataIndexer" + ".translate_directory_intrinsic_metadata", return_value=None, ): indexer.run([origin]) - rev_id = REVISION.id + dir_id = DIRECTORY2.id - rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - assert rev_results == [] + dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) + assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] @@ -252,5 +292,5 @@ ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) - result = indexer.index_list(["https://unknown.org/foo"]) + result = indexer.index_list([Origin("https://unknown.org/foo")]) assert not result diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -19,10 +19,12 @@ Content, Directory, DirectoryEntry, + ObjectType, Origin, OriginVisit, OriginVisitStatus, Person, + Release, Revision, RevisionType, Snapshot, @@ -39,27 +41,26 @@ } -ORIGINS = [ - Origin(url="https://github.com/SoftwareHeritage/swh-storage"), - Origin(url="rsync://ftp.gnu.org/gnu/3dldf"), - Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"), - Origin(url="https://pypi.org/project/limnoria/"), - Origin(url="http://0-512-md.googlecode.com/svn/"), - Origin(url="https://github.com/librariesio/yarn-parser"), - Origin(url="https://github.com/librariesio/yarn-parser.git"), -] - - ORIGIN_VISITS = [ - {"type": "git", "origin": ORIGINS[0].url}, - {"type": "ftp", "origin": ORIGINS[1].url}, - {"type": "deposit", "origin": ORIGINS[2].url}, - {"type": "pypi", "origin": ORIGINS[3].url}, - {"type": "svn", "origin": ORIGINS[4].url}, - {"type": "git", "origin": ORIGINS[5].url}, - {"type": "git", "origin": ORIGINS[6].url}, + {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"}, + {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"}, + { + "type": "deposit", + "origin": "https://forge.softwareheritage.org/source/jesuisgpl/", + }, + { + "type": "pypi", + "origin": "https://old-pypi.example.org/project/limnoria/", + }, # with rev head + {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, # with rel head + {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"}, + {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"}, + {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"}, + {"type": "git", "origin": "https://npm.example.org/yarn-parser"}, ] +ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS] + DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), @@ -97,6 +98,8 @@ ), ) +_utc_plus_2 = datetime.timezone(datetime.timedelta(minutes=120)) + REVISION = Revision( id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"), message=b"Improve search functionality", @@ -111,28 +114,12 @@ email=b"andrewnez@gmail.com", ), committer_date=TimestampWithTimezone.from_datetime( - datetime.datetime( - 2013, - 10, - 4, - 12, - 50, - 49, - tzinfo=datetime.timezone(datetime.timedelta(minutes=120)), - ) + datetime.datetime(2013, 10, 4, 12, 50, 49, tzinfo=_utc_plus_2) ), type=RevisionType.GIT, synthetic=False, date=TimestampWithTimezone.from_datetime( - datetime.datetime( - 2017, - 2, - 20, - 16, - 14, - 16, - tzinfo=datetime.timezone(datetime.timedelta(minutes=120)), - ) + datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2) ), directory=DIRECTORY2.id, parents=(), @@ -140,7 +127,26 @@ REVISIONS = [REVISION] +RELEASE = Release( + name=b"v0.0.0", + message=None, + author=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + synthetic=False, + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2) + ), + target_type=ObjectType.DIRECTORY, + target=DIRECTORY2.id, +) + +RELEASES = [RELEASE] + SNAPSHOTS = [ + # https://github.com/SoftwareHeritage/swh-storage Snapshot( id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), branches={ @@ -161,6 +167,7 @@ ), }, ), + # rsync://ftp.gnu.org/gnu/3dldf Snapshot( id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), branches={ @@ -186,6 +193,7 @@ ), }, ), + # https://forge.softwareheritage.org/source/jesuisgpl/", Snapshot( id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), branches={ @@ -195,6 +203,7 @@ ) }, ), + # https://old-pypi.example.org/project/limnoria/ Snapshot( id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), branches={ @@ -211,6 +220,23 @@ ), }, ), + # https://pypi.org/project/limnoria/ + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/2018.09.09", target_type=TargetType.ALIAS + ), + b"releases/2018.09.01": SnapshotBranch( + target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", + target_type=TargetType.RELEASE, + ), + b"releases/2018.09.09": SnapshotBranch( + target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa + target_type=TargetType.RELEASE, + ), + }, + ), + # http://0-512-md.googlecode.com/svn/ Snapshot( id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), branches={ @@ -220,6 +246,7 @@ ) }, ), + # https://github.com/librariesio/yarn-parser Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ @@ -229,6 +256,7 @@ ) }, ), + # https://github.com/librariesio/yarn-parser.git Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ @@ -238,8 +266,19 @@ ) }, ), + # https://npm.example.org/yarn-parser + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=RELEASE.id, + target_type=TargetType.RELEASE, + ) + }, + ), ] +assert len(SNAPSHOTS) == len(ORIGIN_VISITS) + SHA1_TO_LICENSES = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], @@ -582,6 +621,7 @@ storage.origin_add(ORIGINS) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) + storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):