diff --git a/PKG-INFO b/PKG-INFO index 4912824..3fa533b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,64 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.9.0 +Version: 2.9.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - - ctags - - language - fossology-license - metadata -- revision: - - metadata +- origin: + - metadata (intrinsic, using the content indexer; and extrinsic) An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - - fossology-license (queue swh_indexer_fossology_license): compute the license -- metadata: translate file into translated_metadata dict +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta vocabulary) -Current revision indexers: +Current origin indexers: -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta and ForgeFed vocabularies) diff --git a/README.md b/README.md index f4f2481..56e255b 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,42 @@ swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - - ctags - - language - fossology-license - metadata -- revision: - - metadata +- origin: + - metadata (intrinsic, using the content indexer; and extrinsic) An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - - fossology-license (queue swh_indexer_fossology_license): compute the license -- metadata: translate file into translated_metadata dict +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta vocabulary) -Current revision indexers: +Current origin indexers: -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta and ForgeFed vocabularies) diff --git a/docs/README.md b/docs/README.md index f4f2481..56e255b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,49 +1,42 @@ swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - - ctags - - language - fossology-license - metadata -- revision: - - metadata +- origin: + - metadata (intrinsic, using the content indexer; and extrinsic) An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - - fossology-license (queue swh_indexer_fossology_license): compute the license -- metadata: translate file into translated_metadata dict +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta vocabulary) -Current revision indexers: +Current origin indexers: -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta and ForgeFed vocabularies) diff --git a/docs/dev-info.rst b/docs/dev-info.rst index 9ef8497..4720098 100644 --- a/docs/dev-info.rst +++ b/docs/dev-info.rst @@ -1,198 +1,190 @@ Hacking on swh-indexer ====================== This tutorial will guide you through the hacking on the swh-indexer. If you do not have a local copy of the Software Heritage archive, go to the :ref:`getting started tutorial `. Configuration files ------------------- You will need the following YAML configuration files to run the swh-indexer commands: - Orchestrator at ``~/.config/swh/indexer/orchestrator.yml`` .. code-block:: yaml indexers: mimetype: check_presence: false batch_size: 100 - Orchestrator-text at ``~/.config/swh/indexer/orchestrator-text.yml`` .. code-block:: yaml indexers: - # language: - # batch_size: 10 - # check_presence: false fossology_license: batch_size: 10 check_presence: false - # ctags: - # batch_size: 2 - # check_presence: false - Mimetype indexer at ``~/.config/swh/indexer/mimetype.yml`` .. code-block:: yaml # storage to read sha1's metadata (path) # storage: # cls: local # db: "service=swh-dev" # objstorage: # cls: pathslicing # root: /home/storage/swh-storage/ # slicing: 0:1/1:5 storage: cls: remote url: http://localhost:5002/ indexer_storage: cls: remote args: url: http://localhost:5007/ # storage to read sha1's content # adapt this to your need # locally: this needs to match your storage's setup objstorage: cls: pathslicing slicing: 0:1/1:5 root: /home/storage/swh-storage/ destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask - Fossology indexer at ``~/.config/swh/indexer/fossology_license.yml`` .. code-block:: yaml # storage to read sha1's metadata (path) # storage: # cls: local # db: "service=swh-dev" # objstorage: # cls: pathslicing # root: /home/storage/swh-storage/ # slicing: 0:1/1:5 storage: cls: remote url: http://localhost:5002/ indexer_storage: cls: remote args: url: http://localhost:5007/ # storage to read sha1's content # adapt this to your need # locally: this needs to match your storage's setup objstorage: cls: pathslicing slicing: 0:1/1:5 root: /home/storage/swh-storage/ workdir: /tmp/swh/worker.indexer/license/ tools: name: 'nomos' version: '3.1.0rc2-31-ga2cbb8c' configuration: command_line: 'nomossa ' - Worker at ``~/.config/swh/worker.yml`` .. code-block:: yaml task_broker: amqp://guest@localhost// task_modules: - swh.loader.svn.tasks - swh.loader.tar.tasks - swh.loader.git.tasks - swh.storage.archiver.tasks - swh.indexer.tasks - swh.indexer.orchestrator task_queues: - swh_loader_svn - swh_loader_tar - swh_reader_git_to_azure_archive - swh_storage_archive_worker_to_backend - swh_indexer_orchestrator_content_all - swh_indexer_orchestrator_content_text - swh_indexer_content_mimetype - - swh_indexer_content_language - - swh_indexer_content_ctags - swh_indexer_content_fossology_license - swh_loader_svn_mount_and_load - swh_loader_git_express - swh_loader_git_archive - swh_loader_svn_archive task_soft_time_limit: 0 Database -------- swh-indxer uses a database to store the indexed content. The default db is expected to be called swh-indexer-dev. Create or add ``swh-dev`` and ``swh-indexer-dev`` to the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's configuration files. Add data to local DB -------------------- from within the ``swh-environment``, run the following command:: make rebuild-testdata and fetch some real data to work with, using:: python3 -m swh.loader.git.updater --origin-url Then you can list all content files using this script:: #!/usr/bin/env bash psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g' Run the indexers ----------------- Use the list off contents to feed the indexers with with the following command:: ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all Activate the workers -------------------- To send messages to different queues using rabbitmq (which should already be installed through dependencies installation), run the following command in a dedicated terminal:: python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \ --pool=prefork \ --concurrency=1 \ -Ofair \ --loglevel=info \ --without-gossip \ --without-mingle \ --without-heartbeat 2>&1 With this command rabbitmq will consume message using the worker configuration file. Note: for the fossology_license indexer, you need a package fossology-nomossa which is in our `public debian repository `_. diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 4912824..3fa533b 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,64 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.9.0 +Version: 2.9.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - - ctags - - language - fossology-license - metadata -- revision: - - metadata +- origin: + - metadata (intrinsic, using the content indexer; and extrinsic) An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - - fossology-license (queue swh_indexer_fossology_license): compute the license -- metadata: translate file into translated_metadata dict +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta vocabulary) -Current revision indexers: +Current origin indexers: -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. +- metadata: translate file from an ecosystem-specific formats to JSON-LD + (using schema.org/CodeMeta and ForgeFed vocabularies) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 14212a3..5a7a25c 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,567 +1,567 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy import hashlib -import itertools import logging import time from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar, cast, ) from urllib.parse import urlparse import pkg_resources import sentry_sdk from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ( BaseIndexer, ContentIndexer, DirectoryIndexer, ObjectsDict, OriginIndexer, ) from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model import hashutil from swh.model.model import Directory, MetadataAuthorityType from swh.model.model import ObjectType as ModelObjectType from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType REVISION_GET_BATCH_SIZE = 10 RELEASE_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 T1 = TypeVar("T1") T2 = TypeVar("T2") logger = logging.getLogger(__name__) def call_with_batches( f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, ) -> Iterator[T2]: """Calls a function with batches of args, and concatenates the results.""" groups = grouper(args, batch_size) for group in groups: yield from f(list(group)) class ExtrinsicMetadataIndexer( BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow] ): def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} try: results = {} for item in objects.get("raw_extrinsic_metadata", []): remd = RawExtrinsicMetadata.from_dict(item) sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid())) - results[remd.target] = self.index(remd.id, data=remd) + for result in self.index(remd.id, data=remd): + results[result.id] = result except Exception: if not self.catch_exceptions: raise summary["status"] = "failed" return summary - self.results = list(itertools.chain.from_iterable(results.values())) + self.results = list(results.values()) summary_persist = self.persist_index_computations(self.results) if summary_persist: for value in summary_persist.values(): if value > 0: summary["status"] = "eventful" summary.update(summary_persist) return summary def index( self, id: Sha1Git, data: Optional[RawExtrinsicMetadata], **kwargs, ) -> List[OriginExtrinsicMetadataRow]: if data is None: raise NotImplementedError( "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" ) if data.target.object_type == ExtendedObjectType.ORIGIN: origin_sha1 = data.target.object_id elif data.origin is not None: # HACK: As swh-search does (yet?) not support searching on directories # and traversing back to origins, we index metadata on non-origins with # an origin context as if they were on the origin itself. origin_sha1 = hashlib.sha1(data.origin.encode()).digest() else: # other types are not supported yet return [] if data.authority.type == MetadataAuthorityType.REGISTRY: # metadata provided by a third-party; don't trust it # (technically this could be handled below, but we check it here # to return early; sparing a translation and origin lookup) # TODO: add ways to define trusted authorities return [] metadata_items = [] mappings: List[str] = [] for mapping_cls in EXTRINSIC_MAPPINGS.values(): if data.format in mapping_cls.extrinsic_metadata_formats(): mapping = mapping_cls() metadata_item = mapping.translate(data.metadata) if metadata_item is not None: metadata_items.append(metadata_item) mappings.append(mapping.name) if not metadata_items: # Don't have any mapping to parse it, ignore return [] # TODO: batch requests to origin_get_by_sha1() for _ in range(6): origins = self.storage.origin_get_by_sha1([origin_sha1]) try: (origin,) = origins if origin is not None: break except ValueError: pass # The origin does not exist. This may be due to some replication lag # between the loader's DB/journal and the DB we are consuming from. # Wait a bit and try again logger.debug("Origin %s not found, sleeping for 10s.", data.target) time.sleep(10) else: # Does not exist, or replication lag > 60s. raise ValueError(f"Unknown origin {data.target}") from None if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc: # metadata provided by a third-party; don't trust it # TODO: add ways to define trusted authorities return [] metadata = merge_documents(metadata_items) return [ OriginExtrinsicMetadataRow( id=origin["url"], indexer_configuration_id=self.tool["id"], from_remd_id=data.id, mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[OriginExtrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" return self.idx_storage.origin_extrinsic_metadata_add(results) class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.content_metadata_missing( ( { "id": sha1, "indexer_configuration_id": self.tool["id"], } for sha1 in ids ) ) def index( self, id: Sha1, data: Optional[bytes] = None, log_suffix="unknown directory", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. Args: id: content's identifier data: raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ assert isinstance(id, bytes) assert data is not None metadata = None try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) sentry_sdk.capture_exception() if metadata is None: return [] return [ ContentMetadataRow( id=id, indexer_configuration_id=self.tool["id"], metadata=metadata, ) ] def persist_index_computations( self, results: List[ContentMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" return self.idx_storage.content_metadata_add(results) DEFAULT_CONFIG: Dict[str, Any] = { "tools": { "name": "swh.indexer.metadata", "version": pkg_resources.get_distribution("swh.indexer").version, "configuration": {}, }, } class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): """Directory-level indexer This indexer is in charge of: - filtering directories already indexed in directory_intrinsic_metadata table with defined computation tool - retrieve all entry_files in directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for directory """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = merge_configs(DEFAULT_CONFIG, self.config) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.directory_intrinsic_metadata_missing( ( { "id": sha1_git, "indexer_configuration_id": self.tool["id"], } for sha1_git in sha1_gits ) ) def index( self, id: Sha1Git, data: Optional[Directory] = None, **kwargs ) -> List[DirectoryIntrinsicMetadataRow]: """Index directory by processing it and organizing result. use metadata_detector to iterate on filenames, passes them to the content indexers, then merges (if more than one) Args: id: sha1_git of the directory data: should always be None Returns: dict: dictionary representing a directory_intrinsic_metadata, with keys: - id: directory's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ dir_: List[DirectoryLsEntry] assert data is None, "Unexpected directory object" dir_ = cast( List[DirectoryLsEntry], list(self.storage.directory_ls(id, recursive=False)), ) try: if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_[0]["target"] dir_ = cast( List[DirectoryLsEntry], list(self.storage.directory_ls(subdir, recursive=False)), ) files = [entry for entry in dir_ if entry["type"] == "file"] (mappings, metadata) = self.translate_directory_intrinsic_metadata( files, log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() return [] return [ DirectoryIntrinsicMetadataRow( id=id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" # TODO: add functions in storage to keep data in # directory_intrinsic_metadata return self.idx_storage.directory_intrinsic_metadata_add(results) def translate_directory_intrinsic_metadata( self, files: List[DirectoryLsEntry], log_suffix: str ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata in the given root directory Args: files: list of file entries, as returned by :meth:`swh.storage.interface.StorageInterface.directory_ls` Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ metadata = [] # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage", "tools"] } all_detected_files = detect_metadata(files) used_mappings = [ INTRINSIC_MAPPINGS[context].name for context in all_detected_files ] for (mapping_name, detected_files) in all_detected_files.items(): cfg = deepcopy(config) cfg["tools"]["configuration"]["context"] = mapping_name c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get(detected_files) for c in metadata_generator: # extracting metadata sha1 = c.id sha1s_in_storage.append(sha1) local_metadata = c.metadata # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [ item for item in detected_files if item not in sha1s_in_storage ] if sha1s_filtered: # content indexing try: c_metadata_indexer.run( sha1s_filtered, log_suffix=log_suffix, ) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result.metadata metadata.append(local_metadata) except Exception: self.log.exception("Exception while indexing metadata on contents") sentry_sdk.capture_exception() metadata = merge_documents(metadata) return (used_mappings, metadata) class OriginMetadataIndexer( OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( self, origins: List[Origin], *, check_origin_known: bool = True, **kwargs, ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] head_rel_ids = [] origin_heads: Dict[Origin, CoreSWHID] = {} # Filter out origins not in the storage if check_origin_known: known_origins = list( call_with_batches( self.storage.origin_get, [origin.url for origin in origins], ORIGIN_GET_BATCH_SIZE, ) ) else: known_origins = list(origins) for origin in known_origins: if origin is None: continue head_swhid = get_head_swhid(self.storage, origin.url) if head_swhid: origin_heads[origin] = head_swhid if head_swhid.object_type == ObjectType.REVISION: head_rev_ids.append(head_swhid.object_id) elif head_swhid.object_type == ObjectType.RELEASE: head_rel_ids.append(head_swhid.object_id) else: assert False, head_swhid head_revs = dict( zip( head_rev_ids, call_with_batches( self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE ), ) ) head_rels = dict( zip( head_rel_ids, call_with_batches( self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE ), ) ) results = [] for (origin, head_swhid) in origin_heads.items(): sentry_sdk.set_tag("swh-indexer-origin-url", origin.url) sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid)) if head_swhid.object_type == ObjectType.REVISION: rev = head_revs[head_swhid.object_id] if not rev: self.log.warning( "Missing head object %s of origin %r", head_swhid, origin.url ) continue directory_id = rev.directory elif head_swhid.object_type == ObjectType.RELEASE: rel = head_rels[head_swhid.object_id] if not rel: self.log.warning( "Missing head object %s of origin %r", head_swhid, origin.url ) continue if rel.target_type != ModelObjectType.DIRECTORY: # TODO self.log.warning( "Head release %s of %r has unexpected target type %s", head_swhid, origin.url, rel.target_type, ) continue assert rel.target, rel directory_id = rel.target else: assert False, head_swhid for dir_metadata in self.directory_metadata_indexer.index(directory_id): # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( from_directory=dir_metadata.id, id=origin.url, metadata=dir_metadata.metadata, mappings=dir_metadata.mappings, indexer_configuration_id=dir_metadata.indexer_configuration_id, ) results.append((orig_metadata, dir_metadata)) return results def persist_index_computations( self, results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: # Deduplicate directories dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {} orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {} summary: Dict = {} for (orig_item, dir_item) in results: assert dir_item.metadata == orig_item.metadata if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets if dir_item.id not in dir_metadata: dir_metadata[dir_item.id] = dir_item if orig_item.id not in orig_metadata: orig_metadata[orig_item.id] = orig_item if dir_metadata: summary_dir = self.idx_storage.directory_intrinsic_metadata_add( list(dir_metadata.values()) ) summary.update(summary_dir) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add( list(orig_metadata.values()) ) summary.update(summary_ori) return summary diff --git a/swh/indexer/metadata_dictionary/utils.py b/swh/indexer/metadata_dictionary/utils.py index 8a5fdb9..6aaf4fd 100644 --- a/swh/indexer/metadata_dictionary/utils.py +++ b/swh/indexer/metadata_dictionary/utils.py @@ -1,112 +1,116 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar import urllib.parse from pyld import jsonld from rdflib import RDF, Graph, URIRef import rdflib.term from swh.indexer.codemeta import _document_loader def prettyprint_graph(graph: Graph, root: URIRef): s = graph.serialize(format="application/ld+json") jsonld_graph = json.loads(s) translated_metadata = jsonld.frame( jsonld_graph, {"@id": str(root)}, options={ "documentLoader": _document_loader, "processingMode": "json-ld-1.1", }, ) print(json.dumps(translated_metadata, indent=4)) def add_list( graph: Graph, subject: rdflib.term.Node, predicate: rdflib.term.Identifier, objects: Sequence[rdflib.term.Node], ) -> None: """Adds triples to the ``graph`` so that they are equivalent to this JSON-LD object:: { "@id": subject, predicate: {"@list": objects} } This is a naive implementation of https://json-ld.org/spec/latest/json-ld-api/#list-to-rdf-conversion """ # JSON-LD's @list is syntactic sugar for a linked list / chain in the RDF graph, # which is what we are going to construct, starting from the end: last_link: rdflib.term.Node last_link = RDF.nil for item in reversed(objects): link = rdflib.BNode() graph.add((link, RDF.first, item)) graph.add((link, RDF.rest, last_link)) last_link = link graph.add((subject, predicate, last_link)) TValue = TypeVar("TValue") def add_map( graph: Graph, subject: rdflib.term.Node, predicate: rdflib.term.Identifier, f: Callable[[Graph, TValue], Optional[rdflib.term.Node]], values: Iterable[TValue], ) -> None: """Helper for :func:`add_list` that takes a mapper function ``f``.""" nodes = [f(graph, value) for value in values] add_list(graph, subject, predicate, [node for node in nodes if node]) def add_url_if_valid( graph: Graph, subject: rdflib.term.Node, predicate: rdflib.term.Identifier, url: Any, ) -> None: """Adds ``(subject, predicate, url)`` to the graph if ``url`` is well-formed. This is meant as a workaround for https://github.com/digitalbazaar/pyld/issues/91 to drop URLs that are blatantly invalid early, so PyLD does not crash. >>> from pprint import pprint >>> graph = Graph() >>> subject = rdflib.term.URIRef("http://example.org/test-software") >>> predicate = rdflib.term.URIRef("http://schema.org/license") >>> add_url_if_valid( ... graph, subject, predicate, "https//www.apache.org/licenses/LICENSE-2.0.txt" ... ) >>> add_url_if_valid( ... graph, subject, predicate, "http:s//www.apache.org/licenses/LICENSE-2.0.txt" ... ) >>> add_url_if_valid( ... graph, subject, predicate, "https://www.apache.org/licenses/LICENSE-2.0.txt" ... ) >>> add_url_if_valid( ... graph, subject, predicate, 42 ... ) >>> pprint(set(graph.triples((subject, predicate, None)))) {(rdflib.term.URIRef('http://example.org/test-software'), rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))} """ if not isinstance(url, str): return - if " " in url or not urllib.parse.urlparse(url).netloc: + try: + parsed_url = urllib.parse.urlparse(url) + except Exception: + return + if " " in url or not parsed_url.netloc: return graph.add((subject, predicate, rdflib.term.URIRef(url))) diff --git a/swh/indexer/sql/upgrades/137.sql b/swh/indexer/sql/upgrades/137.sql index a7d69f1..152ae0e 100644 --- a/swh/indexer/sql/upgrades/137.sql +++ b/swh/indexer/sql/upgrades/137.sql @@ -1,23 +1,19 @@ -- SWH Indexer DB schema upgrade -- from_version: 136 -- to_version: 137 -- description: Drop content_language and content_ctags tables and related functions -insert into dbversion(version, release, description) - values(137, now(), 'Work In Progress'); +drop function if exists swh_content_language_add; +drop function if exists swh_mktemp_content_language(); +drop function if exists swh_mktemp_content_ctags(); +drop function if exists swh_content_ctags_add(); +drop function if exists swh_content_ctags_search; -drop function swh_content_language_add; -drop function swh_mktemp_content_language(); -drop function swh_mktemp_content_ctags(); -drop function swh_content_ctags_add(); -drop function swh_content_ctags_search; +drop type if exists content_ctags_signature; -drop index content_language_pkey; +drop table if exists content_language; +drop table if exists content_ctags; -drop table content_language; -drop table content_ctags; - -drop type languages; -drop type ctags_languages; -drop type content_ctags_signature; +drop type if exists languages; +drop type if exists ctags_languages; diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py index 9b52bfd..08f8ea6 100644 --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -1,449 +1,460 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from hypothesis import HealthCheck, given, settings import pytest from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.storage.model import ContentMetadataRow from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer from ..utils import ( BASE_TEST_CONFIG, MAPPING_DESCRIPTION_CONTENT_SHA1, json_document_strategy, ) def test_compute_metadata_none(): """ testing content empty content is empty should return None """ content = b"" # None if no metadata was found or an error occurred declared_metadata = None result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_compute_metadata_npm(): """ testing only computation of metadata with hard_mapping_npm """ content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ { "type": "Person", "name": "Morane G", "email": "moranegg@example.com", } ], } result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_compute_metadata_invalid_description_npm(): """ testing only computation of metadata with hard_mapping_npm """ content = b""" { "name": "test_metadata", "version": "0.0.2", "description": 1234 } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", } result = MAPPINGS["NpmMapping"]().translate(content) assert declared_metadata == result def test_index_content_metadata_npm(storage, obj_storage): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ sha1s = [ MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"], MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"], MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"], ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) metadata_indexer.run(sha1s, log_suffix="unknown content") results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ ContentMetadataRow( id=sha1s[0], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, ), ContentMetadataRow( id=sha1s[1], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "url": "https://docs.npmjs.com/", }, ), ] for result in results: del result.tool["id"] result.metadata.pop("keywords", None) # The assertion below returns False sometimes because of nested lists assert expected_results == results def test_npm_null_list_item_normalization(): package_json = b"""{ "name": "foo", "keywords": [ "foo", null ], "homepage": [ "http://example.org/", null ] }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", "url": "http://example.org/", "keywords": "foo", } def test_npm_bugs_normalization(): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", } # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", } # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", } def test_npm_repository_normalization(): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", } # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } assert result == expected_result # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == expected_result # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", } def test_npm_author(): package_json = rb"""{ "version": "1.0.0", "author": "Foo Bar (@example)" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "Foo Bar", "type": "Person"}], "version": "1.0.0", } def test_npm_invalid_uris(): package_json = rb"""{ "version": "1.0.0", "homepage": "", "author": { "name": "foo", "url": "http://example.org" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "http://example.org", "author": { "name": "foo", "url": "" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "url": "http://example.org", "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "", "author": { "name": "foo", "url": "" }, "bugs": "" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "homepage": "http:example.org", "author": { "name": "foo", "url": "http:example.com" }, "bugs": { "url": "http:example.com" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person"}], "version": "1.0.0", } package_json = rb"""{ "version": "1.0.0", "repository": "git+https://g ithub.com/foo/bar.git" }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "version": "1.0.0", } + package_json = rb"""{ + "version": "1.0.0", + "repository": "git+http://\\u001b[D\\u001b[D\\u001b[Ds\\u001b[C\\u001b[C\\u001b[D\\u001b://github.com/dearzoe/array-combination" +}""" # noqa + result = MAPPINGS["NpmMapping"]().translate(package_json) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "version": "1.0.0", + } + def test_npm_invalid_licenses(): package_json = rb"""{ "version": "1.0.0", "license": "SEE LICENSE IN LICENSE.md", "author": { "name": "foo", "url": "http://example.org" } }""" result = MAPPINGS["NpmMapping"]().translate(package_json) assert result == { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}], "version": "1.0.0", } @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping))) # type: ignore def test_npm_adversarial(doc): raw = json.dumps(doc).encode() MAPPINGS["NpmMapping"]().translate(raw) @pytest.mark.parametrize( "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"] ) def test_detect_metadata_package_json(filename): df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": filename, "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} assert expected_results == results diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index bb6b883..61c71cd 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,419 +1,452 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from unittest.mock import call import attr from swh.indexer.metadata import ( ContentMetadataIndexer, DirectoryMetadataIndexer, ExtrinsicMetadataIndexer, ) from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, ) from swh.indexer.tests.utils import DIRECTORY2 from swh.model.model import ( Directory, DirectoryEntry, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, RawExtrinsicMetadata, ) from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from .utils import ( BASE_TEST_CONFIG, MAPPING_DESCRIPTION_CONTENT_SHA1, MAPPING_DESCRIPTION_CONTENT_SHA1GIT, YARN_PARSER_METADATA, fill_obj_storage, fill_storage, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the dir indexer configures it." DIRECTORY_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } DEPOSIT_REMD = RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=b"\x02" * 20, ), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://example.org/", ), fetcher=MetadataFetcher( name="example-fetcher", version="1.0.0", ), format="sword-v2-atom-codemeta-v2", metadata=""" My Software Author 1 foo@example.org Author 2 """.encode(), origin="https://example.org/jdoe/myrepo", ) GITHUB_REMD = RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.ORIGIN, object_id=b"\x01" * 20, ), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://example.org/", ), fetcher=MetadataFetcher( name="example-fetcher", version="1.0.0", ), format="application/vnd.github.v3+json", metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}', ) class TestMetadata: """ Tests metadata_mock_tool tool for Metadata detection """ def test_directory_metadata_indexer(self): metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None dir_ = DIRECTORY2 assert ( dir_.entries[0].target == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] ) metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([dir_.id]) results = list( metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id]) ) expected_results = [ DirectoryIntrinsicMetadataRow( id=dir_.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] assert results == expected_results def test_directory_metadata_indexer_single_root_dir(self): metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the directory dir_ = DIRECTORY2 assert ( dir_.entries[0].target == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] ) new_dir = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=dir_.id, perms=16384, ), ), ) assert new_dir.id is not None metadata_indexer.storage.directory_add([new_dir]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_dir.id]) results = list( metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id]) ) expected_results = [ DirectoryIntrinsicMetadataRow( id=new_dir.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] assert results == expected_results def test_extrinsic_metadata_indexer_unknown_format(self, mocker): """Should be ignored when unknown format""" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") remd = attr.evolve(GITHUB_REMD, format="unknown format") results = metadata_indexer.index(remd.id, data=remd) assert metadata_indexer.storage.method_calls == [] assert results == [] def test_extrinsic_metadata_indexer_github(self, mocker): """Nominal case, calling the mapping and storing the result""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [GITHUB_REMD.to_dict()]} ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1([b"\x01" * 20]) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [ OriginExtrinsicMetadataRow( id="https://example.org/jdoe/myrepo", tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", }, from_remd_id=GITHUB_REMD.id, mappings=["github"], ) ] def test_extrinsic_metadata_indexer_firstparty_deposit(self, mocker): """Also nominal case, calling the mapping and storing the result""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1( [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] ) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [ OriginExtrinsicMetadataRow( id="https://example.org/jdoe/myrepo", tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [ {"email": "foo@example.org", "name": "Author 1"}, {"name": "Author 2"}, ], "name": "My Software", }, from_remd_id=DEPOSIT_REMD.id, mappings=["sword-codemeta"], ) ] def test_extrinsic_metadata_indexer_thirdparty_deposit(self, mocker): """Metadata-only deposit: currently ignored""" origin = "https://not-from-example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} ) == {"status": "uneventful", "origin_extrinsic_metadata:add": 0} assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1( [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] ) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [] def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker): """Early abort on non-forge authorities""" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") remd = attr.evolve( GITHUB_REMD, authority=attr.evolve( GITHUB_REMD.authority, type=MetadataAuthorityType.REGISTRY ), ) results = metadata_indexer.index(remd.id, data=remd) assert metadata_indexer.storage.method_calls == [] assert results == [] def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker): """Should be ignored when authority URL does not match the origin""" origin = "https://different-domain.example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None results = metadata_indexer.index(GITHUB_REMD.id, data=GITHUB_REMD) assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1([b"\x01" * 20]) ] assert results == [] def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker): - """Nominal case, calling the mapping and storing the result""" + """Two metadata objects with the same origin target""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( { "raw_extrinsic_metadata": [ GITHUB_REMD.to_dict(), {**GITHUB_REMD.to_dict(), "id": b"\x00" * 20}, ] } ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert len(results) == 1, results assert results[0].from_remd_id == b"\x00" * 20 + + def test_extrinsic_directory_metadata_indexer_duplicate_origin(self, mocker): + """Two metadata objects on directories, but with an origin context""" + origin = DEPOSIT_REMD.origin + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + { + "raw_extrinsic_metadata": [ + DEPOSIT_REMD.to_dict(), + { + **DEPOSIT_REMD.to_dict(), + "id": b"\x00" * 20, + "target": "swh:1:dir:" + "01" * 20, + }, + ] + } + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert len(results) == 1, results + assert results[0].from_remd_id == b"\x00" * 20