diff --git a/PKG-INFO b/PKG-INFO index 10c99f5..7c5cbd3 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.7.3 +Version: 2.8.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 10c99f5..7c5cbd3 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.7.3 +Version: 2.8.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index f1e1e6a..14212a3 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,559 +1,567 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy +import hashlib import itertools import logging import time from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar, cast, ) from urllib.parse import urlparse import pkg_resources import sentry_sdk from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ( BaseIndexer, ContentIndexer, DirectoryIndexer, ObjectsDict, OriginIndexer, ) from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model import hashutil from swh.model.model import Directory, MetadataAuthorityType from swh.model.model import ObjectType as ModelObjectType from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType REVISION_GET_BATCH_SIZE = 10 RELEASE_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 T1 = TypeVar("T1") T2 = TypeVar("T2") logger = logging.getLogger(__name__) def call_with_batches( f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, ) -> Iterator[T2]: """Calls a function with batches of args, and concatenates the results.""" groups = grouper(args, batch_size) for group in groups: yield from f(list(group)) class ExtrinsicMetadataIndexer( BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow] ): def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} try: results = {} for item in objects.get("raw_extrinsic_metadata", []): remd = RawExtrinsicMetadata.from_dict(item) sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid())) results[remd.target] = self.index(remd.id, data=remd) except Exception: if not self.catch_exceptions: raise summary["status"] = "failed" return summary self.results = list(itertools.chain.from_iterable(results.values())) summary_persist = self.persist_index_computations(self.results) if summary_persist: for value in summary_persist.values(): if value > 0: summary["status"] = "eventful" summary.update(summary_persist) return summary def index( self, id: Sha1Git, data: Optional[RawExtrinsicMetadata], **kwargs, ) -> List[OriginExtrinsicMetadataRow]: if data is None: raise NotImplementedError( "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" ) - if data.target.object_type != ExtendedObjectType.ORIGIN: + if data.target.object_type == ExtendedObjectType.ORIGIN: + origin_sha1 = data.target.object_id + elif data.origin is not None: + # HACK: As swh-search does (yet?) not support searching on directories + # and traversing back to origins, we index metadata on non-origins with + # an origin context as if they were on the origin itself. + origin_sha1 = hashlib.sha1(data.origin.encode()).digest() + else: # other types are not supported yet return [] if data.authority.type == MetadataAuthorityType.REGISTRY: # metadata provided by a third-party; don't trust it # (technically this could be handled below, but we check it here # to return early; sparing a translation and origin lookup) # TODO: add ways to define trusted authorities return [] metadata_items = [] mappings: List[str] = [] for mapping_cls in EXTRINSIC_MAPPINGS.values(): if data.format in mapping_cls.extrinsic_metadata_formats(): mapping = mapping_cls() metadata_item = mapping.translate(data.metadata) if metadata_item is not None: metadata_items.append(metadata_item) mappings.append(mapping.name) if not metadata_items: # Don't have any mapping to parse it, ignore return [] # TODO: batch requests to origin_get_by_sha1() for _ in range(6): - origins = self.storage.origin_get_by_sha1([data.target.object_id]) + origins = self.storage.origin_get_by_sha1([origin_sha1]) try: (origin,) = origins if origin is not None: break except ValueError: pass # The origin does not exist. This may be due to some replication lag # between the loader's DB/journal and the DB we are consuming from. # Wait a bit and try again logger.debug("Origin %s not found, sleeping for 10s.", data.target) time.sleep(10) else: # Does not exist, or replication lag > 60s. raise ValueError(f"Unknown origin {data.target}") from None if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc: # metadata provided by a third-party; don't trust it # TODO: add ways to define trusted authorities return [] metadata = merge_documents(metadata_items) return [ OriginExtrinsicMetadataRow( id=origin["url"], indexer_configuration_id=self.tool["id"], from_remd_id=data.id, mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[OriginExtrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" return self.idx_storage.origin_extrinsic_metadata_add(results) class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.content_metadata_missing( ( { "id": sha1, "indexer_configuration_id": self.tool["id"], } for sha1 in ids ) ) def index( self, id: Sha1, data: Optional[bytes] = None, log_suffix="unknown directory", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. Args: id: content's identifier data: raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ assert isinstance(id, bytes) assert data is not None metadata = None try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) sentry_sdk.capture_exception() if metadata is None: return [] return [ ContentMetadataRow( id=id, indexer_configuration_id=self.tool["id"], metadata=metadata, ) ] def persist_index_computations( self, results: List[ContentMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" return self.idx_storage.content_metadata_add(results) DEFAULT_CONFIG: Dict[str, Any] = { "tools": { "name": "swh.indexer.metadata", "version": pkg_resources.get_distribution("swh.indexer").version, "configuration": {}, }, } class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): """Directory-level indexer This indexer is in charge of: - filtering directories already indexed in directory_intrinsic_metadata table with defined computation tool - retrieve all entry_files in directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for directory """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = merge_configs(DEFAULT_CONFIG, self.config) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.directory_intrinsic_metadata_missing( ( { "id": sha1_git, "indexer_configuration_id": self.tool["id"], } for sha1_git in sha1_gits ) ) def index( self, id: Sha1Git, data: Optional[Directory] = None, **kwargs ) -> List[DirectoryIntrinsicMetadataRow]: """Index directory by processing it and organizing result. use metadata_detector to iterate on filenames, passes them to the content indexers, then merges (if more than one) Args: id: sha1_git of the directory data: should always be None Returns: dict: dictionary representing a directory_intrinsic_metadata, with keys: - id: directory's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ dir_: List[DirectoryLsEntry] assert data is None, "Unexpected directory object" dir_ = cast( List[DirectoryLsEntry], list(self.storage.directory_ls(id, recursive=False)), ) try: if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_[0]["target"] dir_ = cast( List[DirectoryLsEntry], list(self.storage.directory_ls(subdir, recursive=False)), ) files = [entry for entry in dir_ if entry["type"] == "file"] (mappings, metadata) = self.translate_directory_intrinsic_metadata( files, log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() return [] return [ DirectoryIntrinsicMetadataRow( id=id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage.""" # TODO: add functions in storage to keep data in # directory_intrinsic_metadata return self.idx_storage.directory_intrinsic_metadata_add(results) def translate_directory_intrinsic_metadata( self, files: List[DirectoryLsEntry], log_suffix: str ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata in the given root directory Args: files: list of file entries, as returned by :meth:`swh.storage.interface.StorageInterface.directory_ls` Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ metadata = [] # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage", "tools"] } all_detected_files = detect_metadata(files) used_mappings = [ INTRINSIC_MAPPINGS[context].name for context in all_detected_files ] for (mapping_name, detected_files) in all_detected_files.items(): cfg = deepcopy(config) cfg["tools"]["configuration"]["context"] = mapping_name c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get(detected_files) for c in metadata_generator: # extracting metadata sha1 = c.id sha1s_in_storage.append(sha1) local_metadata = c.metadata # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [ item for item in detected_files if item not in sha1s_in_storage ] if sha1s_filtered: # content indexing try: c_metadata_indexer.run( sha1s_filtered, log_suffix=log_suffix, ) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result.metadata metadata.append(local_metadata) except Exception: self.log.exception("Exception while indexing metadata on contents") sentry_sdk.capture_exception() metadata = merge_documents(metadata) return (used_mappings, metadata) class OriginMetadataIndexer( OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( self, origins: List[Origin], *, check_origin_known: bool = True, **kwargs, ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] head_rel_ids = [] origin_heads: Dict[Origin, CoreSWHID] = {} # Filter out origins not in the storage if check_origin_known: known_origins = list( call_with_batches( self.storage.origin_get, [origin.url for origin in origins], ORIGIN_GET_BATCH_SIZE, ) ) else: known_origins = list(origins) for origin in known_origins: if origin is None: continue head_swhid = get_head_swhid(self.storage, origin.url) if head_swhid: origin_heads[origin] = head_swhid if head_swhid.object_type == ObjectType.REVISION: head_rev_ids.append(head_swhid.object_id) elif head_swhid.object_type == ObjectType.RELEASE: head_rel_ids.append(head_swhid.object_id) else: assert False, head_swhid head_revs = dict( zip( head_rev_ids, call_with_batches( self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE ), ) ) head_rels = dict( zip( head_rel_ids, call_with_batches( self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE ), ) ) results = [] for (origin, head_swhid) in origin_heads.items(): sentry_sdk.set_tag("swh-indexer-origin-url", origin.url) sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid)) if head_swhid.object_type == ObjectType.REVISION: rev = head_revs[head_swhid.object_id] if not rev: self.log.warning( "Missing head object %s of origin %r", head_swhid, origin.url ) continue directory_id = rev.directory elif head_swhid.object_type == ObjectType.RELEASE: rel = head_rels[head_swhid.object_id] if not rel: self.log.warning( "Missing head object %s of origin %r", head_swhid, origin.url ) continue if rel.target_type != ModelObjectType.DIRECTORY: # TODO self.log.warning( "Head release %s of %r has unexpected target type %s", head_swhid, origin.url, rel.target_type, ) continue assert rel.target, rel directory_id = rel.target else: assert False, head_swhid for dir_metadata in self.directory_metadata_indexer.index(directory_id): # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( from_directory=dir_metadata.id, id=origin.url, metadata=dir_metadata.metadata, mappings=dir_metadata.mappings, indexer_configuration_id=dir_metadata.indexer_configuration_id, ) results.append((orig_metadata, dir_metadata)) return results def persist_index_computations( self, results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: # Deduplicate directories dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {} orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {} summary: Dict = {} for (orig_item, dir_item) in results: assert dir_item.metadata == orig_item.metadata if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets if dir_item.id not in dir_metadata: dir_metadata[dir_item.id] = dir_item if orig_item.id not in orig_metadata: orig_metadata[orig_item.id] = orig_item if dir_metadata: summary_dir = self.idx_storage.directory_intrinsic_metadata_add( list(dir_metadata.values()) ) summary.update(summary_dir) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add( list(orig_metadata.values()) ) summary.update(summary_ori) return summary diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index 2d9ff6d..82ac133 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,120 +1,156 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union -from swh.model.model import SnapshotBranch, TargetType +from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.storage.interface import PartialBranches, StorageInterface -def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: +def get_head_swhid(storage: StorageInterface, origin_url: str) -> Optional[CoreSWHID]: """Returns the SWHID of the head revision or release of an origin""" visit_status = origin_get_latest_visit_status( storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) if not visit_status: return None assert visit_status.snapshot is not None - snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) - if snapshot is None: - return None if visit_status.type == "ftp": - return _try_get_ftp_head(dict(snapshot.branches)) + # We need to fetch all branches in order to find the largest one + snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) + if snapshot is None: + return None + return _try_get_ftp_head(storage, snapshot) else: - return _try_get_head_generic(dict(snapshot.branches)) + # Peak into the snapshot, without fetching too many refs. + # If the snapshot is small, this gets all of it in a single request. + # If the snapshot is large, we will query specific branches as we need them. + partial_branches = storage.snapshot_get_branches( + visit_status.snapshot, branches_count=100 + ) + if partial_branches is None: + # Snapshot does not exist + return None + return _try_get_head_generic(storage, partial_branches) _archive_filename_re = re.compile( rb"^" rb"(?P.*)[-_]" rb"(?P[0-9]+(\.[0-9])*)" rb"(?P[-+][a-zA-Z0-9.~]+?)?" rb"(?P(\.[a-zA-Z0-9]+)+)" rb"$" ) def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]: """Extracts the release version from an archive filename, to get an ordering whose maximum is likely to be the last version of the software >>> _parse_version(b'foo') (-inf,) >>> _parse_version(b'foo.tar.gz') (-inf,) >>> _parse_version(b'gnu-hello-0.0.1.tar.gz') (0, 0, 1, 0) >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') (0, 0, 1, -1, 'beta2') >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') (0, 0, 1, 1, 'foobar') """ res = _archive_filename_re.match(filename) if res is None: return (float("-infinity"),) version: List[Union[float, int, str]] = [ int(n) for n in res.group("version").decode().split(".") ] if res.group("preversion") is None: version.append(0) else: preversion = res.group("preversion").decode() if preversion.startswith("-"): version.append(-1) version.append(preversion[1:]) elif preversion.startswith("+"): version.append(1) version.append(preversion[1:]) else: assert False, res.group("preversion") return tuple(version) def _try_get_ftp_head( - branches: Dict[bytes, Optional[SnapshotBranch]] + storage: StorageInterface, snapshot: Snapshot ) -> Optional[CoreSWHID]: - archive_names = list(branches) + archive_names = list(snapshot.branches) max_archive_name = max(archive_names, key=_parse_version) - return _try_resolve_target(branches, max_archive_name) + return _try_resolve_target( + storage, + {"id": snapshot.id, "branches": dict(snapshot.branches), "next_branch": None}, + branch_name=max_archive_name, + ) def _try_get_head_generic( - branches: Dict[bytes, Optional[SnapshotBranch]] + storage: StorageInterface, partial_branches: PartialBranches ) -> Optional[CoreSWHID]: # Works on 'deposit', 'pypi', and VCSs. - return _try_resolve_target(branches, b"HEAD") or _try_resolve_target( - branches, b"master" - ) + return _try_resolve_target( + storage, partial_branches, branch_name=b"HEAD" + ) or _try_resolve_target(storage, partial_branches, branch_name=b"master") + + +def _get_branch( + storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes +) -> Optional[SnapshotBranch]: + """Given a ``branch_name``, gets it from ``partial_branches`` if present, + and fetches it from the storage otherwise.""" + if branch_name in partial_branches["branches"]: + return partial_branches["branches"][branch_name] + elif partial_branches["next_branch"] is not None: + # Branch is not in `partial_branches`, and `partial_branches` indeed partial + res = storage.snapshot_get_branches( + partial_branches["id"], branches_from=branch_name, branches_count=1 + ) + assert res is not None, "Snapshot does not exist anymore" + return res["branches"].get(branch_name) + else: + # Branch is not in `partial_branches`, but `partial_branches` is the full + # list of branches, which means it is a dangling reference. + return None def _try_resolve_target( - branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes + storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes ) -> Optional[CoreSWHID]: try: - branch = branches[branch_name] + branch = _get_branch(storage, partial_branches, branch_name) if branch is None: return None + while branch.target_type == TargetType.ALIAS: - branch = branches[branch.target] + branch = _get_branch(storage, partial_branches, branch.target) if branch is None: return None if branch.target_type == TargetType.REVISION: return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target) elif branch.target_type == TargetType.CONTENT: return None # TODO elif branch.target_type == TargetType.DIRECTORY: return None # TODO elif branch.target_type == TargetType.RELEASE: return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target) else: assert False, branch except KeyError: return None diff --git a/swh/indexer/storage/writer.py b/swh/indexer/storage/writer.py index b4fa365..7c06800 100644 --- a/swh/indexer/storage/writer.py +++ b/swh/indexer/storage/writer.py @@ -1,69 +1,73 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Callable, Dict, Iterable, Optional import attr try: from swh.journal.writer import JournalWriterInterface, get_journal_writer except ImportError: get_journal_writer = None # type: ignore # mypy limitation, see https://github.com/python/mypy/issues/1153 from .model import BaseRow class JournalWriter: """Journal writer storage collaborator. It's in charge of adding objects to the journal. """ journal: Optional[JournalWriterInterface] def __init__(self, tool_getter: Callable[[int], Dict[str, Any]], journal_writer): """ Args: tool_getter: a callable that takes a tool_id and return a dict representing a tool object journal_writer: configuration passed to `swh.journal.writer.get_journal_writer` """ self._tool_getter = tool_getter if journal_writer: if get_journal_writer is None: raise EnvironmentError( "You need the swh.journal package to use the " "journal_writer feature" ) self.journal = get_journal_writer( **journal_writer, value_sanitizer=lambda object_type, value_dict: value_dict, ) else: self.journal = None def write_additions(self, obj_type, entries: Iterable[BaseRow]) -> None: if not self.journal: return + translated = [] + # usually, all the additions in a batch are from the same indexer, # so this cache allows doing a single query for all the entries. tool_cache = {} for entry in entries: assert entry.object_type == obj_type # type: ignore # get the tool used to generate this addition tool_id = entry.indexer_configuration_id assert tool_id if tool_id not in tool_cache: tool_cache[tool_id] = self._tool_getter(tool_id) entry = attr.evolve( entry, tool=tool_cache[tool_id], indexer_configuration_id=None ) - # write to kafka - self.journal.write_addition(obj_type, entry) + translated.append(entry) + + # write to kafka + self.journal.write_additions(obj_type, translated) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 37b574c..bb6b883 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,414 +1,419 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from unittest.mock import call import attr from swh.indexer.metadata import ( ContentMetadataIndexer, DirectoryMetadataIndexer, ExtrinsicMetadataIndexer, ) from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, ) from swh.indexer.tests.utils import DIRECTORY2 from swh.model.model import ( Directory, DirectoryEntry, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, RawExtrinsicMetadata, ) from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from .utils import ( BASE_TEST_CONFIG, MAPPING_DESCRIPTION_CONTENT_SHA1, MAPPING_DESCRIPTION_CONTENT_SHA1GIT, YARN_PARSER_METADATA, fill_obj_storage, fill_storage, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the dir indexer configures it." DIRECTORY_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } DEPOSIT_REMD = RawExtrinsicMetadata( target=ExtendedSWHID( - object_type=ExtendedObjectType.ORIGIN, - object_id=b"\x01" * 20, + object_type=ExtendedObjectType.DIRECTORY, + object_id=b"\x02" * 20, ), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://example.org/", ), fetcher=MetadataFetcher( name="example-fetcher", version="1.0.0", ), format="sword-v2-atom-codemeta-v2", metadata=""" My Software Author 1 foo@example.org Author 2 """.encode(), + origin="https://example.org/jdoe/myrepo", ) GITHUB_REMD = RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.ORIGIN, object_id=b"\x01" * 20, ), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://example.org/", ), fetcher=MetadataFetcher( name="example-fetcher", version="1.0.0", ), format="application/vnd.github.v3+json", metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}', ) class TestMetadata: """ Tests metadata_mock_tool tool for Metadata detection """ def test_directory_metadata_indexer(self): metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None dir_ = DIRECTORY2 assert ( dir_.entries[0].target == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] ) metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([dir_.id]) results = list( metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id]) ) expected_results = [ DirectoryIntrinsicMetadataRow( id=dir_.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] assert results == expected_results def test_directory_metadata_indexer_single_root_dir(self): metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the directory dir_ = DIRECTORY2 assert ( dir_.entries[0].target == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] ) new_dir = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=dir_.id, perms=16384, ), ), ) assert new_dir.id is not None metadata_indexer.storage.directory_add([new_dir]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=MAPPING_DESCRIPTION_CONTENT_SHA1[ "json:yarn-parser-package.json" ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_dir.id]) results = list( metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id]) ) expected_results = [ DirectoryIntrinsicMetadataRow( id=new_dir.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] assert results == expected_results def test_extrinsic_metadata_indexer_unknown_format(self, mocker): """Should be ignored when unknown format""" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") remd = attr.evolve(GITHUB_REMD, format="unknown format") results = metadata_indexer.index(remd.id, data=remd) assert metadata_indexer.storage.method_calls == [] assert results == [] def test_extrinsic_metadata_indexer_github(self, mocker): """Nominal case, calling the mapping and storing the result""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [GITHUB_REMD.to_dict()]} ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1([b"\x01" * 20]) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [ OriginExtrinsicMetadataRow( id="https://example.org/jdoe/myrepo", tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "id": "http://example.org/", "type": "https://forgefed.org/ns#Repository", "name": "test software", }, from_remd_id=GITHUB_REMD.id, mappings=["github"], ) ] def test_extrinsic_metadata_indexer_firstparty_deposit(self, mocker): """Also nominal case, calling the mapping and storing the result""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} assert metadata_indexer.storage.method_calls == [ - call.origin_get_by_sha1([b"\x01" * 20]) + call.origin_get_by_sha1( + [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] + ) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [ OriginExtrinsicMetadataRow( id="https://example.org/jdoe/myrepo", tool={"id": tool["id"], **TRANSLATOR_TOOL}, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": [ {"email": "foo@example.org", "name": "Author 1"}, {"name": "Author 2"}, ], "name": "My Software", }, from_remd_id=DEPOSIT_REMD.id, mappings=["sword-codemeta"], ) ] def test_extrinsic_metadata_indexer_thirdparty_deposit(self, mocker): """Metadata-only deposit: currently ignored""" origin = "https://not-from-example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]} ) == {"status": "uneventful", "origin_extrinsic_metadata:add": 0} assert metadata_indexer.storage.method_calls == [ - call.origin_get_by_sha1([b"\x01" * 20]) + call.origin_get_by_sha1( + [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"] + ) ] results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert results == [] def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker): """Early abort on non-forge authorities""" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") remd = attr.evolve( GITHUB_REMD, authority=attr.evolve( GITHUB_REMD.authority, type=MetadataAuthorityType.REGISTRY ), ) results = metadata_indexer.index(remd.id, data=remd) assert metadata_indexer.storage.method_calls == [] assert results == [] def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker): """Should be ignored when authority URL does not match the origin""" origin = "https://different-domain.example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None results = metadata_indexer.index(GITHUB_REMD.id, data=GITHUB_REMD) assert metadata_indexer.storage.method_calls == [ call.origin_get_by_sha1([b"\x01" * 20]) ] assert results == [] def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker): """Nominal case, calling the mapping and storing the result""" origin = "https://example.org/jdoe/myrepo" metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) metadata_indexer.catch_exceptions = False metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None assert metadata_indexer.process_journal_objects( { "raw_extrinsic_metadata": [ GITHUB_REMD.to_dict(), {**GITHUB_REMD.to_dict(), "id": b"\x00" * 20}, ] } ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} results = list( metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) ) assert len(results) == 1, results assert results[0].from_remd_id == b"\x00" * 20 diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index 999084b..e44ca71 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,157 +1,271 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone +import itertools import pytest from swh.indexer.origin_head import get_head_swhid from swh.indexer.tests.utils import fill_storage from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import CoreSWHID from swh.storage.utils import now + +@pytest.fixture +def swh_storage_backend_config(): + """In-memory storage, to make tests go faster.""" + return {"cls": "memory"} + + SAMPLE_SNAPSHOT = Snapshot( branches={ b"foo": None, b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"foo", ), }, ) +def _add_snapshot_to_origin(storage, origin_url, visit_type, snapshot): + storage.origin_add([Origin(url=origin_url)]) + visit = storage.origin_visit_add( + [ + OriginVisit( + origin=origin_url, + date=datetime(2019, 2, 27, tzinfo=timezone.utc), + type="pypi", + ) + ] + )[0] + storage.snapshot_add([snapshot]) + visit_status = OriginVisitStatus( + origin=origin_url, + visit=visit.visit, + date=now(), + status="full", + snapshot=snapshot.id, + ) + storage.origin_visit_status_add([visit_status]) + + @pytest.fixture def storage(swh_storage): fill_storage(swh_storage) return swh_storage def test_git(storage): origin_url = "https://github.com/SoftwareHeritage/swh-storage" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d" ) def test_git_partial_snapshot(storage): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" storage.origin_add([Origin(url=origin_url)]) visit = storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="git", ) ] )[0] storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="partial", snapshot=SAMPLE_SNAPSHOT.id, ) storage.origin_visit_status_add([visit_status]) assert get_head_swhid(storage, origin_url) is None def test_vcs_missing_snapshot(storage): origin_url = "https://github.com/SoftwareHeritage/swh-indexer" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) is None def test_pypi_missing_branch(storage): origin_url = "https://pypi.org/project/abcdef/" - storage.origin_add( - [ - Origin( - url=origin_url, - ) - ] + _add_snapshot_to_origin(storage, origin_url, "pypi", SAMPLE_SNAPSHOT) + assert get_head_swhid(storage, origin_url) is None + + +@pytest.mark.parametrize( + "branches_start,branches_middle,branches_end", + itertools.product([0, 40, 99, 100, 200], [0, 40, 99, 100, 200], [0, 40, 200]), +) +def test_large_snapshot(storage, branches_start, branches_middle, branches_end): + rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(branches_start)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/foo" + ), + ) + ] + + [(f"aaaa{i}".encode(), None) for i in range(branches_middle)] + + [ + ( + b"refs/heads/foo", + SnapshotBranch( + target_type=TargetType.REVISION, + target=bytes.fromhex(rev_id), + ), + ) + ] + + [(f"zzzz{i}".encode(), None) for i in range(branches_end)] + ) ) - visit = storage.origin_visit_add( - [ - OriginVisit( - origin=origin_url, - date=datetime(2019, 2, 27, tzinfo=timezone.utc), - type="pypi", - ) - ] - )[0] - storage.snapshot_add([SAMPLE_SNAPSHOT]) - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit.visit, - date=now(), - status="full", - snapshot=SAMPLE_SNAPSHOT.id, + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" ) - storage.origin_visit_status_add([visit_status]) + + +def test_large_snapshot_chained_aliases(storage): + rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(200)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/alias2" + ), + ) + ] + + [(f"aaaa{i}".encode(), None) for i in range(200)] + + [ + ( + b"refs/heads/alias2", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/branch" + ), + ) + ] + + [(f"refs/heads/bbbb{i}".encode(), None) for i in range(200)] + + [ + ( + b"refs/heads/branch", + SnapshotBranch( + target_type=TargetType.REVISION, + target=bytes.fromhex(rev_id), + ), + ) + ] + ) + ) + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + ) + + +@pytest.mark.parametrize( + "branches_start,branches_end", + itertools.product([0, 40, 99, 100, 200], [0, 40, 200]), +) +def test_large_snapshot_dangling_alias(storage, branches_start, branches_end): + snapshot = Snapshot( + branches=dict( + [(f"AAAA{i}".encode(), None) for i in range(branches_start)] + + [ + ( + b"HEAD", + SnapshotBranch( + target_type=TargetType.ALIAS, target=b"refs/heads/foo" + ), + ) + ] + + [(f"zzzz{i}".encode(), None) for i in range(branches_end)] + ) + ) + + origin_url = "https://example.org/repo.git" + _add_snapshot_to_origin(storage, origin_url, "git", snapshot) + assert get_head_swhid(storage, origin_url) is None def test_ftp(storage): origin_url = "rsync://ftp.gnu.org/gnu/3dldf" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" ) def test_ftp_missing_snapshot(storage): origin_url = "rsync://ftp.gnu.org/gnu/foobar" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) is None def test_deposit(storage): origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb" ) def test_deposit_missing_snapshot(storage): origin_url = "https://forge.softwareheritage.org/source/foobar" storage.origin_add( [ Origin( url=origin_url, ) ] ) assert get_head_swhid(storage, origin_url) is None def test_pypi(storage): origin_url = "https://old-pypi.example.org/project/limnoria/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" ) origin_url = "https://pypi.org/project/limnoria/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" ) def test_svn(storage): origin_url = "http://0-512-md.googlecode.com/svn/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18" )