diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 5163c4a..ac0920b 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,408 +1,450 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar, ) import sentry_sdk from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Directory, Origin, Sha1Git -from swh.model.swhids import ObjectType +from swh.model.model import Directory +from swh.model.model import ObjectType as ModelObjectType +from swh.model.model import Origin, Sha1Git +from swh.model.swhids import CoreSWHID, ObjectType REVISION_GET_BATCH_SIZE = 10 +RELEASE_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 T1 = TypeVar("T1") T2 = TypeVar("T2") def call_with_batches( f: Callable[[List[T1]], Iterable[T2]], args: List[T1], batch_size: int, ) -> Iterator[T2]: """Calls a function with batches of args, and concatenates the results.""" groups = grouper(args, batch_size) for group in groups: yield from f(list(group)) class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.content_metadata_missing( ( { "id": sha1, "indexer_configuration_id": self.tool["id"], } for sha1 in ids ) ) def index( self, id: Sha1, data: Optional[bytes] = None, log_suffix="unknown directory", **kwargs, ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. Args: id: content's identifier data: raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ assert isinstance(id, bytes) assert data is not None metadata = None try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) metadata = MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) sentry_sdk.capture_exception() if metadata is None: return [] return [ ContentMetadataRow( id=id, indexer_configuration_id=self.tool["id"], metadata=metadata, ) ] def persist_index_computations( self, results: List[ContentMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - metadata (jsonb): detected metadata """ return self.idx_storage.content_metadata_add(results) DEFAULT_CONFIG: Dict[str, Any] = { "tools": { "name": "swh-metadata-detector", "version": "0.0.2", "configuration": {}, }, } class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]): """Directory-level indexer This indexer is in charge of: - filtering directories already indexed in directory_intrinsic_metadata table with defined computation tool - retrieve all entry_files in directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for directory """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = merge_configs(DEFAULT_CONFIG, self.config) def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones.""" yield from self.idx_storage.directory_intrinsic_metadata_missing( ( { "id": sha1_git, "indexer_configuration_id": self.tool["id"], } for sha1_git in sha1_gits ) ) def index( self, id: Sha1Git, data: Optional[Directory] = None, **kwargs ) -> List[DirectoryIntrinsicMetadataRow]: """Index directory by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at directory level Args: id: sha1_git of the directory data: directory model object from storage Returns: dict: dictionary representing a directory_intrinsic_metadata, with keys: - id: directory's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ if data is None: dir_ = list(self.storage.directory_ls(id, recursive=False)) else: assert isinstance(data, Directory) dir_ = data.to_dict() try: if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_[0]["target"] dir_ = list(self.storage.directory_ls(subdir, recursive=False)) files = [entry for entry in dir_ if entry["type"] == "file"] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_directory_intrinsic_metadata( detected_files, log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() return [ DirectoryIntrinsicMetadataRow( id=id, indexer_configuration_id=self.tool["id"], mappings=mappings, metadata=metadata, ) ] def persist_index_computations( self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ # TODO: add functions in storage to keep data in # directory_intrinsic_metadata return self.idx_storage.directory_intrinsic_metadata_add(results) def translate_directory_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files: dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] tool = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {}, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} config["tools"] = [tool] for context in detected_files.keys(): cfg = deepcopy(config) cfg["tools"][0]["configuration"]["context"] = context c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context] ) for c in metadata_generator: # extracting metadata sha1 = c.id sha1s_in_storage.append(sha1) local_metadata = c.metadata # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [ item for item in detected_files[context] if item not in sha1s_in_storage ] if sha1s_filtered: # content indexing try: c_metadata_indexer.run( sha1s_filtered, log_suffix=log_suffix, ) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result.metadata metadata.append(local_metadata) except Exception: self.log.exception("Exception while indexing metadata on contents") sentry_sdk.capture_exception() metadata = merge_documents(metadata) return (used_mappings, metadata) class OriginMetadataIndexer( OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]] ): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( self, origins: List[Origin], check_origin_known: bool = True, **kwargs ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] - origins_with_head = [] + head_rel_ids = [] + origin_heads: Dict[Origin, CoreSWHID] = {} # Filter out origins not in the storage if check_origin_known: known_origins = list( call_with_batches( self.storage.origin_get, [origin.url for origin in origins], ORIGIN_GET_BATCH_SIZE, ) ) else: known_origins = list(origins) for origin in known_origins: if origin is None: continue head_swhid = get_head_swhid(self.storage, origin.url) if head_swhid: - # TODO: add support for releases - assert head_swhid.object_type == ObjectType.REVISION, head_swhid - origins_with_head.append(origin) - head_rev_ids.append(head_swhid.object_id) - - head_revs = list( - call_with_batches( - self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + origin_heads[origin] = head_swhid + if head_swhid.object_type == ObjectType.REVISION: + head_rev_ids.append(head_swhid.object_id) + elif head_swhid.object_type == ObjectType.RELEASE: + head_rel_ids.append(head_swhid.object_id) + else: + assert False, head_swhid + + head_revs = dict( + zip( + head_rev_ids, + call_with_batches( + self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + ), + ) + ) + head_rels = dict( + zip( + head_rel_ids, + call_with_batches( + self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE + ), ) ) - assert len(head_revs) == len(head_rev_ids) results = [] - for (origin, rev) in zip(origins_with_head, head_revs): - if not rev: - self.log.warning("Missing head revision of origin %r", origin.url) - continue + for (origin, head_swhid) in origin_heads.items(): + if head_swhid.object_type == ObjectType.REVISION: + rev = head_revs[head_swhid.object_id] + if not rev: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + directory_id = rev.directory + elif head_swhid.object_type == ObjectType.RELEASE: + rel = head_rels[head_swhid.object_id] + if not rel: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + if rel.target_type != ModelObjectType.DIRECTORY: + # TODO + self.log.warning( + "Head release %s of %r has unexpected target type %s", + head_swhid, + origin.url, + rel.target_type, + ) + continue + assert rel.target, rel + directory_id = rel.target + else: + assert False, head_swhid - for dir_metadata in self.directory_metadata_indexer.index(rev.directory): + for dir_metadata in self.directory_metadata_indexer.index(directory_id): # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( from_directory=dir_metadata.id, id=origin.url, metadata=dir_metadata.metadata, mappings=dir_metadata.mappings, indexer_configuration_id=dir_metadata.indexer_configuration_id, ) results.append((orig_metadata, dir_metadata)) return results def persist_index_computations( self, results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]], ) -> Dict[str, int]: # Deduplicate directories dir_metadata: List[DirectoryIntrinsicMetadataRow] = [] orig_metadata: List[OriginIntrinsicMetadataRow] = [] summary: Dict = {} for (orig_item, dir_item) in results: assert dir_item.metadata == orig_item.metadata if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}): # Only store non-empty metadata sets if dir_item not in dir_metadata: dir_metadata.append(dir_item) if orig_item not in orig_metadata: orig_metadata.append(orig_item) if dir_metadata: summary_dir = self.idx_storage.directory_intrinsic_metadata_add( dir_metadata ) summary.update(summary_dir) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata) summary.update(summary_ori) return summary diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index 6e79e1e..2d9ff6d 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,120 +1,120 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from typing import Dict, List, Optional, Tuple, Union from swh.model.model import SnapshotBranch, TargetType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: """Returns the SWHID of the head revision or release of an origin""" visit_status = origin_get_latest_visit_status( storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) if not visit_status: return None assert visit_status.snapshot is not None snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) if snapshot is None: return None if visit_status.type == "ftp": return _try_get_ftp_head(dict(snapshot.branches)) else: return _try_get_head_generic(dict(snapshot.branches)) _archive_filename_re = re.compile( rb"^" rb"(?P.*)[-_]" rb"(?P[0-9]+(\.[0-9])*)" rb"(?P[-+][a-zA-Z0-9.~]+?)?" rb"(?P(\.[a-zA-Z0-9]+)+)" rb"$" ) def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]: """Extracts the release version from an archive filename, to get an ordering whose maximum is likely to be the last version of the software >>> _parse_version(b'foo') (-inf,) >>> _parse_version(b'foo.tar.gz') (-inf,) >>> _parse_version(b'gnu-hello-0.0.1.tar.gz') (0, 0, 1, 0) >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') (0, 0, 1, -1, 'beta2') >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') (0, 0, 1, 1, 'foobar') """ res = _archive_filename_re.match(filename) if res is None: return (float("-infinity"),) version: List[Union[float, int, str]] = [ int(n) for n in res.group("version").decode().split(".") ] if res.group("preversion") is None: version.append(0) else: preversion = res.group("preversion").decode() if preversion.startswith("-"): version.append(-1) version.append(preversion[1:]) elif preversion.startswith("+"): version.append(1) version.append(preversion[1:]) else: assert False, res.group("preversion") return tuple(version) def _try_get_ftp_head( branches: Dict[bytes, Optional[SnapshotBranch]] ) -> Optional[CoreSWHID]: archive_names = list(branches) max_archive_name = max(archive_names, key=_parse_version) return _try_resolve_target(branches, max_archive_name) def _try_get_head_generic( branches: Dict[bytes, Optional[SnapshotBranch]] ) -> Optional[CoreSWHID]: # Works on 'deposit', 'pypi', and VCSs. return _try_resolve_target(branches, b"HEAD") or _try_resolve_target( branches, b"master" ) def _try_resolve_target( branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes ) -> Optional[CoreSWHID]: try: branch = branches[branch_name] if branch is None: return None while branch.target_type == TargetType.ALIAS: branch = branches[branch.target] if branch is None: return None if branch.target_type == TargetType.REVISION: return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target) elif branch.target_type == TargetType.CONTENT: return None # TODO elif branch.target_type == TargetType.DIRECTORY: return None # TODO elif branch.target_type == TargetType.RELEASE: - return None # TODO + return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target) else: assert False, branch except KeyError: return None diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index 21f8637..999084b 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,152 +1,157 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import pytest from swh.indexer.origin_head import get_head_swhid from swh.indexer.tests.utils import fill_storage from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import CoreSWHID from swh.storage.utils import now SAMPLE_SNAPSHOT = Snapshot( branches={ b"foo": None, b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"foo", ), }, ) @pytest.fixture def storage(swh_storage): fill_storage(swh_storage) return swh_storage def test_git(storage): origin_url = "https://github.com/SoftwareHeritage/swh-storage" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d" ) def test_git_partial_snapshot(storage): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" storage.origin_add([Origin(url=origin_url)]) visit = storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="git", ) ] )[0] storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="partial", snapshot=SAMPLE_SNAPSHOT.id, ) storage.origin_visit_status_add([visit_status]) assert get_head_swhid(storage, origin_url) is None def test_vcs_missing_snapshot(storage): origin_url = "https://github.com/SoftwareHeritage/swh-indexer" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) is None def test_pypi_missing_branch(storage): origin_url = "https://pypi.org/project/abcdef/" storage.origin_add( [ Origin( url=origin_url, ) ] ) visit = storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="pypi", ) ] )[0] storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="full", snapshot=SAMPLE_SNAPSHOT.id, ) storage.origin_visit_status_add([visit_status]) assert get_head_swhid(storage, origin_url) is None def test_ftp(storage): origin_url = "rsync://ftp.gnu.org/gnu/3dldf" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" ) def test_ftp_missing_snapshot(storage): origin_url = "rsync://ftp.gnu.org/gnu/foobar" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) is None def test_deposit(storage): origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" storage.origin_add([Origin(url=origin_url)]) assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb" ) def test_deposit_missing_snapshot(storage): origin_url = "https://forge.softwareheritage.org/source/foobar" storage.origin_add( [ Origin( url=origin_url, ) ] ) assert get_head_swhid(storage, origin_url) is None def test_pypi(storage): - origin_url = "https://pypi.org/project/limnoria/" + origin_url = "https://old-pypi.example.org/project/limnoria/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" ) + origin_url = "https://pypi.org/project/limnoria/" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" + ) + def test_svn(storage): origin_url = "http://0-512-md.googlecode.com/svn/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18" ) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 529680f..f5179c7 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,256 +1,296 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy from unittest.mock import patch import pytest from swh.indexer.metadata import OriginMetadataIndexer from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( DirectoryIntrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model.model import Origin from swh.storage.interface import StorageInterface from .test_metadata import TRANSLATOR_TOOL from .utils import DIRECTORY2, YARN_PARSER_METADATA @pytest.fixture def swh_indexer_config(swh_indexer_config): """Override the default configuration to override the tools entry""" cfg = copy.deepcopy(swh_indexer_config) cfg["tools"] = TRANSLATOR_TOOL return cfg -def test_origin_metadata_indexer( +def test_origin_metadata_indexer_release( + swh_indexer_config, + idx_storage: IndexerStorageInterface, + storage: StorageInterface, + obj_storage, +) -> None: + indexer = OriginMetadataIndexer(config=swh_indexer_config) + origin = "https://npm.example.org/yarn-parser" + indexer.run([origin]) + + tool = swh_indexer_config["tools"] + + dir_id = DIRECTORY2.id + dir_metadata = DirectoryIntrinsicMetadataRow( + id=dir_id, + tool=tool, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + origin_metadata = OriginIntrinsicMetadataRow( + id=origin, + tool=tool, + from_directory=dir_id, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + + dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) + for dir_result in dir_results: + assert dir_result.tool + del dir_result.tool["id"] + assert dir_results == [dir_metadata] + + orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) + for orig_result in orig_results: + assert orig_result.tool + del orig_result.tool["id"] + assert orig_results == [origin_metadata] + + +def test_origin_metadata_indexer_revision( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" indexer.run([origin]) tool = swh_indexer_config["tools"] dir_id = DIRECTORY2.id dir_metadata = DirectoryIntrinsicMetadataRow( id=dir_id, tool=tool, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) origin_metadata = OriginIntrinsicMetadataRow( id=origin, tool=tool, from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) for dir_result in dir_results: assert dir_result.tool del dir_result.tool["id"] assert dir_results == [dir_metadata] orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) for orig_result in orig_results: assert orig_result.tool del orig_result.tool["id"] assert orig_results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["https://github.com/librariesio/yarn-parser"]) indexer.run(["https://github.com/librariesio/yarn-parser"] * 2) origin = "https://github.com/librariesio/yarn-parser" dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert len(dir_results) == 1 orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert len(orig_results) == 1 def test_origin_metadata_indexer_missing_head( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: storage.origin_add([Origin(url="https://example.com")]) indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.run(["https://example.com"]) origin = "https://example.com" results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: origin1 = "https://example.com" origin2 = "https://github.com/librariesio/yarn-parser" storage.origin_add([Origin(url=origin1)]) indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.run([origin1, origin2]) dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert dir_results == [ DirectoryIntrinsicMetadataRow( id=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], tool=dir_results[0].tool, ) ] orig_results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) for orig_result in orig_results: assert orig_results == [ OriginIntrinsicMetadataRow( id=origin2, from_directory=dir_id, metadata=YARN_PARSER_METADATA, mappings=["npm"], tool=orig_results[0].tool, ) ] def test_origin_metadata_indexer_duplicate_directory( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) indexer.storage = storage indexer.idx_storage = idx_storage indexer.catch_exceptions = False origin1 = "https://github.com/librariesio/yarn-parser" origin2 = "https://github.com/librariesio/yarn-parser.git" indexer.run([origin1, origin2]) dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert len(dir_results) == 1 orig_results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) assert len(orig_results) == 2 def test_origin_metadata_indexer_no_metadata_file( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): indexer.run([origin]) dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] def test_origin_metadata_indexer_no_metadata( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.DirectoryMetadataIndexer" ".translate_directory_intrinsic_metadata", return_value=(["npm"], {"@context": "foo"}), ): indexer.run([origin]) dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] def test_origin_metadata_indexer_error( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.DirectoryMetadataIndexer" ".translate_directory_intrinsic_metadata", return_value=None, ): indexer.run([origin]) dir_id = DIRECTORY2.id dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id])) assert dir_results == [] orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert orig_results == [] def test_origin_metadata_indexer_unknown_origin( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) result = indexer.index_list([Origin("https://unknown.org/foo")]) assert not result diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index 45f48a2..5171bae 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,723 +1,783 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import datetime import functools from typing import Any, Dict import unittest from hypothesis import strategies from swh.core.api.classes import stream_results from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Content, Directory, DirectoryEntry, + ObjectType, Origin, OriginVisit, OriginVisitStatus, Person, + Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.storage.utils import now BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = { "storage": {"cls": "memory"}, "objstorage": {"cls": "memory"}, INDEXER_CFG_KEY: {"cls": "memory"}, } ORIGIN_VISITS = [ {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"}, {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"}, { "type": "deposit", "origin": "https://forge.softwareheritage.org/source/jesuisgpl/", }, - {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, + { + "type": "pypi", + "origin": "https://old-pypi.example.org/project/limnoria/", + }, # with rev head + {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, # with rel head {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"}, {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"}, {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"}, + {"type": "git", "origin": "https://npm.example.org/yarn-parser"}, ] ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS] DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=( DirectoryEntry( name=b"index.js", type="file", target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), perms=0o100644, ), DirectoryEntry( name=b"package.json", type="file", target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), perms=0o100644, ), DirectoryEntry( name=b".github", type="dir", target=Directory(entries=()).id, perms=0o040000, ), ), ) DIRECTORY2 = Directory( id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6", entries=( DirectoryEntry( name=b"package.json", type="file", target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"), perms=0o100644, ), ), ) _utc_plus_2 = datetime.timezone(datetime.timedelta(minutes=120)) REVISION = Revision( id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"), message=b"Improve search functionality", author=Person( name=b"Andrew Nesbitt", fullname=b"Andrew Nesbitt ", email=b"andrewnez@gmail.com", ), committer=Person( name=b"Andrew Nesbitt", fullname=b"Andrew Nesbitt ", email=b"andrewnez@gmail.com", ), committer_date=TimestampWithTimezone.from_datetime( datetime.datetime(2013, 10, 4, 12, 50, 49, tzinfo=_utc_plus_2) ), type=RevisionType.GIT, synthetic=False, date=TimestampWithTimezone.from_datetime( datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2) ), directory=DIRECTORY2.id, parents=(), ) REVISIONS = [REVISION] +RELEASE = Release( + name=b"v0.0.0", + message=None, + author=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + synthetic=False, + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2) + ), + target_type=ObjectType.DIRECTORY, + target=DIRECTORY2.id, +) + +RELEASES = [RELEASE] + SNAPSHOTS = [ + # https://github.com/SoftwareHeritage/swh-storage Snapshot( id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), branches={ b"refs/heads/add-revision-origin-cache": SnapshotBranch( target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e', target_type=TargetType.REVISION, ), b"refs/head/master": SnapshotBranch( target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm", target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"refs/head/master", target_type=TargetType.ALIAS ), b"refs/tags/v0.0.103": SnapshotBranch( target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd', target_type=TargetType.RELEASE, ), }, ), + # rsync://ftp.gnu.org/gnu/3dldf Snapshot( id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), branches={ b"3DLDF-1.1.4.tar.gz": SnapshotBranch( target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11', target_type=TargetType.REVISION, ), b"3DLDF-2.0.2.tar.gz": SnapshotBranch( target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", # noqa target_type=TargetType.REVISION, ), b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch( target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff", # noqa target_type=TargetType.REVISION, ), b"3DLDF-2.0.3.tar.gz": SnapshotBranch( target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by", # noqa target_type=TargetType.REVISION, ), b"3DLDF-2.0.tar.gz": SnapshotBranch( target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m", target_type=TargetType.REVISION, ), }, ), + # https://forge.softwareheritage.org/source/jesuisgpl/", Snapshot( id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), branches={ b"master": SnapshotBranch( target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa target_type=TargetType.REVISION, ) }, ), + # https://old-pypi.example.org/project/limnoria/ Snapshot( id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), branches={ b"HEAD": SnapshotBranch( target=b"releases/2018.09.09", target_type=TargetType.ALIAS ), b"releases/2018.09.01": SnapshotBranch( target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", target_type=TargetType.REVISION, ), b"releases/2018.09.09": SnapshotBranch( target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa target_type=TargetType.REVISION, ), }, ), + # https://pypi.org/project/limnoria/ + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/2018.09.09", target_type=TargetType.ALIAS + ), + b"releases/2018.09.01": SnapshotBranch( + target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", + target_type=TargetType.RELEASE, + ), + b"releases/2018.09.09": SnapshotBranch( + target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa + target_type=TargetType.RELEASE, + ), + }, + ), + # http://0-512-md.googlecode.com/svn/ Snapshot( id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), branches={ b"master": SnapshotBranch( target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18", target_type=TargetType.REVISION, ) }, ), + # https://github.com/librariesio/yarn-parser Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ) }, ), + # https://github.com/librariesio/yarn-parser.git Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ) }, ), + # https://npm.example.org/yarn-parser + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=RELEASE.id, + target_type=TargetType.RELEASE, + ) + }, + ), ] +assert len(SNAPSHOTS) == len(ORIGIN_VISITS) + SHA1_TO_LICENSES = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"], "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"], "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"], "da39a3ee5e6b4b0d3255bfef95601890afd80709": [], } SHA1_TO_CTAGS = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [ { "name": "foo", "kind": "str", "line": 10, "lang": "bar", } ], "d4c647f0fc257591cc9ba1722484229780d1c607": [ { "name": "let", "kind": "int", "line": 100, "lang": "haskell", } ], "688a5ef812c53907562fe379d4b3851e69c7cb15": [ { "name": "symbol", "kind": "float", "line": 99, "lang": "python", } ], } OBJ_STORAGE_DATA = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text", "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text", "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text", "02fb2c89e14f7fab46701478c83779c7beb7b069": b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, "103bc087db1d26afc3a0283f38663d081e9b01e6": b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, "93666f74f1cf635c8c8ac118879da6ec5623c410": b""" (should 'pygments (recognize 'lisp 'easily)) """, "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, "d4c647f0fc257591cc9ba1722484229780d1c607": b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b""" """, "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"", # was 626364 / b'bcd' "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd", # was 636465 / b'cde' now yarn-parser package.json "f5305243b3ce7ef8dc864ebc73794da304025beb": b""" { "name": "yarn-parser", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "main": "index.js", "scripts": { "start": "node index.js", "test": "mocha" }, "engines": { "node": "9.8.0" }, "repository": { "type": "git", "url": "git+https://github.com/librariesio/yarn-parser.git" }, "keywords": [ "yarn", "parse", "lock", "dependencies" ], "author": "Andrew Nesbitt", "license": "AGPL-3.0", "bugs": { "url": "https://github.com/librariesio/yarn-parser/issues" }, "homepage": "https://github.com/librariesio/yarn-parser#readme", "dependencies": { "@yarnpkg/lockfile": "^1.0.0", "body-parser": "^1.15.2", "express": "^4.14.0" }, "devDependencies": { "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", "test": "^0.6.0" } } """, } YARN_PARSER_METADATA = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "url": "https://github.com/librariesio/yarn-parser#readme", "codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git", "author": [{"type": "Person", "name": "Andrew Nesbitt"}], "license": "https://spdx.org/licenses/AGPL-3.0", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "issueTracker": "https://github.com/librariesio/yarn-parser/issues", "name": "yarn-parser", "keywords": ["yarn", "parse", "lock", "dependencies"], "type": "SoftwareSourceCode", } json_dict_keys = strategies.one_of( strategies.characters(), strategies.just("type"), strategies.just("url"), strategies.just("name"), strategies.just("email"), strategies.just("@id"), strategies.just("@context"), strategies.just("repository"), strategies.just("license"), strategies.just("repositories"), strategies.just("licenses"), ) """Hypothesis strategy that generates strings, with an emphasis on those that are often used as dictionary keys in metadata files.""" generic_json_document = strategies.recursive( strategies.none() | strategies.booleans() | strategies.floats() | strategies.characters(), lambda children: ( strategies.lists(children, min_size=1) | strategies.dictionaries(json_dict_keys, children, min_size=1) ), ) """Hypothesis strategy that generates possible values for values of JSON metadata files.""" def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files for a JSON-based format that uses the given keys.""" if keys is None: keys = strategies.characters() else: keys = strategies.one_of(map(strategies.just, keys)) return strategies.dictionaries(keys, generic_json_document, min_size=1) def _tree_to_xml(root, xmlns, data): def encode(s): "Skips unpaired surrogates generated by json_document_strategy" return s.encode("utf8", "replace") def to_xml(data, indent=b" "): if data is None: return b"" elif isinstance(data, (bool, str, int, float)): return indent + encode(str(data)) elif isinstance(data, list): return b"\n".join(to_xml(v, indent=indent) for v in data) elif isinstance(data, dict): lines = [] for (key, value) in data.items(): lines.append(indent + encode("<{}>".format(key))) lines.append(to_xml(value, indent=indent + b" ")) lines.append(indent + encode("".format(key))) return b"\n".join(lines) else: raise TypeError(data) return b"\n".join( [ '<{} xmlns="{}">'.format(root, xmlns).encode(), to_xml(data), "".format(root).encode(), ] ) class TreeToXmlTest(unittest.TestCase): def test_leaves(self): self.assertEqual( _tree_to_xml("root", "http://example.com", None), b'\n\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", True), b'\n True\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", "abc"), b'\n abc\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 42), b'\n 42\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 3.14), b'\n 3.14\n', ) def test_dict(self): self.assertIn( _tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}), [ b'\n' b" \n bar\n \n" b" \n qux\n \n" b"", b'\n' b" \n qux\n \n" b" \n bar\n \n" b"", ], ) def test_list(self): self.assertEqual( _tree_to_xml( "root", "http://example.com", [ {"foo": "bar"}, {"foo": "baz"}, ], ), b'\n' b" \n bar\n \n" b" \n baz\n \n" b"", ) def xml_document_strategy(keys, root, xmlns): """Generates an hypothesis strategy that generates metadata files for an XML format that uses the given keys.""" return strategies.builds( functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys) ) def filter_dict(d, keys): "return a copy of the dict with keys deleted" if not isinstance(keys, (list, tuple)): keys = (keys,) return dict((k, v) for (k, v) in d.items() if k not in keys) def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) def fill_storage(storage): storage.origin_add(ORIGINS) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) + storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): assert snapshot.id is not None visit = storage.origin_visit_add( [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])] )[0] visit_status = OriginVisitStatus( origin=visit.origin, visit=visit.visit, date=now(), status="full", snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) contents = [] for (obj_id, content) in OBJ_STORAGE_DATA.items(): content_hashes = hashutil.MultiHash.from_data(content).digest() contents.append( Content( data=content, length=len(content), status="visible", sha1=hash_to_bytes(obj_id), sha1_git=hash_to_bytes(obj_id), sha256=content_hashes["sha256"], blake2s256=content_hashes["blake2s256"], ) ) storage.content_add(contents) class CommonContentIndexerTest(metaclass=abc.ABCMeta): def get_indexer_results(self, ids): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state def assert_results_ok(self, sha1s, expected_results=None): sha1s = [ sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s ] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual(expected_results, actual_results) def test_index(self): """Known sha1 have their data indexed""" sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s) self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s) self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [ self.id1, "799a5ef812c53907562fe379d4b3851e69c7cb15", # unknown "800a5ef812c53907562fe379d4b3851e69c7cb15", ] # unknown # when self.indexer.run(sha1s) # then expected_results = [ res for res in self.expected_results if hashutil.hash_to_hex(res.id) in sha1s ] self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerPartitionTest: """Allows to factorize tests on range indexer.""" def setUp(self): self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, partition_id, nb_partitions, actual_results): expected_ids = [ c.sha1 for c in stream_results( self.indexer.storage.content_get_partition, partition_id=partition_id, nb_partitions=nb_partitions, ) ] actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data.id assert _id in expected_ids _tool_id = indexed_data.indexer_configuration_id assert _tool_id == self.indexer.tool["id"] def test__index_contents(self): """Indexing contents without existing data results in indexed data""" partition_id = 0 nb_partitions = 4 actual_results = list( self.indexer._index_contents(partition_id, nb_partitions, indexed={}) ) self.assert_results_ok(partition_id, nb_partitions, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data""" partition_id = 3 nb_partitions = 4 # first pass actual_results = list( self.indexer._index_contents(partition_id, nb_partitions, indexed={}), ) self.assert_results_ok(partition_id, nb_partitions, actual_results) indexed_ids = {res.id for res in actual_results} actual_results = list( self.indexer._index_contents( partition_id, nb_partitions, indexed=indexed_ids ) ) # already indexed, so nothing new assert actual_results == [] def test_generate_content_get(self): """Optimal indexing should result in indexed data""" partition_id = 0 nb_partitions = 1 actual_results = self.indexer.run( partition_id, nb_partitions, skip_existing=False ) assert actual_results["status"] == "eventful", actual_results def test_generate_content_get_no_result(self): """No result indexed returns False""" actual_results = self.indexer.run(1, 2**512, incremental=False) assert actual_results == {"status": "uneventful"}