diff --git a/swh/indexer/data/package-json/CITATION b/swh/indexer/data/package-json/CITATION new file mode 100644 index 0000000..52a13c0 --- /dev/null +++ b/swh/indexer/data/package-json/CITATION @@ -0,0 +1 @@ +swh:1:dir:49dd6f75450a37243dfcc4b418ca5bf5e0010748;origin=https://github.com/Bartvds/package.json-schema diff --git a/swh/indexer/data/package-json/LICENSE b/swh/indexer/data/package-json/LICENSE new file mode 100644 index 0000000..3651abe --- /dev/null +++ b/swh/indexer/data/package-json/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2014 Bart van der Schoor + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/swh/indexer/data/package-json/schema.json b/swh/indexer/data/package-json/schema.json new file mode 100644 index 0000000..e5f799f --- /dev/null +++ b/swh/indexer/data/package-json/schema.json @@ -0,0 +1,377 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema", + "id": "lib://package.json", + "title": "package.json-schema", + "description": "JSON Schema for node/npm package.json", + "$ref": "lib://package.json#/definitions/standard", + "definitions": { + "minimal": { + "allOf": [ + { + "$ref": "lib://package.json#/definitions/structure" + }, + { + "required": [ + "name", + "version" + ] + } + ] + }, + "standard": { + "allOf": [ + { + "$ref": "lib://package.json#/definitions/structure" + }, + { + "required": [ + "name", + "version", + "description", + "keywords", + "author", + "homepage", + "repository", + "bugs", + "licenses", + "engines", + "main", + "scripts", + "dependencies", + "devDependencies" + ], + "properties": { + "scripts": { + "type": "object", + "properties": { + "test": { + "type" : "string", + "pattern": "[a-zA-Z]" + } + } + }, + "author": { + "$ref": "lib://package.json#/definitions/person-object" + }, + "contributors": { + "type": "array", + "items": { + "$ref": "lib://package.json#/definitions/person-object" + } + }, + "maintainers": { + "type": "array", + "items": { + "$ref": "lib://package.json#/definitions/person-object" + } + } + } + } + ] + }, + "structure": { + "type": "object", + "properties": { + "name": { + "$ref": "lib://package.json#/definitions/name" + }, + "version": { + "$ref": "lib://package.json#/definitions/semver" + }, + "description": { + "type": "string", + "minLength": 1 + }, + "keywords": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/name" + } + }, + "author": { + "$ref": "lib://package.json#/definitions/person" + }, + "contributors": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/person" + } + }, + "maintainers": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/person" + } + }, + "homepage": { + "$ref": "lib://package.json#/definitions/uri-http" + }, + "repository": { + "$ref": "lib://package.json#/definitions/repository" + }, + "man": { + "oneOf": [ + { + "$ref": "lib://package.json#/definitions/path" + }, + { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/path" + } + } + ] + }, + "bugs": { + "oneOf": [ + { + "$ref": "lib://package.json#/definitions/uri-http" + }, + { + "type": "object", + "required": [ + "url" + ], + "properties": { + "url": { + "$ref": "lib://package.json#/definitions/uri-http" + }, + "email": { + "$ref": "lib://package.json#/definitions/email" + } + } + } + ] + }, + "license": { + "$ref": "lib://package.json#/definitions/licence" + }, + "licenses": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/licence" + } + }, + "private": { + "type": "boolean" + }, + "preferGlobal": { + "type": "boolean" + }, + "engines": { + "$ref": "lib://package.json#/definitions/string-map" + }, + "engineStrict": { + "type": "boolean" + }, + "main": { + "$ref": "lib://package.json#/definitions/path" + }, + "bin": { + "oneOf": [ + { + "$ref": "lib://package.json#/definitions/path" + }, + { + + "$ref": "lib://package.json#/definitions/path-map" + } + ] + }, + "files": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/path" + } + }, + "os": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/identifier" + } + }, + "cpu": { + "type": "array", + "uniqueItems": true, + "items": { + "$ref": "lib://package.json#/definitions/identifier" + } + }, + "config": { + "type": "object" + }, + "publishConfig": { + "type": "object" + }, + "directories": { + "type": "object", + "properties": { + "lib": { + "$ref": "lib://package.json#/definitions/path" + }, + "bin": { + "$ref": "lib://package.json#/definitions/path" + }, + "man": { + "$ref": "lib://package.json#/definitions/path" + }, + "doc": { + "$ref": "lib://package.json#/definitions/path" + }, + "example": { + "$ref": "lib://package.json#/definitions/path" + } + } + }, + "scripts": { + "$ref": "lib://package.json#/definitions/string-map" + }, + "dependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + }, + "devDependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + }, + "bundledDependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + }, + "bundleDependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + }, + "optionalDependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + }, + "peerDependencies": { + "$ref": "lib://package.json#/definitions/dependency-map" + } + } + }, + "uri-http": { + "type": "string", + "pattern": "^https?:\/\/" + }, + "email": { + "type": "string", + "pattern": "^([0-9a-zA-Z]([-\\.\\w]*[0-9a-zA-Z])*@([0-9a-zA-Z][-\\w]*[0-9a-zA-Z]\\.)+[a-zA-Z]{2,9})$" + }, + "path": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "pattern": "^[A-Za-z](?:[_\\.-]?[A-Za-z0-9]+)*$" + }, + "identifier": { + "type": "string", + "pattern": "^[A-Za-z](?:[_-]?[A-Za-z0-9]+)*$" + }, + "semver": { + "type": "string", + "pattern": "^\\d+\\.\\d+\\.\\d+(?:-[a-z]+(?:[_\\.-]*[a-z0-9]+)*)*$" + }, + "type-url": { + "type": "object", + "additionalProperties": false, + "required": [ + "type", + "url" + ], + "properties": { + "type": { + "type": "string", + "pattern": "[a-zA-Z]" + }, + "url": { + "$ref": "lib://package.json#/definitions/uri-http" + } + } + + }, + "repository": { + "$ref": "lib://package.json#/definitions/type-url" + }, + "licence": { + "oneOf": [ + { + "type": "string", + "pattern": "[a-zA-Z]" + }, + { + "$ref": "lib://package.json#/definitions/licence-object" + } + ] + }, + "licence-object": { + "type": "object", + "additionalProperties": false, + "properties": { + "type": { + "type": "string", + "pattern": "[a-zA-Z]" + }, + "url": { + "$ref": "lib://package.json#/definitions/uri-http" + } + } + }, + "person": { + "oneOf": [ + { + "type": "string", + "pattern": "[a-zA-Z]" + }, + { + "$ref": "lib://package.json#/definitions/person-object" + } + ] + }, + "person-object": { + "type": "object", + "required": [ + "name" + ], + "properties": { + "name": { + "type": "string", + "pattern": "[a-zA-Z]" + }, + "email": { + "$ref": "lib://package.json#/definitions/email" + }, + "url": { + "$ref": "lib://package.json#/definitions/uri-http" + } + } + }, + "string-map": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + ".+": { + "type": "string" + } + } + }, + "path-map": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + ".+": { + "$ref": "lib://package.json#/definitions/path", + "pattern": "[a-zA-Z]" + } + } + }, + "dependency-map": { + "$ref": "lib://package.json#/definitions/string-map" + } + } +} diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 836d77d..8b2df2d 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,335 +1,335 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import logging from copy import deepcopy from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ # Note: This used when the content metadata indexer is used alone # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data, log_suffix='unknown revision'): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) result['translated_metadata'] = \ MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id)) if result['translated_metadata'] is None: return None return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': list(MAPPINGS), }, }), } def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'mappings': None, 'translated_metadata': None } try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_revision_metadata( detected_files, log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) result['mappings'] = mappings result['translated_metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files, log_suffix): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ used_mappings = [MAPPINGS[context].name for context in detected_files] translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } config['tools'] = [tool] for context in detected_files.keys(): cfg = deepcopy(config) cfg['tools'][0]['configuration']['context'] = context c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups', log_suffix=log_suffix) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return (used_mappings, min_metadata) class OriginMetadataIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' ADDITIONAL_CONFIG = { 'tools': ('list', []) } USE_TOOLS = False def __init__(self): super().__init__() self.origin_head_indexer = OriginHeadIndexer() self.revision_metadata_indexer = RevisionMetadataIndexer() def index_list(self, origins): head_rev_ids = [] for origin in origins: head_result = self.origin_head_indexer.index(origin) if not head_result: continue head_rev_ids.append(head_result['revision_id']) head_revs = list(self.storage.revision_get(head_rev_ids)) assert len(head_revs) == len(head_rev_ids) results = [] - for (orig, rev) in zip(origins, head_revs): + for (origin, rev) in zip(origins, head_revs): if not rev: self.warning('Missing head revision %s of origin %r', (hashutil.hash_to_bytes(rev['id']), origin)) continue rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], 'origin_id': origin['id'], 'metadata': rev_metadata['translated_metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': rev_metadata['indexer_configuration_id'], } results.append((orig_metadata, rev_metadata)) return results def persist_index_computations(self, results, policy_update): conflict_update = (policy_update == 'update-dups') # Deduplicate revisions rev_metadata = [] orig_metadata = [] for (orig_item, rev_item) in results: if rev_item not in rev_metadata: rev_metadata.append(rev_item) if rev_item not in orig_metadata: orig_metadata.append(orig_item) self.idx_storage.revision_metadata_add( rev_metadata, conflict_update=conflict_update) self.idx_storage.origin_intrinsic_metadata_add( orig_metadata, conflict_update=conflict_update) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 2d5e653..fb72a5a 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,163 +1,188 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from unittest.mock import patch from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata import OriginMetadataIndexer from .utils import BASE_TEST_CONFIG, YARN_PARSER_METADATA from .test_metadata import REVISION_METADATA_CONFIG ORIGIN_HEAD_CONFIG = { **BASE_TEST_CONFIG, 'tools': { 'name': 'origin-metadata', 'version': '0.0.1', 'configuration': {}, }, 'tasks': { 'revision_metadata': 'revision_metadata', 'origin_intrinsic_metadata': 'origin_intrinsic_metadata', } } @pytest.fixture def origin_metadata_indexer(): prefix = 'swh.indexer.' suffix = '.parse_config_file' with patch(prefix + 'metadata.OriginMetadataIndexer' + suffix) as omi, \ patch(prefix + 'origin_head.OriginHeadIndexer' + suffix) as ohi, \ patch(prefix + 'metadata.RevisionMetadataIndexer' + suffix) as rmi: omi.return_value = BASE_TEST_CONFIG ohi.return_value = ORIGIN_HEAD_CONFIG rmi.return_value = REVISION_METADATA_CONFIG yield OriginMetadataIndexer() def test_origin_metadata_indexer( idx_storage, storage, obj_storage, origin_metadata_indexer): indexer = OriginMetadataIndexer() indexer.run(["git+https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'translated_metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'origin_id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] assert results == [origin_metadata] -def test_origin_metadata_indexer_duplicates( +def test_origin_metadata_indexer_duplicate_origin( idx_storage, storage, obj_storage, origin_metadata_indexer): indexer = OriginMetadataIndexer() indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["git+https://github.com/librariesio/yarn-parser"]) indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list(indexer.idx_storage.revision_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head( idx_storage, storage, obj_storage, origin_metadata_indexer): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer() indexer.run(["git+https://example.com"]) origin = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage, origin_metadata_indexer): storage.origin_add([{ 'type': 'git', 'url': 'https://example.com' }]) indexer = OriginMetadataIndexer() indexer.run(["git+https://example.com", "git+https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { 'id': rev_id, 'translated_metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { 'origin_id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) for result in results: del result['tool'] assert results == [origin_metadata] + + +def test_origin_metadata_indexer_duplicate_revision( + idx_storage, storage, obj_storage, origin_metadata_indexer): + indexer = OriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser", + "git+https://github.com/librariesio/yarn-parser.git"]) + + origin1 = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + origin2 = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser.git'}) + assert origin1['id'] != origin2['id'] + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + assert len(results) == 1 + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin1['id'], origin2['id']])) + assert len(results) == 2 diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index c19bb7e..2cfc437 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,666 +1,678 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import datetime import hashlib import random from hypothesis import strategies from swh.model import hashutil from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG = { 'storage': { 'cls': 'memory', 'args': { }, }, 'objstorage': { 'cls': 'memory', 'args': { }, }, INDEXER_CFG_KEY: { 'cls': 'memory', 'args': { }, }, } ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, + { + 'id': 54974446, + 'lister': None, + 'project': None, + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser.git'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), - 'target_type': 'revision'}}} + 'target_type': 'revision'}}}, + 54974446: { + 'branches': { + b'HEAD': { + 'target': hash_to_bytes( + '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'target_type': 'revision'}}}, } REVISIONS = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'author': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] DIRECTORY_ID = b'10' DIRECTORY = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'cde' }, { 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None } ] SHA1_TO_LICENSES = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], } SHA1_TO_CTAGS = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{ 'name': 'foo', 'kind': 'str', 'line': 10, 'lang': 'bar', }], 'd4c647f0fc257591cc9ba1722484229780d1c607': [{ 'name': 'let', 'kind': 'int', 'line': 100, 'lang': 'haskell', }], '688a5ef812c53907562fe379d4b3851e69c7cb15': [{ 'name': 'symbol', 'kind': 'float', 'line': 99, 'lang': 'python', }], } OBJ_STORAGE_DATA = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', '636465': b""" { "name": "yarn-parser", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "main": "index.js", "scripts": { "start": "node index.js", "test": "mocha" }, "engines": { "node": "9.8.0" }, "repository": { "type": "git", "url": "git+https://github.com/librariesio/yarn-parser.git" }, "keywords": [ "yarn", "parse", "lock", "dependencies" ], "author": "Andrew Nesbitt", "license": "AGPL-3.0", "bugs": { "url": "https://github.com/librariesio/yarn-parser/issues" }, "homepage": "https://github.com/librariesio/yarn-parser#readme", "dependencies": { "@yarnpkg/lockfile": "^1.0.0", "body-parser": "^1.15.2", "express": "^4.14.0" }, "devDependencies": { "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", "test": "^0.6.0" } } """ } YARN_PARSER_METADATA = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+git+https://github.com/librariesio/yarn-parser.git', 'author': [{ 'type': 'Person', 'name': 'Andrew Nesbitt' }], 'license': 'https://spdx.org/licenses/AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } json_dict_keys = strategies.one_of( strategies.characters(), *map(strategies.just, ['type', 'url', 'name', 'email', '@id', '@context', 'repository', 'license', ]), ) """Hypothesis strategy that generates strings, with an emphasis on those that are often used as dictionary keys in metadata files.""" generic_json_document = strategies.recursive( strategies.none() | strategies.booleans() | strategies.floats() | strategies.characters(), lambda children: ( strategies.lists(children, 1) | strategies.dictionaries(json_dict_keys, children, min_size=1) ) ) """Hypothesis strategy that generates possible values for values of JSON metadata files.""" def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files for a format that uses the given keys.""" if keys is None: keys = strategies.characters() else: keys = strategies.one_of(map(strategies.just, keys)) return strategies.dictionaries(keys, generic_json_document, min_size=1) def filter_dict(d, keys): 'return a copy of the dict with keys deleted' if not isinstance(keys, (list, tuple)): keys = (keys, ) return dict((k, v) for (k, v) in d.items() if k not in keys) def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) def fill_storage(storage): for origin in ORIGINS: origin = origin.copy() del origin['id'] storage.origin_add_one(origin) for (orig_pseudo_id, snap) in SNAPSHOTS.items(): for orig in ORIGINS: if orig_pseudo_id == orig['id']: origin_id = storage.origin_get( {'type': orig['type'], 'url': orig['url']})['id'] break else: assert False visit = storage.origin_visit_add(origin_id, datetime.datetime.now()) snap_id = snap.get('id') or \ bytes([random.randint(0, 255) for _ in range(32)]) storage.snapshot_add(origin_id, visit['visit'], { 'id': snap_id, 'branches': snap['branches'] }) storage.revision_add(REVISIONS) storage.directory_add([{ 'id': DIRECTORY_ID, 'entries': DIRECTORY, }]) for (obj_id, content) in OBJ_STORAGE_DATA.items(): # TODO: use MultiHash if hasattr(hashlib, 'blake2s'): blake2s256 = hashlib.blake2s(content, digest_size=32).digest() else: # fallback for Python <3.6 blake2s256 = bytes([random.randint(0, 255) for _ in range(32)]) storage.content_add([{ 'data': content, 'length': len(content), 'status': 'visible', 'sha1': hash_to_bytes(obj_id), 'sha1_git': hash_to_bytes(obj_id), 'sha256': hashlib.sha256(content).digest(), 'blake2s256': blake2s256 }]) class CommonContentIndexerTest(metaclass=abc.ABCMeta): legacy_get_format = False """True if and only if the tested indexer uses the legacy format. see: https://forge.softwareheritage.org/T1433 """ def get_indexer_results(self, ids): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state def assert_legacy_results_ok(self, sha1s, expected_results=None): # XXX old format, remove this when all endpoints are # updated to the new one # see: https://forge.softwareheritage.org/T1433 sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual(len(expected_results), len(actual_results), (expected_results, actual_results)) for indexed_data in actual_results: _id = indexed_data['id'] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data['id'] = _id self.assertEqual(indexed_data, expected_data) def assert_results_ok(self, sha1s, expected_results=None): if self.legacy_get_format: self.assert_legacy_results_ok(sha1s, expected_results) return sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual(len(expected_results), len(actual_results), (expected_results, actual_results)) for indexed_data in actual_results: (_id, indexed_data) = list(indexed_data.items())[0] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data = [expected_data] self.assertEqual(indexed_data, expected_data) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update='update-dups') self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [self.id1, '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def setUp(self): self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data['id'] assert isinstance(_id, bytes) indexed_data = indexed_data.copy() indexed_data['id'] = hash_to_hex(indexed_data['id']) self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) self.assertTrue(start <= _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(map(hash_to_bytes, data_indexed))) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok( start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( # checks the bytes input this time start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertTrue(actual_results) def test_generate_content_get_no_result(self): """No result indexed returns False""" _start, _end = ['0000000000000000000000000000000000000000', '0000000000000000000000000000000000000001'] start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( start, end, incremental=False) # then self.assertFalse(actual_results)