diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -215,7 +215,15 @@ @indexer_cli_group.command("journal-client") @click.argument( "indexer", - type=click.Choice(["origin-intrinsic-metadata", "extrinsic-metadata", "*"]), + type=click.Choice( + [ + "origin-intrinsic-metadata", + "extrinsic-metadata", + "content-mimetype", + "content-fossology-license", + "*", + ] + ), required=False # TODO: remove required=False after we stop using it ) @@ -321,6 +329,22 @@ idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) + if indexer in ("content-mimetype", "*"): + from swh.indexer.mimetype import MimetypeIndexer + + object_types.add("content") + idx = MimetypeIndexer() + idx.catch_exceptions = False # don't commit offsets if indexation failed + worker_fns.append(idx.process_journal_objects) + + if indexer in ("content-fossology-license", "*"): + from swh.indexer.fossology_license import FossologyLicenseIndexer + + object_types.add("content") + idx = FossologyLicenseIndexer() + idx.catch_exceptions = False # don't commit offsets if indexation failed + worker_fns.append(idx.process_journal_objects) + if not worker_fns: raise click.ClickException(f"Unknown indexer: {indexer}") diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -41,6 +41,9 @@ class ObjectsDict(TypedDict, total=False): + """Typed objects.""" + + content: List[Dict] directory: List[Dict] origin: List[Dict] origin_visit_status: List[Dict] @@ -282,12 +285,23 @@ """ return {} + def process_journal_objects(self, objects: ObjectsDict) -> Dict: + """Read swh message objects (content, origin, ...) from the journal to: + + - retrieve the associated objects from the storage backend (e.g. storage, + objstorage...) + - execute the associated indexing computations + - store the results in the indexer storage + + """ + raise NotImplementedError() + class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]): - """A content indexer working on a list of ids directly. + """A content indexer working on the journal (method `process_journal_objects`) or on + a list of ids directly (method `run`). - To work on indexer partition, use the :class:`ContentPartitionIndexer` - instead. + To work on indexer partition, use the :class:`ContentPartitionIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the @@ -295,6 +309,44 @@ """ + def process_journal_objects(self, objects: ObjectsDict) -> Dict: + """Read content objects from the journal, retrieve their raw content and compute + content indexing (e.g. mimetype, fossology license, ...). + + Note that once this is deployed, this supersedes the main ContentIndexer.run + method call and the class ContentPartitionIndexer. + + """ + summary: Dict[str, Any] = {"status": "uneventful"} + try: + results = [] + contents = objects.get("content", []) + # FIXME: with swh.objstorage > v2.0: self.objstorage.get_batch(contents) + content_data = self.objstorage.get_batch(c["sha1"] for c in contents) + for item, raw_content in zip(contents, content_data): + id_ = item["sha1"] + if not raw_content: + self.log.warning( + "Content %s not found in objstorage", hashutil.hash_to_hex(id_) + ) + continue + + results.extend(self.index(id_, data=raw_content)) + except Exception: + if not self.catch_exceptions: + raise + summary["status"] = "failed" + return summary + + summary_persist = self.persist_index_computations(results) + self.results = results + if summary_persist: + for value in summary_persist.values(): + if value > 0: + summary["status"] = "eventful" + summary.update(summary_persist) + return summary + def run(self, ids: List[Sha1], **kwargs) -> Dict: """Given a list of ids: diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -72,11 +72,22 @@ @pytest.fixture def swh_indexer_config( - swh_storage_backend_config, idx_storage_backend_config, swh_scheduler_config + swh_storage_backend_config, + idx_storage_backend_config, + swh_scheduler_config, + tmp_path, ): + from os import makedirs + + objstore_rootdir = f"{tmp_path}/objstorage/objects" + makedirs(objstore_rootdir) return { "storage": swh_storage_backend_config, - "objstorage": {"cls": "memory"}, + "objstorage": { + "cls": "pathslicing", + "root": objstore_rootdir, + "slicing": "0:2/0:5", + }, "indexer_storage": idx_storage_backend_config, "scheduler": {"cls": "local", **swh_scheduler_config}, "tools": { diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py --- a/swh/indexer/tests/metadata_dictionary/test_npm.py +++ b/swh/indexer/tests/metadata_dictionary/test_npm.py @@ -11,13 +11,11 @@ from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.storage.model import ContentMetadataRow -from swh.model.hashutil import hash_to_bytes from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer from ..utils import ( BASE_TEST_CONFIG, - fill_obj_storage, - fill_storage, + MAPPING_DESCRIPTION_CONTENT_SHA1, json_document_strategy, ) @@ -96,31 +94,29 @@ assert declared_metadata == result -def test_index_content_metadata_npm(): +def test_index_content_metadata_npm(storage, obj_storage): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ sha1s = [ - hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), - hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), - hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), + MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"], + MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"], + MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"], ] + # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) - fill_obj_storage(metadata_indexer.objstorage) - fill_storage(metadata_indexer.storage) - - metadata_indexer.run(sha1s) + metadata_indexer.run(sha1s, log_suffix="unknown content") results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ ContentMetadataRow( - id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + id=sha1s[0], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", @@ -132,7 +128,7 @@ }, ), ContentMetadataRow( - id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), + id=sha1s[1], tool=TRANSLATOR_TOOL, metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -17,16 +17,17 @@ from swh.indexer.cli import indexer_cli_group from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + ContentMimetypeRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes -from swh.model.model import Origin, OriginVisitStatus +from swh.model.model import Content, Origin, OriginVisitStatus from .test_metadata import REMD -from .utils import DIRECTORY2, REVISION +from .utils import DIRECTORY2, RAW_CONTENTS, REVISION def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: @@ -731,3 +732,85 @@ ) ] assert sorted(results, key=lambda r: r.id) == expected_results + + +def test_cli_journal_client_index__content_mimetype( + cli_runner, + swh_config, + kafka_prefix: str, + kafka_server, + consumer: Consumer, + idx_storage, + obj_storage, + storage, + mocker, + swh_indexer_config, +): + """Test the 'swh indexer journal-client' cli tool.""" + journal_writer = get_journal_writer( + "kafka", + brokers=[kafka_server], + prefix=kafka_prefix, + client_id="test producer", + value_sanitizer=lambda object_type, value: value, + flush_timeout=3, # fail early if something is going wrong + ) + + contents = [] + expected_results = [] + content_ids = [] + for content_id, content_d in RAW_CONTENTS.items(): + raw_content = content_d[0] + content = Content.from_data(raw_content) + + assert content_id == content.sha1 + + contents.append(content) + content_ids.append(content_id) + + if isinstance(content_d[1], tuple): + mimetype = content_d[1][1] + else: + mimetype = content_d[1] + encoding = content_d[2] + content_mimetype_row = ContentMimetypeRow( + id=content.sha1, + tool={"id": 1, **swh_indexer_config["tools"]}, + mimetype=mimetype, + encoding=encoding, + ) + expected_results.append(content_mimetype_row) + + assert len(contents) == len(RAW_CONTENTS) + + storage.content_add(contents) + journal_writer.write_additions("content", contents) + + result = cli_runner.invoke( + indexer_cli_group, + [ + "-C", + swh_config, + "journal-client", + "content-mimetype", + "--broker", + kafka_server, + "--prefix", + kafka_prefix, + "--group-id", + "test-consumer", + "--stop-after-objects", + len(contents), + ], + catch_exceptions=False, + ) + + # Check the output + expected_output = "Done.\n" + assert result.exit_code == 0, result.output + assert result.output == expected_output + + results = idx_storage.content_mimetype_get(content_ids) + assert len(results) == len(expected_results) + for result in results: + assert result in expected_results diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,7 @@ from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, OBJ_STORAGE_DATA, + RAW_CONTENT_IDS, SHA1_TO_CTAGS, CommonContentIndexerTest, fill_obj_storage, @@ -99,16 +100,14 @@ fill_obj_storage(self.indexer.objstorage) # Prepare test input - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "d4c647f0fc257591cc9ba1722484229780d1c607" - self.id2 = "688a5ef812c53907562fe379d4b3851e69c7cb15" + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} self.expected_results = [ *[ ContentCtagsRow( - id=hash_to_bytes(self.id0), + id=self.id0, tool=tool, **kwargs, ) @@ -116,7 +115,7 @@ ], *[ ContentCtagsRow( - id=hash_to_bytes(self.id1), + id=self.id1, tool=tool, **kwargs, ) @@ -124,7 +123,7 @@ ], *[ ContentCtagsRow( - id=hash_to_bytes(self.id2), + id=self.id2, tool=tool, **kwargs, ) @@ -137,7 +136,7 @@ def _set_mocks(self): def find_ctags_for_content(raw_content): for (sha1, ctags) in SHA1_TO_CTAGS.items(): - if OBJ_STORAGE_DATA[sha1] == raw_content: + if OBJ_STORAGE_DATA[hash_to_bytes(sha1)] == raw_content: return ctags else: raise ValueError( @@ -155,7 +154,7 @@ id_ = cmd[-1].split("/")[-1] return "\n".join( json.dumps({"language": ctag["lang"], **ctag}) - for ctag in SHA1_TO_CTAGS[id_] + for ctag in SHA1_TO_CTAGS[hash_to_bytes(id_)] ) self._real_check_output = swh.indexer.ctags.subprocess.check_output diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,6 +18,7 @@ from swh.indexer.storage.model import ContentLicenseRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, + RAW_CONTENT_IDS, SHA1_TO_LICENSES, CommonContentIndexerPartitionTest, CommonContentIndexerTest, @@ -55,8 +56,8 @@ if isinstance(id, bytes): path = path.decode("utf-8") # path is something like /tmp/tmpXXX/ so we keep only the sha1 part - path = path.split("/")[-1] - return {"licenses": SHA1_TO_LICENSES.get(path, [])} + id_ = path.split("/")[-1] + return {"licenses": SHA1_TO_LICENSES.get(hash_to_bytes(id_), [])} CONFIG = { @@ -97,23 +98,18 @@ fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" - self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" # empty content + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} + # then self.expected_results = [ *[ - ContentLicenseRow( - id=hash_to_bytes(self.id0), tool=tool, license=license - ) + ContentLicenseRow(id=self.id0, tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id0] ], *[ - ContentLicenseRow( - id=hash_to_bytes(self.id1), tool=tool, license=license - ) + ContentLicenseRow(id=self.id1, tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id1] ], *[], # self.id2 diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -31,6 +31,8 @@ from .utils import ( BASE_TEST_CONFIG, + MAPPING_DESCRIPTION_CONTENT_SHA1, + MAPPING_DESCRIPTION_CONTENT_SHA1GIT, YARN_PARSER_METADATA, fill_obj_storage, fill_storage, @@ -92,10 +94,17 @@ assert tool is not None dir_ = DIRECTORY2 + assert ( + dir_.entries[0].target + == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] + ) + metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( - id=DIRECTORY2.entries[0].target, + id=MAPPING_DESCRIPTION_CONTENT_SHA1[ + "json:yarn-parser-package.json" + ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) @@ -105,9 +114,7 @@ metadata_indexer.run([dir_.id]) results = list( - metadata_indexer.idx_storage.directory_intrinsic_metadata_get( - [DIRECTORY2.id] - ) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id]) ) expected_results = [ @@ -132,6 +139,10 @@ # Add a parent directory, that is the only directory at the root # of the directory dir_ = DIRECTORY2 + assert ( + dir_.entries[0].target + == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"] + ) new_dir = Directory( entries=( @@ -154,7 +165,9 @@ metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( - id=DIRECTORY2.entries[0].target, + id=MAPPING_DESCRIPTION_CONTENT_SHA1[ + "json:yarn-parser-package.json" + ], indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,22 +16,19 @@ from swh.indexer.storage.model import ContentMimetypeRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, + RAW_CONTENT_IDS, + RAW_CONTENTS, CommonContentIndexerPartitionTest, CommonContentIndexerTest, fill_obj_storage, fill_storage, filter_dict, ) -from swh.model.hashutil import hash_to_bytes @pytest.mark.parametrize( "raw_text,mimetype,encoding", - [ - ("du français".encode(), "text/plain", "utf-8"), - (b"def __init__(self):", ("text/x-python", "text/x-script.python"), "us-ascii"), - (b"\xff\xfe\x00\x00\x00\x00\xff\xfe\xff\xff", "application/octet-stream", ""), - ], + RAW_CONTENTS.values(), ) def test_compute_mimetype_encoding(raw_text, mimetype, encoding): """Compute mimetype encoding should return results""" @@ -79,32 +76,25 @@ fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" - self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} - self.expected_results = [ - ContentMimetypeRow( - id=hash_to_bytes(self.id0), - tool=tool, - mimetype="text/plain", - encoding="us-ascii", - ), - ContentMimetypeRow( - id=hash_to_bytes(self.id1), - tool=tool, - mimetype="text/plain", - encoding="us-ascii", - ), - ContentMimetypeRow( - id=hash_to_bytes(self.id2), - tool=tool, - mimetype="application/x-empty", - encoding="binary", - ), - ] + results = [] + for raw_content_id in RAW_CONTENT_IDS: + content_t = RAW_CONTENTS[raw_content_id] + # New magic version can return different results, this deals with such a case + if isinstance(content_t[1], tuple): + mimetype = content_t[1][1] + else: + mimetype = content_t[1] + encoding = content_t[2] + mimetype_row = ContentMimetypeRow( + id=raw_content_id, tool=tool, mimetype=mimetype, encoding=encoding + ) + results.append(mimetype_row) + + self.expected_results = results RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)]) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,14 +6,13 @@ import abc import datetime import functools -from typing import Any, Dict +from typing import Any, Dict, List, Tuple import unittest from hypothesis import strategies from swh.core.api.classes import stream_results from swh.indexer.storage import INDEXER_CFG_KEY -from swh.model import hashutil from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Content, @@ -40,7 +39,6 @@ INDEXER_CFG_KEY: {"cls": "memory"}, } - ORIGIN_VISITS = [ {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"}, {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"}, @@ -61,20 +59,230 @@ ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS] +OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = { + "text:some": b"this is some text", + "text:another": b"another text", + "text:yet": b"yet another text", + "python:code": b""" + import unittest + import logging + from swh.indexer.mimetype import MimetypeIndexer + from swh.indexer.tests.test_utils import MockObjStorage + + class MockStorage(): + def content_mimetype_add(self, mimetypes): + self.state = mimetypes + + def indexer_configuration_add(self, tools): + return [{ + 'id': 10, + }] + """, + "c:struct": b""" + #ifndef __AVL__ + #define __AVL__ + + typedef struct _avl_tree avl_tree; + + typedef struct _data_t { + int content; + } data_t; + """, + "lisp:assertion": b""" + (should 'pygments (recognize 'lisp 'easily)) + + """, + "json:test-metadata-package.json": b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """, + "json:npm-package.json": b""" + { + "version": "5.0.3", + "name": "npm", + "description": "a package manager for JavaScript", + "keywords": [ + "install", + "modules", + "package manager", + "package.json" + ], + "preferGlobal": true, + "config": { + "publishtest": false + }, + "homepage": "https://docs.npmjs.com/", + "author": "Isaac Z. Schlueter (http://blog.izs.me)", + "repository": { + "type": "git", + "url": "https://github.com/npm/npm" + }, + "bugs": { + "url": "https://github.com/npm/npm/issues" + }, + "dependencies": { + "JSONStream": "~1.3.1", + "abbrev": "~1.1.0", + "ansi-regex": "~2.1.1", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3" + }, + "devDependencies": { + "tacks": "~1.2.6", + "tap": "~10.3.2" + }, + "license": "Artistic-2.0" + } + + """, + "text:carriage-return": b""" + """, + "text:empty": b"", + # was 626364 / b'bcd' + "text:unimportant": b"unimportant content for bcd", + # was 636465 / b'cde' now yarn-parser package.json + "json:yarn-parser-package.json": b""" + { + "name": "yarn-parser", + "version": "1.0.0", + "description": "Tiny web service for parsing yarn.lock files", + "main": "index.js", + "scripts": { + "start": "node index.js", + "test": "mocha" + }, + "engines": { + "node": "9.8.0" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/librariesio/yarn-parser.git" + }, + "keywords": [ + "yarn", + "parse", + "lock", + "dependencies" + ], + "author": "Andrew Nesbitt", + "license": "AGPL-3.0", + "bugs": { + "url": "https://github.com/librariesio/yarn-parser/issues" + }, + "homepage": "https://github.com/librariesio/yarn-parser#readme", + "dependencies": { + "@yarnpkg/lockfile": "^1.0.0", + "body-parser": "^1.15.2", + "express": "^4.14.0" + }, + "devDependencies": { + "chai": "^4.1.2", + "mocha": "^5.2.0", + "request": "^2.87.0", + "test": "^0.6.0" + } + } + +""", +} + +MAPPING_DESCRIPTION_CONTENT_SHA1GIT: Dict[str, bytes] = {} +MAPPING_DESCRIPTION_CONTENT_SHA1: Dict[str, bytes] = {} +OBJ_STORAGE_DATA: Dict[bytes, bytes] = {} + +for key_description, data in OBJ_STORAGE_RAW_CONTENT.items(): + content = Content.from_data(data) + MAPPING_DESCRIPTION_CONTENT_SHA1GIT[key_description] = content.sha1_git + MAPPING_DESCRIPTION_CONTENT_SHA1[key_description] = content.sha1 + OBJ_STORAGE_DATA[content.sha1] = data + + +RAW_CONTENT_METADATA = [ + ( + "du français".encode(), + "text/plain", + "utf-8", + ), + ( + b"def __init__(self):", + ("text/x-python", "text/x-script.python"), + "us-ascii", + ), + ( + b"\xff\xfe\x00\x00\x00\x00\xff\xfe\xff\xff", + "application/octet-stream", + "", + ), +] + +RAW_CONTENTS: Dict[bytes, Tuple] = {} +RAW_CONTENT_IDS: List[bytes] = [] + +for index, raw_content_d in enumerate(RAW_CONTENT_METADATA): + raw_content = raw_content_d[0] + content = Content.from_data(raw_content) + RAW_CONTENTS[content.sha1] = raw_content_d + RAW_CONTENT_IDS.append(content.sha1) + # and write it to objstorage data so it's flushed in the objstorage + OBJ_STORAGE_DATA[content.sha1] = raw_content + + +SHA1_TO_LICENSES: Dict[bytes, List[str]] = { + RAW_CONTENT_IDS[0]: ["GPL"], + RAW_CONTENT_IDS[1]: ["AGPL"], + RAW_CONTENT_IDS[2]: [], +} + + +SHA1_TO_CTAGS: Dict[bytes, List[Dict[str, Any]]] = { + RAW_CONTENT_IDS[0]: [ + { + "name": "foo", + "kind": "str", + "line": 10, + "lang": "bar", + } + ], + RAW_CONTENT_IDS[1]: [ + { + "name": "symbol", + "kind": "float", + "line": 99, + "lang": "python", + } + ], + RAW_CONTENT_IDS[2]: [ + { + "name": "let", + "kind": "int", + "line": 100, + "lang": "haskell", + } + ], +} + DIRECTORY = Directory( - id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=( DirectoryEntry( name=b"index.js", type="file", - target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), + target=MAPPING_DESCRIPTION_CONTENT_SHA1GIT["text:some"], perms=0o100644, ), DirectoryEntry( name=b"package.json", type="file", - target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + target=MAPPING_DESCRIPTION_CONTENT_SHA1GIT[ + "json:test-metadata-package.json" + ], perms=0o100644, ), DirectoryEntry( @@ -87,12 +295,11 @@ ) DIRECTORY2 = Directory( - id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6", entries=( DirectoryEntry( name=b"package.json", type="file", - target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"), + target=MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"], perms=0o100644, ), ), @@ -101,7 +308,6 @@ _utc_plus_2 = datetime.timezone(datetime.timedelta(minutes=120)) REVISION = Revision( - id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"), message=b"Improve search functionality", author=Person( name=b"Andrew Nesbitt", @@ -148,7 +354,6 @@ SNAPSHOTS = [ # https://github.com/SoftwareHeritage/swh-storage Snapshot( - id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), branches={ b"refs/heads/add-revision-origin-cache": SnapshotBranch( target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e', @@ -169,7 +374,6 @@ ), # rsync://ftp.gnu.org/gnu/3dldf Snapshot( - id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), branches={ b"3DLDF-1.1.4.tar.gz": SnapshotBranch( target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11', @@ -195,7 +399,6 @@ ), # https://forge.softwareheritage.org/source/jesuisgpl/", Snapshot( - id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), branches={ b"master": SnapshotBranch( target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa @@ -205,7 +408,6 @@ ), # https://old-pypi.example.org/project/limnoria/ Snapshot( - id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), branches={ b"HEAD": SnapshotBranch( target=b"releases/2018.09.09", target_type=TargetType.ALIAS @@ -238,7 +440,6 @@ ), # http://0-512-md.googlecode.com/svn/ Snapshot( - id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), branches={ b"master": SnapshotBranch( target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18", @@ -248,7 +449,6 @@ ), # https://github.com/librariesio/yarn-parser Snapshot( - id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ b"HEAD": SnapshotBranch( target=REVISION.id, @@ -258,7 +458,6 @@ ), # https://github.com/librariesio/yarn-parser.git Snapshot( - id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ b"HEAD": SnapshotBranch( target=REVISION.id, @@ -280,178 +479,6 @@ assert len(SNAPSHOTS) == len(ORIGIN_VISITS) -SHA1_TO_LICENSES = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], - "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"], - "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"], - "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"], - "da39a3ee5e6b4b0d3255bfef95601890afd80709": [], -} - - -SHA1_TO_CTAGS = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [ - { - "name": "foo", - "kind": "str", - "line": 10, - "lang": "bar", - } - ], - "d4c647f0fc257591cc9ba1722484229780d1c607": [ - { - "name": "let", - "kind": "int", - "line": 100, - "lang": "haskell", - } - ], - "688a5ef812c53907562fe379d4b3851e69c7cb15": [ - { - "name": "symbol", - "kind": "float", - "line": 99, - "lang": "python", - } - ], -} - - -OBJ_STORAGE_DATA = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text", - "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text", - "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text", - "02fb2c89e14f7fab46701478c83779c7beb7b069": b""" - import unittest - import logging - from swh.indexer.mimetype import MimetypeIndexer - from swh.indexer.tests.test_utils import MockObjStorage - - class MockStorage(): - def content_mimetype_add(self, mimetypes): - self.state = mimetypes - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] - """, - "103bc087db1d26afc3a0283f38663d081e9b01e6": b""" - #ifndef __AVL__ - #define __AVL__ - - typedef struct _avl_tree avl_tree; - - typedef struct _data_t { - int content; - } data_t; - """, - "93666f74f1cf635c8c8ac118879da6ec5623c410": b""" - (should 'pygments (recognize 'lisp 'easily)) - - """, - "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b""" - { - "name": "test_metadata", - "version": "0.0.1", - "description": "Simple package.json test for indexer", - "repository": { - "type": "git", - "url": "https://github.com/moranegg/metadata_test" - } - } - """, - "d4c647f0fc257591cc9ba1722484229780d1c607": b""" - { - "version": "5.0.3", - "name": "npm", - "description": "a package manager for JavaScript", - "keywords": [ - "install", - "modules", - "package manager", - "package.json" - ], - "preferGlobal": true, - "config": { - "publishtest": false - }, - "homepage": "https://docs.npmjs.com/", - "author": "Isaac Z. Schlueter (http://blog.izs.me)", - "repository": { - "type": "git", - "url": "https://github.com/npm/npm" - }, - "bugs": { - "url": "https://github.com/npm/npm/issues" - }, - "dependencies": { - "JSONStream": "~1.3.1", - "abbrev": "~1.1.0", - "ansi-regex": "~2.1.1", - "ansicolors": "~0.3.2", - "ansistyles": "~0.1.3" - }, - "devDependencies": { - "tacks": "~1.2.6", - "tap": "~10.3.2" - }, - "license": "Artistic-2.0" - } - - """, - "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b""" - """, - "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"", - # was 626364 / b'bcd' - "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd", - # was 636465 / b'cde' now yarn-parser package.json - "f5305243b3ce7ef8dc864ebc73794da304025beb": b""" - { - "name": "yarn-parser", - "version": "1.0.0", - "description": "Tiny web service for parsing yarn.lock files", - "main": "index.js", - "scripts": { - "start": "node index.js", - "test": "mocha" - }, - "engines": { - "node": "9.8.0" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/librariesio/yarn-parser.git" - }, - "keywords": [ - "yarn", - "parse", - "lock", - "dependencies" - ], - "author": "Andrew Nesbitt", - "license": "AGPL-3.0", - "bugs": { - "url": "https://github.com/librariesio/yarn-parser/issues" - }, - "homepage": "https://github.com/librariesio/yarn-parser#readme", - "dependencies": { - "@yarnpkg/lockfile": "^1.0.0", - "body-parser": "^1.15.2", - "express": "^4.14.0" - }, - "devDependencies": { - "chai": "^4.1.2", - "mocha": "^5.2.0", - "request": "^2.87.0", - "test": "^0.6.0" - } - } - -""", -} - - YARN_PARSER_METADATA = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "url": "https://github.com/librariesio/yarn-parser#readme", @@ -613,17 +640,19 @@ def fill_obj_storage(obj_storage): """Add some content in an object storage.""" - for (obj_id, content) in OBJ_STORAGE_DATA.items(): - obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) + for obj_id, content in OBJ_STORAGE_DATA.items(): + obj_storage.add(content, obj_id) def fill_storage(storage): - storage.origin_add(ORIGINS) + """Fill in storage with consistent test dataset.""" + storage.content_add([Content.from_data(data) for data in OBJ_STORAGE_DATA.values()]) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) + storage.origin_add(ORIGINS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): assert snapshot.id is not None @@ -639,22 +668,6 @@ ) storage.origin_visit_status_add([visit_status]) - contents = [] - for (obj_id, content) in OBJ_STORAGE_DATA.items(): - content_hashes = hashutil.MultiHash.from_data(content).digest() - contents.append( - Content( - data=content, - length=len(content), - status="visible", - sha1=hash_to_bytes(obj_id), - sha1_git=hash_to_bytes(obj_id), - sha256=content_hashes["sha256"], - blake2s256=content_hashes["blake2s256"], - ) - ) - storage.content_add(contents) - class CommonContentIndexerTest(metaclass=abc.ABCMeta): def get_indexer_results(self, ids): @@ -662,9 +675,7 @@ return self.indexer.idx_storage.state def assert_results_ok(self, sha1s, expected_results=None): - sha1s = [ - sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s - ] + sha1s = [hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: @@ -698,11 +709,7 @@ self.indexer.run(sha1s) # then - expected_results = [ - res - for res in self.expected_results - if hashutil.hash_to_hex(res.id) in sha1s - ] + expected_results = [res for res in self.expected_results if res.id in sha1s] self.assert_results_ok(sha1s, expected_results) @@ -711,6 +718,7 @@ """Allows to factorize tests on range indexer.""" def setUp(self): + # still useful? self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, partition_id, nb_partitions, actual_results):