diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -215,7 +215,15 @@ @indexer_cli_group.command("journal-client") @click.argument( "indexer", - type=click.Choice(["origin-intrinsic-metadata", "extrinsic-metadata", "*"]), + type=click.Choice( + [ + "origin-intrinsic-metadata", + "extrinsic-metadata", + "content-mimetype", + "content-fossology-license", + "*", + ] + ), required=False # TODO: remove required=False after we stop using it ) diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -72,11 +72,22 @@ @pytest.fixture def swh_indexer_config( - swh_storage_backend_config, idx_storage_backend_config, swh_scheduler_config + swh_storage_backend_config, + idx_storage_backend_config, + swh_scheduler_config, + tmp_path, ): + from os import makedirs + + objstore_rootdir = f"{tmp_path}/objstorage/objects" + makedirs(objstore_rootdir) return { "storage": swh_storage_backend_config, - "objstorage": {"cls": "memory"}, + "objstorage": { + "cls": "pathslicing", + "root": objstore_rootdir, + "slicing": "0:2/0:5", + }, "indexer_storage": idx_storage_backend_config, "scheduler": {"cls": "local", **swh_scheduler_config}, "tools": { diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -17,16 +17,17 @@ from swh.indexer.cli import indexer_cli_group from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( + ContentMimetypeRow, DirectoryIntrinsicMetadataRow, OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes -from swh.model.model import Origin, OriginVisitStatus +from swh.model.model import Content, Origin, OriginVisitStatus from .test_metadata import REMD -from .utils import DIRECTORY2, REVISION +from .utils import DIRECTORY2, RAW_CONTENTS, REVISION, fill_obj_storage def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: @@ -731,3 +732,87 @@ ) ] assert sorted(results, key=lambda r: r.id) == expected_results + + +def test_cli_journal_client_index__content_mimetype( + cli_runner, + swh_config, + kafka_prefix: str, + kafka_server, + consumer: Consumer, + idx_storage, + obj_storage, + storage, + mocker, + swh_indexer_config, +): + """Test the 'swh indexer journal-client' cli tool.""" + journal_writer = get_journal_writer( + "kafka", + brokers=[kafka_server], + prefix=kafka_prefix, + client_id="test producer", + value_sanitizer=lambda object_type, value: value, + flush_timeout=3, # fail early if something is going wrong + ) + + fill_obj_storage(obj_storage) + + contents = [] + expected_results = [] + content_ids = [] + for content_id, content_d in RAW_CONTENTS.items(): + raw_content = content_d[0] + content = Content.from_data(raw_content) + + assert content_id == content.sha1 + + contents.append(content) + content_ids.append(content_id) + + if isinstance(content_d[1], tuple): + mimetype = content_d[1][1] + else: + mimetype = content_d[1] + encoding = content_d[2] + content_mimetype_row = ContentMimetypeRow( + id=content.sha1, + tool={"id": 1, **swh_indexer_config["tools"]}, + mimetype=mimetype, + encoding=encoding, + ) + expected_results.append(content_mimetype_row) + + assert len(contents) == len(RAW_CONTENTS) + + storage.content_add(contents) + journal_writer.write_additions("content", contents) + + result = cli_runner.invoke( + indexer_cli_group, + [ + "-C", + swh_config, + "journal-client", + "content-mimetype", + "--broker", + kafka_server, + "--prefix", + kafka_prefix, + "--group-id", + "test-consumer", + "--stop-after-objects", + len(contents), + ], + catch_exceptions=False, + ) + + # Check the output + expected_output = "Done.\n" + assert result.exit_code == 0, result.output + assert result.output == expected_output + + results = idx_storage.content_mimetype_get(content_ids) + assert len(results) == len(expected_results) + for result in results: + assert result in expected_results diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,7 @@ from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, OBJ_STORAGE_DATA, + RAW_CONTENT_IDS, SHA1_TO_CTAGS, CommonContentIndexerTest, fill_obj_storage, @@ -99,16 +100,14 @@ fill_obj_storage(self.indexer.objstorage) # Prepare test input - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "d4c647f0fc257591cc9ba1722484229780d1c607" - self.id2 = "688a5ef812c53907562fe379d4b3851e69c7cb15" + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} self.expected_results = [ *[ ContentCtagsRow( - id=hash_to_bytes(self.id0), + id=self.id0, tool=tool, **kwargs, ) @@ -116,7 +115,7 @@ ], *[ ContentCtagsRow( - id=hash_to_bytes(self.id1), + id=self.id1, tool=tool, **kwargs, ) @@ -124,7 +123,7 @@ ], *[ ContentCtagsRow( - id=hash_to_bytes(self.id2), + id=self.id2, tool=tool, **kwargs, ) @@ -137,7 +136,7 @@ def _set_mocks(self): def find_ctags_for_content(raw_content): for (sha1, ctags) in SHA1_TO_CTAGS.items(): - if OBJ_STORAGE_DATA[sha1] == raw_content: + if OBJ_STORAGE_DATA[hash_to_bytes(sha1)] == raw_content: return ctags else: raise ValueError( @@ -155,7 +154,7 @@ id_ = cmd[-1].split("/")[-1] return "\n".join( json.dumps({"language": ctag["lang"], **ctag}) - for ctag in SHA1_TO_CTAGS[id_] + for ctag in SHA1_TO_CTAGS[hash_to_bytes(id_)] ) self._real_check_output = swh.indexer.ctags.subprocess.check_output diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,6 +18,7 @@ from swh.indexer.storage.model import ContentLicenseRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, + RAW_CONTENT_IDS, SHA1_TO_LICENSES, CommonContentIndexerPartitionTest, CommonContentIndexerTest, @@ -55,8 +56,8 @@ if isinstance(id, bytes): path = path.decode("utf-8") # path is something like /tmp/tmpXXX/ so we keep only the sha1 part - path = path.split("/")[-1] - return {"licenses": SHA1_TO_LICENSES.get(path, [])} + id_ = path.split("/")[-1] + return {"licenses": SHA1_TO_LICENSES.get(hash_to_bytes(id_), [])} CONFIG = { @@ -97,23 +98,18 @@ fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" - self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" # empty content + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} + # then self.expected_results = [ *[ - ContentLicenseRow( - id=hash_to_bytes(self.id0), tool=tool, license=license - ) + ContentLicenseRow(id=self.id0, tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id0] ], *[ - ContentLicenseRow( - id=hash_to_bytes(self.id1), tool=tool, license=license - ) + ContentLicenseRow(id=self.id1, tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id1] ], *[], # self.id2 diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -95,7 +95,7 @@ metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( - id=DIRECTORY2.entries[0].target, + id=dir_.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) @@ -105,9 +105,7 @@ metadata_indexer.run([dir_.id]) results = list( - metadata_indexer.idx_storage.directory_intrinsic_metadata_get( - [DIRECTORY2.id] - ) + metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id]) ) expected_results = [ diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,22 +16,19 @@ from swh.indexer.storage.model import ContentMimetypeRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, + RAW_CONTENT_IDS, + RAW_CONTENTS, CommonContentIndexerPartitionTest, CommonContentIndexerTest, fill_obj_storage, fill_storage, filter_dict, ) -from swh.model.hashutil import hash_to_bytes @pytest.mark.parametrize( "raw_text,mimetype,encoding", - [ - ("du français".encode(), "text/plain", "utf-8"), - (b"def __init__(self):", ("text/x-python", "text/x-script.python"), "us-ascii"), - (b"\xff\xfe\x00\x00\x00\x00\xff\xfe\xff\xff", "application/octet-stream", ""), - ], + RAW_CONTENTS.values(), ) def test_compute_mimetype_encoding(raw_text, mimetype, encoding): """Compute mimetype encoding should return results""" @@ -79,32 +76,25 @@ fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) - self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" - self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" - self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" + self.id0, self.id1, self.id2 = RAW_CONTENT_IDS tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} - self.expected_results = [ - ContentMimetypeRow( - id=hash_to_bytes(self.id0), - tool=tool, - mimetype="text/plain", - encoding="us-ascii", - ), - ContentMimetypeRow( - id=hash_to_bytes(self.id1), - tool=tool, - mimetype="text/plain", - encoding="us-ascii", - ), - ContentMimetypeRow( - id=hash_to_bytes(self.id2), - tool=tool, - mimetype="application/x-empty", - encoding="binary", - ), - ] + results = [] + for raw_content_id in RAW_CONTENT_IDS: + content_t = RAW_CONTENTS[raw_content_id] + # New magic version can return different results, this deals with such a case + if isinstance(content_t[1], tuple): + mimetype = content_t[1][1] + else: + mimetype = content_t[1] + encoding = content_t[2] + mimetype_row = ContentMimetypeRow( + id=raw_content_id, tool=tool, mimetype=mimetype, encoding=encoding + ) + results.append(mimetype_row) + + self.expected_results = results RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)]) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,14 +6,13 @@ import abc import datetime import functools -from typing import Any, Dict +from typing import Any, Dict, List, Tuple import unittest from hypothesis import strategies from swh.core.api.classes import stream_results from swh.indexer.storage import INDEXER_CFG_KEY -from swh.model import hashutil from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Content, @@ -40,7 +39,6 @@ INDEXER_CFG_KEY: {"cls": "memory"}, } - ORIGIN_VISITS = [ {"type": "git", "origin": "https://github.com/SoftwareHeritage/swh-storage"}, {"type": "ftp", "origin": "rsync://ftp.gnu.org/gnu/3dldf"}, @@ -61,20 +59,229 @@ ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS] +OBJ_STORAGE_RAW_CONTENT: Dict[str, bytes] = { + "text:some": b"this is some text", + "text:another": b"another text", + "text:yet": b"yet another text", + "python:code": b""" + import unittest + import logging + from swh.indexer.mimetype import MimetypeIndexer + from swh.indexer.tests.test_utils import MockObjStorage + + class MockStorage(): + def content_mimetype_add(self, mimetypes): + self.state = mimetypes + + def indexer_configuration_add(self, tools): + return [{ + 'id': 10, + }] + """, + "c:struct": b""" + #ifndef __AVL__ + #define __AVL__ + + typedef struct _avl_tree avl_tree; + + typedef struct _data_t { + int content; + } data_t; + """, + "lisp:assertion": b""" + (should 'pygments (recognize 'lisp 'easily)) + + """, + "json:test-metadata-package.json": b""" + { + "name": "test_metadata", + "version": "0.0.1", + "description": "Simple package.json test for indexer", + "repository": { + "type": "git", + "url": "https://github.com/moranegg/metadata_test" + } + } + """, + "json:npm-package.json": b""" + { + "version": "5.0.3", + "name": "npm", + "description": "a package manager for JavaScript", + "keywords": [ + "install", + "modules", + "package manager", + "package.json" + ], + "preferGlobal": true, + "config": { + "publishtest": false + }, + "homepage": "https://docs.npmjs.com/", + "author": "Isaac Z. Schlueter (http://blog.izs.me)", + "repository": { + "type": "git", + "url": "https://github.com/npm/npm" + }, + "bugs": { + "url": "https://github.com/npm/npm/issues" + }, + "dependencies": { + "JSONStream": "~1.3.1", + "abbrev": "~1.1.0", + "ansi-regex": "~2.1.1", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3" + }, + "devDependencies": { + "tacks": "~1.2.6", + "tap": "~10.3.2" + }, + "license": "Artistic-2.0" + } + + """, + "text:carriage-return": b""" + """, + "text:empty": b"", + # was 626364 / b'bcd' + "text:unimportant": b"unimportant content for bcd", + # was 636465 / b'cde' now yarn-parser package.json + "json:yarn-parser-package.json": b""" + { + "name": "yarn-parser", + "version": "1.0.0", + "description": "Tiny web service for parsing yarn.lock files", + "main": "index.js", + "scripts": { + "start": "node index.js", + "test": "mocha" + }, + "engines": { + "node": "9.8.0" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/librariesio/yarn-parser.git" + }, + "keywords": [ + "yarn", + "parse", + "lock", + "dependencies" + ], + "author": "Andrew Nesbitt", + "license": "AGPL-3.0", + "bugs": { + "url": "https://github.com/librariesio/yarn-parser/issues" + }, + "homepage": "https://github.com/librariesio/yarn-parser#readme", + "dependencies": { + "@yarnpkg/lockfile": "^1.0.0", + "body-parser": "^1.15.2", + "express": "^4.14.0" + }, + "devDependencies": { + "chai": "^4.1.2", + "mocha": "^5.2.0", + "request": "^2.87.0", + "test": "^0.6.0" + } + } + +""", +} + +MAPPING_CONTENT_ID: Dict[str, bytes] = {} +OBJ_STORAGE_DATA: Dict[bytes, bytes] = {} + +for key_description, data in OBJ_STORAGE_RAW_CONTENT.items(): + content = Content.from_data(data) + MAPPING_CONTENT_ID[key_description] = content.sha1 + OBJ_STORAGE_DATA[content.sha1] = data + + +RAW_CONTENT_METADATA = [ + ( + "du français".encode(), + "text/plain", + "utf-8", + ), + ( + b"def __init__(self):", + ("text/x-python", "text/x-script.python"), + "us-ascii", + ), + ( + b"\xff\xfe\x00\x00\x00\x00\xff\xfe\xff\xff", + "application/octet-stream", + "", + ), +] + +RAW_CONTENTS: Dict[bytes, Tuple] = {} +RAW_CONTENT_IDS: List[bytes] = [] + +for index, raw_content_d in enumerate(RAW_CONTENT_METADATA): + raw_content = raw_content_d[0] + content = Content.from_data(raw_content) + RAW_CONTENTS[content.sha1] = raw_content_d + RAW_CONTENT_IDS.append(content.sha1) + # and write it to objstorage data so it's flushed in the objstorage + MAPPING_CONTENT_ID["text-key-{index}"] = content.sha1 + OBJ_STORAGE_DATA[content.sha1] = raw_content + + +SHA1_TO_LICENSES: Dict[bytes, List[str]] = { + RAW_CONTENT_IDS[0]: ["GPL"], + MAPPING_CONTENT_ID["python:code"]: ["Apache2.0"], + MAPPING_CONTENT_ID["c:struct"]: ["MIT"], + RAW_CONTENT_IDS[1]: ["AGPL"], + RAW_CONTENT_IDS[2]: [], +} + + +SHA1_TO_CTAGS: Dict[bytes, List[Dict[str, Any]]] = { + RAW_CONTENT_IDS[0]: [ + { + "name": "foo", + "kind": "str", + "line": 10, + "lang": "bar", + } + ], + RAW_CONTENT_IDS[1]: [ + { + "name": "symbol", + "kind": "float", + "line": 99, + "lang": "python", + } + ], + RAW_CONTENT_IDS[2]: [ + { + "name": "let", + "kind": "int", + "line": 100, + "lang": "haskell", + } + ], +} + DIRECTORY = Directory( - id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=( DirectoryEntry( name=b"index.js", type="file", - target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), + target=RAW_CONTENT_IDS[0], perms=0o100644, ), DirectoryEntry( name=b"package.json", type="file", - target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + target=MAPPING_CONTENT_ID["json:test-metadata-package.json"], perms=0o100644, ), DirectoryEntry( @@ -87,12 +294,11 @@ ) DIRECTORY2 = Directory( - id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6", entries=( DirectoryEntry( name=b"package.json", type="file", - target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"), + target=MAPPING_CONTENT_ID["json:yarn-parser-package.json"], perms=0o100644, ), ), @@ -280,178 +486,6 @@ assert len(SNAPSHOTS) == len(ORIGIN_VISITS) -SHA1_TO_LICENSES = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], - "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"], - "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"], - "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"], - "da39a3ee5e6b4b0d3255bfef95601890afd80709": [], -} - - -SHA1_TO_CTAGS = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [ - { - "name": "foo", - "kind": "str", - "line": 10, - "lang": "bar", - } - ], - "d4c647f0fc257591cc9ba1722484229780d1c607": [ - { - "name": "let", - "kind": "int", - "line": 100, - "lang": "haskell", - } - ], - "688a5ef812c53907562fe379d4b3851e69c7cb15": [ - { - "name": "symbol", - "kind": "float", - "line": 99, - "lang": "python", - } - ], -} - - -OBJ_STORAGE_DATA = { - "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text", - "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text", - "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text", - "02fb2c89e14f7fab46701478c83779c7beb7b069": b""" - import unittest - import logging - from swh.indexer.mimetype import MimetypeIndexer - from swh.indexer.tests.test_utils import MockObjStorage - - class MockStorage(): - def content_mimetype_add(self, mimetypes): - self.state = mimetypes - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] - """, - "103bc087db1d26afc3a0283f38663d081e9b01e6": b""" - #ifndef __AVL__ - #define __AVL__ - - typedef struct _avl_tree avl_tree; - - typedef struct _data_t { - int content; - } data_t; - """, - "93666f74f1cf635c8c8ac118879da6ec5623c410": b""" - (should 'pygments (recognize 'lisp 'easily)) - - """, - "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b""" - { - "name": "test_metadata", - "version": "0.0.1", - "description": "Simple package.json test for indexer", - "repository": { - "type": "git", - "url": "https://github.com/moranegg/metadata_test" - } - } - """, - "d4c647f0fc257591cc9ba1722484229780d1c607": b""" - { - "version": "5.0.3", - "name": "npm", - "description": "a package manager for JavaScript", - "keywords": [ - "install", - "modules", - "package manager", - "package.json" - ], - "preferGlobal": true, - "config": { - "publishtest": false - }, - "homepage": "https://docs.npmjs.com/", - "author": "Isaac Z. Schlueter (http://blog.izs.me)", - "repository": { - "type": "git", - "url": "https://github.com/npm/npm" - }, - "bugs": { - "url": "https://github.com/npm/npm/issues" - }, - "dependencies": { - "JSONStream": "~1.3.1", - "abbrev": "~1.1.0", - "ansi-regex": "~2.1.1", - "ansicolors": "~0.3.2", - "ansistyles": "~0.1.3" - }, - "devDependencies": { - "tacks": "~1.2.6", - "tap": "~10.3.2" - }, - "license": "Artistic-2.0" - } - - """, - "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b""" - """, - "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"", - # was 626364 / b'bcd' - "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd", - # was 636465 / b'cde' now yarn-parser package.json - "f5305243b3ce7ef8dc864ebc73794da304025beb": b""" - { - "name": "yarn-parser", - "version": "1.0.0", - "description": "Tiny web service for parsing yarn.lock files", - "main": "index.js", - "scripts": { - "start": "node index.js", - "test": "mocha" - }, - "engines": { - "node": "9.8.0" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/librariesio/yarn-parser.git" - }, - "keywords": [ - "yarn", - "parse", - "lock", - "dependencies" - ], - "author": "Andrew Nesbitt", - "license": "AGPL-3.0", - "bugs": { - "url": "https://github.com/librariesio/yarn-parser/issues" - }, - "homepage": "https://github.com/librariesio/yarn-parser#readme", - "dependencies": { - "@yarnpkg/lockfile": "^1.0.0", - "body-parser": "^1.15.2", - "express": "^4.14.0" - }, - "devDependencies": { - "chai": "^4.1.2", - "mocha": "^5.2.0", - "request": "^2.87.0", - "test": "^0.6.0" - } - } - -""", -} - - YARN_PARSER_METADATA = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "url": "https://github.com/librariesio/yarn-parser#readme", @@ -613,8 +647,8 @@ def fill_obj_storage(obj_storage): """Add some content in an object storage.""" - for (obj_id, content) in OBJ_STORAGE_DATA.items(): - obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) + for obj_id, content in OBJ_STORAGE_DATA.items(): + obj_storage.add(content, obj_id) def fill_storage(storage): @@ -639,21 +673,7 @@ ) storage.origin_visit_status_add([visit_status]) - contents = [] - for (obj_id, content) in OBJ_STORAGE_DATA.items(): - content_hashes = hashutil.MultiHash.from_data(content).digest() - contents.append( - Content( - data=content, - length=len(content), - status="visible", - sha1=hash_to_bytes(obj_id), - sha1_git=hash_to_bytes(obj_id), - sha256=content_hashes["sha256"], - blake2s256=content_hashes["blake2s256"], - ) - ) - storage.content_add(contents) + storage.content_add([Content.from_data(data) for data in OBJ_STORAGE_DATA.values()]) class CommonContentIndexerTest(metaclass=abc.ABCMeta): @@ -662,9 +682,7 @@ return self.indexer.idx_storage.state def assert_results_ok(self, sha1s, expected_results=None): - sha1s = [ - sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s - ] + sha1s = [hash_to_bytes(sha1) for sha1 in sha1s] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: @@ -698,11 +716,7 @@ self.indexer.run(sha1s) # then - expected_results = [ - res - for res in self.expected_results - if hashutil.hash_to_hex(res.id) in sha1s - ] + expected_results = [res for res in self.expected_results if res.id in sha1s] self.assert_results_ok(sha1s, expected_results) @@ -711,6 +725,7 @@ """Allows to factorize tests on range indexer.""" def setUp(self): + # still useful? self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, partition_id, nb_partitions, actual_results):