diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -3,21 +3,81 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -from typing import Any, Dict, Optional, Tuple, List, Union +from datetime import datetime +from enum import Enum import gzip +import json +from typing import Any, Dict, List, Optional, Tuple -from swh.model.hashutil import hash_to_bytes -from swh.model.hashutil import hash_to_hex -from swh.model.model import MetadataTargetType, Origin +import attr from swh.clearlydefined.error import ( InvalidComponents, - WrongMetadata, - ToolNotFound, NoJsonExtension, RevisionNotFound, + ToolNotFound, ToolNotSupported, + WrongMetadata, +) +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + + +class ToolType(Enum): + """The type of content pointed to by a snapshot branch. Usually a + revision or an alias.""" + + DEFINITION = "definition" + SCANCODE = "scancode" + CLEARLYDEFINED = "clearlydefined" + LICENSEE = "licensee" + FOSSOLOGY = "fossology" + + +def map_row_data_with_metadata( + swh_id: str, + type: MetadataTargetType, + origin: Optional[Origin], + metadata: Dict, + date: datetime, + format: str, +) -> RawExtrinsicMetadata: + """ + Take and data_list as input and write + data inside RawExtrensicMetadata table inside + swh storage + """ + return RawExtrinsicMetadata( + type=type, + target=parse_swhid(swh_id), + discovery_date=date, + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format=format, + origin=origin.url if origin else None, + metadata=json.dumps(metadata).encode("utf-8"), + ) + + +authority = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://clearlydefined.io/", + metadata={}, +) + + +fetcher = MetadataFetcher( + name="swh-clearlydefined", + version="0.0.1", + metadata={}, ) @@ -50,21 +110,42 @@ def map_sha1_and_add_in_data( - storage, sha1: Optional[str], data: list, mapping_status=True + storage, + sha1: Optional[str], + data: List[RawExtrinsicMetadata], + file: Dict, + date: datetime, + format: str, + mapping_status=True, ) -> bool: + """ + Take sha1, data, file, date, mapping_status as input + and return whether the sha1 exists in content, if it exists + map sha1 with swhid and push RawExtrensicMetadata object that got + mapping row data with RawExtrensicMetadata + """ if sha1: assert isinstance(sha1, str) swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) if swh_id: - data.append((swh_id, MetadataTargetType.CONTENT, None)) + data.append( + map_row_data_with_metadata( + swh_id=swh_id, + type=MetadataTargetType.CONTENT, + origin=None, + metadata=file, + date=date, + format=format, + ) + ) else: mapping_status = False return mapping_status def map_scancode( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -76,18 +157,20 @@ content = metadata.get("content") or {} files = content.get("files") or {} mapping_status = True - data: list = [] + format = "clearlydefined-harvest-scancode-json" + data: List[RawExtrinsicMetadata] = [] for file in files: sha1 = file.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_licensee( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -101,18 +184,20 @@ content = output.get("content") or {} files = content.get("matched_files") or [] mapping_status = True - data: list = [] + format = "clearlydefined-harvest-licensee-json" + data: List[RawExtrinsicMetadata] = [] for file in files: sha1 = file.get("content_hash") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_clearlydefined( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -123,19 +208,21 @@ metadata = json.loads(metadata_string) files = metadata.get("files") or [] mapping_status = True - data: list = [] + format = "clearlydefined-harvest-clearlydefined-json" + data: List[RawExtrinsicMetadata] = [] for file in files: hashes = file.get("hashes") or {} sha1 = hashes.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_harvest( - storage, tool: str, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, tool: str, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take tool, metadata_string and storage as input and try to map the sha1 of files with content, return status of @@ -147,12 +234,12 @@ "clearlydefined": map_clearlydefined, } - return tools[tool](storage=storage, metadata_string=metadata_string) + return tools[tool](storage=storage, metadata_string=metadata_string, date=date) def map_definition( - storage, metadata_string: str -) -> Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]]: + storage, metadata_string: str, date: datetime +) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: """ Take metadata_string and storage as input and try to map the sha1 of defintion with content/ gitSha in revision @@ -190,25 +277,24 @@ else: raise WrongMetadata("Wrong metadata") - return True, [(swh_id, metadata_type, origin)] + return True, [ + map_row_data_with_metadata( + swh_id=swh_id, + type=metadata_type, + origin=origin, + metadata=metadata, + date=date, + format="clearlydefined-definition-json", + ) + ] -def map_row( - storage, row: tuple -) -> Union[ - Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]], - Tuple[bool, List[Tuple[str, MetadataTargetType, None]]], -]: +def get_type_of_tool(cd_path) -> ToolType: """ - Take row and storage as input and try to map that row, - if ID of row is invalid then raise exception, - if not able to map that row, then return None - else return status of that row and data to be written - in storage + Take cd_path as input if cd_path is invalid then raise exception, + else return tyoe of tool of that row """ - cd_path = row[0] list_cd_path = cd_path.split("/") - # For example: maven/mavencentral/cobol-parser/abc/0.4.0.json if list_cd_path[4] != "revision": raise RevisionNotFound( @@ -220,13 +306,6 @@ raise NoJsonExtension( 'Not a supported/known ID, A valid ID should end with ".json" extension.' ) - - metadata_string = gzip.decompress(row[1]).decode() - # if the row doesn't contain any information in metadata return None so it can be - # mapped later on - if metadata_string == "": - return None - # if the ID of row contains 9 components: # ////revision//tool//.json # then it is a harvest @@ -239,23 +318,44 @@ ) tool = list_cd_path[7] # if the row contains an unknown tool - if tool not in ("scancode", "licensee", "clearlydefined"): + if tool not in ("scancode", "licensee", "clearlydefined", "fossology"): raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") - return map_harvest( - tool=tool, - metadata_string=metadata_string, - storage=storage, - ) - + return ToolType(tool) elif len(list_cd_path) == 6: - # if the ID of row contains 6 components: - # ////revision/.json - # then it is a defintion - return map_definition( - metadata_string=metadata_string, - storage=storage, - ) + return ToolType.DEFINITION # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json raise InvalidComponents( "Not a supported/known ID, A valid ID should have 6 or 9 components." ) + + +def map_row( + storage, metadata: bytes, id: str, date: datetime +) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: + """ + Take row and storage as input and try to map that row, + if ID of row is invalid then raise exception, + if not able to map that row, then return None + else return status of that row and data to be written + in storage + """ + tool = get_type_of_tool(id).value + + # if the row doesn't contain any information in metadata return None so it can be + # mapped later on + metadata_string = gzip.decompress(metadata).decode() + if metadata_string == "": + return None + + if tool == "definition": + return map_definition( + metadata_string=metadata_string, storage=storage, date=date + ) + + else: + return map_harvest( + tool=tool, + metadata_string=metadata_string, + storage=storage, + date=date, + ) diff --git a/swh/clearlydefined/tests/data/clearlydefined_metadata.json b/swh/clearlydefined/tests/data/clearlydefined_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearlydefined_metadata.json @@ -0,0 +1,7 @@ +{ + "path": "package/LICENSE", + "hashes": { + "sha1": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json b/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json @@ -0,0 +1,7 @@ +{ + "path": "package/README.md", + "hashes": { + "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", + "sha256": "60b9c916c43fba00e2d3ba5207b25bf28109e985c3f739f430bb2056423d5aa9" + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/licensee_metadata.json b/swh/clearlydefined/tests/data/licensee_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/licensee_metadata.json @@ -0,0 +1,11 @@ +{ + "filename": "package/package.json", + "content": "{\n \"name\": \"@fluidframework/replay-driver\",\n \"version\": \"0.31.0\",\n \"description\": \"Document replay version of Socket.IO implementation\",\n \"homepage\": \"https://fluidframework.com\",\n \"repository\": \"https://github.com/microsoft/FluidFramework\",\n \"license\": \"MIT\",\n \"author\": \"Microsoft\",\n \"sideEffects\": false,\n \"main\": \"dist/index.js\",\n \"module\": \"lib/index.js\",\n \"types\": \"dist/index.d.ts\",\n \"scripts\": {\n \"build\": \"npm run build:genver && concurrently npm:build:compile npm:lint\",\n \"build:compile\": \"concurrently npm:tsc npm:build:esnext\",\n \"build:docs\": \"api-extractor run --local && copyfiles -u 1 ./_api-extractor-temp/doc-models/* ../../../_api-extractor-temp/\",\n \"build:esnext\": \"tsc --project ./tsconfig.esnext.json\",\n \"build:full\": \"npm run build\",\n \"build:full:compile\": \"npm run build:compile\",\n \"build:genver\": \"gen-version\",\n \"clean\": \"rimraf dist lib *.tsbuildinfo *.build.log\",\n \"eslint\": \"eslint --format stylish src\",\n \"eslint:fix\": \"eslint --format stylish src --fix\",\n \"lint\": \"npm run eslint\",\n \"lint:fix\": \"npm run eslint:fix\",\n \"tsc\": \"tsc\",\n \"tsfmt\": \"tsfmt --verify\",\n \"tsfmt:fix\": \"tsfmt --replace\"\n },\n \"dependencies\": {\n \"@fluidframework/common-definitions\": \"^0.19.1\",\n \"@fluidframework/common-utils\": \"^0.26.0\",\n \"@fluidframework/driver-definitions\": \"^0.31.0\",\n \"@fluidframework/driver-utils\": \"^0.31.0\",\n \"@fluidframework/protocol-definitions\": \"^0.1016.1\",\n \"@fluidframework/telemetry-utils\": \"^0.31.0\",\n \"assert\": \"^2.0.0\",\n \"debug\": \"^4.1.1\"\n },\n \"devDependencies\": {\n \"@fluidframework/build-common\": \"^0.19.2\",\n \"@fluidframework/eslint-config-fluid\": \"^0.21.0\",\n \"@microsoft/api-extractor\": \"^7.7.2\",\n \"@types/assert\": \"^1.5.1\",\n \"@types/debug\": \"^4.1.5\",\n \"@types/mocha\": \"^5.2.5\",\n \"@types/nock\": \"^9.3.0\",\n \"@types/node\": \"^10.17.24\",\n \"@typescript-eslint/eslint-plugin\": \"~4.2.0\",\n \"@typescript-eslint/parser\": \"~4.2.0\",\n \"concurrently\": \"^5.2.0\",\n \"copyfiles\": \"^2.1.0\",\n \"eslint\": \"~7.9.0\",\n \"eslint-plugin-eslint-comments\": \"~3.2.0\",\n \"eslint-plugin-import\": \"~2.22.0\",\n \"eslint-plugin-no-null\": \"~1.0.2\",\n \"eslint-plugin-prefer-arrow\": \"~1.2.2\",\n \"eslint-plugin-react\": \"~7.21.2\",\n \"eslint-plugin-unicorn\": \"~22.0.0\",\n \"mocha\": \"^8.1.1\",\n \"nock\": \"^10.0.1\",\n \"rimraf\": \"^2.6.2\",\n \"typescript\": \"~3.7.4\",\n \"typescript-formatter\": \"7.1.0\"\n }\n}\n", + "content_hash": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "content_normalized": null, + "matcher": { + "name": "npmbower", + "confidence": 90 + }, + "matched_license": "MIT" +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/scancode_metadata.json b/swh/clearlydefined/tests/data/scancode_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/scancode_metadata.json @@ -0,0 +1,87 @@ +{ + "path": "package/LICENSE", + "type": "file", + "name": "LICENSE", + "base_name": "LICENSE", + "extension": "", + "size": 1073, + "date": "1985-10-26", + "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", + "md5": "dc2a37e472c366af2a7b8bd0f2ba5af4", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "licenses": [ + { + "key": "mit", + "score": 97.66, + "name": "MIT License", + "short_name": "MIT License", + "category": "Permissive", + "is_exception": false, + "owner": "MIT", + "homepage_url": "http://opensource.org/licenses/mit-license.php", + "text_url": "http://opensource.org/licenses/mit-license.php", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:mit", + "spdx_license_key": "MIT", + "spdx_url": "https://spdx.org/licenses/MIT", + "start_line": 1, + "end_line": 21, + "matched_rule": { + "identifier": "mit_160.RULE", + "license_expression": "mit", + "licenses": [ + "mit" + ], + "is_license_text": true, + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "matcher": "3-seq", + "rule_length": 167, + "matched_length": 167, + "match_coverage": 100, + "rule_relevance": 100 + }, + "matched_text": "The MIT License\n\nCopyright ([c]) [2017] [Google], [Inc].\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE." + } + ], + "license_expressions": [ + "mit" + ], + "holders": [ + { + "value": "Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "copyrights": [ + { + "value": "Copyright (c) 2017 Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "authors": [], + "packages": [], + "emails": [], + "urls": [], + "is_legal": true, + "is_manifest": false, + "is_readme": false, + "is_top_level": true, + "is_key_file": true, + "is_generated": false, + "is_license_text": true, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/test_mapping_utils.py b/swh/clearlydefined/tests/test_mapping_utils.py --- a/swh/clearlydefined/tests/test_mapping_utils.py +++ b/swh/clearlydefined/tests/test_mapping_utils.py @@ -3,38 +3,44 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.clearlydefined.mapping_utils import map_sha1_with_swhid -from swh.clearlydefined.mapping_utils import map_row -from swh.clearlydefined.mapping_utils import map_definition +from datetime import datetime, timezone +import gzip +import json +import os + +import attr +import pytest + from swh.clearlydefined.error import ( InvalidComponents, - WrongMetadata, NoJsonExtension, RevisionNotFound, - ToolNotSupported, ToolNotFound, + ToolNotSupported, + WrongMetadata, +) +from swh.clearlydefined.mapping_utils import ( + authority, + fetcher, + map_definition, + map_row, + map_sha1_with_swhid, ) - from swh.model import from_disk +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid from swh.model.model import ( - MetadataTargetType, - Origin, Content, - Revision, - Person, - TimestampWithTimezone, - Timestamp, - RevisionType, Directory, DirectoryEntry, + MetadataTargetType, + Person, + RawExtrinsicMetadata, + Revision, + RevisionType, + Timestamp, + TimestampWithTimezone, ) -from swh.model.hashutil import hash_to_bytes - -import gzip -import os -from typing import Tuple -import pytest - content_data = [ Content.from_data(b"42\n"), @@ -135,16 +141,6 @@ ] -def make_row(id: str, text: str) -> Tuple[str, bytes]: - """ - Take id and text as input and return a - row like a row present in - clearcode toolkit database - """ - row = (id, gzip.compress(text.encode())) - return row - - def file_data(file_name): with open(file_name) as file: data = file.read() @@ -181,162 +177,330 @@ def test_map_row_for_definitions_with_sha1(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text=file_data(os.path.join(datadir, "definitions.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - Origin( - url="http://central.maven.org/maven2/za/co/absa/cobrix/" - "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" ), - ) + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-definition-json", + origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/" + "0.4.0/cobol-parser-0.4.0-sources.jar", + metadata=json.dumps( + json.loads(file_data(os.path.join(datadir, "definitions.json"))) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "definitions.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_definitions_with_gitsha1(swh_storage, datadir): add_revision_data(swh_storage) - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text=file_data(os.path.join(datadir, "definitions_sha1git.json")), - ) expected = ( True, [ - ( - "swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6", - MetadataTargetType.REVISION, - Origin( - url="http://central.maven.org/maven2/za/co/absa/cobrix/" - "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + target=parse_swhid( + "swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6" ), - ) + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-definition-json", + origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/" + "0.4.0/cobol-parser-0.4.0-sources.jar", + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "definitions_sha1git.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "definitions_sha1git.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_scancode(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", - text=file_data(os.path.join(datadir, "scancode.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-scancode-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "scancode_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "scancode.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_scancode_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", - text=file_data(os.path.join(datadir, "scancode_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-scancode-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "scancode_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "scancode_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_licensee(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/\ - 9.13.0.json", - text=file_data(os.path.join(datadir, "licensee.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-licensee-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "licensee_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "licensee.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_licensee_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/\ - 9.13.0.json", - text=file_data(os.path.join(datadir, "licensee_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-licensee-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "licensee_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "licensee_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_clearlydefined(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", - text=file_data(os.path.join(datadir, "clearlydefined.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ) + ).encode("utf-8"), + ), + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data( + os.path.join(datadir, "clearlydefined_metadata_2.json") + ) + ) + ).encode("utf-8"), + ), ], ) - map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/" + "1.3.4.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "clearlydefined.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_clearlydefined_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", - text=file_data(os.path.join(datadir, "clearlydefined_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ) + ).encode("utf-8"), + ), + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data( + os.path.join(datadir, "clearlydefined_metadata_2.json") + ) + ) + ).encode("utf-8"), + ), ], ) - map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/" + "1.3.4.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "clearlydefined_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_sha1git_not_in_revision(swh_storage, datadir): @@ -347,6 +511,7 @@ os.path.join(datadir, "definitions_not_mapped_sha1_git.json") ), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) is None ) @@ -360,6 +525,7 @@ os.path.join(datadir, "definitions_not_mapped.json") ), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) is None ) @@ -370,64 +536,74 @@ map_definition( metadata_string=file_data(os.path.join(datadir, "licensee.json")), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) def test_map_row_with_invalid_ID(swh_storage): - row = make_row( - id="maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json", text="abc" - ) with pytest.raises(InvalidComponents): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json", + metadata=gzip.compress(" ".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_empty_metadata_string(swh_storage): - row = make_row( + map_row( + storage=swh_storage, id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text="", - ) - assert map_row(storage=swh_storage, row=row) is None + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) is None def test_map_row_with_invalid_ID_without_revision(swh_storage): - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/abc/0.4.0.json", - text="abc", - ) with pytest.raises(RevisionNotFound): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/abc/0.4.0.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_ID_without_json_extension(swh_storage): - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.txt", - text="abc", - ) with pytest.raises(NoJsonExtension): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.txt", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_ID_without_6_or_9_length(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/3.2.2.json", text="abc" - ) with pytest.raises(InvalidComponents): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_tool(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/abc/3.2.2.json", - text="abc", - ) with pytest.raises(ToolNotSupported): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/abc/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_harvest_ID(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json", - text="abc", - ) with pytest.raises(ToolNotFound): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + )