diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py index 45cdb76..4a28686 100644 --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -1,261 +1,361 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -from typing import Any, Dict, Optional, Tuple, List, Union +from datetime import datetime +from enum import Enum import gzip +import json +from typing import Any, Dict, List, Optional, Tuple -from swh.model.hashutil import hash_to_bytes -from swh.model.hashutil import hash_to_hex -from swh.model.model import MetadataTargetType, Origin +import attr from swh.clearlydefined.error import ( InvalidComponents, - WrongMetadata, - ToolNotFound, NoJsonExtension, RevisionNotFound, + ToolNotFound, ToolNotSupported, + WrongMetadata, +) +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + + +class ToolType(Enum): + """The type of content pointed to by a snapshot branch. Usually a + revision or an alias.""" + + DEFINITION = "definition" + SCANCODE = "scancode" + CLEARLYDEFINED = "clearlydefined" + LICENSEE = "licensee" + FOSSOLOGY = "fossology" + + +def map_row_data_with_metadata( + swh_id: str, + type: MetadataTargetType, + origin: Optional[Origin], + metadata: Dict, + date: datetime, + format: str, +) -> RawExtrinsicMetadata: + """ + Take and data_list as input and write + data inside RawExtrensicMetadata table inside + swh storage + """ + return RawExtrinsicMetadata( + type=type, + target=parse_swhid(swh_id), + discovery_date=date, + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format=format, + origin=origin.url if origin else None, + metadata=json.dumps(metadata).encode("utf-8"), + ) + + +authority = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://clearlydefined.io/", + metadata={}, +) + + +fetcher = MetadataFetcher( + name="swh-clearlydefined", + version="0.0.1", + metadata={}, ) def map_sha1_with_swhid(storage, sha1: str) -> Optional[str]: """ Take sha1 and storage as input and give the corresponding swhID for that sha1 """ if not sha1: return None content = storage.content_get([hash_to_bytes(sha1)])[0] if not content: return None sha1_git = hash_to_hex(content.sha1_git) swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) return swh_id def sha1_git_in_revisions(storage, sha1_git: str) -> bool: """ Take sha1_git and storage as input and tell whether that sha1_git exists in revision table """ sha1_git_bytes = hash_to_bytes(sha1_git) missing_revision = storage.revision_missing([sha1_git_bytes]) if len(list(missing_revision)) == 0: return True return False def map_sha1_and_add_in_data( - storage, sha1: Optional[str], data: list, mapping_status=True + storage, + sha1: Optional[str], + data: List[RawExtrinsicMetadata], + file: Dict, + date: datetime, + format: str, + mapping_status=True, ) -> bool: + """ + Take sha1, data, file, date, mapping_status as input + and return whether the sha1 exists in content, if it exists + map sha1 with swhid and push RawExtrensicMetadata object that got + mapping row data with RawExtrensicMetadata + """ if sha1: assert isinstance(sha1, str) swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) if swh_id: - data.append((swh_id, MetadataTargetType.CONTENT, None)) + data.append( + map_row_data_with_metadata( + swh_id=swh_id, + type=MetadataTargetType.CONTENT, + origin=None, + metadata=file, + date=date, + format=format, + ) + ) else: mapping_status = False return mapping_status def map_scancode( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping status of harvest (True if able to map every sha1, False if not able to map every sha1) and data to be written in storage """ metadata = json.loads(metadata_string) content = metadata.get("content") or {} files = content.get("files") or {} mapping_status = True - data: list = [] + format = "clearlydefined-harvest-scancode-json" + data: List[RawExtrinsicMetadata] = [] for file in files: sha1 = file.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_licensee( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping status of harvest (True if able to map every sha1, False if not able to map every sha1) and data to be written in storage """ metadata = json.loads(metadata_string) licensee = metadata.get("licensee") or {} output = licensee.get("output") or {} content = output.get("content") or {} files = content.get("matched_files") or [] mapping_status = True - data: list = [] + format = "clearlydefined-harvest-licensee-json" + data: List[RawExtrinsicMetadata] = [] for file in files: sha1 = file.get("content_hash") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_clearlydefined( - storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping status of harvest (True if able to map every sha1, False if not able to map every sha1) and data to be written in storage """ metadata = json.loads(metadata_string) files = metadata.get("files") or [] mapping_status = True - data: list = [] + format = "clearlydefined-harvest-clearlydefined-json" + data: List[RawExtrinsicMetadata] = [] for file in files: hashes = file.get("hashes") or {} sha1 = hashes.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date, format) + and mapping_status ) return mapping_status, data def map_harvest( - storage, tool: str, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: + storage, tool: str, metadata_string: str, date: datetime +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take tool, metadata_string and storage as input and try to map the sha1 of files with content, return status of harvest and data to be written in storage """ tools = { "scancode": map_scancode, "licensee": map_licensee, "clearlydefined": map_clearlydefined, } - return tools[tool](storage=storage, metadata_string=metadata_string) + return tools[tool](storage=storage, metadata_string=metadata_string, date=date) def map_definition( - storage, metadata_string: str -) -> Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]]: + storage, metadata_string: str, date: datetime +) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: """ Take metadata_string and storage as input and try to map the sha1 of defintion with content/ gitSha in revision return None if not able to map else return data to be written in storage """ metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} hashes: Dict[str, str] = described.get("hashes") or {} sha1_git = hashes.get("gitSha") source: Dict[str, str] = described.get("sourceLocation") or {} url = source.get("url") origin = None sha1 = hashes.get("sha1") if url: assert isinstance(url, str) origin = Origin(url=url) if sha1_git: assert isinstance(sha1_git, str) if not sha1_git_in_revisions(sha1_git=sha1_git, storage=storage): return None swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) metadata_type = MetadataTargetType.REVISION elif sha1: assert isinstance(sha1, str) swh_id_sha1 = map_sha1_with_swhid(sha1=sha1, storage=storage) if not swh_id_sha1: return None assert isinstance(swh_id_sha1, str) swh_id = swh_id_sha1 metadata_type = MetadataTargetType.CONTENT else: raise WrongMetadata("Wrong metadata") - return True, [(swh_id, metadata_type, origin)] + return True, [ + map_row_data_with_metadata( + swh_id=swh_id, + type=metadata_type, + origin=origin, + metadata=metadata, + date=date, + format="clearlydefined-definition-json", + ) + ] -def map_row( - storage, row: tuple -) -> Union[ - Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]], - Tuple[bool, List[Tuple[str, MetadataTargetType, None]]], -]: +def get_type_of_tool(cd_path) -> ToolType: """ - Take row and storage as input and try to map that row, - if ID of row is invalid then raise exception, - if not able to map that row, then return None - else return status of that row and data to be written - in storage + Take cd_path as input if cd_path is invalid then raise exception, + else return tyoe of tool of that row """ - cd_path = row[0] list_cd_path = cd_path.split("/") - # For example: maven/mavencentral/cobol-parser/abc/0.4.0.json if list_cd_path[4] != "revision": raise RevisionNotFound( "Not a supported/known ID, A valid ID should have" '5th component as "revision".' ) # For example: maven/mavencentral/cobol-parser/revision/0.4.0.txt if not list_cd_path[-1].endswith(".json"): raise NoJsonExtension( 'Not a supported/known ID, A valid ID should end with ".json" extension.' ) - - metadata_string = gzip.decompress(row[1]).decode() - # if the row doesn't contain any information in metadata return None so it can be - # mapped later on - if metadata_string == "": - return None - # if the ID of row contains 9 components: # ////revision//tool//.json # then it is a harvest if len(list_cd_path) == 9: # npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json if list_cd_path[6] != "tool": raise ToolNotFound( 'Not a supported/known harvest ID, A valid harvest ID should have 7th\ component as "tool".' ) tool = list_cd_path[7] # if the row contains an unknown tool - if tool not in ("scancode", "licensee", "clearlydefined"): + if tool not in ("scancode", "licensee", "clearlydefined", "fossology"): raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") - return map_harvest( - tool=tool, - metadata_string=metadata_string, - storage=storage, - ) - + return ToolType(tool) elif len(list_cd_path) == 6: - # if the ID of row contains 6 components: - # ////revision/.json - # then it is a defintion - return map_definition( - metadata_string=metadata_string, - storage=storage, - ) + return ToolType.DEFINITION # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json raise InvalidComponents( "Not a supported/known ID, A valid ID should have 6 or 9 components." ) + + +def map_row( + storage, metadata: bytes, id: str, date: datetime +) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: + """ + Take row and storage as input and try to map that row, + if ID of row is invalid then raise exception, + if not able to map that row, then return None + else return status of that row and data to be written + in storage + """ + tool = get_type_of_tool(id).value + + # if the row doesn't contain any information in metadata return None so it can be + # mapped later on + metadata_string = gzip.decompress(metadata).decode() + if metadata_string == "": + return None + + if tool == "definition": + return map_definition( + metadata_string=metadata_string, storage=storage, date=date + ) + + else: + return map_harvest( + tool=tool, + metadata_string=metadata_string, + storage=storage, + date=date, + ) diff --git a/swh/clearlydefined/tests/data/clearlydefined_metadata.json b/swh/clearlydefined/tests/data/clearlydefined_metadata.json new file mode 100644 index 0000000..7588488 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearlydefined_metadata.json @@ -0,0 +1,7 @@ +{ + "path": "package/LICENSE", + "hashes": { + "sha1": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json b/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json new file mode 100644 index 0000000..3fe8093 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearlydefined_metadata_2.json @@ -0,0 +1,7 @@ +{ + "path": "package/README.md", + "hashes": { + "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", + "sha256": "60b9c916c43fba00e2d3ba5207b25bf28109e985c3f739f430bb2056423d5aa9" + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/licensee_metadata.json b/swh/clearlydefined/tests/data/licensee_metadata.json new file mode 100644 index 0000000..96cf7d2 --- /dev/null +++ b/swh/clearlydefined/tests/data/licensee_metadata.json @@ -0,0 +1,11 @@ +{ + "filename": "package/package.json", + "content": "{\n \"name\": \"@fluidframework/replay-driver\",\n \"version\": \"0.31.0\",\n \"description\": \"Document replay version of Socket.IO implementation\",\n \"homepage\": \"https://fluidframework.com\",\n \"repository\": \"https://github.com/microsoft/FluidFramework\",\n \"license\": \"MIT\",\n \"author\": \"Microsoft\",\n \"sideEffects\": false,\n \"main\": \"dist/index.js\",\n \"module\": \"lib/index.js\",\n \"types\": \"dist/index.d.ts\",\n \"scripts\": {\n \"build\": \"npm run build:genver && concurrently npm:build:compile npm:lint\",\n \"build:compile\": \"concurrently npm:tsc npm:build:esnext\",\n \"build:docs\": \"api-extractor run --local && copyfiles -u 1 ./_api-extractor-temp/doc-models/* ../../../_api-extractor-temp/\",\n \"build:esnext\": \"tsc --project ./tsconfig.esnext.json\",\n \"build:full\": \"npm run build\",\n \"build:full:compile\": \"npm run build:compile\",\n \"build:genver\": \"gen-version\",\n \"clean\": \"rimraf dist lib *.tsbuildinfo *.build.log\",\n \"eslint\": \"eslint --format stylish src\",\n \"eslint:fix\": \"eslint --format stylish src --fix\",\n \"lint\": \"npm run eslint\",\n \"lint:fix\": \"npm run eslint:fix\",\n \"tsc\": \"tsc\",\n \"tsfmt\": \"tsfmt --verify\",\n \"tsfmt:fix\": \"tsfmt --replace\"\n },\n \"dependencies\": {\n \"@fluidframework/common-definitions\": \"^0.19.1\",\n \"@fluidframework/common-utils\": \"^0.26.0\",\n \"@fluidframework/driver-definitions\": \"^0.31.0\",\n \"@fluidframework/driver-utils\": \"^0.31.0\",\n \"@fluidframework/protocol-definitions\": \"^0.1016.1\",\n \"@fluidframework/telemetry-utils\": \"^0.31.0\",\n \"assert\": \"^2.0.0\",\n \"debug\": \"^4.1.1\"\n },\n \"devDependencies\": {\n \"@fluidframework/build-common\": \"^0.19.2\",\n \"@fluidframework/eslint-config-fluid\": \"^0.21.0\",\n \"@microsoft/api-extractor\": \"^7.7.2\",\n \"@types/assert\": \"^1.5.1\",\n \"@types/debug\": \"^4.1.5\",\n \"@types/mocha\": \"^5.2.5\",\n \"@types/nock\": \"^9.3.0\",\n \"@types/node\": \"^10.17.24\",\n \"@typescript-eslint/eslint-plugin\": \"~4.2.0\",\n \"@typescript-eslint/parser\": \"~4.2.0\",\n \"concurrently\": \"^5.2.0\",\n \"copyfiles\": \"^2.1.0\",\n \"eslint\": \"~7.9.0\",\n \"eslint-plugin-eslint-comments\": \"~3.2.0\",\n \"eslint-plugin-import\": \"~2.22.0\",\n \"eslint-plugin-no-null\": \"~1.0.2\",\n \"eslint-plugin-prefer-arrow\": \"~1.2.2\",\n \"eslint-plugin-react\": \"~7.21.2\",\n \"eslint-plugin-unicorn\": \"~22.0.0\",\n \"mocha\": \"^8.1.1\",\n \"nock\": \"^10.0.1\",\n \"rimraf\": \"^2.6.2\",\n \"typescript\": \"~3.7.4\",\n \"typescript-formatter\": \"7.1.0\"\n }\n}\n", + "content_hash": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "content_normalized": null, + "matcher": { + "name": "npmbower", + "confidence": 90 + }, + "matched_license": "MIT" +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/scancode_metadata.json b/swh/clearlydefined/tests/data/scancode_metadata.json new file mode 100644 index 0000000..044d187 --- /dev/null +++ b/swh/clearlydefined/tests/data/scancode_metadata.json @@ -0,0 +1,87 @@ +{ + "path": "package/LICENSE", + "type": "file", + "name": "LICENSE", + "base_name": "LICENSE", + "extension": "", + "size": 1073, + "date": "1985-10-26", + "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", + "md5": "dc2a37e472c366af2a7b8bd0f2ba5af4", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "licenses": [ + { + "key": "mit", + "score": 97.66, + "name": "MIT License", + "short_name": "MIT License", + "category": "Permissive", + "is_exception": false, + "owner": "MIT", + "homepage_url": "http://opensource.org/licenses/mit-license.php", + "text_url": "http://opensource.org/licenses/mit-license.php", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:mit", + "spdx_license_key": "MIT", + "spdx_url": "https://spdx.org/licenses/MIT", + "start_line": 1, + "end_line": 21, + "matched_rule": { + "identifier": "mit_160.RULE", + "license_expression": "mit", + "licenses": [ + "mit" + ], + "is_license_text": true, + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "matcher": "3-seq", + "rule_length": 167, + "matched_length": 167, + "match_coverage": 100, + "rule_relevance": 100 + }, + "matched_text": "The MIT License\n\nCopyright ([c]) [2017] [Google], [Inc].\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE." + } + ], + "license_expressions": [ + "mit" + ], + "holders": [ + { + "value": "Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "copyrights": [ + { + "value": "Copyright (c) 2017 Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "authors": [], + "packages": [], + "emails": [], + "urls": [], + "is_legal": true, + "is_manifest": false, + "is_readme": false, + "is_top_level": true, + "is_key_file": true, + "is_generated": false, + "is_license_text": true, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/test_mapping_utils.py b/swh/clearlydefined/tests/test_mapping_utils.py index 36de923..d3797b5 100644 --- a/swh/clearlydefined/tests/test_mapping_utils.py +++ b/swh/clearlydefined/tests/test_mapping_utils.py @@ -1,433 +1,609 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.clearlydefined.mapping_utils import map_sha1_with_swhid -from swh.clearlydefined.mapping_utils import map_row -from swh.clearlydefined.mapping_utils import map_definition +from datetime import datetime, timezone +import gzip +import json +import os + +import attr +import pytest + from swh.clearlydefined.error import ( InvalidComponents, - WrongMetadata, NoJsonExtension, RevisionNotFound, - ToolNotSupported, ToolNotFound, + ToolNotSupported, + WrongMetadata, +) +from swh.clearlydefined.mapping_utils import ( + authority, + fetcher, + map_definition, + map_row, + map_sha1_with_swhid, ) - from swh.model import from_disk +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import parse_swhid from swh.model.model import ( - MetadataTargetType, - Origin, Content, - Revision, - Person, - TimestampWithTimezone, - Timestamp, - RevisionType, Directory, DirectoryEntry, + MetadataTargetType, + Person, + RawExtrinsicMetadata, + Revision, + RevisionType, + Timestamp, + TimestampWithTimezone, ) -from swh.model.hashutil import hash_to_bytes - -import gzip -import os -from typing import Tuple -import pytest - content_data = [ Content.from_data(b"42\n"), Content.from_data(b"4242\n"), ] directory = Directory( id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple( [ DirectoryEntry( name=b"foo", type="file", target=content_data[0].sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) revision_data = [ Revision( id=hash_to_bytes("4c66129b968ab8122964823d1d77677f50884cf6"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"nicolas@example.com", fullname=b"Nicolas Dandrimont ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset=120, negative_utc=False, ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"stefano@example.com", fullname=b"St\xc3fano Zacchiroli ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset=120, negative_utc=False, ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ), Revision( id=hash_to_bytes("3c66129b968ab8122964823d1d77677f50884cf6"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"roberto@example.com", fullname=b"Roberto Dicosmo ", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset=-720, negative_utc=False, ), committer=Person( name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset=0, negative_utc=False, ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ), ] -def make_row(id: str, text: str) -> Tuple[str, bytes]: - """ - Take id and text as input and return a - row like a row present in - clearcode toolkit database - """ - row = (id, gzip.compress(text.encode())) - return row - - def file_data(file_name): with open(file_name) as file: data = file.read() return data def add_content_data(swh_storage): swh_storage.content_add(content_data) def add_revision_data(swh_storage): swh_storage.revision_add(revision_data) def test_mapping_sha1_with_swhID(swh_storage): add_content_data(swh_storage) sha1 = "34973274ccef6ab4dfaaf86599792fa9c3fe4689" assert "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" == map_sha1_with_swhid( sha1=sha1, storage=swh_storage ) def test_mapping_with_empty_sha1(swh_storage): add_content_data(swh_storage) sha1 = "" assert map_sha1_with_swhid(sha1=sha1, storage=swh_storage) is None def test_mapping_with_wrong_sha1(swh_storage): add_content_data(swh_storage) sha1 = "6ac599151a7aaa8ca5d38dc5bb61b49193a3cadc1ed33de5a57e4d1ecc53c846" assert map_sha1_with_swhid(sha1=sha1, storage=swh_storage) is None def test_map_row_for_definitions_with_sha1(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text=file_data(os.path.join(datadir, "definitions.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - Origin( - url="http://central.maven.org/maven2/za/co/absa/cobrix/" - "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" ), - ) + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-definition-json", + origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/" + "0.4.0/cobol-parser-0.4.0-sources.jar", + metadata=json.dumps( + json.loads(file_data(os.path.join(datadir, "definitions.json"))) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "definitions.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_definitions_with_gitsha1(swh_storage, datadir): add_revision_data(swh_storage) - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text=file_data(os.path.join(datadir, "definitions_sha1git.json")), - ) expected = ( True, [ - ( - "swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6", - MetadataTargetType.REVISION, - Origin( - url="http://central.maven.org/maven2/za/co/absa/cobrix/" - "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + target=parse_swhid( + "swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6" ), - ) + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-definition-json", + origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/" + "0.4.0/cobol-parser-0.4.0-sources.jar", + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "definitions_sha1git.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "definitions_sha1git.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_scancode(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", - text=file_data(os.path.join(datadir, "scancode.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-scancode-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "scancode_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "scancode.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_scancode_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", - text=file_data(os.path.join(datadir, "scancode_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-scancode-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "scancode_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/3.2.2.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "scancode_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_licensee(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/\ - 9.13.0.json", - text=file_data(os.path.join(datadir, "licensee.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-licensee-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "licensee_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "licensee.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_licensee_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/\ - 9.13.0.json", - text=file_data(os.path.join(datadir, "licensee_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-licensee-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "licensee_metadata.json")) + ) + ).encode("utf-8"), + ), ], ) - assert map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "licensee_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_clearlydefined(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", - text=file_data(os.path.join(datadir, "clearlydefined.json")), - ) expected = ( False, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ) + ).encode("utf-8"), + ), + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data( + os.path.join(datadir, "clearlydefined_metadata_2.json") + ) + ) + ).encode("utf-8"), + ), ], ) - map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/" + "1.3.4.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "clearlydefined.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_map_row_for_clearlydefined_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) - row = make_row( - id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", - text=file_data(os.path.join(datadir, "clearlydefined_true.json")), - ) expected = ( True, [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - ) + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ) + ).encode("utf-8"), + ), + RawExtrinsicMetadata( + type=MetadataTargetType.CONTENT, + target=parse_swhid( + "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" + ), + discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="clearlydefined-harvest-clearlydefined-json", + origin=None, + metadata=json.dumps( + json.loads( + file_data( + os.path.join(datadir, "clearlydefined_metadata_2.json") + ) + ) + ).encode("utf-8"), + ), ], ) - map_row(storage=swh_storage, row=row) == expected + assert ( + map_row( + storage=swh_storage, + id="npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/" + "1.3.4.json", + metadata=gzip.compress( + file_data(os.path.join(datadir, "clearlydefined_true.json")).encode() + ), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) + == expected + ) def test_sha1git_not_in_revision(swh_storage, datadir): add_revision_data(swh_storage) assert ( map_definition( metadata_string=file_data( os.path.join(datadir, "definitions_not_mapped_sha1_git.json") ), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) is None ) def test_sha1_not_in_content(swh_storage, datadir): add_content_data(swh_storage) assert ( map_definition( metadata_string=file_data( os.path.join(datadir, "definitions_not_mapped.json") ), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) is None ) def test_map_definition_with_wrong_metadata(swh_storage, datadir): with pytest.raises(WrongMetadata): map_definition( metadata_string=file_data(os.path.join(datadir, "licensee.json")), storage=swh_storage, + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) def test_map_row_with_invalid_ID(swh_storage): - row = make_row( - id="maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json", text="abc" - ) with pytest.raises(InvalidComponents): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json", + metadata=gzip.compress(" ".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_empty_metadata_string(swh_storage): - row = make_row( + map_row( + storage=swh_storage, id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", - text="", - ) - assert map_row(storage=swh_storage, row=row) is None + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) is None def test_map_row_with_invalid_ID_without_revision(swh_storage): - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/abc/0.4.0.json", - text="abc", - ) with pytest.raises(RevisionNotFound): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/abc/0.4.0.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_ID_without_json_extension(swh_storage): - row = make_row( - id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.txt", - text="abc", - ) with pytest.raises(NoJsonExtension): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.txt", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_ID_without_6_or_9_length(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/3.2.2.json", text="abc" - ) with pytest.raises(InvalidComponents): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_tool(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/abc/3.2.2.json", - text="abc", - ) with pytest.raises(ToolNotSupported): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/abc/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + ) def test_map_row_with_invalid_harvest_ID(swh_storage): - row = make_row( - id="npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json", - text="abc", - ) with pytest.raises(ToolNotFound): - map_row(storage=swh_storage, row=row) + map_row( + storage=swh_storage, + id="npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json", + metadata=gzip.compress("".encode()), + date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + )