diff --git a/swh/clearlydefined/error.py b/swh/clearlydefined/error.py --- a/swh/clearlydefined/error.py +++ b/swh/clearlydefined/error.py @@ -22,14 +22,6 @@ pass -class WrongMetadata(Exception): - """ - Raise this when tried to process invalid metadata - """ - - pass - - class NoJsonExtension(Exception): """ Raise this when ID does not have .json extension at end diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -7,6 +7,7 @@ from enum import Enum import gzip import json +import re from typing import Any, Dict, List, Optional, Tuple from swh.clearlydefined.error import ( @@ -15,7 +16,6 @@ RevisionNotFound, ToolNotFound, ToolNotSupported, - WrongMetadata, ) from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import parse_swhid @@ -30,8 +30,7 @@ class ToolType(Enum): - """The type of content pointed to by a snapshot branch. Usually a - revision or an alias.""" + """The type of a row""" DEFINITION = "definition" SCANCODE = "scancode" @@ -40,6 +39,14 @@ FOSSOLOGY = "fossology" +class MappingStatus(Enum): + """The type of mapping status of a row""" + + MAPPED = "mapped" + UNMAPPED = "unmapped" + IGNORE = "ignore" + + AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://clearlydefined.io/", @@ -54,6 +61,10 @@ ) +def is_sha1(s): + return bool(re.match("^[a-fA-F0-9]+$", s)) + + def map_row_data_with_metadata( swh_id: str, type: MetadataTargetType, @@ -191,7 +202,7 @@ def map_harvest( storage, tool: str, metadata_string: str, date: datetime -) -> Tuple[bool, List[RawExtrinsicMetadata]]: +) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: """ Take tool, metadata_string and storage as input and try to map the sha1 of files with content, return status of @@ -217,12 +228,15 @@ map_sha1_and_add_in_data(storage, sha1, data, file, date, format_) and mapping_status ) - return mapping_status, data + status = MappingStatus.UNMAPPED + if mapping_status: + status = MappingStatus.MAPPED + return status, data def map_definition( storage, metadata_string: str, date: datetime -) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: +) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of defintion with content/ gitSha in revision @@ -236,31 +250,26 @@ source: Dict[str, str] = described.get("sourceLocation") or {} url = source.get("url") origin = None - sha1 = hashes.get("sha1") if url: assert isinstance(url, str) origin = Origin(url=url) + if not sha1_git: + sha1_git = source.get("revision") + if sha1_git: assert isinstance(sha1_git, str) + if len(sha1_git) != 40 and not is_sha1(sha1_git): + return MappingStatus.IGNORE, [] if not sha1_git_in_revisions(sha1_git=sha1_git, storage=storage): - return None + return MappingStatus.UNMAPPED, [] swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) metadata_type = MetadataTargetType.REVISION - elif sha1: - assert isinstance(sha1, str) - swh_id_sha1 = map_sha1_with_swhid(sha1=sha1, storage=storage) - if not swh_id_sha1: - return None - assert isinstance(swh_id_sha1, str) - swh_id = swh_id_sha1 - metadata_type = MetadataTargetType.CONTENT - else: - raise WrongMetadata("Wrong metadata") + return MappingStatus.IGNORE, [] - return True, [ + return MappingStatus.MAPPED, [ map_row_data_with_metadata( swh_id=swh_id, type=metadata_type, @@ -314,7 +323,7 @@ def map_row( storage, metadata: bytes, id: str, date: datetime -) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: +) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: """ Take row and storage as input and try to map that row, if ID of row is invalid then raise exception, @@ -328,7 +337,7 @@ # mapped later on metadata_string = gzip.decompress(metadata).decode() if metadata_string == "": - return None + return MappingStatus.UNMAPPED, [] if tool == "definition": return map_definition( diff --git a/swh/clearlydefined/orchestrator.py b/swh/clearlydefined/orchestrator.py --- a/swh/clearlydefined/orchestrator.py +++ b/swh/clearlydefined/orchestrator.py @@ -4,15 +4,16 @@ # See top-level LICENSE file for more information from datetime import datetime -from typing import Optional +from typing import List, Optional import attr -import psycopg2 import dateutil +import psycopg2 from swh.clearlydefined.mapping_utils import ( AUTHORITY, FETCHER, + MappingStatus, get_type_of_tool, map_row, ) @@ -81,32 +82,48 @@ return dateutil.parser.isoparse(date) -def orchestrate_row(storage: StorageInterface, cursor, connection, row: Row) -> bool: +def write_data_from_list( + storage: StorageInterface, metadata_list: List[RawExtrinsicMetadata] +): + """ + Take list of RawExtrinsicMetadata and + write in storage + """ + for data in metadata_list: + write_in_storage(storage=storage, metadata=data) + + +def orchestrate_row( + storage: StorageInterface, cursor, connection, row: Row +) -> Optional[bool]: """ Take storage, cursor, connection, row as input and if able to completely map that row then write data in storage, else store the ID in unmapped_data - table and return mapping_status of that row + table and return true if that row is fully mapped + false for partial or no mapping """ able_to_be_mapped = map_row( metadata=row.metadata, id=row.path, date=row.date, storage=storage ) - if not able_to_be_mapped: + + mapping_status, metadata_list = able_to_be_mapped + + if mapping_status == MappingStatus.IGNORE: + return None + + elif mapping_status == MappingStatus.UNMAPPED: # This is a case when no metadata of row is not able to be mapped write_in_not_mapped( cd_path=row.path, cursor=cursor, write_connection=connection ) + write_data_from_list(storage=storage, metadata_list=metadata_list) return False + else: # This is a case when partial metadata of that row is able to be mapped - mapping_status, metadata_list = able_to_be_mapped - if not mapping_status: - write_in_not_mapped( - cd_path=row.path, cursor=cursor, write_connection=connection - ) - for data in metadata_list: - write_in_storage(storage=storage, metadata=data) - return mapping_status + write_data_from_list(storage=storage, metadata_list=metadata_list) + return True def map_previously_unmapped_data(storage: StorageInterface, cursor, connection) -> None: diff --git a/swh/clearlydefined/tests/data/README.md b/swh/clearlydefined/tests/data/README.md --- a/swh/clearlydefined/tests/data/README.md +++ b/swh/clearlydefined/tests/data/README.md @@ -8,7 +8,7 @@ clearlydefined_not_mapped - mock metadata that will not be mapped in first orchestration -def_not_mapped - mock metadata that will not be mapped in first orchestration +def_not_mapped - mock metadata that will be ignored in orchestration definitions_not_mapped_sha1_git - mock metadata (getting False as mapping status, row type definition sha1git) diff --git a/swh/clearlydefined/tests/data/def_with_no_sha1_and_sha1git.json b/swh/clearlydefined/tests/data/def_with_no_sha1_and_sha1git.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/def_with_no_sha1_and_sha1git.json @@ -0,0 +1 @@ +{"described": {"releaseDate": "2015-12-20", "tools": ["scancode/3.2.2"], "toolScore": {"total": 100, "date": 30, "source": 70}, "sourceLocation": {"type": "git", "provider": "github", "namespace": "zzzsochi", "name": "aiotraversal", "revision": "f30be064983596f133e6b17ac7cf378bb582850e", "url": "https://github.com/zzzsochi/aiotraversal/tree/f30be064983596f133e6b17ac7cf378bb582850e"}, "score": {"total": 100, "date": 30, "source": 70}}, "coordinates": {"type": "git", "provider": "github", "namespace": "zzzsochi", "name": "aiotraversal", "revision": "f30be064983596f133e6b17ac7cf378bb582850e"}, "licensed": {"toolScore": {"total": 0, "declared": 0, "discovered": 0, "consistency": 0, "spdx": 0, "texts": 0}, "facets": {"core": {"attribution": {"unknown": 13}, "discovered": {"unknown": 13}, "files": 13}}, "score": {"total": 0, "declared": 0, "discovered": 0, "consistency": 0, "spdx": 0, "texts": 0}}, "_meta": {"schemaVersion": "1.6.1", "updated": "2019-05-11T07:15:08.694Z"}, "scores": {"effective": 50, "tool": 50}} \ No newline at end of file diff --git a/swh/clearlydefined/tests/test_mapping_utils.py b/swh/clearlydefined/tests/test_mapping_utils.py --- a/swh/clearlydefined/tests/test_mapping_utils.py +++ b/swh/clearlydefined/tests/test_mapping_utils.py @@ -16,11 +16,11 @@ RevisionNotFound, ToolNotFound, ToolNotSupported, - WrongMetadata, ) from swh.clearlydefined.mapping_utils import ( AUTHORITY, FETCHER, + MappingStatus, map_definition, map_row, map_sha1_with_swhid, @@ -174,34 +174,17 @@ assert map_sha1_with_swhid(sha1=sha1, storage=swh_storage) is None -def test_map_row_for_definitions_with_sha1(swh_storage, datadir): +def test_map_row_for_definitions_with_no_sha1_sha1git(swh_storage, datadir): add_content_data(swh_storage) - expected = ( - True, - [ - RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, - target=parse_swhid( - "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd" - ), - discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), - authority=AUTHORITY, - fetcher=FETCHER, - format="clearlydefined-definition-json", - origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/" - "0.4.0/cobol-parser-0.4.0-sources.jar", - metadata=json.dumps( - json.loads(file_data(os.path.join(datadir, "definitions.json"))) - ).encode("utf-8"), - ), - ], - ) + expected = MappingStatus.UNMAPPED, [] assert ( map_row( storage=swh_storage, id="maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", metadata=gzip.compress( - file_data(os.path.join(datadir, "definitions.json")).encode() + file_data( + os.path.join(datadir, "def_with_no_sha1_and_sha1git.json") + ).encode() ), date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) @@ -212,7 +195,7 @@ def test_map_row_for_definitions_with_gitsha1(swh_storage, datadir): add_revision_data(swh_storage) expected = ( - True, + MappingStatus.MAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.REVISION, @@ -249,7 +232,7 @@ def test_map_row_for_scancode(swh_storage, datadir): add_content_data(swh_storage) expected = ( - False, + MappingStatus.UNMAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -285,7 +268,7 @@ def test_map_row_for_scancode_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) expected = ( - True, + MappingStatus.MAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -321,7 +304,7 @@ def test_map_row_for_licensee(swh_storage, datadir): add_content_data(swh_storage) expected = ( - False, + MappingStatus.UNMAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -358,7 +341,7 @@ def test_map_row_for_licensee_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) expected = ( - True, + MappingStatus.MAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -395,7 +378,7 @@ def test_map_row_for_clearlydefined(swh_storage, datadir): add_content_data(swh_storage) expected = ( - False, + MappingStatus.UNMAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -450,7 +433,7 @@ def test_map_row_for_clearlydefined_true_mapping_status(swh_storage, datadir): add_content_data(swh_storage) expected = ( - True, + MappingStatus.MAPPED, [ RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, @@ -504,6 +487,7 @@ def test_sha1git_not_in_revision(swh_storage, datadir): add_revision_data(swh_storage) + expected = MappingStatus.UNMAPPED, [] assert ( map_definition( metadata_string=file_data( @@ -512,12 +496,13 @@ storage=swh_storage, date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) - is None + == expected ) def test_sha1_not_in_content(swh_storage, datadir): add_content_data(swh_storage) + expected = MappingStatus.IGNORE, [] assert ( map_definition( metadata_string=file_data( @@ -526,17 +511,21 @@ storage=swh_storage, date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) - is None + == expected ) -def test_map_definition_with_wrong_metadata(swh_storage, datadir): - with pytest.raises(WrongMetadata): +def test_map_definition_with_data_to_be_ignored(swh_storage, datadir): + add_content_data(swh_storage) + expected = MappingStatus.IGNORE, [] + assert ( map_definition( metadata_string=file_data(os.path.join(datadir, "licensee.json")), storage=swh_storage, date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), ) + == expected + ) def test_map_row_with_invalid_ID(swh_storage): diff --git a/swh/clearlydefined/tests/test_orchestrator.py b/swh/clearlydefined/tests/test_orchestrator.py --- a/swh/clearlydefined/tests/test_orchestrator.py +++ b/swh/clearlydefined/tests/test_orchestrator.py @@ -3,8 +3,7 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime -from datetime import timezone +from datetime import datetime, timezone import gzip import os from typing import List, Optional, Tuple @@ -83,36 +82,36 @@ "npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" "9.13.0.json", gzip_compress_data("licensee_true.json", datadir=datadir), - datetime(year=2021, month=2, day=3,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=3,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=3, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=3, tzinfo=timezone.utc), "", ), ( "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", gzip_compress_data("clearlydefined_true.json", datadir=datadir), - datetime(year=2021, month=2, day=4,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=4,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=4, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=4, tzinfo=timezone.utc), "", ), ( "maven/mavencentral/za.co.absa.cobrix/cobol/revision/0.4.0.json", gzip_compress_data("def_not_mapped.json", datadir=datadir), - datetime(year=2021, month=2, day=5,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=5,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=5, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=5, tzinfo=timezone.utc), "", ), ( "npm/npmjs/@pixi/mesh-extras/revision/5.3.6/tool/clearlydefined/1.3.4.json", gzip_compress_data("clearydefined_not_mapped.json", datadir=datadir), - datetime(year=2021, month=2, day=6,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=6,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), "", ), ( "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/fossology/1.3.4.json", gzip_compress_data(None, datadir=datadir), - datetime(year=2021, month=2, day=1,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=1,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=1, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=1, tzinfo=timezone.utc), "", ), ] @@ -124,8 +123,8 @@ ( "maven/mavencentral/cobrix/cobol-parser/revision/0.4.0.json", gzip_compress_data(None, datadir=datadir), - datetime(year=2021, month=2, day=1,tzinfo=timezone.utc), - datetime(year=2021, month=2, day=8,tzinfo=timezone.utc), + datetime(year=2021, month=2, day=1, tzinfo=timezone.utc), + datetime(year=2021, month=2, day=8, tzinfo=timezone.utc), "", ), ] @@ -148,7 +147,7 @@ ) orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) # Check how much data is unmapped after first orchestration - assert 2 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor) assert datetime(2021, 2, 6, 0, 0, tzinfo=timezone.utc) == get_last_run_date( cursor=cursor )