diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import json -from typing import Any, Dict, Optional, Tuple, List, Iterator +from typing import Any, Dict, Optional, Tuple, List import gzip from enum import Enum from datetime import datetime @@ -44,27 +44,24 @@ def map_row_data_with_metadata( - data_list: List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]], + data: Tuple[str, MetadataTargetType, Optional[Origin], Dict], date: datetime, -) -> Iterator[RawExtrinsicMetadata]: +) -> RawExtrinsicMetadata: """ Take and data_list as input and write data inside RawExtrensicMetadata table inside swh storage """ - - for data in data_list: - metadata = RawExtrinsicMetadata( - type=data[1], - target=parse_swhid(data[0]), - discovery_date=date, - authority=attr.evolve(authority, metadata=None), - fetcher=attr.evolve(fetcher, metadata=None), - format="json", - origin=data[2].url if isinstance(data[2], Origin) else None, - metadata=json.dumps(data[3]).encode("utf-8"), - ) - yield metadata + return RawExtrinsicMetadata( + type=data[1], + target=parse_swhid(data[0]), + discovery_date=date, + authority=attr.evolve(authority, metadata=None), + fetcher=attr.evolve(fetcher, metadata=None), + format="json", + origin=data[2].url if isinstance(data[2], Origin) else None, + metadata=json.dumps(data[3]).encode("utf-8"), + ) authority = MetadataAuthority( @@ -110,13 +107,22 @@ def map_sha1_and_add_in_data( - storage, sha1: Optional[str], data: list, file: Dict, mapping_status=True + storage, + sha1: Optional[str], + data: list, + file: Dict, + date: datetime, + mapping_status=True, ) -> bool: if sha1: assert isinstance(sha1, str) swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) if swh_id: - data.append((swh_id, MetadataTargetType.CONTENT, None, file)) + data.append( + map_row_data_with_metadata( + (swh_id, MetadataTargetType.CONTENT, None, file), date + ) + ) else: mapping_status = False return mapping_status @@ -124,7 +130,7 @@ def map_scancode( storage, metadata_string: str, date: datetime -) -> Tuple[bool, Iterator[RawExtrinsicMetadata]]: +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -136,18 +142,18 @@ content = metadata.get("content") or {} files = content.get("files") or {} mapping_status = True - data: list = [] + data: List[RawExtrinsicMetadata] = [] for file in files: sha1 = file.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date) and mapping_status ) - return mapping_status, map_row_data_with_metadata(data_list=data, date=date) + return mapping_status, data def map_licensee( storage, metadata_string: str, date: datetime -) -> Tuple[bool, Iterator[RawExtrinsicMetadata]]: +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -165,14 +171,14 @@ for file in files: sha1 = file.get("content_hash") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date) and mapping_status ) - return mapping_status, map_row_data_with_metadata(data_list=data, date=date) + return mapping_status, data def map_clearlydefined( storage, metadata_string: str, date: datetime -) -> Tuple[bool, Iterator[RawExtrinsicMetadata]]: +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -188,14 +194,14 @@ hashes = file.get("hashes") or {} sha1 = hashes.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file, date) and mapping_status ) - return mapping_status, map_row_data_with_metadata(data_list=data, date=date) + return mapping_status, data def map_harvest( storage, tool: str, metadata_string: str, date: datetime -) -> Tuple[bool, Iterator[RawExtrinsicMetadata]]: +) -> Tuple[bool, List[RawExtrinsicMetadata]]: """ Take tool, metadata_string and storage as input and try to map the sha1 of files with content, return status of @@ -212,7 +218,7 @@ def map_definition( storage, metadata_string: str, date: datetime -) -> Optional[Tuple[bool, Iterator[RawExtrinsicMetadata]]]: +) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: """ Take metadata_string and storage as input and try to map the sha1 of defintion with content/ gitSha in revision @@ -250,9 +256,11 @@ else: raise WrongMetadata("Wrong metadata") - return True, map_row_data_with_metadata( - data_list=[(swh_id, metadata_type, origin, metadata)], date=date - ) + return True, [ + map_row_data_with_metadata( + data=(swh_id, metadata_type, origin, metadata), date=date + ) + ] def get_type_of_tool(cd_path) -> ToolType: @@ -295,9 +303,7 @@ ) -def map_row( - storage, row: tuple -) -> Optional[Tuple[bool, Iterator[RawExtrinsicMetadata]]]: +def map_row(storage, row: tuple) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: """ Take row and storage as input and try to map that row, if ID of row is invalid then raise exception, diff --git a/swh/clearlydefined/tests/test_mapping_utils.py b/swh/clearlydefined/tests/test_mapping_utils.py --- a/swh/clearlydefined/tests/test_mapping_utils.py +++ b/swh/clearlydefined/tests/test_mapping_utils.py @@ -219,9 +219,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_definitions_with_gitsha1(swh_storage, datadir): @@ -252,9 +250,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_scancode(swh_storage, datadir): @@ -284,9 +280,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_scancode_true_mapping_status(swh_storage, datadir): @@ -316,9 +310,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_licensee(swh_storage, datadir): @@ -349,9 +341,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_licensee_true_mapping_status(swh_storage, datadir): @@ -382,9 +372,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_clearlydefined(swh_storage, datadir): @@ -414,9 +402,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_map_row_for_clearlydefined_true_mapping_status(swh_storage, datadir): @@ -446,9 +432,7 @@ ), ], ) - status, data_list = map_row(storage=swh_storage, row=row) - assert status == expected[0] - assert list(data_list) == expected[1] + assert map_row(storage=swh_storage, row=row) == expected def test_sha1git_not_in_revision(swh_storage, datadir): @@ -546,60 +530,3 @@ ) with pytest.raises(ToolNotFound): map_row(storage=swh_storage, row=row) - - -def test_map_row_data_with_metadata(datadir): - data_list = [ - ( - "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", - MetadataTargetType.CONTENT, - None, - json.loads(file_data(os.path.join(datadir, "licensee_metadata.json"))), - ), - ( - "swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6", - MetadataTargetType.REVISION, - Origin( - url="http://central.maven.org/maven2/za/co/absa/cobrix/" - "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" - ), - json.loads(file_data(os.path.join(datadir, "definitions_sha1git.json"))), - ), - ] - - metdata_list = list( - map_row_data_with_metadata( - data_list=data_list, - date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), - ) - ) - - expected = [ - RawExtrinsicMetadata( - type=MetadataTargetType.CONTENT, - target=parse_swhid("swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa"), - discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), - authority=attr.evolve(authority, metadata=None), - fetcher=attr.evolve(fetcher, metadata=None), - format="json", - origin=None, - metadata=json.dumps( - json.loads(file_data(os.path.join(datadir, "licensee_metadata.json"))) - ).encode("utf-8"), - ), - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - target=parse_swhid("swh:1:rev:4c66129b968ab8122964823d1d77677f50884cf6"), - discovery_date=datetime(year=2021, month=2, day=6, tzinfo=timezone.utc), - authority=attr.evolve(authority, metadata=None), - fetcher=attr.evolve(fetcher, metadata=None), - format="json", - origin="http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser" - "/0.4.0/cobol-parser-0.4.0-sources.jar", - metadata=json.dumps( - json.loads(file_data(os.path.join(datadir, "definitions_sha1git.json"))) - ).encode("utf-8"), - ), - ] - - assert metdata_list == expected