Changeset View
Standalone View
swh/clearlydefined/mapping_utils.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | |||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | |||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||
import json | import json | |||||||||||
from typing import Any, Dict, Optional, Tuple, List, Union | from typing import Any, Dict, Optional, Tuple, List, Union, Iterator | |||||||||||
import gzip | import gzip | |||||||||||
from enum import Enum | ||||||||||||
from datetime import datetime | ||||||||||||
from swh.model.model import ( | ||||||||||||
MetadataAuthority, | ||||||||||||
MetadataAuthorityType, | ||||||||||||
MetadataFetcher, | ||||||||||||
RawExtrinsicMetadata, | ||||||||||||
MetadataTargetType, | ||||||||||||
Origin, | ||||||||||||
) | ||||||||||||
import attr | ||||||||||||
vlorentz: Please import in the [[ https://www.python.org/dev/peps/pep-0008/#imports | order recommended… | ||||||||||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | |||||||||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | |||||||||||
from swh.model.model import MetadataTargetType, Origin | from swh.model.identifiers import parse_swhid | |||||||||||
from swh.clearlydefined.error import ( | from swh.clearlydefined.error import ( | |||||||||||
InvalidComponents, | InvalidComponents, | |||||||||||
WrongMetadata, | WrongMetadata, | |||||||||||
ToolNotFound, | ToolNotFound, | |||||||||||
NoJsonExtension, | NoJsonExtension, | |||||||||||
RevisionNotFound, | RevisionNotFound, | |||||||||||
ToolNotSupported, | ToolNotSupported, | |||||||||||
) | ) | |||||||||||
class ToolType(Enum): | ||||||||||||
"""The type of content pointed to by a snapshot branch. Usually a | ||||||||||||
revision or an alias.""" | ||||||||||||
DEFINITION = "definition" | ||||||||||||
SCANCODE = "scancode" | ||||||||||||
CLEARLYDEFINED = "clearlydefined" | ||||||||||||
LICENSEE = "licensee" | ||||||||||||
FOSSOLOGY = "fossology" | ||||||||||||
def map_row_data_with_metadata( | ||||||||||||
Done Inline Actionsuse four arguments instead of a 4-tuple. vlorentz: use four arguments instead of a 4-tuple. | ||||||||||||
data_list: Union[ | ||||||||||||
List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]], | ||||||||||||
List[Tuple[str, MetadataTargetType, None, Dict]], | ||||||||||||
Done Inline Actions
vlorentz: | ||||||||||||
], | ||||||||||||
date: datetime, | ||||||||||||
) -> Iterator[RawExtrinsicMetadata]: | ||||||||||||
""" | ||||||||||||
Take and data_list as input and write | ||||||||||||
data inside RawExtrensicMetadata table inside | ||||||||||||
swh storage | ||||||||||||
""" | ||||||||||||
for data in data_list: | ||||||||||||
metadata = RawExtrinsicMetadata( | ||||||||||||
type=data[1], | ||||||||||||
Done Inline Actionsdata[2]'s type is Optional[Origin], so check for None instead of checking its type. vlorentz: `data[2]`'s type is `Optional[Origin]`, so check for `None` instead of checking its type. | ||||||||||||
target=parse_swhid(data[0]), | ||||||||||||
discovery_date=date, | ||||||||||||
authority=attr.evolve(authority, metadata=None), | ||||||||||||
fetcher=attr.evolve(fetcher, metadata=None), | ||||||||||||
format="json", | ||||||||||||
Not Done Inline ActionsInstead of taking a list and returning an iterator, just take one element and return an other element. Callers can take care of iterating. vlorentz: Instead of taking a list and returning an iterator, just take one element and return an other… | ||||||||||||
origin=data[2].url if isinstance(data[2], Origin) else None, | ||||||||||||
Not Done Inline Actionsclearlydefined isn't a deposit client; check out the definition of authority types: https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#authorities vlorentz: clearlydefined isn't a deposit client; check out the definition of authority types: https… | ||||||||||||
Done Inline ActionsCan you suggest me, what should be done here ? TG1999: Can you suggest me, what should be done here ? | ||||||||||||
metadata=json.dumps(data[3]).encode("utf-8"), | ||||||||||||
) | ||||||||||||
yield metadata | ||||||||||||
authority = MetadataAuthority( | ||||||||||||
type=MetadataAuthorityType.DEPOSIT_CLIENT, | ||||||||||||
Not Done Inline ActionsThat's not detailed enough; JSON is a notation/language, not a format; there are thousands of formats based on JSON. Look at other parts of the codebase for examples. vlorentz: That's not detailed enough; JSON is a notation/language, not a format; there are thousands of… | ||||||||||||
url="https://clearlydefined.io/", | ||||||||||||
metadata={}, | ||||||||||||
) | ||||||||||||
fetcher = MetadataFetcher( | ||||||||||||
name="swh-clearlydefined", | ||||||||||||
version="0.0.1", | ||||||||||||
metadata={}, | ||||||||||||
) | ||||||||||||
def map_sha1_with_swhid(storage, sha1: str) -> Optional[str]: | def map_sha1_with_swhid(storage, sha1: str) -> Optional[str]: | |||||||||||
""" | """ | |||||||||||
Take sha1 and storage as input and give the corresponding | Take sha1 and storage as input and give the corresponding | |||||||||||
swhID for that sha1 | swhID for that sha1 | |||||||||||
""" | """ | |||||||||||
if not sha1: | if not sha1: | |||||||||||
return None | return None | |||||||||||
content = storage.content_get([hash_to_bytes(sha1)])[0] | content = storage.content_get([hash_to_bytes(sha1)])[0] | |||||||||||
Show All 12 Lines | def sha1_git_in_revisions(storage, sha1_git: str) -> bool: | |||||||||||
""" | """ | |||||||||||
sha1_git_bytes = hash_to_bytes(sha1_git) | sha1_git_bytes = hash_to_bytes(sha1_git) | |||||||||||
missing_revision = storage.revision_missing([sha1_git_bytes]) | missing_revision = storage.revision_missing([sha1_git_bytes]) | |||||||||||
if len(list(missing_revision)) == 0: | if len(list(missing_revision)) == 0: | |||||||||||
return True | return True | |||||||||||
return False | return False | |||||||||||
def map_sha1_and_add_in_data( | def map_sha1_and_add_in_data( | |||||||||||
storage, sha1: Optional[str], data: list, mapping_status=True | storage, sha1: Optional[str], data: list, file: Dict, mapping_status=True | |||||||||||
) -> bool: | ) -> bool: | |||||||||||
Not Done Inline Actionsthere is again an argument named data of type list, with no docstring. it's impossible to know what it is when reading this. Please:
vlorentz: there is again an argument named `data` of type `list`, with no docstring.
it's impossible to… | ||||||||||||
if sha1: | if sha1: | |||||||||||
assert isinstance(sha1, str) | assert isinstance(sha1, str) | |||||||||||
swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) | swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) | |||||||||||
if swh_id: | if swh_id: | |||||||||||
data.append((swh_id, MetadataTargetType.CONTENT, None)) | data.append((swh_id, MetadataTargetType.CONTENT, None, file)) | |||||||||||
else: | else: | |||||||||||
mapping_status = False | mapping_status = False | |||||||||||
return mapping_status | return mapping_status | |||||||||||
Not Done Inline ActionsThis is unnecessarily complex; this function takes a mapping_status only to return it and sometimes set it to False`. Instead, you should let the caller do the boolean operation (which they already do anyway...) vlorentz: This is unnecessarily complex; this function takes a `mapping_status` only to return it and… | ||||||||||||
Done Inline ActionsI did not write that function initially, but since these lines of code were redundant in map_clearlydefined, map_scancode, map_licensee, so we decided to put this code in a function, so suggestions on this ? TG1999: I did not write that function initially, but since these lines of code were redundant in… | ||||||||||||
Not Done Inline ActionsWe discussed this already. But if you don't see how, leave it like this and I'll do it. vlorentz: We discussed this already. But if you don't see how, leave it like this and I'll do it. | ||||||||||||
def map_scancode( | def map_scancode( | |||||||||||
storage, metadata_string: str | storage, metadata_string: str | |||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | ) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: | |||||||||||
""" | """ | |||||||||||
Take metadata_string and storage as input and try to | Take metadata_string and storage as input and try to | |||||||||||
map the sha1 of files with content, return mapping | map the sha1 of files with content, return mapping | |||||||||||
status of harvest (True if able to map every sha1, | status of harvest (True if able to map every sha1, | |||||||||||
False if not able to map every sha1) and | False if not able to map every sha1) and | |||||||||||
data to be written in storage | data to be written in storage | |||||||||||
""" | """ | |||||||||||
metadata = json.loads(metadata_string) | metadata = json.loads(metadata_string) | |||||||||||
content = metadata.get("content") or {} | content = metadata.get("content") or {} | |||||||||||
files = content.get("files") or {} | files = content.get("files") or {} | |||||||||||
mapping_status = True | mapping_status = True | |||||||||||
data: list = [] | data: list = [] | |||||||||||
Not Done Inline ActionsWhat is the complete type of data? vlorentz: What is the complete type of `data`? | ||||||||||||
for file in files: | for file in files: | |||||||||||
sha1 = file.get("sha1") | sha1 = file.get("sha1") | |||||||||||
mapping_status = ( | mapping_status = ( | |||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status | |||||||||||
) | ) | |||||||||||
return mapping_status, data | return mapping_status, data | |||||||||||
def map_licensee( | def map_licensee( | |||||||||||
storage, metadata_string: str | storage, metadata_string: str | |||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | ) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: | |||||||||||
""" | """ | |||||||||||
Take metadata_string and storage as input and try to | Take metadata_string and storage as input and try to | |||||||||||
map the sha1 of files with content, return mapping | map the sha1 of files with content, return mapping | |||||||||||
status of harvest (True if able to map every sha1, | status of harvest (True if able to map every sha1, | |||||||||||
False if not able to map every sha1) and | False if not able to map every sha1) and | |||||||||||
data to be written in storage | data to be written in storage | |||||||||||
""" | """ | |||||||||||
metadata = json.loads(metadata_string) | metadata = json.loads(metadata_string) | |||||||||||
licensee = metadata.get("licensee") or {} | licensee = metadata.get("licensee") or {} | |||||||||||
output = licensee.get("output") or {} | output = licensee.get("output") or {} | |||||||||||
content = output.get("content") or {} | content = output.get("content") or {} | |||||||||||
files = content.get("matched_files") or [] | files = content.get("matched_files") or [] | |||||||||||
mapping_status = True | mapping_status = True | |||||||||||
data: list = [] | data: list = [] | |||||||||||
Done Inline Actionstype vlorentz: type | ||||||||||||
for file in files: | for file in files: | |||||||||||
sha1 = file.get("content_hash") | sha1 = file.get("content_hash") | |||||||||||
mapping_status = ( | mapping_status = ( | |||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status | |||||||||||
) | ) | |||||||||||
return mapping_status, data | return mapping_status, data | |||||||||||
def map_clearlydefined( | def map_clearlydefined( | |||||||||||
storage, metadata_string: str | storage, metadata_string: str | |||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | ) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: | |||||||||||
""" | """ | |||||||||||
Take metadata_string and storage as input and try to | Take metadata_string and storage as input and try to | |||||||||||
map the sha1 of files with content, return mapping | map the sha1 of files with content, return mapping | |||||||||||
status of harvest (True if able to map every sha1, | status of harvest (True if able to map every sha1, | |||||||||||
False if not able to map every sha1) and | False if not able to map every sha1) and | |||||||||||
data to be written in storage | data to be written in storage | |||||||||||
""" | """ | |||||||||||
metadata = json.loads(metadata_string) | metadata = json.loads(metadata_string) | |||||||||||
files = metadata.get("files") or [] | files = metadata.get("files") or [] | |||||||||||
mapping_status = True | mapping_status = True | |||||||||||
data: list = [] | data: list = [] | |||||||||||
Done Inline Actionstype vlorentz: type | ||||||||||||
for file in files: | for file in files: | |||||||||||
hashes = file.get("hashes") or {} | hashes = file.get("hashes") or {} | |||||||||||
sha1 = hashes.get("sha1") | sha1 = hashes.get("sha1") | |||||||||||
mapping_status = ( | mapping_status = ( | |||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status | |||||||||||
) | ) | |||||||||||
return mapping_status, data | return mapping_status, data | |||||||||||
def map_harvest( | def map_harvest( | |||||||||||
storage, tool: str, metadata_string: str | storage, tool: str, metadata_string: str | |||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | ) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: | |||||||||||
""" | """ | |||||||||||
Take tool, metadata_string and storage as input and try to | Take tool, metadata_string and storage as input and try to | |||||||||||
map the sha1 of files with content, return status of | map the sha1 of files with content, return status of | |||||||||||
harvest and data to be written in storage | harvest and data to be written in storage | |||||||||||
""" | """ | |||||||||||
tools = { | tools = { | |||||||||||
"scancode": map_scancode, | "scancode": map_scancode, | |||||||||||
"licensee": map_licensee, | "licensee": map_licensee, | |||||||||||
"clearlydefined": map_clearlydefined, | "clearlydefined": map_clearlydefined, | |||||||||||
} | } | |||||||||||
return tools[tool](storage=storage, metadata_string=metadata_string) | return tools[tool](storage=storage, metadata_string=metadata_string) | |||||||||||
def map_definition( | def map_definition( | |||||||||||
storage, metadata_string: str | storage, metadata_string: str | |||||||||||
) -> Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]]: | ) -> Optional[ | |||||||||||
Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]]] | ||||||||||||
]: | ||||||||||||
""" | """ | |||||||||||
Take metadata_string and storage as input and try to | Take metadata_string and storage as input and try to | |||||||||||
map the sha1 of defintion with content/ gitSha in revision | map the sha1 of defintion with content/ gitSha in revision | |||||||||||
return None if not able to map | return None if not able to map | |||||||||||
else return data to be written in storage | else return data to be written in storage | |||||||||||
""" | """ | |||||||||||
metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) | metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) | |||||||||||
described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | |||||||||||
Show All 21 Lines | elif sha1: | |||||||||||
return None | return None | |||||||||||
assert isinstance(swh_id_sha1, str) | assert isinstance(swh_id_sha1, str) | |||||||||||
swh_id = swh_id_sha1 | swh_id = swh_id_sha1 | |||||||||||
metadata_type = MetadataTargetType.CONTENT | metadata_type = MetadataTargetType.CONTENT | |||||||||||
else: | else: | |||||||||||
raise WrongMetadata("Wrong metadata") | raise WrongMetadata("Wrong metadata") | |||||||||||
return True, [(swh_id, metadata_type, origin)] | return True, [(swh_id, metadata_type, origin, metadata)] | |||||||||||
def map_row( | def get_type_of_tool(cd_path) -> ToolType: | |||||||||||
storage, row: tuple | ||||||||||||
) -> Union[ | ||||||||||||
Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]], | ||||||||||||
Tuple[bool, List[Tuple[str, MetadataTargetType, None]]], | ||||||||||||
]: | ||||||||||||
""" | """ | |||||||||||
Take row and storage as input and try to map that row, | Take cd_path as input if cd_path is invalid then raise exception, | |||||||||||
if ID of row is invalid then raise exception, | else return tyoe of tool of that row | |||||||||||
if not able to map that row, then return None | ||||||||||||
else return status of that row and data to be written | ||||||||||||
in storage | ||||||||||||
""" | """ | |||||||||||
cd_path = row[0] | ||||||||||||
list_cd_path = cd_path.split("/") | list_cd_path = cd_path.split("/") | |||||||||||
# For example: maven/mavencentral/cobol-parser/abc/0.4.0.json | # For example: maven/mavencentral/cobol-parser/abc/0.4.0.json | |||||||||||
if list_cd_path[4] != "revision": | if list_cd_path[4] != "revision": | |||||||||||
raise RevisionNotFound( | raise RevisionNotFound( | |||||||||||
"Not a supported/known ID, A valid ID should have" | "Not a supported/known ID, A valid ID should have" | |||||||||||
'5th component as "revision".' | '5th component as "revision".' | |||||||||||
) | ) | |||||||||||
# For example: maven/mavencentral/cobol-parser/revision/0.4.0.txt | # For example: maven/mavencentral/cobol-parser/revision/0.4.0.txt | |||||||||||
if not list_cd_path[-1].endswith(".json"): | if not list_cd_path[-1].endswith(".json"): | |||||||||||
raise NoJsonExtension( | raise NoJsonExtension( | |||||||||||
'Not a supported/known ID, A valid ID should end with ".json" extension.' | 'Not a supported/known ID, A valid ID should end with ".json" extension.' | |||||||||||
) | ) | |||||||||||
metadata_string = gzip.decompress(row[1]).decode() | ||||||||||||
# if the row doesn't contain any information in metadata return None so it can be | ||||||||||||
# mapped later on | ||||||||||||
if metadata_string == "": | ||||||||||||
return None | ||||||||||||
# if the ID of row contains 9 components: | # if the ID of row contains 9 components: | |||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>/tool/<tool_name>/<tool_version>.json | # <package_manager>/<instance>/<namespace>/<name>/revision/<version>/tool/<tool_name>/<tool_version>.json | |||||||||||
# then it is a harvest | # then it is a harvest | |||||||||||
if len(list_cd_path) == 9: | if len(list_cd_path) == 9: | |||||||||||
# npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json | # npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json | |||||||||||
if list_cd_path[6] != "tool": | if list_cd_path[6] != "tool": | |||||||||||
raise ToolNotFound( | raise ToolNotFound( | |||||||||||
'Not a supported/known harvest ID, A valid harvest ID should have 7th\ | 'Not a supported/known harvest ID, A valid harvest ID should have 7th\ | |||||||||||
component as "tool".' | component as "tool".' | |||||||||||
) | ) | |||||||||||
tool = list_cd_path[7] | tool = list_cd_path[7] | |||||||||||
# if the row contains an unknown tool | # if the row contains an unknown tool | |||||||||||
if tool not in ("scancode", "licensee", "clearlydefined"): | if tool not in ("scancode", "licensee", "clearlydefined", "fossology"): | |||||||||||
raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") | raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") | |||||||||||
return map_harvest( | return ToolType(tool) | |||||||||||
tool=tool, | elif len(list_cd_path) == 6: | |||||||||||
metadata_string=metadata_string, | return ToolType.DEFINITION | |||||||||||
storage=storage, | # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json | |||||||||||
raise InvalidComponents( | ||||||||||||
"Not a supported/known ID, A valid ID should have 6 or 9 components." | ||||||||||||
) | ) | |||||||||||
elif len(list_cd_path) == 6: | ||||||||||||
# if the ID of row contains 6 components: | def map_row( | |||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>.json | storage, row: tuple | |||||||||||
# then it is a defintion | ) -> Union[ | |||||||||||
Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]]]], | ||||||||||||
Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]], | ||||||||||||
Done Inline Actionsagain... Union[Optional[XXX], XXX] is the same as Optional[XXX] vlorentz: again... `Union[Optional[XXX], XXX]` is the same as `Optional[XXX]` | ||||||||||||
]: | ||||||||||||
""" | ||||||||||||
Take row and storage as input and try to map that row, | ||||||||||||
if ID of row is invalid then raise exception, | ||||||||||||
if not able to map that row, then return None | ||||||||||||
else return status of that row and data to be written | ||||||||||||
in storage | ||||||||||||
""" | ||||||||||||
tool = get_type_of_tool(row[0]).value | ||||||||||||
# if the row doesn't contain any information in metadata return None so it can be | ||||||||||||
# mapped later on | ||||||||||||
metadata_string = gzip.decompress(row[1]).decode() | ||||||||||||
if metadata_string == "": | ||||||||||||
return None | ||||||||||||
if tool == "definition": | ||||||||||||
return map_definition( | return map_definition( | |||||||||||
metadata_string=metadata_string, | metadata_string=metadata_string, | |||||||||||
storage=storage, | storage=storage, | |||||||||||
) | ) | |||||||||||
# For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json | ||||||||||||
raise InvalidComponents( | else: | |||||||||||
"Not a supported/known ID, A valid ID should have 6 or 9 components." | return map_harvest( | |||||||||||
tool=tool, | ||||||||||||
metadata_string=metadata_string, | ||||||||||||
storage=storage, | ||||||||||||
) | ) | |||||||||||
Done Inline Actionsfull type of row? what about passing the elements of the row as arguments directly? vlorentz: full type of row?
what about passing the elements of the row as arguments directly? |
Please import in the order recommended by PEP 8
isort should do it automatically for you before you commit; did you skip it?