Changeset View
Standalone View
swh/clearlydefined/mapping_utils.py
# Copyright (C) 2017-2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||||||||||
vlorentz: (also 2021 or 2020-2021 while we're at it) | |||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||
from typing import Optional | import json | ||||||||||||
from typing import Any, Dict, Optional, Tuple, List, Union | |||||||||||||
import gzip | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||||||||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||||||||||
import psycopg2 | from swh.model.model import MetadataTargetType, Origin | ||||||||||||
from swh.clearlydefined.error import ( | |||||||||||||
InvalidComponents, | |||||||||||||
WrongMetadata, | |||||||||||||
ToolNotFound, | |||||||||||||
NoJsonExtension, | |||||||||||||
RevisionNotFound, | |||||||||||||
ToolNotSupported, | |||||||||||||
) | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
def map_sha1_with_swhid(sha1: str, dsn: str) -> Optional[str]: | |||||||||||||
Done Inline Actionsthe argument name should only be "storage", for consistency with the rest of the codebase. And we prefer it to be the first argument (same reason) vlorentz: the argument name should only be "storage", for consistency with the rest of the codebase. And… | |||||||||||||
def map_sha1_with_swhid(storage, sha1: str) -> Optional[str]: | |||||||||||||
""" | """ | ||||||||||||
Take sha1 and dsn as input and give the corresponding | Take sha1 and storage as input and give the corresponding | ||||||||||||
swhID for that sha1 | swhID for that sha1 | ||||||||||||
""" | """ | ||||||||||||
if not sha1: | if not sha1: | ||||||||||||
return None | return None | ||||||||||||
read_connection = psycopg2.connect(dsn=dsn) | content = storage.content_get([hash_to_bytes(sha1)])[0] | ||||||||||||
cur = read_connection.cursor() | if not content: | ||||||||||||
sha1 = hash_to_bytes(sha1) | |||||||||||||
cur.execute("SELECT sha1_git FROM content where sha1= %s;", (sha1,)) | |||||||||||||
sha1_git_tuple_data = cur.fetchall() | |||||||||||||
if len(sha1_git_tuple_data) == 0: | |||||||||||||
return None | return None | ||||||||||||
sha1_git = hash_to_hex(sha1_git_tuple_data[0][0]) | sha1_git = hash_to_hex(content.sha1_git) | ||||||||||||
swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) | swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) | ||||||||||||
return swh_id | return swh_id | ||||||||||||
def sha1_git_in_revisions(storage, sha1_git: str) -> bool: | |||||||||||||
""" | |||||||||||||
Take sha1_git and storage as input and | |||||||||||||
tell whether that sha1_git exists in revision | |||||||||||||
table | |||||||||||||
""" | |||||||||||||
sha1_git_bytes = hash_to_bytes(sha1_git) | |||||||||||||
missing_revision = storage.revision_missing([sha1_git_bytes]) | |||||||||||||
Done Inline Actionsuse storage.revision_missing vlorentz: use `storage.revision_missing` | |||||||||||||
if len(list(missing_revision)) == 0: | |||||||||||||
return True | |||||||||||||
return False | |||||||||||||
def map_sha1_and_add_in_data( | |||||||||||||
storage, sha1: Optional[str], data: list, mapping_status=True | |||||||||||||
) -> bool: | |||||||||||||
if sha1: | |||||||||||||
assert isinstance(sha1, str) | |||||||||||||
swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) | |||||||||||||
if swh_id: | |||||||||||||
data.append((swh_id, MetadataTargetType.CONTENT, None)) | |||||||||||||
else: | |||||||||||||
mapping_status = False | |||||||||||||
return mapping_status | |||||||||||||
Done Inline Actionsplease rename this. "flag" is synonymous with "boolean", which doesn't convey the meaning of this variable. (same comment below) vlorentz: please rename this. "flag" is synonymous with "boolean", which doesn't convey the meaning of… | |||||||||||||
def map_scancode( | |||||||||||||
storage, metadata_string: str | |||||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | |||||||||||||
""" | |||||||||||||
Take metadata_string and storage as input and try to | |||||||||||||
map the sha1 of files with content, return mapping | |||||||||||||
status of harvest (True if able to map every sha1, | |||||||||||||
False if not able to map every sha1) and | |||||||||||||
data to be written in storage | |||||||||||||
""" | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
content = metadata.get("content") or {} | |||||||||||||
files = content.get("files") or {} | |||||||||||||
mapping_status = True | |||||||||||||
data: list = [] | |||||||||||||
for file in files: | |||||||||||||
sha1 = file.get("sha1") | |||||||||||||
mapping_status = ( | |||||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | |||||||||||||
) | |||||||||||||
return mapping_status, data | |||||||||||||
def map_licensee( | |||||||||||||
storage, metadata_string: str | |||||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | |||||||||||||
""" | |||||||||||||
Take metadata_string and storage as input and try to | |||||||||||||
map the sha1 of files with content, return mapping | |||||||||||||
status of harvest (True if able to map every sha1, | |||||||||||||
False if not able to map every sha1) and | |||||||||||||
data to be written in storage | |||||||||||||
""" | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
licensee = metadata.get("licensee") or {} | |||||||||||||
output = licensee.get("output") or {} | |||||||||||||
content = output.get("content") or {} | |||||||||||||
files = content.get("matched_files") or [] | |||||||||||||
mapping_status = True | |||||||||||||
data: list = [] | |||||||||||||
for file in files: | |||||||||||||
sha1 = file.get("content_hash") | |||||||||||||
mapping_status = ( | |||||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | |||||||||||||
) | |||||||||||||
return mapping_status, data | |||||||||||||
def map_clearlydefined( | |||||||||||||
storage, metadata_string: str | |||||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | |||||||||||||
""" | |||||||||||||
Take metadata_string and storage as input and try to | |||||||||||||
map the sha1 of files with content, return mapping | |||||||||||||
status of harvest (True if able to map every sha1, | |||||||||||||
False if not able to map every sha1) and | |||||||||||||
data to be written in storage | |||||||||||||
""" | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
files = metadata.get("files") or [] | |||||||||||||
mapping_status = True | |||||||||||||
data: list = [] | |||||||||||||
for file in files: | |||||||||||||
hashes = file.get("hashes") or {} | |||||||||||||
sha1 = hashes.get("sha1") | |||||||||||||
mapping_status = ( | |||||||||||||
map_sha1_and_add_in_data(storage, sha1, data) and mapping_status | |||||||||||||
) | |||||||||||||
return mapping_status, data | |||||||||||||
def map_harvest( | |||||||||||||
storage, tool: str, metadata_string: str | |||||||||||||
) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: | |||||||||||||
""" | |||||||||||||
Take tool, metadata_string and storage as input and try to | |||||||||||||
map the sha1 of files with content, return status of | |||||||||||||
harvest and data to be written in storage | |||||||||||||
Done Inline Actionswhen does this happen in practice? vlorentz: when does this happen in practice? | |||||||||||||
Done Inline ActionsYes, you are right, this line won't get hit in practice TG1999: Yes, you are right, this line won't get hit in practice | |||||||||||||
""" | |||||||||||||
tools = { | |||||||||||||
"scancode": map_scancode, | |||||||||||||
"licensee": map_licensee, | |||||||||||||
"clearlydefined": map_clearlydefined, | |||||||||||||
} | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
return tools[tool](storage=storage, metadata_string=metadata_string) | |||||||||||||
def map_definition( | |||||||||||||
storage, metadata_string: str | |||||||||||||
) -> Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]]: | |||||||||||||
""" | |||||||||||||
Take metadata_string and storage as input and try to | |||||||||||||
Done Inline Actionswhen is the url not a string? vlorentz: when is the url not a string? | |||||||||||||
Not Done Inline Actionssource.get("url") may return None, so for that I have used this line, and url should be str, not Optional[str] that will be returned by source.get("url") to prevent mypy error here TG1999: source.get("url") may return None, so for that I have used this line, and url should be str… | |||||||||||||
Not Done Inline ActionsIf the URL is expected to always be either None or a string, then test if it is None, and if it's not, assert it's a string. That way we get an error at runtime instead of silently ignoring the unexpected data type. vlorentz: If the URL is expected to always be either None or a string, then test if it is None, and if… | |||||||||||||
Done Inline ActionsGot it TG1999: Got it | |||||||||||||
map the sha1 of defintion with content/ gitSha in revision | |||||||||||||
return None if not able to map | |||||||||||||
else return data to be written in storage | |||||||||||||
Done Inline ActionsWhen is sha1_git not a string? vlorentz: When is `sha1_git` not a string? | |||||||||||||
Not Done Inline ActionsSame reason for sourc.get("url") TG1999: Same reason for sourc.get("url") | |||||||||||||
""" | |||||||||||||
metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) | |||||||||||||
described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | |||||||||||||
hashes: Dict[str, str] = described.get("hashes") or {} | |||||||||||||
sha1_git = hashes.get("gitSha") | |||||||||||||
source: Dict[str, str] = described.get("sourceLocation") or {} | |||||||||||||
url = source.get("url") | |||||||||||||
origin = None | |||||||||||||
sha1 = hashes.get("sha1") | |||||||||||||
if url: | |||||||||||||
assert isinstance(url, str) | |||||||||||||
origin = Origin(url=url) | |||||||||||||
if sha1_git: | |||||||||||||
assert isinstance(sha1_git, str) | |||||||||||||
if not sha1_git_in_revisions(sha1_git=sha1_git, storage=storage): | |||||||||||||
return None | |||||||||||||
Done Inline Actionsthe two return True, ... statements are exactly the same, you don't need to write them twice. Use an else instead, and return after the else block vlorentz: the two `return True, ...` statements are exactly the same, you don't need to write them twice. | |||||||||||||
swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) | |||||||||||||
metadata_type = MetadataTargetType.REVISION | |||||||||||||
elif sha1: | |||||||||||||
assert isinstance(sha1, str) | |||||||||||||
swh_id_sha1 = map_sha1_with_swhid(sha1=sha1, storage=storage) | |||||||||||||
Not Done Inline Actionswhen does this happen in practice? vlorentz: when does this happen in practice? | |||||||||||||
Done Inline ActionsYes you are right, it won't happen in pratcice TG1999: Yes you are right, it won't happen in pratcice | |||||||||||||
Done Inline ActionsActually this line got hit, when metadata is wrongly formed, should I raise an error for it ?? TG1999: Actually this line got hit, when metadata is wrongly formed, should I raise an error for it ?? | |||||||||||||
Not Done Inline Actionsdepends when/why it's triggered. If you know for sure that will happen, then you need to handle it because we don't want the script to crash in production. vlorentz: depends when/why it's triggered. If you know for sure that will happen, then you need to handle… | |||||||||||||
if not swh_id_sha1: | |||||||||||||
return None | |||||||||||||
assert isinstance(swh_id_sha1, str) | |||||||||||||
swh_id = swh_id_sha1 | |||||||||||||
metadata_type = MetadataTargetType.CONTENT | |||||||||||||
else: | |||||||||||||
Done Inline Actionswhen does this happen? vlorentz: when does this happen? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_invalid_ID_without_revision TG1999: The case in test_map_row_with_invalid_ID_without_revision | |||||||||||||
Not Done Inline ActionsCould you explain this in a comment (as well as the ones below)? vlorentz: Could you explain this in a comment (as well as the ones below)? | |||||||||||||
Done Inline ActionsSure TG1999: Sure | |||||||||||||
raise WrongMetadata("Wrong metadata") | |||||||||||||
Done Inline Actionsand this? vlorentz: and this? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_invalid_ID_without_json_extension TG1999: The case in test_map_row_with_invalid_ID_without_json_extension | |||||||||||||
return True, [(swh_id, metadata_type, origin)] | |||||||||||||
def map_row( | |||||||||||||
storage, row: tuple | |||||||||||||
) -> Union[ | |||||||||||||
Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]], | |||||||||||||
Tuple[bool, List[Tuple[str, MetadataTargetType, None]]], | |||||||||||||
]: | |||||||||||||
""" | |||||||||||||
Take row and storage as input and try to map that row, | |||||||||||||
if ID of row is invalid then raise exception, | |||||||||||||
if not able to map that row, then return None | |||||||||||||
else return status of that row and data to be written | |||||||||||||
in storage | |||||||||||||
""" | |||||||||||||
cd_path = row[0] | |||||||||||||
list_cd_path = cd_path.split("/") | |||||||||||||
# For example: maven/mavencentral/cobol-parser/abc/0.4.0.json | |||||||||||||
Done Inline Actionsand this? vlorentz: and this? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_empty_metadata_string, I have seen in clearcode toolkit DB, some rows might not have metadata at time of scan, it may gradually fetch that data after a while. TG1999: The case in test_map_row_with_empty_metadata_string, I have seen in clearcode toolkit DB, some… | |||||||||||||
if list_cd_path[4] != "revision": | |||||||||||||
raise RevisionNotFound( | |||||||||||||
"Not a supported/known ID, A valid ID should have" | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
'5th component as "revision".' | |||||||||||||
) | |||||||||||||
# For example: maven/mavencentral/cobol-parser/revision/0.4.0.txt | |||||||||||||
if not list_cd_path[-1].endswith(".json"): | |||||||||||||
raise NoJsonExtension( | |||||||||||||
'Not a supported/known ID, A valid ID should end with ".json" extension.' | |||||||||||||
) | |||||||||||||
metadata_string = gzip.decompress(row[1]).decode() | |||||||||||||
# if the row doesn't contain any information in metadata return None so it can be | |||||||||||||
# mapped later on | |||||||||||||
if metadata_string == "": | |||||||||||||
return None | |||||||||||||
# if the ID of row contains 9 components: | |||||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>/tool/<tool_name>/<tool_version>.json | |||||||||||||
# then it is a harvest | |||||||||||||
if len(list_cd_path) == 9: | |||||||||||||
# npm/npmjs/@ngtools/webpack/revision/10.2.1/abc/scancode/3.2.2.json | |||||||||||||
if list_cd_path[6] != "tool": | |||||||||||||
raise ToolNotFound( | |||||||||||||
'Not a supported/known harvest ID, A valid harvest ID should have 7th\ | |||||||||||||
component as "tool".' | |||||||||||||
) | |||||||||||||
tool = list_cd_path[7] | |||||||||||||
# if the row contains an unknown tool | |||||||||||||
Done Inline Actionsshouldn't this be an error? vlorentz: shouldn't this be an error? | |||||||||||||
Not Done Inline ActionsIMO more tools can be used by clearcode toolkit in future and in that case we should save them for future mapping. TG1999: IMO more tools can be used by clearcode toolkit in future and in that case we should save them… | |||||||||||||
Done Inline Actionsthen we should either raise an error here so we don't ignore them silenty, and be generic enough to support new tools. vlorentz: then we should either raise an error here so we don't ignore them silenty, and be generic… | |||||||||||||
Done Inline ActionsSure TG1999: Sure | |||||||||||||
if tool not in ("scancode", "licensee", "clearlydefined"): | |||||||||||||
raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") | |||||||||||||
return map_harvest( | |||||||||||||
tool=tool, | |||||||||||||
metadata_string=metadata_string, | |||||||||||||
storage=storage, | |||||||||||||
) | |||||||||||||
elif len(list_cd_path) == 6: | |||||||||||||
# if the ID of row contains 6 components: | |||||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>.json | |||||||||||||
# then it is a defintion | |||||||||||||
return map_definition( | |||||||||||||
metadata_string=metadata_string, | |||||||||||||
storage=storage, | |||||||||||||
) | |||||||||||||
# For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json | |||||||||||||
raise InvalidComponents( | |||||||||||||
"Not a supported/known ID, A valid ID should have 6 or 9 components." | |||||||||||||
) |
(also 2021 or 2020-2021 while we're at it)