Changeset View
Standalone View
swh/clearlydefined/mapping_utils.py
# Copyright (C) 2017-2021 The Software Heritage developers | # Copyright (C) 2017-2021 The Software Heritage developers | ||||||||||||
vlorentz: (also 2021 or 2020-2021 while we're at it) | |||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||
import json | |||||||||||||
from typing import Any | |||||||||||||
from typing import Dict | |||||||||||||
from typing import Optional | from typing import Optional | ||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
from typing import Tuple | |||||||||||||
import gzip | |||||||||||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||||||||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||||||||||
from swh.model.model import MetadataTargetType | |||||||||||||
from swh.model.model import Origin | |||||||||||||
import psycopg2 | import psycopg2 | ||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
def map_sha1_with_swhid(sha1: str, dsn: str) -> Optional[str]: | def map_sha1_with_swhid(sha1: str, dsn: str) -> Optional[str]: | ||||||||||||
Done Inline Actionsthe argument name should only be "storage", for consistency with the rest of the codebase. And we prefer it to be the first argument (same reason) vlorentz: the argument name should only be "storage", for consistency with the rest of the codebase. And… | |||||||||||||
""" | """ | ||||||||||||
Take sha1 and dsn as input and give the corresponding | Take sha1 and dsn as input and give the corresponding | ||||||||||||
swhID for that sha1 | swhID for that sha1 | ||||||||||||
""" | """ | ||||||||||||
if not sha1: | if not sha1: | ||||||||||||
return None | return None | ||||||||||||
read_connection = psycopg2.connect(dsn=dsn) | read_connection = psycopg2.connect(dsn=dsn) | ||||||||||||
cur = read_connection.cursor() | cur = read_connection.cursor() | ||||||||||||
sha1 = hash_to_bytes(sha1) | sha1 = hash_to_bytes(sha1) | ||||||||||||
cur.execute("SELECT sha1_git FROM content where sha1= %s;", (sha1,)) | cur.execute("SELECT sha1_git FROM content where sha1= %s;", (sha1,)) | ||||||||||||
sha1_git_tuple_data = cur.fetchall() | sha1_git_tuple_data = cur.fetchall() | ||||||||||||
if len(sha1_git_tuple_data) == 0: | if len(sha1_git_tuple_data) == 0: | ||||||||||||
return None | return None | ||||||||||||
sha1_git = hash_to_hex(sha1_git_tuple_data[0][0]) | sha1_git = hash_to_hex(sha1_git_tuple_data[0][0]) | ||||||||||||
swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) | swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) | ||||||||||||
return swh_id | return swh_id | ||||||||||||
def sha1_git_in_revisions(sha1_git: str, dsn: str) -> bool: | |||||||||||||
""" | |||||||||||||
Take sha1_git and dsn as input and | |||||||||||||
tell whether that sha1_git exists in revision | |||||||||||||
table | |||||||||||||
""" | |||||||||||||
read_connection = psycopg2.connect(dsn=dsn) | |||||||||||||
cur = read_connection.cursor() | |||||||||||||
Done Inline Actionsuse storage.revision_missing vlorentz: use `storage.revision_missing` | |||||||||||||
sha1_git = hash_to_bytes(sha1_git) | |||||||||||||
cur.execute("SELECT id FROM revision WHERE id= %s;", (sha1_git,)) | |||||||||||||
rows = cur.fetchall() | |||||||||||||
if len(rows) == 1: | |||||||||||||
return True | |||||||||||||
else: | |||||||||||||
return False | |||||||||||||
def map_scancode(metadata_string: str, dsn: str) -> Tuple[bool, list]: | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
content = metadata.get("content") or {} | |||||||||||||
files = content.get("files") or {} | |||||||||||||
flag = True | |||||||||||||
data = [] | |||||||||||||
for file in files: | |||||||||||||
Done Inline Actionsplease rename this. "flag" is synonymous with "boolean", which doesn't convey the meaning of this variable. (same comment below) vlorentz: please rename this. "flag" is synonymous with "boolean", which doesn't convey the meaning of… | |||||||||||||
sha1 = file.get("sha1") | |||||||||||||
if sha1: | |||||||||||||
swh_id = map_sha1_with_swhid(sha1, dsn) | |||||||||||||
if swh_id: | |||||||||||||
data.append((swh_id, MetadataTargetType.CONTENT, None)) | |||||||||||||
else: | |||||||||||||
flag = False | |||||||||||||
return flag, data | |||||||||||||
def map_licensee(metadata_string: str, dsn: str) -> Tuple[bool, list]: | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
licensee = metadata.get("licensee") or {} | |||||||||||||
output = licensee.get("output") or {} | |||||||||||||
content = output.get("content") or {} | |||||||||||||
files = content.get("matched_files") or [] | |||||||||||||
flag = True | |||||||||||||
data = [] | |||||||||||||
for file in files: | |||||||||||||
sha1 = file.get("content_hash") | |||||||||||||
if sha1: | |||||||||||||
swh_id = map_sha1_with_swhid(sha1, dsn) | |||||||||||||
if swh_id: | |||||||||||||
data.append((swh_id, MetadataTargetType.CONTENT, None)) | |||||||||||||
else: | |||||||||||||
flag = False | |||||||||||||
return flag, data | |||||||||||||
def map_clearlydefined(metadata_string: str, dsn: str) -> Tuple[bool, list]: | |||||||||||||
metadata = json.loads(metadata_string) | |||||||||||||
files = metadata.get("files") or [] | |||||||||||||
flag = True | |||||||||||||
data = [] | |||||||||||||
for file in files: | |||||||||||||
hashes = file.get("hashes") or {} | |||||||||||||
sha1 = hashes.get("sha1") | |||||||||||||
if sha1: | |||||||||||||
swh_id = map_sha1_with_swhid(sha1, dsn) | |||||||||||||
if swh_id: | |||||||||||||
data.append((swh_id, MetadataTargetType.CONTENT, None)) | |||||||||||||
else: | |||||||||||||
flag = False | |||||||||||||
return flag, data | |||||||||||||
def map_harvest(tool: str, metadata_string: str, dsn: str) -> Tuple[bool, list]: | |||||||||||||
tools = { | |||||||||||||
"scancode": map_scancode, | |||||||||||||
"licensee": map_licensee, | |||||||||||||
"clearlydefined": map_clearlydefined, | |||||||||||||
} | |||||||||||||
if tool in tools: | |||||||||||||
return tools[tool](metadata_string, dsn) | |||||||||||||
return False, [] | |||||||||||||
def map_definition(metadata_string: str, dsn: str) -> Tuple[bool, list]: | |||||||||||||
metadata: Dict[str,Dict[str,Optional[Dict]]] = json.loads(metadata_string) | |||||||||||||
described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | |||||||||||||
hashes: Dict[str,str] = described.get("hashes") or {} | |||||||||||||
sha1_git = hashes.get("gitSha") | |||||||||||||
source: Dict[str,str] = described.get("sourceLocation") or {} | |||||||||||||
url = source.get("url") | |||||||||||||
origin = None | |||||||||||||
if isinstance(url, str): | |||||||||||||
origin = Origin(url=url) | |||||||||||||
if sha1_git and isinstance(sha1_git, str): | |||||||||||||
if not sha1_git_in_revisions(sha1_git=sha1_git, dsn=dsn): | |||||||||||||
return False, [] | |||||||||||||
swh_id: Optional[str] = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) | |||||||||||||
metadata_type = MetadataTargetType.REVISION | |||||||||||||
return True, [(swh_id, metadata_type, origin)] | |||||||||||||
sha1 = hashes.get("sha1") | |||||||||||||
if isinstance(sha1, str): | |||||||||||||
swh_id = map_sha1_with_swhid(sha1=sha1, dsn=dsn) | |||||||||||||
Done Inline Actionswhen does this happen in practice? vlorentz: when does this happen in practice? | |||||||||||||
Done Inline ActionsYes, you are right, this line won't get hit in practice TG1999: Yes, you are right, this line won't get hit in practice | |||||||||||||
if not swh_id: | |||||||||||||
return False, [] | |||||||||||||
metadata_type = MetadataTargetType.CONTENT | |||||||||||||
return True, [(swh_id, metadata_type, origin)] | |||||||||||||
return False, [] | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
def check_for_valid_ID(list_cd_path: list) -> bool: | |||||||||||||
if len(list_cd_path) < 6: | |||||||||||||
return False | |||||||||||||
if list_cd_path[4] != "revision": | |||||||||||||
return False | |||||||||||||
if not list_cd_path[-1].endswith(".json"): | |||||||||||||
return False | |||||||||||||
Done Inline Actionswhen is the url not a string? vlorentz: when is the url not a string? | |||||||||||||
Not Done Inline Actionssource.get("url") may return None, so for that I have used this line, and url should be str, not Optional[str] that will be returned by source.get("url") to prevent mypy error here TG1999: source.get("url") may return None, so for that I have used this line, and url should be str… | |||||||||||||
Not Done Inline ActionsIf the URL is expected to always be either None or a string, then test if it is None, and if it's not, assert it's a string. That way we get an error at runtime instead of silently ignoring the unexpected data type. vlorentz: If the URL is expected to always be either None or a string, then test if it is None, and if… | |||||||||||||
Done Inline ActionsGot it TG1999: Got it | |||||||||||||
return True | |||||||||||||
Done Inline ActionsWhen is sha1_git not a string? vlorentz: When is `sha1_git` not a string? | |||||||||||||
Not Done Inline ActionsSame reason for sourc.get("url") TG1999: Same reason for sourc.get("url") | |||||||||||||
def map_row(row: tuple, swh_dsn: str) -> Optional[Tuple[bool, list]]: | |||||||||||||
cd_path = row[0] | |||||||||||||
list_cd_path = cd_path.split("/") | |||||||||||||
metadata_string = gzip.decompress(row[1]).decode() | |||||||||||||
if not check_for_valid_ID(list_cd_path): | |||||||||||||
return None | |||||||||||||
# if the row doesn't contain any information in metadata return None | |||||||||||||
if metadata_string == "": | |||||||||||||
return None | |||||||||||||
# if the ID of row contains 9 components: | |||||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>/tool/<tool_name>/<tool_version>.json | |||||||||||||
# then it is a harvest | |||||||||||||
if len(list_cd_path) == 9: | |||||||||||||
if list_cd_path[6] != "tool": | |||||||||||||
Done Inline Actionsthe two return True, ... statements are exactly the same, you don't need to write them twice. Use an else instead, and return after the else block vlorentz: the two `return True, ...` statements are exactly the same, you don't need to write them twice. | |||||||||||||
return None | |||||||||||||
tool = list_cd_path[7] | |||||||||||||
if tool not in ("scancode", "licensee", "clearlydefined"): | |||||||||||||
return None | |||||||||||||
return map_harvest(tool=tool, metadata_string=metadata_string, dsn=swh_dsn) | |||||||||||||
Not Done Inline Actionswhen does this happen in practice? vlorentz: when does this happen in practice? | |||||||||||||
Done Inline ActionsYes you are right, it won't happen in pratcice TG1999: Yes you are right, it won't happen in pratcice | |||||||||||||
Done Inline ActionsActually this line got hit, when metadata is wrongly formed, should I raise an error for it ?? TG1999: Actually this line got hit, when metadata is wrongly formed, should I raise an error for it ?? | |||||||||||||
Not Done Inline Actionsdepends when/why it's triggered. If you know for sure that will happen, then you need to handle it because we don't want the script to crash in production. vlorentz: depends when/why it's triggered. If you know for sure that will happen, then you need to handle… | |||||||||||||
# if the ID of row contains 6 components: | |||||||||||||
# <package_manager>/<instance>/<namespace>/<name>/revision/<version>.json | |||||||||||||
# then it is a defintion | |||||||||||||
if len(list_cd_path) == 6: | |||||||||||||
return map_definition( | |||||||||||||
metadata_string=metadata_string, | |||||||||||||
dsn=swh_dsn, | |||||||||||||
Done Inline Actionswhen does this happen? vlorentz: when does this happen? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_invalid_ID_without_revision TG1999: The case in test_map_row_with_invalid_ID_without_revision | |||||||||||||
Not Done Inline ActionsCould you explain this in a comment (as well as the ones below)? vlorentz: Could you explain this in a comment (as well as the ones below)? | |||||||||||||
Done Inline ActionsSure TG1999: Sure | |||||||||||||
) | |||||||||||||
return None | |||||||||||||
Done Inline Actionsand this? vlorentz: and this? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_invalid_ID_without_json_extension TG1999: The case in test_map_row_with_invalid_ID_without_json_extension | |||||||||||||
Done Inline Actionsand this? vlorentz: and this? | |||||||||||||
Not Done Inline ActionsThe case in test_map_row_with_empty_metadata_string, I have seen in clearcode toolkit DB, some rows might not have metadata at time of scan, it may gradually fetch that data after a while. TG1999: The case in test_map_row_with_empty_metadata_string, I have seen in clearcode toolkit DB, some… | |||||||||||||
Done Inline Actions
vlorentz: | |||||||||||||
Done Inline Actionsshouldn't this be an error? vlorentz: shouldn't this be an error? | |||||||||||||
Not Done Inline ActionsIMO more tools can be used by clearcode toolkit in future and in that case we should save them for future mapping. TG1999: IMO more tools can be used by clearcode toolkit in future and in that case we should save them… | |||||||||||||
Done Inline Actionsthen we should either raise an error here so we don't ignore them silenty, and be generic enough to support new tools. vlorentz: then we should either raise an error here so we don't ignore them silenty, and be generic… | |||||||||||||
Done Inline ActionsSure TG1999: Sure |
(also 2021 or 2020-2021 while we're at it)