Changeset View
Changeset View
Standalone View
Standalone View
swh/clearlydefined/mapping_utils.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||||
from datetime import datetime | from datetime import datetime | ||||||||||||||
from enum import Enum | from enum import Enum | ||||||||||||||
import gzip | import gzip | ||||||||||||||
import json | import json | ||||||||||||||
import re | |||||||||||||||
from typing import Any, Dict, List, Optional, Tuple | from typing import Any, Dict, List, Optional, Tuple | ||||||||||||||
from swh.clearlydefined.error import ( | from swh.clearlydefined.error import ( | ||||||||||||||
InvalidComponents, | InvalidComponents, | ||||||||||||||
NoJsonExtension, | NoJsonExtension, | ||||||||||||||
RevisionNotFound, | RevisionNotFound, | ||||||||||||||
ToolNotFound, | ToolNotFound, | ||||||||||||||
ToolNotSupported, | ToolNotSupported, | ||||||||||||||
WrongMetadata, | |||||||||||||||
) | ) | ||||||||||||||
from swh.model.hashutil import hash_to_bytes, hash_to_hex | from swh.model.hashutil import hash_to_bytes, hash_to_hex | ||||||||||||||
from swh.model.identifiers import parse_swhid | from swh.model.identifiers import parse_swhid | ||||||||||||||
from swh.model.model import ( | from swh.model.model import ( | ||||||||||||||
MetadataAuthority, | MetadataAuthority, | ||||||||||||||
MetadataAuthorityType, | MetadataAuthorityType, | ||||||||||||||
MetadataFetcher, | MetadataFetcher, | ||||||||||||||
MetadataTargetType, | MetadataTargetType, | ||||||||||||||
Origin, | Origin, | ||||||||||||||
RawExtrinsicMetadata, | RawExtrinsicMetadata, | ||||||||||||||
) | ) | ||||||||||||||
class ToolType(Enum): | class ToolType(Enum): | ||||||||||||||
"""The type of content pointed to by a snapshot branch. Usually a | """The type of a row""" | ||||||||||||||
revision or an alias.""" | |||||||||||||||
DEFINITION = "definition" | DEFINITION = "definition" | ||||||||||||||
SCANCODE = "scancode" | SCANCODE = "scancode" | ||||||||||||||
CLEARLYDEFINED = "clearlydefined" | CLEARLYDEFINED = "clearlydefined" | ||||||||||||||
LICENSEE = "licensee" | LICENSEE = "licensee" | ||||||||||||||
FOSSOLOGY = "fossology" | FOSSOLOGY = "fossology" | ||||||||||||||
class MappingStatus(Enum): | |||||||||||||||
"""The type of mapping status of a row""" | |||||||||||||||
MAPPED = "mapped" | |||||||||||||||
UNMAPPED = "unmapped" | |||||||||||||||
IGNORE = "ignore" | |||||||||||||||
AUTHORITY = MetadataAuthority( | AUTHORITY = MetadataAuthority( | ||||||||||||||
type=MetadataAuthorityType.REGISTRY, | type=MetadataAuthorityType.REGISTRY, | ||||||||||||||
url="https://clearlydefined.io/", | url="https://clearlydefined.io/", | ||||||||||||||
metadata=None, | metadata=None, | ||||||||||||||
) | ) | ||||||||||||||
FETCHER = MetadataFetcher( | FETCHER = MetadataFetcher( | ||||||||||||||
name="swh-clearlydefined", | name="swh-clearlydefined", | ||||||||||||||
version="0.0.1", | version="0.0.1", | ||||||||||||||
metadata=None, | metadata=None, | ||||||||||||||
) | ) | ||||||||||||||
def is_sha1(s): | |||||||||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | |||||||||||||||
return bool(re.match("^[a-fA-F0-9]+$", s)) | |||||||||||||||
def map_row_data_with_metadata( | def map_row_data_with_metadata( | ||||||||||||||
swh_id: str, | swh_id: str, | ||||||||||||||
type: MetadataTargetType, | type: MetadataTargetType, | ||||||||||||||
origin: Optional[Origin], | origin: Optional[Origin], | ||||||||||||||
metadata: Dict, | metadata: Dict, | ||||||||||||||
date: datetime, | date: datetime, | ||||||||||||||
format: str, | format: str, | ||||||||||||||
) -> RawExtrinsicMetadata: | ) -> RawExtrinsicMetadata: | ||||||||||||||
▲ Show 20 Lines • Show All 121 Lines • ▼ Show 20 Lines | for file in files: | ||||||||||||||
sha1 = hashes.get("sha1") | sha1 = hashes.get("sha1") | ||||||||||||||
assert sha1 | assert sha1 | ||||||||||||||
files_with_sha1.append((sha1, file)) | files_with_sha1.append((sha1, file)) | ||||||||||||||
return files_with_sha1 | return files_with_sha1 | ||||||||||||||
def map_harvest( | def map_harvest( | ||||||||||||||
storage, tool: str, metadata_string: str, date: datetime | storage, tool: str, metadata_string: str, date: datetime | ||||||||||||||
) -> Tuple[bool, List[RawExtrinsicMetadata]]: | ) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: | ||||||||||||||
""" | """ | ||||||||||||||
Take tool, metadata_string and storage as input and try to | Take tool, metadata_string and storage as input and try to | ||||||||||||||
map the sha1 of files with content, return status of | map the sha1 of files with content, return status of | ||||||||||||||
harvest and data to be written in storage | harvest and data to be written in storage | ||||||||||||||
""" | """ | ||||||||||||||
tools = { | tools = { | ||||||||||||||
"scancode": list_scancode_files, | "scancode": list_scancode_files, | ||||||||||||||
"licensee": list_licensee_files, | "licensee": list_licensee_files, | ||||||||||||||
Show All 9 Lines | ) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: | ||||||||||||||
mapping_status = True | mapping_status = True | ||||||||||||||
data: List[RawExtrinsicMetadata] = [] | data: List[RawExtrinsicMetadata] = [] | ||||||||||||||
for (sha1, file) in tools[tool](metadata_string): | for (sha1, file) in tools[tool](metadata_string): | ||||||||||||||
mapping_status = ( | mapping_status = ( | ||||||||||||||
map_sha1_and_add_in_data(storage, sha1, data, file, date, format_) | map_sha1_and_add_in_data(storage, sha1, data, file, date, format_) | ||||||||||||||
and mapping_status | and mapping_status | ||||||||||||||
) | ) | ||||||||||||||
return mapping_status, data | status = MappingStatus.UNMAPPED | ||||||||||||||
if mapping_status: | |||||||||||||||
status = MappingStatus.MAPPED | |||||||||||||||
return status, data | |||||||||||||||
def map_definition( | def map_definition( | ||||||||||||||
storage, metadata_string: str, date: datetime | storage, metadata_string: str, date: datetime | ||||||||||||||
) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: | ) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: | ||||||||||||||
""" | """ | ||||||||||||||
Take metadata_string and storage as input and try to | Take metadata_string and storage as input and try to | ||||||||||||||
map the sha1 of defintion with content/ gitSha in revision | map the sha1 of defintion with content/ gitSha in revision | ||||||||||||||
return None if not able to map | return None if not able to map | ||||||||||||||
else return data to be written in storage | else return data to be written in storage | ||||||||||||||
""" | """ | ||||||||||||||
metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) | metadata: Dict[str, Dict[str, Optional[Dict]]] = json.loads(metadata_string) | ||||||||||||||
described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | described: Dict[str, Optional[Dict[str, Any]]] = metadata.get("described") or {} | ||||||||||||||
hashes: Dict[str, str] = described.get("hashes") or {} | hashes: Dict[str, str] = described.get("hashes") or {} | ||||||||||||||
sha1_git = hashes.get("gitSha") | sha1_git = hashes.get("gitSha") | ||||||||||||||
source: Dict[str, str] = described.get("sourceLocation") or {} | source: Dict[str, str] = described.get("sourceLocation") or {} | ||||||||||||||
url = source.get("url") | url = source.get("url") | ||||||||||||||
origin = None | origin = None | ||||||||||||||
sha1 = hashes.get("sha1") | |||||||||||||||
if url: | if url: | ||||||||||||||
assert isinstance(url, str) | assert isinstance(url, str) | ||||||||||||||
origin = Origin(url=url) | origin = Origin(url=url) | ||||||||||||||
if not sha1_git: | |||||||||||||||
sha1_git = source.get("revision") | |||||||||||||||
if sha1_git: | if sha1_git: | ||||||||||||||
assert isinstance(sha1_git, str) | assert isinstance(sha1_git, str) | ||||||||||||||
if len(sha1_git) != 40 and not is_sha1(sha1_git): | |||||||||||||||
Not Done Inline Actionswhen does this happen? vlorentz: when does this happen? | |||||||||||||||
Done Inline ActionsSome time revision contains tags like 0.4.0, so I don't want code to throw error on these cases, so I am checking first is it a valid hexadecimal, before mapping it with revision TG1999: Some time revision contains tags like 0.4.0, so I don't want code to throw error on these cases… | |||||||||||||||
Not Done Inline ActionsMeh. is_hex is not going to be good enough; eg. a branch named beef would pass. Instead, check it is hexadecimal *and* the right length (a regexp would probably be faster than the current implementation of is_hex, btw) vlorentz: Meh. `is_hex` is not going to be good enough; eg. a branch named `beef` would pass.
Instead… | |||||||||||||||
Done Inline ActionsCan you suggest me a regexp for this? TG1999: Can you suggest me a regexp for this? | |||||||||||||||
Not Done Inline ActionsI could but that's no fun. How would you do it? vlorentz: I could but that's no fun. How would you do it? | |||||||||||||||
return MappingStatus.IGNORE, [] | |||||||||||||||
if not sha1_git_in_revisions(sha1_git=sha1_git, storage=storage): | if not sha1_git_in_revisions(sha1_git=sha1_git, storage=storage): | ||||||||||||||
return None | return MappingStatus.UNMAPPED, [] | ||||||||||||||
swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) | swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) | ||||||||||||||
metadata_type = MetadataTargetType.REVISION | metadata_type = MetadataTargetType.REVISION | ||||||||||||||
elif sha1: | |||||||||||||||
assert isinstance(sha1, str) | |||||||||||||||
swh_id_sha1 = map_sha1_with_swhid(sha1=sha1, storage=storage) | |||||||||||||||
if not swh_id_sha1: | |||||||||||||||
return None | |||||||||||||||
assert isinstance(swh_id_sha1, str) | |||||||||||||||
swh_id = swh_id_sha1 | |||||||||||||||
metadata_type = MetadataTargetType.CONTENT | |||||||||||||||
else: | else: | ||||||||||||||
raise WrongMetadata("Wrong metadata") | return MappingStatus.IGNORE, [] | ||||||||||||||
return True, [ | return MappingStatus.MAPPED, [ | ||||||||||||||
map_row_data_with_metadata( | map_row_data_with_metadata( | ||||||||||||||
swh_id=swh_id, | swh_id=swh_id, | ||||||||||||||
type=metadata_type, | type=metadata_type, | ||||||||||||||
origin=origin, | origin=origin, | ||||||||||||||
metadata=metadata, | metadata=metadata, | ||||||||||||||
date=date, | date=date, | ||||||||||||||
format="clearlydefined-definition-json", | format="clearlydefined-definition-json", | ||||||||||||||
) | ) | ||||||||||||||
Show All 37 Lines | def get_type_of_tool(cd_path) -> ToolType: | ||||||||||||||
# For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json | # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json | ||||||||||||||
raise InvalidComponents( | raise InvalidComponents( | ||||||||||||||
"Not a supported/known ID, A valid ID should have 6 or 9 components." | "Not a supported/known ID, A valid ID should have 6 or 9 components." | ||||||||||||||
) | ) | ||||||||||||||
def map_row( | def map_row( | ||||||||||||||
storage, metadata: bytes, id: str, date: datetime | storage, metadata: bytes, id: str, date: datetime | ||||||||||||||
) -> Optional[Tuple[bool, List[RawExtrinsicMetadata]]]: | ) -> Tuple[MappingStatus, List[RawExtrinsicMetadata]]: | ||||||||||||||
""" | """ | ||||||||||||||
Take row and storage as input and try to map that row, | Take row and storage as input and try to map that row, | ||||||||||||||
if ID of row is invalid then raise exception, | if ID of row is invalid then raise exception, | ||||||||||||||
if not able to map that row, then return None | if not able to map that row, then return None | ||||||||||||||
else return status of that row and data to be written | else return status of that row and data to be written | ||||||||||||||
in storage | in storage | ||||||||||||||
""" | """ | ||||||||||||||
tool = get_type_of_tool(id).value | tool = get_type_of_tool(id).value | ||||||||||||||
# if the row doesn't contain any information in metadata return None so it can be | # if the row doesn't contain any information in metadata return None so it can be | ||||||||||||||
# mapped later on | # mapped later on | ||||||||||||||
metadata_string = gzip.decompress(metadata).decode() | metadata_string = gzip.decompress(metadata).decode() | ||||||||||||||
if metadata_string == "": | if metadata_string == "": | ||||||||||||||
return None | return MappingStatus.UNMAPPED, [] | ||||||||||||||
if tool == "definition": | if tool == "definition": | ||||||||||||||
return map_definition( | return map_definition( | ||||||||||||||
metadata_string=metadata_string, storage=storage, date=date | metadata_string=metadata_string, storage=storage, date=date | ||||||||||||||
) | ) | ||||||||||||||
else: | else: | ||||||||||||||
return map_harvest( | return map_harvest( | ||||||||||||||
tool=tool, | tool=tool, | ||||||||||||||
metadata_string=metadata_string, | metadata_string=metadata_string, | ||||||||||||||
storage=storage, | storage=storage, | ||||||||||||||
date=date, | date=date, | ||||||||||||||
) | ) |