diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -3,13 +3,20 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +import json from typing import Optional from swh.model.hashutil import hash_to_bytes from swh.model.hashutil import hash_to_hex +from swh.model.model import MetadataTargetType +from swh.model.model import Origin import psycopg2 +def write_in_metadata_storage(metadata_type, origin, swh_id): + pass + + def map_sha1_with_swhid(sha1: str, dsn: str) -> Optional[str]: """ Take sha1 and dsn as input and give the corresponding @@ -27,3 +34,112 @@ sha1_git = hash_to_hex(sha1_git_tuple_data[0][0]) swh_id = "swh:1:cnt:{sha1_git}".format(sha1_git=sha1_git) return swh_id + + +def sha1_git_in_revisions(sha1_git: str, dsn: str) -> bool: + """ + Take sha1_git and dsn as input and + tell whether that sha1_git exists in revision + table + """ + read_connection = psycopg2.connect(dsn=dsn) + cur = read_connection.cursor() + sha1_git = hash_to_bytes(sha1_git) + cur.execute("SELECT id FROM revision WHERE id= %s;", (sha1_git,)) + rows = cur.fetchall() + if len(rows) == 1: + return True + else: + return False + + +def map_scancode(metadata_string: str, dsn): + metadata = json.loads(metadata_string) + content = metadata.get("content") or {} + files = content.get("files") or {} + flag = True + for file in files: + sha1 = file.get("sha1") + if sha1: + swh_id = map_sha1_with_swhid(sha1, dsn) + if swh_id: + pass + else: + flag = False + return flag + + +def map_licensee(metadata_string: str, dsn): + metadata = json.loads(metadata_string) + licensee = metadata.get("licensee") or {} + output = licensee.get("output") or {} + content = output.get("content") or {} + files = content.get("matched_files") or [] + flag = True + for file in files: + sha1 = file.get("content_hash") + if sha1: + swh_id = map_sha1_with_swhid(sha1, dsn) + if swh_id: + pass + else: + flag = False + return flag + + +def map_clearlydefined(metadata_string: str, dsn): + metadata = json.loads(metadata_string) + files = metadata.get("files") or [] + flag = True + for file in files: + hashes = file.get("hashes") or {} + sha1 = hashes.get("sha1") + if sha1: + swh_id = map_sha1_with_swhid(sha1, dsn) + if swh_id: + + pass + else: + flag = False + return flag + + +def map_harvest(tool, metadata_string, dsn): + tools = { + "scancode": map_scancode, + "licensee": map_licensee, + "clearlydefined": map_clearlydefined, + } + + if tool in tools: + return tools.get(tool)(metadata_string, dsn) + + return False + + +def map_definition(metadata_string: str, dsn: str) -> bool: + metadata = json.loads(metadata_string) + described = metadata.get("described") or {} + hashes = described.get("hashes") or {} + sha1_git = hashes.get("gitSha") + source = described.get("sourceLocation") or {} + url = source.get("url") + origin = Origin(url=url) + + if sha1_git: + if not sha1_git_in_revisions(sha1_git=sha1_git, dsn=dsn): + return False + swh_id = "swh:1:rev:{sha1_git}".format(sha1_git=sha1_git) + metadata_type = MetadataTargetType.REVISION + write_in_metadata_storage( + metadata_type=metadata_type, origin=origin, swh_id=swh_id + ) + return True + + sha1 = hashes.get("sha1") + swh_id = map_sha1_with_swhid(sha1=sha1, dsn=dsn) + metadata_type = MetadataTargetType.CONTENT + write_in_metadata_storage(metadata_type=metadata_type, origin=origin, swh_id=swh_id) + if not swh_id: + return False + return True diff --git a/swh/clearlydefined/read.py b/swh/clearlydefined/read.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/read.py @@ -0,0 +1,93 @@ +# Copyright (C) 2017-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import gzip +import psycopg2 + +from swh.clearlydefined.mapping_utils import map_harvest +from swh.clearlydefined.mapping_utils import map_definition + + +def write_next_date(previous_date, new_date, clearcode_dsn): + update_connection = psycopg2.connect(dsn=clearcode_dsn) + cur = update_connection.cursor() + cur.execute( + "UPDATE last_run_date SET time= %s WHERE time= %s;", + ( + new_date, + previous_date, + ), + ) + update_connection.commit() + + +def get_last_run_date(clearcode_dsn): + read_connection = psycopg2.connect(dsn=clearcode_dsn) + cur = read_connection.cursor() + cur.execute("SELECT * FROM last_run_date;") + row = cur.fetchall()[0] + date = row[0] + return date + + +def map_row(row, swh_dsn): + cd_path = row[0] + list_cd_path = cd_path.split("/") + metadata_string = gzip.decompress(row[1]).decode() + if metadata_string == "": + return False + if len(list_cd_path) == 9: + tool = list_cd_path[7] + return map_harvest(tool=tool, metadata_string=metadata_string, dsn=swh_dsn) + if len(list_cd_path) == 6: + return map_definition(metadata_string=metadata_string, dsn=swh_dsn) + + +def map_previously_unmapped_data(clearcode_dsn, swh_dsn): + read_connection = psycopg2.connect(dsn=clearcode_dsn) + cur = read_connection.cursor() + cur.execute("SELECT * FROM unmapped_data ;") + rows = cur.fetchall() + for row in rows: + cd_path = row[0] + cur.execute("SELECT * FROM clearcode_cditem WHERE path=%s;", (cd_path,)) + unmapped_row = cur.fetchall()[0] + if map_row(row=unmapped_row, swh_dsn=swh_dsn): + cur.execute("DELETE * FROM unmapped_data WHERE path= %s;", (cd_path,)) + read_connection.commit() + + +def write_in_not_mapped(cd_path, clearcode_dsn): + write_connection = psycopg2.connect(dsn=clearcode_dsn) + cur = write_connection.cursor() + cur.execute("INSERT INTO unmapped_data (path) VALUES (%s):", (cd_path,)) + write_connection.commit() + + +def read_from_clearcode_and_write_in_swh(clearcode_dsn, swh_dsn, date=None): + read_connection = psycopg2.connect(dsn=clearcode_dsn) + cur = read_connection.cursor() + cur.execute("SELECT * FROM clearcode_cditem ORDER BY last_modified_date DESC;") + rows = cur.fetchall() + if date: + new_date = rows[0][2] + write_next_date( + previous_date=date, new_date=new_date, clearcode_dsn=clearcode_dsn + ) + for row in rows: + cd_path = row[0] + if date and row[2] <= date: + return + mapped = map_row(row=row, swh_dsn=swh_dsn) + if not mapped: + write_in_not_mapped(cd_path=cd_path, clearcode_dsn=clearcode_dsn) + + +def main(clearcode_dsn, swh_dsn): + map_previously_unmapped_data(clearcode_dsn=clearcode_dsn, swh_dsn=swh_dsn) + date = get_last_run_date(clearcode_dsn=clearcode_dsn) + read_from_clearcode_and_write_in_swh( + clearcode_dsn=clearcode_dsn, swh_dsn=swh_dsn, date=date + )