diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -1 +1 @@ -pytest_plugins = ["swh.storage.pytest_plugin"] +pytest_plugins = ["swh.storage.pytest_plugin", "swh.clearlydefined.pytest_plugin"] diff --git a/swh/clearlydefined/cli.py b/swh/clearlydefined/cli.py --- a/swh/clearlydefined/cli.py +++ b/swh/clearlydefined/cli.py @@ -2,18 +2,23 @@ from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group +import json +from swh.storage import get_storage +from swh.clearlydefined.orchestrator import orchestrator @swh_cli_group.group(name="clearlydefined", context_settings=CONTEXT_SETTINGS) +@click.option( + "--config", + "-C", + default=None, + type=click.STRING, + help="SWH storage config.", +) +@click.option("--clearcode-dsn", default=None, type=click.STRING, help="Clearcode DSN.") @click.pass_context -def clearlydefined_cli_group(ctx): - """Foo main command. - """ - - -@clearlydefined_cli_group.command() -@click.option("--bar", help="Something") -@click.pass_context -def bar(ctx, bar): - """Do something.""" - click.echo("bar") +def run_orchestration(ctx, config, clearcode_dsn): + """Software Heritage Clearlydefined Metadata Fetcher""" + swh_storage_backend_config = json.loads(config) + storage = get_storage(**swh_storage_backend_config) + orchestrator(storage=storage, clearcode_dsn=clearcode_dsn) diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -50,13 +50,13 @@ def map_sha1_and_add_in_data( - storage, sha1: Optional[str], data: list, mapping_status=True + storage, sha1: Optional[str], data: list, file: Dict, mapping_status=True ) -> bool: if sha1: assert isinstance(sha1, str) swh_id = map_sha1_with_swhid(storage=storage, sha1=sha1) if swh_id: - data.append((swh_id, MetadataTargetType.CONTENT, None)) + data.append((swh_id, MetadataTargetType.CONTENT, None, file)) else: mapping_status = False return mapping_status @@ -64,7 +64,7 @@ def map_scancode( storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: +) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -80,14 +80,14 @@ for file in files: sha1 = file.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status ) return mapping_status, data def map_licensee( storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: +) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -105,14 +105,14 @@ for file in files: sha1 = file.get("content_hash") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status ) return mapping_status, data def map_clearlydefined( storage, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: +) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: """ Take metadata_string and storage as input and try to map the sha1 of files with content, return mapping @@ -128,14 +128,14 @@ hashes = file.get("hashes") or {} sha1 = hashes.get("sha1") mapping_status = ( - map_sha1_and_add_in_data(storage, sha1, data) and mapping_status + map_sha1_and_add_in_data(storage, sha1, data, file) and mapping_status ) return mapping_status, data def map_harvest( storage, tool: str, metadata_string: str -) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None]]]: +) -> Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]]: """ Take tool, metadata_string and storage as input and try to map the sha1 of files with content, return status of @@ -152,7 +152,9 @@ def map_definition( storage, metadata_string: str -) -> Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]]: +) -> Optional[ + Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]]] +]: """ Take metadata_string and storage as input and try to map the sha1 of defintion with content/ gitSha in revision @@ -190,25 +192,15 @@ else: raise WrongMetadata("Wrong metadata") - return True, [(swh_id, metadata_type, origin)] + return True, [(swh_id, metadata_type, origin, metadata)] -def map_row( - storage, row: tuple -) -> Union[ - Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin]]]]], - Tuple[bool, List[Tuple[str, MetadataTargetType, None]]], -]: +def get_type_of_tool(cd_path) -> str: """ - Take row and storage as input and try to map that row, - if ID of row is invalid then raise exception, - if not able to map that row, then return None - else return status of that row and data to be written - in storage + Take cd_path as input if cd_path is invalid then raise exception, + else return tyoe of tool of that row """ - cd_path = row[0] list_cd_path = cd_path.split("/") - # For example: maven/mavencentral/cobol-parser/abc/0.4.0.json if list_cd_path[4] != "revision": raise RevisionNotFound( @@ -220,13 +212,6 @@ raise NoJsonExtension( 'Not a supported/known ID, A valid ID should end with ".json" extension.' ) - - metadata_string = gzip.decompress(row[1]).decode() - # if the row doesn't contain any information in metadata return None so it can be - # mapped later on - if metadata_string == "": - return None - # if the ID of row contains 9 components: # ////revision//tool//.json # then it is a harvest @@ -239,23 +224,47 @@ ) tool = list_cd_path[7] # if the row contains an unknown tool - if tool not in ("scancode", "licensee", "clearlydefined"): + if tool not in ("scancode", "licensee", "clearlydefined", "fossology"): raise ToolNotSupported(f"Tool for this ID {cd_path} is not supported") - return map_harvest( - tool=tool, + return tool + elif len(list_cd_path) == 6: + return "definition" + # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json + raise InvalidComponents( + "Not a supported/known ID, A valid ID should have 6 or 9 components." + ) + + +def map_row( + storage, row: tuple +) -> Union[ + Optional[Tuple[bool, List[Tuple[str, MetadataTargetType, Optional[Origin], Dict]]]], + Tuple[bool, List[Tuple[str, MetadataTargetType, None, Dict]]], +]: + """ + Take row and storage as input and try to map that row, + if not able to map that row, then return None + else return status of that row and data to be written + in storage + """ + + tool = get_type_of_tool(row[0]) + + # if the row doesn't contain any information in metadata return None so it can be + # mapped later on + metadata_string = gzip.decompress(row[1]).decode() + if metadata_string == "": + return None + + if tool == "definition": + return map_definition( metadata_string=metadata_string, storage=storage, ) - elif len(list_cd_path) == 6: - # if the ID of row contains 6 components: - # ////revision/.json - # then it is a defintion - return map_definition( + else: + return map_harvest( + tool=tool, metadata_string=metadata_string, storage=storage, ) - # For example: maven/mavencentral/cobol-parser/abc/revision/def/0.4.0.json - raise InvalidComponents( - "Not a supported/known ID, A valid ID should have 6 or 9 components." - ) diff --git a/swh/clearlydefined/orchestrator.py b/swh/clearlydefined/orchestrator.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/orchestrator.py @@ -0,0 +1,229 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import psycopg2 +from typing import Optional, Tuple, Dict +from datetime import datetime +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + Origin, + RawExtrinsicMetadata, + MetadataTargetType, +) +from swh.model.identifiers import parse_swhid +import attr +import json + +from swh.clearlydefined.mapping_utils import map_row +from swh.clearlydefined.mapping_utils import get_type_of_tool + + +def write_in_storage( + storage, + data: Tuple[str, MetadataTargetType, Optional[Origin], Dict], + date: datetime, +) -> None: + """ + Take storage and data as input and write + data inside RawExtrensicMetadata table inside + swh storage + """ + metadata = RawExtrinsicMetadata( + type=data[1], + target=parse_swhid(data[0]), + discovery_date=date, + authority=attr.evolve(get_metadata_authority(), metadata=None), + fetcher=attr.evolve(get_metadata_fetcher(), metadata=None), + format="json", + origin=data[2].url if isinstance(data[2], Origin) else None, + metadata=json.dumps(data[3]).encode("utf-8"), + ) + storage.raw_extrinsic_metadata_add([metadata]) + + +def get_metadata_authority() -> MetadataAuthority: + """ + return MetadataAuthority + """ + return MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url="https://clearlydefined.io/", + metadata={}, + ) + + +def get_metadata_fetcher() -> MetadataFetcher: + """ + return MetadataFetcher + """ + return MetadataFetcher( + name="swh-clearlydefined", + version="0.0.1", + metadata={}, + ) + + +def init_tables(cursor, connection) -> None: + """ + Take connection and cursor as input and initialize tables if + they don't exists + """ + cursor.execute( + """CREATE TABLE IF NOT EXISTS unmapped_data (path VARCHAR PRIMARY KEY); + CREATE TABLE IF NOT EXISTS last_run_date (time TIMESTAMPTZ);""" + ) + connection.commit() + + +def init_storage(storage) -> None: + """ + Take storage as input and add MetadataFetcher, MetadataAuthority inside storage + """ + metadata_authority = get_metadata_authority() + storage.metadata_authority_add([metadata_authority]) + metadata_fetcher = get_metadata_fetcher() + storage.metadata_fetcher_add([metadata_fetcher]) + + +def write_next_date(cursor, update_connection, previous_date, new_date) -> None: + """ + Take cursor, update_connection, previous_date, new_date as input + and if it previous_date is None, then enter new_date, else + update the previous_date with new_date + """ + if not previous_date: + cursor.execute("INSERT into last_run_date VALUES(%s)", (new_date,)) + else: + cursor.execute( + "UPDATE last_run_date SET time= %s WHERE time= %s;", + ( + new_date, + previous_date, + ), + ) + update_connection.commit() + + +def get_last_run_date(cursor) -> Optional[datetime]: + """ + Take cursor as input and get last run date from which + new rows will be orchestered, return None if it's first + orchestration + """ + cursor.execute("SELECT * FROM last_run_date;") + rows = cursor.fetchall() + if len(rows) < 1: + return None + date = rows[0][0] + return date + + +def orchestor_row(storage, cursor, connection, row) -> None: + """ + Take storage, cursor, connection, row as input + and orchestor that row + """ + mapped = map_row(row=row, storage=storage) + if not mapped: + write_in_not_mapped(cd_path=row[0], cursor=cursor, write_connection=connection) + else: + mapping_status, data_list = mapped + if not mapping_status: + write_in_not_mapped( + cd_path=row[0], cursor=cursor, write_connection=connection + ) + for data in data_list: + write_in_storage(storage=storage, data=data, date=row[2]) + + +def map_previously_unmapped_data(storage, cursor, connection) -> None: + """ + Take storage, cursor, connection as input and map previously + unmapped data + """ + cursor.execute("SELECT * FROM unmapped_data ;") + rows = cursor.fetchall() + for row in rows: + cd_path = row[0] + cursor.execute("DELETE FROM unmapped_data WHERE path=%s", (cd_path,)) + connection.commit() + cursor.execute("SELECT * FROM clearcode_cditem WHERE path=%s;", (cd_path,)) + unmapped_row = cursor.fetchall()[0] + orchestor_row( + storage=storage, + row=unmapped_row, + cursor=cursor, + connection=connection, + ) + + +def write_in_not_mapped(cursor, write_connection, cd_path) -> None: + """ + Take cursor, write_connection, cd_path as input + and write 'cd_path' if 'cd_path' does not exists + inside unmapped_data + """ + cursor.execute("SELECT * FROM unmapped_data WHERE path=%s;", (cd_path,)) + if len(cursor.fetchall()) == 1: + return + cursor.execute("INSERT INTO unmapped_data (path) VALUES (%s);", (cd_path,)) + write_connection.commit() + return + + +def read_from_clearcode_and_write_in_swh( + storage, cursor, connection, date: Optional[datetime] +) -> None: + """ + Take storage, cursor, connection, date as input + and read from clearcode database and write only + the data that is discovered after 'date' in swh storage + """ + if date: + cursor.execute( + "SELECT * FROM clearcode_cditem " + "WHERE last_modified_date < %s " + "ORDER BY last_modified_date DESC;", + (date,), + ) + else: + cursor.execute( + "SELECT * FROM clearcode_cditem ORDER BY last_modified_date DESC;" + ) + rows = cursor.fetchall() + new_date = rows[0][2] + write_next_date( + cursor=cursor, + update_connection=connection, + previous_date=date, + new_date=new_date, + ) + for row in rows: + tool = get_type_of_tool(row[0]) + if tool == "fossology": + pass + else: + orchestor_row( + storage=storage, cursor=cursor, connection=connection, row=row + ) + + +def orchestrator(storage, clearcode_dsn: str) -> None: + """ + Take clearcode_dsn, swh_storage_backend_config as input + and write data periodically from clearcode database to + swh raw extrensic metadata + """ + connection = psycopg2.connect(dsn=clearcode_dsn) + cursor = connection.cursor() + init_tables(cursor=cursor, connection=connection) + init_storage(storage=storage) + map_previously_unmapped_data(storage=storage, cursor=cursor, connection=connection) + date = get_last_run_date(cursor=cursor) + read_from_clearcode_and_write_in_swh( + storage=storage, cursor=cursor, connection=connection, date=date + ) diff --git a/swh/clearlydefined/pytest_plugin.py b/swh/clearlydefined/pytest_plugin.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/pytest_plugin.py @@ -0,0 +1,31 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from os import environ, path + +import pytest + +import swh.clearlydefined + +from swh.core.db.pytest_plugin import postgresql_fact + +SQL_DIR = path.join(path.dirname(swh.clearlydefined.__file__), "sql") + +environ["LC_ALL"] = "C.UTF-8" + + +swh_clearcode = postgresql_fact( + "postgresql_proc", db_name="clearcode", dump_files=path.join(SQL_DIR, "*.sql") +) + + +@pytest.fixture +def clearcode_dsn(swh_clearcode): + """Basic pg storage configuration with no journal collaborator + (to avoid pulling optional dependency on clients of this fixture) + + """ + clearcode_dsn = swh_clearcode.dsn + return clearcode_dsn diff --git a/swh/clearlydefined/sql/schema.sql b/swh/clearlydefined/sql/schema.sql new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/sql/schema.sql @@ -0,0 +1,12 @@ +CREATE TABLE clearcode_cditem( +path VARCHAR(2048) PRIMARY KEY, +content BYTEA NOT NULL, +last_modified_date TIMESTAMPTZ NOT NULL, +last_map_date TIMESTAMPTZ, +map_error TEXT, +uuid UUID NOT NULL +); +CREATE TABLE unmapped_data( +path VARCHAR PRIMARY KEY); +CREATE TABLE last_run_date ( +time TIMESTAMPTZ); \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/clearlydefined_metadata.json b/swh/clearlydefined/tests/data/clearlydefined_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearlydefined_metadata.json @@ -0,0 +1,7 @@ +{ + "path": "package/LICENSE", + "hashes": { + "sha1": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/clearlydefined_true.json b/swh/clearlydefined/tests/data/clearlydefined_true.json --- a/swh/clearlydefined/tests/data/clearlydefined_true.json +++ b/swh/clearlydefined/tests/data/clearlydefined_true.json @@ -54,13 +54,6 @@ "sha1": "61c2b3a30496d329e21af70dd2d7e097046d07b7", "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" } - }, - { - "path": "package/README.md", - "hashes": { - "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", - "sha256": "60b9c916c43fba00e2d3ba5207b25bf28109e985c3f739f430bb2056423d5aa9" - } } ], "package.json": { diff --git a/swh/clearlydefined/tests/data/clearlydefined_true.json b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json copy from swh/clearlydefined/tests/data/clearlydefined_true.json copy to swh/clearlydefined/tests/data/clearydefined_not_mapped.json --- a/swh/clearlydefined/tests/data/clearlydefined_true.json +++ b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json @@ -51,16 +51,9 @@ { "path": "package/LICENSE", "hashes": { - "sha1": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "sha1": "385f736f1ad8f5743cf2681b154d314f9cf48db8", "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" } - }, - { - "path": "package/README.md", - "hashes": { - "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", - "sha256": "60b9c916c43fba00e2d3ba5207b25bf28109e985c3f739f430bb2056423d5aa9" - } } ], "package.json": { diff --git a/swh/clearlydefined/tests/data/def_not_mapped.json b/swh/clearlydefined/tests/data/def_not_mapped.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/def_not_mapped.json @@ -0,0 +1,88 @@ +{ + "described": { + "releaseDate": "2019-03-29", + "sourceLocation": { + "type": "sourcearchive", + "provider": "mavencentral", + "namespace": "za.co.absa.cobrix", + "name": "cobol-parser", + "revision": "0.4.0", + "url": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + }, + "urls": { + "registry": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser", + "version": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0", + "download": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0.jar" + }, + "hashes": { + "sha1": "3e21cc4942a4234c9e5edd8a9cacd1670fe59f13", + "sha256": "2bf17e47907dc3dfa64fc17ae6ef71b54d96a79a740f3b7a618104d4281656f0" + }, + "files": 261, + "tools": [ + "clearlydefined/1.5.0", + "scancode/3.2.2" + ], + "toolScore": { + "total": 100, + "date": 30, + "source": 70 + }, + "score": { + "total": 100, + "date": 30, + "source": 70 + } + }, + "licensed": { + "declared": "Apache-2.0", + "toolScore": { + "total": 60, + "declared": 30, + "discovered": 0, + "consistency": 15, + "spdx": 15, + "texts": 0 + }, + "facets": { + "core": { + "attribution": { + "unknown": 260, + "parties": [ + "Copyright 2018-2019 ABSA Group Limited" + ] + }, + "discovered": { + "unknown": 260, + "expressions": [ + "Apache-2.0" + ] + }, + "files": 261 + } + }, + "score": { + "total": 60, + "declared": 30, + "discovered": 0, + "consistency": 15, + "spdx": 15, + "texts": 0 + } + }, + "coordinates": { + "type": "maven", + "provider": "mavencentral", + "namespace": "za.co.absa.cobrix", + "name": "cobol-parser", + "revision": "0.4.0" + }, + "_meta": { + "schemaVersion": "1.6.1", + "updated": "2019-11-04T05:20:21.308Z" + }, + "scores": { + "effective": 80, + "tool": 80 + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/licensee_metadata.json b/swh/clearlydefined/tests/data/licensee_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/licensee_metadata.json @@ -0,0 +1,11 @@ +{ + "filename": "package/package.json", + "content": "{\n \"name\": \"@fluidframework/replay-driver\",\n \"version\": \"0.31.0\",\n \"description\": \"Document replay version of Socket.IO implementation\",\n \"homepage\": \"https://fluidframework.com\",\n \"repository\": \"https://github.com/microsoft/FluidFramework\",\n \"license\": \"MIT\",\n \"author\": \"Microsoft\",\n \"sideEffects\": false,\n \"main\": \"dist/index.js\",\n \"module\": \"lib/index.js\",\n \"types\": \"dist/index.d.ts\",\n \"scripts\": {\n \"build\": \"npm run build:genver && concurrently npm:build:compile npm:lint\",\n \"build:compile\": \"concurrently npm:tsc npm:build:esnext\",\n \"build:docs\": \"api-extractor run --local && copyfiles -u 1 ./_api-extractor-temp/doc-models/* ../../../_api-extractor-temp/\",\n \"build:esnext\": \"tsc --project ./tsconfig.esnext.json\",\n \"build:full\": \"npm run build\",\n \"build:full:compile\": \"npm run build:compile\",\n \"build:genver\": \"gen-version\",\n \"clean\": \"rimraf dist lib *.tsbuildinfo *.build.log\",\n \"eslint\": \"eslint --format stylish src\",\n \"eslint:fix\": \"eslint --format stylish src --fix\",\n \"lint\": \"npm run eslint\",\n \"lint:fix\": \"npm run eslint:fix\",\n \"tsc\": \"tsc\",\n \"tsfmt\": \"tsfmt --verify\",\n \"tsfmt:fix\": \"tsfmt --replace\"\n },\n \"dependencies\": {\n \"@fluidframework/common-definitions\": \"^0.19.1\",\n \"@fluidframework/common-utils\": \"^0.26.0\",\n \"@fluidframework/driver-definitions\": \"^0.31.0\",\n \"@fluidframework/driver-utils\": \"^0.31.0\",\n \"@fluidframework/protocol-definitions\": \"^0.1016.1\",\n \"@fluidframework/telemetry-utils\": \"^0.31.0\",\n \"assert\": \"^2.0.0\",\n \"debug\": \"^4.1.1\"\n },\n \"devDependencies\": {\n \"@fluidframework/build-common\": \"^0.19.2\",\n \"@fluidframework/eslint-config-fluid\": \"^0.21.0\",\n \"@microsoft/api-extractor\": \"^7.7.2\",\n \"@types/assert\": \"^1.5.1\",\n \"@types/debug\": \"^4.1.5\",\n \"@types/mocha\": \"^5.2.5\",\n \"@types/nock\": \"^9.3.0\",\n \"@types/node\": \"^10.17.24\",\n \"@typescript-eslint/eslint-plugin\": \"~4.2.0\",\n \"@typescript-eslint/parser\": \"~4.2.0\",\n \"concurrently\": \"^5.2.0\",\n \"copyfiles\": \"^2.1.0\",\n \"eslint\": \"~7.9.0\",\n \"eslint-plugin-eslint-comments\": \"~3.2.0\",\n \"eslint-plugin-import\": \"~2.22.0\",\n \"eslint-plugin-no-null\": \"~1.0.2\",\n \"eslint-plugin-prefer-arrow\": \"~1.2.2\",\n \"eslint-plugin-react\": \"~7.21.2\",\n \"eslint-plugin-unicorn\": \"~22.0.0\",\n \"mocha\": \"^8.1.1\",\n \"nock\": \"^10.0.1\",\n \"rimraf\": \"^2.6.2\",\n \"typescript\": \"~3.7.4\",\n \"typescript-formatter\": \"7.1.0\"\n }\n}\n", + "content_hash": "61c2b3a30496d329e21af70dd2d7e097046d07b7", + "content_normalized": null, + "matcher": { + "name": "npmbower", + "confidence": 90 + }, + "matched_license": "MIT" +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/scancode_metadata.json b/swh/clearlydefined/tests/data/scancode_metadata.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/scancode_metadata.json @@ -0,0 +1,87 @@ +{ + "path": "package/LICENSE", + "type": "file", + "name": "LICENSE", + "base_name": "LICENSE", + "extension": "", + "size": 1073, + "date": "1985-10-26", + "sha1": "34973274ccef6ab4dfaaf86599792fa9c3fe4689", + "md5": "dc2a37e472c366af2a7b8bd0f2ba5af4", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "licenses": [ + { + "key": "mit", + "score": 97.66, + "name": "MIT License", + "short_name": "MIT License", + "category": "Permissive", + "is_exception": false, + "owner": "MIT", + "homepage_url": "http://opensource.org/licenses/mit-license.php", + "text_url": "http://opensource.org/licenses/mit-license.php", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:mit", + "spdx_license_key": "MIT", + "spdx_url": "https://spdx.org/licenses/MIT", + "start_line": 1, + "end_line": 21, + "matched_rule": { + "identifier": "mit_160.RULE", + "license_expression": "mit", + "licenses": [ + "mit" + ], + "is_license_text": true, + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "matcher": "3-seq", + "rule_length": 167, + "matched_length": 167, + "match_coverage": 100, + "rule_relevance": 100 + }, + "matched_text": "The MIT License\n\nCopyright ([c]) [2017] [Google], [Inc].\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE." + } + ], + "license_expressions": [ + "mit" + ], + "holders": [ + { + "value": "Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "copyrights": [ + { + "value": "Copyright (c) 2017 Google, Inc.", + "start_line": 3, + "end_line": 3 + } + ], + "authors": [], + "packages": [], + "emails": [], + "urls": [], + "is_legal": true, + "is_manifest": false, + "is_readme": false, + "is_top_level": true, + "is_key_file": true, + "is_generated": false, + "is_license_text": true, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/test_cli.py b/swh/clearlydefined/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/test_cli.py @@ -0,0 +1,14 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.clearlydefined.cli import run_orchestration as cli +from click.testing import CliRunner +import json + + +def test_orchestration_from_cli(swh_storage_backend_config, clearcode_dsn): + runner = CliRunner() + result = runner.invoke(cli,[json.dumps(swh_storage_backend_config), clearcode_dsn]) + assert result.exit_code == 0 diff --git a/swh/clearlydefined/tests/test_mapping_utils.py b/swh/clearlydefined/tests/test_mapping_utils.py --- a/swh/clearlydefined/tests/test_mapping_utils.py +++ b/swh/clearlydefined/tests/test_mapping_utils.py @@ -34,6 +34,7 @@ import os from typing import Tuple import pytest +import json content_data = [ @@ -195,6 +196,7 @@ url="http://central.maven.org/maven2/za/co/absa/cobrix/" "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" ), + json.loads(file_data(os.path.join(datadir, "definitions.json"))), ) ], ) @@ -217,6 +219,9 @@ url="http://central.maven.org/maven2/za/co/absa/cobrix/" "cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" ), + json.loads( + file_data(os.path.join(datadir, "definitions_sha1git.json")) + ), ) ], ) @@ -236,6 +241,7 @@ "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", MetadataTargetType.CONTENT, None, + json.loads(file_data(os.path.join(datadir, "scancode_metadata.json"))), ) ], ) @@ -255,6 +261,7 @@ "swh:1:cnt:d81cc0710eb6cf9efd5b920a8453e1e07157b6cd", MetadataTargetType.CONTENT, None, + json.loads(file_data(os.path.join(datadir, "scancode_metadata.json"))), ) ], ) @@ -275,6 +282,7 @@ "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", MetadataTargetType.CONTENT, None, + json.loads(file_data(os.path.join(datadir, "licensee_metadata.json"))), ) ], ) @@ -295,6 +303,7 @@ "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", MetadataTargetType.CONTENT, None, + json.loads(file_data(os.path.join(datadir, "licensee_metadata.json"))), ) ], ) @@ -314,6 +323,9 @@ "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", MetadataTargetType.CONTENT, None, + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ), ) ], ) @@ -333,6 +345,9 @@ "swh:1:cnt:36fade77193cb6d2bd826161a0979d64c28ab4fa", MetadataTargetType.CONTENT, None, + json.loads( + file_data(os.path.join(datadir, "clearlydefined_metadata.json")) + ), ) ], ) diff --git a/swh/clearlydefined/tests/test_orchestrator.py b/swh/clearlydefined/tests/test_orchestrator.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/test_orchestrator.py @@ -0,0 +1,165 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.clearlydefined.orchestrator import orchestrator +from swh.clearlydefined.orchestrator import get_last_run_date +import psycopg2 +from datetime import datetime +from typing import Optional +import gzip +import uuid +import os +from swh.model.model import Content + + +content_data = [ + Content.from_data(b"42\n"), + Content.from_data(b"4242\n"), +] + + +def add_content_data(swh_storage): + swh_storage.content_add(content_data) + + +def file_data(file_name: str) -> str: + with open(file_name) as file: + data = file.read() + return data + + +def file_content(filename: Optional[str], datadir): + if not filename: + return gzip.compress("".encode("utf-8"), compresslevel=9) + else: + return gzip.compress( + file_data(os.path.join(datadir, filename)).encode("utf-8"), compresslevel=9 + ) + + +def fill_rows_in_table(rows, cursor, connection): + for row in rows: + cursor.execute( + """INSERT INTO clearcode_cditem (path, content, last_modified_date, + last_map_date, map_error, uuid) VALUES (%s, %s, %s, %s, %s, %s);""", + ( + row[0], + row[1], + row[2], + row[3], + row[4], + uuid.uuid4(), + ), + ) + connection.commit() + + +def fill_data_before_updation_of_storage(connection, cursor, datadir): + rows = [ + ( + "maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + file_content("definitions.json", datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=1), + "", + ), + ( + "npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/" "3.2.2.json", + file_content("scancode_true.json", datadir=datadir), + datetime(year=2021, month=2, day=2), + datetime(year=2021, month=2, day=2), + "", + ), + ( + "npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + file_content("licensee_true.json", datadir=datadir), + datetime(year=2021, month=2, day=3), + datetime(year=2021, month=2, day=3), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", + file_content("clearlydefined_true.json", datadir=datadir), + datetime(year=2021, month=2, day=4), + datetime(year=2021, month=2, day=4), + "", + ), + ( + "maven/mavencentral/za.co.absa.cobrix/cobol/revision/0.4.0.json", + file_content("def_not_mapped.json", datadir=datadir), + datetime(year=2021, month=2, day=5), + datetime(year=2021, month=2, day=5), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.6/tool/clearlydefined/1.3.4.json", + file_content("clearydefined_not_mapped.json", datadir=datadir), + datetime(year=2021, month=2, day=6), + datetime(year=2021, month=2, day=6), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/fossology/1.3.4.json", + file_content(None, datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=1), + "", + ), + ] + fill_rows_in_table(rows=rows, cursor=cursor, connection=connection) + + +def fill_data_after_updation_of_storage(connection, cursor, datadir): + rows = [ + ( + "maven/mavencentral/cobrix/cobol-parser/revision/0.4.0.json", + file_content(None, datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=8), + "", + ), + ] + fill_rows_in_table(rows=rows, cursor=cursor, connection=connection) + + +def get_length_of_unmapped_data(connection, cursor) -> int: + cursor.execute("SELECT * FROM unmapped_data") + rows = cursor.fetchall() + return len(rows) + + +def test_orchestrator(swh_storage, clearcode_dsn, datadir): + connection = psycopg2.connect(dsn=clearcode_dsn) + cursor = connection.cursor() + add_content_data(swh_storage) + # Fill data in clearcode database, for first time orchestration + fill_data_before_updation_of_storage( + connection=connection, cursor=cursor, datadir=datadir + ) + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + # Check how much data is unmapped after first orchestration + assert 2 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + assert datetime( + 2021, 2, 6, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None) + ) == get_last_run_date(cursor=cursor) + content_data.extend( + [Content.from_data(b"424242\n"), Content.from_data(b"42424242\n")] + ) + add_content_data(swh_storage) + # Run orchestration after updation in swh storage and + # check how much data is unmapped after second orchestration + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 0 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + fill_data_after_updation_of_storage( + connection=connection, cursor=cursor, datadir=datadir + ) + # Fill new data in clearcode database and + # check how much data is unmapped after second orchestration + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + # Check how much data is unmapped when archive was not updated + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor)