diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -1,3 +1,56 @@ # swh-clearlydefined ClearlyDefined metadata Fetcher for Software Heritage + + +Installation of Clearcode Toolkit and running it +================================================ + +https://github.com/nexb/clearcode-toolkit#quick-start-using-a-database-storage + + +Setting up SWH-CLEARLYDEFINED +============================= + +* pip3 install -r requirements-swh.txt + + +Running of SWH-CLEARLYDEFINED metadata fetcher +============================================== + +* Create a config file (sample config file) +* Then pass command "swh clearlydefined [OPTIONS] fill_storage" +* OPTIONS - +- -C, --config-file + + Configuration file (default: /home/jenkins/.config/swh/global.yml) +- --clearcode-dsn + Sample DSN : "dbname=clearcode user=postgres host=127.0.0.1 port=32552 options=''" + +* Sample command looks like this: +swh clearlydefined -C /path/to/file --clearcode-dsn dbname=clearcode user=postgres host=127.0.0.1 port=32552 options='' fill_storage + +* Set a sample command like this on a cron tab that will fill data periodically + +Example to run this command weekly at 8:00 am morning on Sunday: + 0 8 * * 0 swh clearlydefined -C /path/to/file --clearcode-dsn "dbname=clearcode user=clearcode host=127.0.0.1 port=32552 options=''" fill_storage + + +Architecture +============ + +When user gives above command, it activates orchestration process. + +Orchestration Process - Fetches data from clearcode toolkit DB and then try to map it with SWH Storage, and the data which is able to be mapped (based on +mapping status) is written in RawExtrensicMetadata table of SWH Storage and data that is not being able to be mapped is stored in a state, so that data can +be mapped in future (updation of SWH storage). + +Mapping Process - Clearcode toolkit majorly contains two types of row data, one is definitions and second is harvest. Havests can further be classified as 4 +types for now (more harvest tools can be used in future) Clearlydefined, Licensee, Scancode, Fossology. Definitions can contain sha1 or sha1git and if it +is able to mapped we send mapping status true else false. Harvests of type Clearlydefined, Licensee, Scancode contains a list of sha1 data and if we are +able to map every sha1 from that list we send mapping status as true else false and since Harvests of Fossology doesn't contain any data that can be mapped +with SWH storage, we ignore it and neither try to map it nor store in the state + +Mapping of Sha1 and Sha1git - Sha1 is tried to be mapped with "content" table, if it exists in "content" table then SWHID is made using the respective +sha1git of that sha1 like this "swh:cnt:(sha1git)" and if it contains sha1git, then it is mapped using "revision" table, if it exists in "revision" table +then SWHID is made like this "swh:rev:(sha1git)". diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -1 +1,31 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from os import environ, path + +import pytest + +import swh.clearlydefined + +from swh.core.db.pytest_plugin import postgresql_fact + +SQL_DIR = path.join(path.dirname(swh.clearlydefined.__file__), "sql") + +environ["LC_ALL"] = "C.UTF-8" pytest_plugins = ["swh.storage.pytest_plugin"] + +swh_clearcode = postgresql_fact( + "postgresql_proc", db_name="clearcode", dump_files=path.join(SQL_DIR, "*.sql") +) + + +@pytest.fixture +def clearcode_dsn(swh_clearcode): + """Basic pg storage configuration with no journal collaborator + (to avoid pulling optional dependency on clients of this fixture) + + """ + clearcode_dsn = swh_clearcode.dsn + return clearcode_dsn diff --git a/swh/clearlydefined/cli.py b/swh/clearlydefined/cli.py --- a/swh/clearlydefined/cli.py +++ b/swh/clearlydefined/cli.py @@ -1,19 +1,54 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + import click +from swh.clearlydefined.orchestrator import orchestrator from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group +from swh.storage import get_storage @swh_cli_group.group(name="clearlydefined", context_settings=CONTEXT_SETTINGS) +@click.option( + "--config-file", + "-C", + default=None, + type=click.Path( + exists=True, + dir_okay=False, + ), + help="SWH storage config.", +) +@click.option("--clearcode-dsn", default=None, type=click.STRING, help="Clearcode DSN.") @click.pass_context -def clearlydefined_cli_group(ctx): - """Foo main command. - """ +def clearlydefined(ctx, config_file, clearcode_dsn): + """Software Heritage Clearlydefined Metadata Fetcher""" + from swh.core import config + + if config_file: + if not os.path.exists(config_file): + raise ValueError("%s does not exist" % config_file) + conf = config.read(config_file) + else: + conf = {} + + if "storage" not in conf: + ctx.fail("You must have a storage configured in your config file.") + + ctx.ensure_object(dict) + ctx.obj["config"] = conf + ctx.obj["dsn"] = clearcode_dsn -@clearlydefined_cli_group.command() -@click.option("--bar", help="Something") +@clearlydefined.command(name="fill_storage") @click.pass_context -def bar(ctx, bar): - """Do something.""" - click.echo("bar") +def run_orchestration(ctx): + print(ctx.obj["config"]["storage"]) + storage = get_storage(**ctx.obj["config"]["storage"]) + clearcode_dsn = ctx.obj["dsn"] + orchestrator(storage=storage, clearcode_dsn=clearcode_dsn) diff --git a/swh/clearlydefined/mapping_utils.py b/swh/clearlydefined/mapping_utils.py --- a/swh/clearlydefined/mapping_utils.py +++ b/swh/clearlydefined/mapping_utils.py @@ -43,14 +43,14 @@ AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://clearlydefined.io/", - metadata=None, + metadata={}, ) FETCHER = MetadataFetcher( name="swh-clearlydefined", version="0.0.1", - metadata=None, + metadata={}, ) diff --git a/swh/clearlydefined/orchestrator.py b/swh/clearlydefined/orchestrator.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/orchestrator.py @@ -0,0 +1,206 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +from typing import Optional + +import psycopg2 + +from swh.clearlydefined.mapping_utils import ( + AUTHORITY, + FETCHER, + get_type_of_tool, + map_row, +) +from swh.model.model import RawExtrinsicMetadata +from swh.storage.interface import StorageInterface + + +class Row: + def __init__(self, path, metadata, date): + self.path = path + self.metadata = metadata + self.date = date + + +def write_in_storage( + storage: StorageInterface, + metadata: RawExtrinsicMetadata, +) -> None: + """ + Take storage and metadata as input + and add metadata in storage + """ + storage.raw_extrinsic_metadata_add([metadata]) + + +def init_storage(storage: StorageInterface) -> None: + """ + Take storage as input and add MetadataFetcher, MetadataAuthority inside storage + """ + storage.metadata_authority_add([AUTHORITY]) + storage.metadata_fetcher_add([FETCHER]) + + +def write_next_date( + cursor, update_connection, previous_date: Optional[datetime], new_date: datetime +) -> None: + """ + Take cursor, update_connection, previous_date, new_date as input + and if it previous_date is None, then enter new_date, else + update the date stored in table with new_date + """ + if not previous_date: + cursor.execute( + """INSERT into clearcode_env (key, value) VALUES(%s,%s)""", + ("date", new_date), + ) + else: + cursor.execute( + """UPDATE clearcode_env SET value = %s WHERE key='date'""", + (new_date,), + ) + update_connection.commit() + + +def get_last_run_date(cursor) -> Optional[datetime]: + """ + Take cursor as input and get last run date from which + new rows will be orchestered, return None if it's first + orchestration + """ + cursor.execute("SELECT value FROM clearcode_env WHERE key='date';") + rows = cursor.fetchall() + if len(rows) < 1: + return None + date = rows[0][0] + return date + + +def orchestrate_row(storage: StorageInterface, cursor, connection, row: Row) -> bool: + """ + Take storage, cursor, connection, row as input + and if able to completely map that row then write + data in storage, else store the ID in state + """ + able_to_be_mapped = map_row( + metadata=row.metadata, id=row.path, date=row.date, storage=storage + ) + if not able_to_be_mapped: + # This is a case when no metadata of row is not able to be mapped + write_in_not_mapped( + cd_path=row.path, cursor=cursor, write_connection=connection + ) + return False + else: + # This is a case when partial metadata of that row is able to be mapped + mapping_status, metadata_list = able_to_be_mapped + if not mapping_status: + write_in_not_mapped( + cd_path=row.path, cursor=cursor, write_connection=connection + ) + for data in metadata_list: + write_in_storage(storage=storage, metadata=data) + return mapping_status + + +def map_previously_unmapped_data(storage: StorageInterface, cursor, connection) -> None: + """ + Take storage, cursor, connection as input and map previously + unmapped data + """ + cursor.execute("SELECT path FROM unmapped_data ;") + rows = cursor.fetchall() + for row in rows: + cd_path = row[0] + cursor.execute( + """SELECT path,content,last_modified_date FROM + clearcode_cditem WHERE path=%s;""", + (cd_path,), + ) + unmapped_row = cursor.fetchall()[0] + if orchestrate_row( + storage=storage, + row=unmapped_row, + cursor=cursor, + connection=connection, + ): + cursor.execute("DELETE FROM unmapped_data WHERE path=%s", (cd_path,)) + connection.commit() + + +def write_in_not_mapped(cursor, write_connection, cd_path: str) -> None: + """ + Take cursor, write_connection, cd_path as input + and write 'cd_path' if 'cd_path' does not exists + inside unmapped_data + """ + cursor.execute( + "INSERT INTO unmapped_data (path) VALUES (%s) ON CONFLICT (path) DO NOTHING;", + (cd_path,), + ) + write_connection.commit() + return + + +def read_from_clearcode_and_write_in_swh( + storage: StorageInterface, cursor, connection, date: Optional[datetime] +) -> None: + """ + Take storage, cursor, connection, date as input + and read from clearcode database and write only + the data that is discovered after 'date' in swh storage. + 'date' is the last discovery date of the object that was + stored at the time of previous run. + """ + if date: + cursor.execute( + "SELECT path,content,last_modified_date FROM clearcode_cditem " + "WHERE last_modified_date < %s " + "ORDER BY last_modified_date DESC;", + (date,), + ) + else: + cursor.execute( + """SELECT path,content,last_modified_date FROM clearcode_cditem + ORDER BY last_modified_date DESC;""" + ) + rows = cursor.fetchall() + if len(rows) < 1: + return + new_date = rows[0][2] + write_next_date( + cursor=cursor, + update_connection=connection, + previous_date=date, + new_date=new_date, + ) + for row in rows: + tool = get_type_of_tool(row[0]).value + if tool == "fossology": + pass + else: + orchestrate_row( + storage=storage, + cursor=cursor, + connection=connection, + row=Row(path=row[0], metadata=row[1], date=row[2]), + ) + + +def orchestrator(storage: StorageInterface, clearcode_dsn: str) -> None: + """ + Take clearcode_dsn, swh_storage_backend_config as input + and write data periodically from clearcode database to + swh raw extrensic metadata + """ + connection = psycopg2.connect(dsn=clearcode_dsn) + cursor = connection.cursor() + init_storage(storage=storage) + map_previously_unmapped_data(storage=storage, cursor=cursor, connection=connection) + date = get_last_run_date(cursor=cursor) + read_from_clearcode_and_write_in_swh( + storage=storage, cursor=cursor, connection=connection, date=date + ) diff --git a/swh/clearlydefined/sql/30-schema.sql b/swh/clearlydefined/sql/30-schema.sql new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/sql/30-schema.sql @@ -0,0 +1,55 @@ +--- +--- SQL implementation of the Clearlydefined data +--- + +-- schema versions +create table dbversion +( + version int primary key, + release timestamptz, + description text +); + +comment on table dbversion is 'Details of current db version'; +comment on column dbversion.version is 'SQL schema version'; +comment on column dbversion.release is 'Version deployment timestamp'; +comment on column dbversion.description is 'Release description'; + +-- latest schema version +insert into dbversion(version, release, description) + values(1, now(), 'Work In Progress'); + +--schema clearcode_cditem +create table clearcode_cditem( + path varchar(2048) primary key, + content bytea not null, + last_modified_date timestamptz not null, + last_map_date timestamptz, + map_error text, + uuid uuid not null +); + +comment on table clearcode_cditem is 'Data of clearcode_toolkit'; +comment on column clearcode_cditem.path is 'ID'; +comment on column clearcode_cditem.content is 'Metadata content'; +comment on column clearcode_cditem.last_modified_date is 'Last date of updation'; +comment on column clearcode_cditem.last_map_date is 'Last date of mapping'; +comment on column clearcode_cditem.map_error is 'Mapping error'; +comment on column clearcode_cditem.uuid is 'UUID'; + +--schema unmapped_data +create table unmapped_data( + path varchar primary key +); +comment on table unmapped_data is 'Unmapped Data of clearcode_toolkit'; +comment on column unmapped_data.path is 'ID'; + +--schema clearcode_env +create table clearcode_env( + key text primary key, + value text +); + +comment on table clearcode_env is 'Environment variables of clearcode_toolkit'; +comment on column clearcode_env.key is 'Name of variable'; +comment on column clearcode_env.key is 'Value of variable'; diff --git a/swh/clearlydefined/tests/data/README.md b/swh/clearlydefined/tests/data/README.md new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/README.md @@ -0,0 +1,31 @@ +clearlydefined_metadata_2 - This file is used for matching mapped data from clearlydefined_true + +clearlydefined_metadata - This file is used for matching mapped data from clearlydefined_true + +clearlydefined_true - This file is used for feeding as input as a mock metadata (getting True as mapping status, row type clearlydefined) + +clearlydefined - This file is used for feeding as input as a mock metadata (getting False as mapping status, row type clearlydefined) + +clearlydefined_not_mapped - This file is used for testing orchestrator, feeding as input as a mock metadata that will not be mapped in first orchestration + +def_not_mapped - This file is used for testing orchestrator, feeding as input as a mock metadata that will not be mapped in first orchestration + +definitions_not_mapped_sha1_git - This file is used for feeding as input as a mock metadata (getting False as mapping status, row type definition sha1git) + +definitions_not_mapped - This file is used for feeding as input as a mock metadata (getting False as mapping status, row type definition sha1) + +definitions_sha1git - This file is used for feeding as input as a mock metadata (getting True as mapping status, row type definition sha1 git) + +definitions - This file is used for feeding as input as a mock metadata (getting True as mapping status, row type definition sha1) + +licensee_metadata - This file is used for matching mapped data from licensee_true + +licensee_true - This file is used for feeding as input as a mock metadata (getting True as mapping status, row type licensee) + +licensee - This file is used for feeding as input as a mock metadata (getting False as mapping status, row type licensee) + +scancode_metadata - This file is used for matching mapped data from scancode_true + +scancode_true - This file is used for feeding as input as a mock metadata (getting True as mapping status, row type scancode) + +scancode - This file is used for feeding as input as a mock metadata (getting False as mapping status, row type scancode) diff --git a/swh/clearlydefined/tests/data/clearydefined_not_mapped.json b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json @@ -0,0 +1,212 @@ +{ + "_metadata": { + "type": "npm", + "url": "cd:/npm/npmjs/@pixi/mesh-extras/5.3.5", + "fetchedAt": "2020-12-18T10:26:42.827Z", + "links": { + "self": { + "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:clearlydefined:1.3.4", + "type": "resource" + }, + "siblings": { + "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:clearlydefined", + "type": "collection" + }, + "licensee": { + "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:licensee", + "type": "collection" + }, + "scancode": { + "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:scancode", + "type": "collection" + }, + "source": { + "href": "urn:git:github:pixijs:pixi.js:revision:b5353da2693f0112230cd2b1be581f9bff0ce2a1", + "type": "resource" + } + }, + "schemaVersion": "1.3.4", + "toolVersion": "1.1.4", + "processedAt": "2020-12-18T10:26:43.254Z" + }, + "attachments": [ + { + "path": "package/LICENSE", + "token": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" + }, + { + "path": "package/package.json", + "token": "85b85a7807ba4fdcd78d622e40497bdbe5f1b3346cdda1fe2cb859224de3d598" + } + ], + "summaryInfo": { + "k": 317, + "count": 11, + "hashes": { + "sha1": "c7e6ec806b594c8d5520ca3ffa87bdad27374411", + "sha256": "2988c69d931c63cfbcef7b8f2b8e051ea91626bb942596823df83bf4b3e36841" + } + }, + "files": [ + { + "path": "package/LICENSE", + "hashes": { + "sha1": "385f736f1ad8f5743cf2681b154d314f9cf48db8", + "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b" + } + } + ], + "package.json": { + "name": "@pixi/mesh-extras", + "version": "5.3.5", + "main": "lib/mesh-extras.js", + "module": "lib/mesh-extras.es.js", + "bundle": "dist/mesh-extras.js", + "description": "Custom Mesh display objects, like Rope and SimplePlane", + "author": "Mat Groves", + "contributors": [ + "Matt Karl " + ], + "homepage": "http://pixijs.com/", + "bugs": "https://github.com/pixijs/pixi.js/issues", + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/pixijs/pixi.js.git" + }, + "publishConfig": { + "access": "public" + }, + "files": [ + "lib", + "dist" + ], + "dependencies": { + "@pixi/constants": "5.3.5", + "@pixi/core": "5.3.5", + "@pixi/math": "5.3.5", + "@pixi/mesh": "5.3.5", + "@pixi/utils": "5.3.5" + }, + "devDependencies": { + "@pixi/loaders": "5.3.5" + }, + "gitHead": "b5353da2693f0112230cd2b1be581f9bff0ce2a1" + }, + "registryData": { + "_id": "@pixi/mesh-extras", + "_rev": "37-99af22664aed7bcd0476efccf57088fc", + "name": "@pixi/mesh-extras", + "dist-tags": { + "latest": "5.3.5", + "next": "5.0.0-alpha.3", + "latest-5.1.x": "5.1.6", + "prerelease": "5.4.0-rc.3", + "latest-5.2.x": "5.2.5" + }, + "maintainers": [ + { + "name": "bigtimebuddy", + "email": "matt@mattkarl.com" + } + ], + "description": "Custom Mesh display objects, like Rope and SimplePlane", + "homepage": "http://pixijs.com/", + "repository": { + "type": "git", + "url": "git+https://github.com/pixijs/pixi.js.git" + }, + "contributors": [ + { + "name": "Matt Karl", + "email": "matt@mattkarl.com" + } + ], + "author": { + "name": "Mat Groves" + }, + "bugs": { + "url": "https://github.com/pixijs/pixi.js/issues" + }, + "license": "MIT", + "readme": "# @pixi/mesh-extras\n\n## Installation\n\n```bash\nnpm install @pixi/mesh-extras\n```\n\n## Usage\n\n```js\nimport { MeshRenderer } from '@pixi/mesh';\nimport { Renderer } from '@pixi/core';\nimport { Rope } from '@pixi/mesh-extras';\n\nRenderer.registerPlugin('mesh', MeshRenderer);\n\nconst rope = new Rope();\n```", + "readmeFilename": "README.md", + "manifest": { + "name": "@pixi/mesh-extras", + "version": "5.3.5", + "main": "lib/mesh-extras.js", + "module": "lib/mesh-extras.es.js", + "bundle": "dist/mesh-extras.js", + "description": "Custom Mesh display objects, like Rope and SimplePlane", + "author": { + "name": "Mat Groves" + }, + "contributors": [ + { + "name": "Matt Karl", + "email": "matt@mattkarl.com" + } + ], + "homepage": "http://pixijs.com/", + "bugs": { + "url": "https://github.com/pixijs/pixi.js/issues" + }, + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/pixijs/pixi.js.git" + }, + "publishConfig": { + "access": "public" + }, + "dependencies": { + "@pixi/constants": "5.3.5", + "@pixi/core": "5.3.5", + "@pixi/math": "5.3.5", + "@pixi/mesh": "5.3.5", + "@pixi/utils": "5.3.5" + }, + "devDependencies": { + "@pixi/loaders": "5.3.5" + }, + "gitHead": "b5353da2693f0112230cd2b1be581f9bff0ce2a1", + "_id": "@pixi/mesh-extras@5.3.5", + "_nodeVersion": "10.19.0", + "_npmVersion": "lerna/3.13.3/node@v10.19.0+x64 (darwin)", + "_npmUser": { + "name": "bigtimebuddy", + "email": "matt@mattkarl.com" + }, + "dist": { + "integrity": "sha512-47oHFkxUWQikB+7RT4ZcCecwc/GaKFSbZm6SL7dG12mQ+N5nGAZkWBIDw2d/Ai9PxMHo4ciUCjcS7el0SHkBjA==", + "shasum": "c7e6ec806b594c8d5520ca3ffa87bdad27374411", + "tarball": "https://registry.npmjs.org/@pixi/mesh-extras/-/mesh-extras-5.3.5.tgz", + "fileCount": 11, + "unpackedSize": 308318, + "npm-signature": "-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.13\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJf26KUCRA9TVsSAnZWagAA/rAP/3UY+d5ycZ5QpSxO9Ke6\nrJEFC0jBfGRprbf4BGgvFil5HXjFX+s0xy5VB9AcSl+bPEDK6LIr6FZrCS85\nMBh4yRJXX+17mTfoSVaoLYA2UEl9fBhIGrryyObSF81TdAOhpKRmU5aPJ4hV\nORmdhetWLj4EOaHFt1pb0f62tVDrZdu8GU+TYqmU8ZpNoIfuw6iC8B79t1+R\njSxDqEUfVxCN3P/JTjakCP/oSqLOf1VSdUq/0wmyE8cdPLw+l8s20t5T7CQj\nCwe5jgVhL+Pg0z1rIQxJqRUzjGXZlnEdt+B5fIhHIcNbKQFtN/JMa1JcwjNA\nY9oAD8eHxGElK5gOtfFEYNqOlUPaeI0iupuaOK1gNVcGeKmvHZleE9dVsYrB\njM6d34F/mI6RPSw/ABOV1v7USWdxKe+f1paNSSMYbJNbehBYttDYxkXkgrLM\nynz7ru1EdnHACpJmcZVkZj/3hTZ0o7YDgRHpbLfIxAc2ETqr7ryN0y37uMAS\ns2kuaC89e6k7pd+91vEHQZtLKhMMtvivBYmdH2ZQWirU34kA9z+l5ux+NAoo\nWR9jfn/8+7S5SB4v0HFSzsgVbAxibuUaNHSvG39eBs9Au14uBdsasTwxaLwN\nRFwiTTJNS4fiCz165VCE08kI4uzrJAyqtblc9+z1gsmtWScpuDN7gJVrfsjx\nPSph\r\n=rg4i\r\n-----END PGP SIGNATURE-----\r\n" + }, + "directories": {}, + "maintainers": [ + { + "name": "bigtimebuddy", + "email": "matt@mattkarl.com" + } + ], + "_npmOperationalInternal": { + "host": "s3://npm-registry-packages", + "tmp": "tmp/mesh-extras_5.3.5_1608229524025_0.8091580391334361" + }, + "_hasShrinkwrap": false + }, + "releaseDate": "2020-12-17T18:25:24.267Z" + }, + "sourceInfo": { + "type": "git", + "provider": "github", + "namespace": "pixijs", + "name": "pixi.js", + "revision": "b5353da2693f0112230cd2b1be581f9bff0ce2a1", + "url": null, + "path": null + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/data/def_not_mapped.json b/swh/clearlydefined/tests/data/def_not_mapped.json new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/data/def_not_mapped.json @@ -0,0 +1,88 @@ +{ + "described": { + "releaseDate": "2019-03-29", + "sourceLocation": { + "type": "sourcearchive", + "provider": "mavencentral", + "namespace": "za.co.absa.cobrix", + "name": "cobol-parser", + "revision": "0.4.0", + "url": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar" + }, + "urls": { + "registry": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser", + "version": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0", + "download": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0.jar" + }, + "hashes": { + "sha1": "3e21cc4942a4234c9e5edd8a9cacd1670fe59f13", + "sha256": "2bf17e47907dc3dfa64fc17ae6ef71b54d96a79a740f3b7a618104d4281656f0" + }, + "files": 261, + "tools": [ + "clearlydefined/1.5.0", + "scancode/3.2.2" + ], + "toolScore": { + "total": 100, + "date": 30, + "source": 70 + }, + "score": { + "total": 100, + "date": 30, + "source": 70 + } + }, + "licensed": { + "declared": "Apache-2.0", + "toolScore": { + "total": 60, + "declared": 30, + "discovered": 0, + "consistency": 15, + "spdx": 15, + "texts": 0 + }, + "facets": { + "core": { + "attribution": { + "unknown": 260, + "parties": [ + "Copyright 2018-2019 ABSA Group Limited" + ] + }, + "discovered": { + "unknown": 260, + "expressions": [ + "Apache-2.0" + ] + }, + "files": 261 + } + }, + "score": { + "total": 60, + "declared": 30, + "discovered": 0, + "consistency": 15, + "spdx": 15, + "texts": 0 + } + }, + "coordinates": { + "type": "maven", + "provider": "mavencentral", + "namespace": "za.co.absa.cobrix", + "name": "cobol-parser", + "revision": "0.4.0" + }, + "_meta": { + "schemaVersion": "1.6.1", + "updated": "2019-11-04T05:20:21.308Z" + }, + "scores": { + "effective": 80, + "tool": 80 + } +} \ No newline at end of file diff --git a/swh/clearlydefined/tests/test_cli.py b/swh/clearlydefined/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/test_cli.py @@ -0,0 +1,33 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import tempfile + +from click.testing import CliRunner +import yaml + +from swh.clearlydefined.cli import clearlydefined as cli + + +def test_orchestration_from_cli(swh_storage_backend_config, clearcode_dsn): + config = {"storage": swh_storage_backend_config} + with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd: + yaml.dump(config, config_fd) + config_fd.seek(0) + runner = CliRunner() + result = runner.invoke( + cli, + ["-C", config_fd.name, "--clearcode-dsn", clearcode_dsn, "fill_storage"], + ) + assert result.exit_code == 0 + + +def test_cli_with_config_without_storage(swh_storage_backend_config, clearcode_dsn): + runner = CliRunner() + result = runner.invoke( + cli, + ["--clearcode-dsn", clearcode_dsn, "fill_storage"], + ) + assert result.exit_code == 2 diff --git a/swh/clearlydefined/tests/test_orchestrator.py b/swh/clearlydefined/tests/test_orchestrator.py new file mode 100644 --- /dev/null +++ b/swh/clearlydefined/tests/test_orchestrator.py @@ -0,0 +1,169 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +import gzip +import os +from typing import Optional, Tuple, List +import uuid + +import psycopg2 + +from swh.clearlydefined.orchestrator import get_last_run_date, orchestrator +from swh.model.model import Content + +content_data = [ + Content.from_data(b"42\n"), + Content.from_data(b"4242\n"), +] + + +def add_content_data(swh_storage): + swh_storage.content_add(content_data) + + +def file_data(file_name: str) -> str: + with open(file_name) as file: + return file.read() + + +def gzip_compress_data(filename: Optional[str], datadir) -> bytes: + """ + Take filename as input + and return gzip compressed + data for that filename + """ + if not filename: + return gzip.compress("".encode("utf-8"), compresslevel=9) + else: + return gzip.compress( + file_data(os.path.join(datadir, filename)).encode("utf-8"), compresslevel=9 + ) + + +def fill_rows_in_table( + rows: List[Tuple[str, bytes, datetime, datetime, str]], cursor, connection +): + """ + Take rows as input and store + those rows in clearcode_cditem table + """ + for row in rows: + cursor.execute( + """INSERT INTO clearcode_cditem (path, content, last_modified_date, + last_map_date, map_error, uuid) VALUES (%s, %s, %s, %s, %s, %s);""", + ( + *row, + uuid.uuid4(), + ), + ) + connection.commit() + + +def fill_data_before_updation_of_storage(connection, cursor, datadir): + rows = [ + ( + "maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json", + gzip_compress_data("definitions.json", datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=1), + "", + ), + ( + "npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/" "3.2.2.json", + gzip_compress_data("scancode_true.json", datadir=datadir), + datetime(year=2021, month=2, day=2), + datetime(year=2021, month=2, day=2), + "", + ), + ( + "npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/" + "9.13.0.json", + gzip_compress_data("licensee_true.json", datadir=datadir), + datetime(year=2021, month=2, day=3), + datetime(year=2021, month=2, day=3), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json", + gzip_compress_data("clearlydefined_true.json", datadir=datadir), + datetime(year=2021, month=2, day=4), + datetime(year=2021, month=2, day=4), + "", + ), + ( + "maven/mavencentral/za.co.absa.cobrix/cobol/revision/0.4.0.json", + gzip_compress_data("def_not_mapped.json", datadir=datadir), + datetime(year=2021, month=2, day=5), + datetime(year=2021, month=2, day=5), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.6/tool/clearlydefined/1.3.4.json", + gzip_compress_data("clearydefined_not_mapped.json", datadir=datadir), + datetime(year=2021, month=2, day=6), + datetime(year=2021, month=2, day=6), + "", + ), + ( + "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/fossology/1.3.4.json", + gzip_compress_data(None, datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=1), + "", + ), + ] + fill_rows_in_table(rows=rows, cursor=cursor, connection=connection) + + +def fill_data_after_updation_of_storage(connection, cursor, datadir): + rows = [ + ( + "maven/mavencentral/cobrix/cobol-parser/revision/0.4.0.json", + gzip_compress_data(None, datadir=datadir), + datetime(year=2021, month=2, day=1), + datetime(year=2021, month=2, day=8), + "", + ), + ] + fill_rows_in_table(rows=rows, cursor=cursor, connection=connection) + + +def get_length_of_unmapped_data(connection, cursor) -> int: + cursor.execute("SELECT COUNT(*) FROM unmapped_data") + count = cursor.fetchall()[0][0] + return count + + +def test_orchestrator(swh_storage, clearcode_dsn, datadir): + connection = psycopg2.connect(dsn=clearcode_dsn) + cursor = connection.cursor() + add_content_data(swh_storage) + # Fill data in clearcode database, for first time orchestration + fill_data_before_updation_of_storage( + connection=connection, cursor=cursor, datadir=datadir + ) + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + # Check how much data is unmapped after first orchestration + assert 2 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + assert "2021-02-06 00:00:00+00" == get_last_run_date(cursor=cursor) + content_data.extend( + [Content.from_data(b"424242\n"), Content.from_data(b"42424242\n")] + ) + add_content_data(swh_storage) + # Run orchestration after insertion in swh storage and + # check how much data is unmapped after second orchestration + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 0 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + fill_data_after_updation_of_storage( + connection=connection, cursor=cursor, datadir=datadir + ) + # Fill new data in clearcode database and + # check how much data is unmapped after second orchestration + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor) + # Check how much data is unmapped when archive was not updated + orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn) + assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor)