Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/README.md b/README.md
index 2b4212f..3e497d0 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,56 @@
# swh-clearlydefined
ClearlyDefined metadata Fetcher for Software Heritage
+
+
+Installation of Clearcode Toolkit and running it
+================================================
+
+https://github.com/nexb/clearcode-toolkit#quick-start-using-a-database-storage
+
+
+Setting up SWH-CLEARLYDEFINED
+=============================
+
+* pip3 install -r requirements-swh.txt
+
+
+Running of SWH-CLEARLYDEFINED metadata fetcher
+==============================================
+
+* Create a config file (sample config file)
+* Then pass command "swh clearlydefined [OPTIONS] fill_storage"
+* OPTIONS -
+- -C, --config-file <config_file>
+
+ Configuration file (default: /home/jenkins/.config/swh/global.yml)
+- --clearcode-dsn
+ Sample DSN : "dbname=clearcode user=postgres host=127.0.0.1 port=32552 options=''"
+
+* Sample command looks like this:
+swh clearlydefined -C /path/to/file --clearcode-dsn dbname=clearcode user=postgres host=127.0.0.1 port=32552 options='' fill_storage
+
+* Set a sample command like this on a cron tab that will fill data periodically
+
+Example to run this command weekly at 8:00 am morning on Sunday:
+ 0 8 * * 0 swh clearlydefined -C /path/to/file --clearcode-dsn "dbname=clearcode user=clearcode host=127.0.0.1 port=32552 options=''" fill_storage
+
+
+Architecture
+============
+
+When user gives above command, it activates orchestration process.
+
+Orchestration Process - Fetches data from clearcode toolkit DB and then try to map it with SWH Storage, and the data which is able to be mapped (based on
+mapping status) is written in RawExtrensicMetadata table of SWH Storage and data that is not being able to be mapped is stored in a state, so that data can
+be mapped in future (updation of SWH storage).
+
+Mapping Process - Clearcode toolkit majorly contains two types of row data, one is definitions and second is harvest. Havests can further be classified as 4
+types for now (more harvest tools can be used in future) Clearlydefined, Licensee, Scancode, Fossology. Definitions can contain sha1 or sha1git and if it
+is able to mapped we send mapping status true else false. Harvests of type Clearlydefined, Licensee, Scancode contains a list of sha1 data and if we are
+able to map every sha1 from that list we send mapping status as true else false and since Harvests of Fossology doesn't contain any data that can be mapped
+with SWH storage, we ignore it and neither try to map it nor store in the state
+
+Mapping of Sha1 and Sha1git - Sha1 is tried to be mapped with "content" table, if it exists in "content" table then SWHID is made using the respective
+sha1git of that sha1 like this "swh:cnt:(sha1git)" and if it contains sha1git, then it is mapped using "revision" table, if it exists in "revision" table
+then SWHID is made like this "swh:rev:(sha1git)".
diff --git a/conftest.py b/conftest.py
index f12587a..810c682 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1 +1,30 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from os import environ, path
+
+import pytest
+
+import swh.clearlydefined
+from swh.core.db.pytest_plugin import postgresql_fact
+
+SQL_DIR = path.join(path.dirname(swh.clearlydefined.__file__), "sql")
+
+environ["LC_ALL"] = "C.UTF-8"
pytest_plugins = ["swh.storage.pytest_plugin"]
+
+swh_clearcode = postgresql_fact(
+ "postgresql_proc", db_name="clearcode", dump_files=path.join(SQL_DIR, "*.sql")
+)
+
+
+@pytest.fixture
+def clearcode_dsn(swh_clearcode):
+ """Basic pg storage configuration with no journal collaborator
+ (to avoid pulling optional dependency on clients of this fixture)
+
+ """
+ clearcode_dsn = swh_clearcode.dsn
+ return clearcode_dsn
diff --git a/swh/clearlydefined/cli.py b/swh/clearlydefined/cli.py
index 5e98570..56d3d77 100644
--- a/swh/clearlydefined/cli.py
+++ b/swh/clearlydefined/cli.py
@@ -1,19 +1,54 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+
import click
+from swh.clearlydefined.orchestrator import orchestrator
from swh.core.cli import CONTEXT_SETTINGS
from swh.core.cli import swh as swh_cli_group
+from swh.storage import get_storage
@swh_cli_group.group(name="clearlydefined", context_settings=CONTEXT_SETTINGS)
+@click.option(
+ "--config-file",
+ "-C",
+ default=None,
+ type=click.Path(
+ exists=True,
+ dir_okay=False,
+ ),
+ help="SWH storage config.",
+)
+@click.option("--clearcode-dsn", default=None, type=click.STRING, help="Clearcode DSN.")
@click.pass_context
-def clearlydefined_cli_group(ctx):
- """Foo main command.
- """
+def clearlydefined(ctx, config_file, clearcode_dsn):
+ """Software Heritage Clearlydefined Metadata Fetcher"""
+ from swh.core import config
+
+ if config_file:
+ if not os.path.exists(config_file):
+ raise ValueError("%s does not exist" % config_file)
+ conf = config.read(config_file)
+ else:
+ conf = {}
+
+ if "storage" not in conf:
+ ctx.fail("You must have a storage configured in your config file.")
+
+ ctx.ensure_object(dict)
+ ctx.obj["config"] = conf
+ ctx.obj["dsn"] = clearcode_dsn
-@clearlydefined_cli_group.command()
-@click.option("--bar", help="Something")
+@clearlydefined.command(name="fill_storage")
@click.pass_context
-def bar(ctx, bar):
- """Do something."""
- click.echo("bar")
+def run_orchestration(ctx):
+ print(ctx.obj["config"]["storage"])
+ storage = get_storage(**ctx.obj["config"]["storage"])
+ clearcode_dsn = ctx.obj["dsn"]
+ orchestrator(storage=storage, clearcode_dsn=clearcode_dsn)
diff --git a/swh/clearlydefined/orchestrator.py b/swh/clearlydefined/orchestrator.py
new file mode 100644
index 0000000..5c439dd
--- /dev/null
+++ b/swh/clearlydefined/orchestrator.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+from typing import Optional
+
+import attr
+import psycopg2
+import dateutil
+
+from swh.clearlydefined.mapping_utils import (
+ AUTHORITY,
+ FETCHER,
+ get_type_of_tool,
+ map_row,
+)
+from swh.model.model import RawExtrinsicMetadata
+from swh.storage.interface import StorageInterface
+
+
+class Row:
+ def __init__(self, path, metadata, date):
+ self.path = path
+ self.metadata = metadata
+ self.date = date
+
+
+def write_in_storage(
+ storage: StorageInterface,
+ metadata: RawExtrinsicMetadata,
+) -> None:
+ """
+ Take storage and metadata as input
+ and add metadata in storage
+ """
+ storage.raw_extrinsic_metadata_add([metadata])
+
+
+def init_storage(storage: StorageInterface) -> None:
+ """
+ Take storage as input and add MetadataFetcher, MetadataAuthority inside storage
+ """
+ storage.metadata_authority_add([attr.evolve(AUTHORITY, metadata={})])
+ storage.metadata_fetcher_add([attr.evolve(FETCHER, metadata={})])
+
+
+def write_next_date(
+ cursor, update_connection, previous_date: Optional[datetime], new_date: datetime
+) -> None:
+ """
+ Take cursor, update_connection, previous_date, new_date as input
+ and if it previous_date is None, then enter new_date, else
+ update the date stored in table with new_date
+ """
+ if not previous_date:
+ cursor.execute(
+ """INSERT into clearcode_env (key, value) VALUES(%s,%s)""",
+ ("date", new_date),
+ )
+ else:
+ cursor.execute(
+ """UPDATE clearcode_env SET value = %s WHERE key='date'""",
+ (new_date,),
+ )
+ update_connection.commit()
+
+
+def get_last_run_date(cursor) -> Optional[datetime]:
+ """
+ Take cursor as input and get last run date from which
+ new rows will be orchestered, return None if it's first
+ orchestration
+ """
+ cursor.execute("SELECT value FROM clearcode_env WHERE key='date';")
+ rows = cursor.fetchall()
+ if len(rows) < 1:
+ return None
+ date = rows[0][0]
+ return dateutil.parser.isoparse(date)
+
+
+def orchestrate_row(storage: StorageInterface, cursor, connection, row: Row) -> bool:
+ """
+ Take storage, cursor, connection, row as input
+ and if able to completely map that row then write
+ data in storage, else store the ID in unmapped_data
+ table and return mapping_status of that row
+ """
+ able_to_be_mapped = map_row(
+ metadata=row.metadata, id=row.path, date=row.date, storage=storage
+ )
+ if not able_to_be_mapped:
+ # This is a case when no metadata of row is not able to be mapped
+ write_in_not_mapped(
+ cd_path=row.path, cursor=cursor, write_connection=connection
+ )
+ return False
+ else:
+ # This is a case when partial metadata of that row is able to be mapped
+ mapping_status, metadata_list = able_to_be_mapped
+ if not mapping_status:
+ write_in_not_mapped(
+ cd_path=row.path, cursor=cursor, write_connection=connection
+ )
+ for data in metadata_list:
+ write_in_storage(storage=storage, metadata=data)
+ return mapping_status
+
+
+def map_previously_unmapped_data(storage: StorageInterface, cursor, connection) -> None:
+ """
+ Take storage, cursor, connection as input and map previously
+ unmapped data
+ """
+ cursor.execute("SELECT path FROM unmapped_data ;")
+ rows = cursor.fetchall()
+ for row in rows:
+ cd_path = row[0]
+ cursor.execute(
+ """SELECT path,content,last_modified_date FROM
+ clearcode_cditem WHERE path=%s;""",
+ (cd_path,),
+ )
+ unmapped_row = cursor.fetchall()[0]
+ if orchestrate_row(
+ storage=storage,
+ row=Row(
+ path=unmapped_row[0], metadata=unmapped_row[1], date=unmapped_row[2]
+ ),
+ cursor=cursor,
+ connection=connection,
+ ):
+ cursor.execute("DELETE FROM unmapped_data WHERE path=%s", (cd_path,))
+ connection.commit()
+
+
+def write_in_not_mapped(cursor, write_connection, cd_path: str) -> None:
+ """
+ Take cursor, write_connection, cd_path as input
+ and write 'cd_path' if 'cd_path' does not exists
+ inside unmapped_data
+ """
+ cursor.execute(
+ "INSERT INTO unmapped_data (path) VALUES (%s) ON CONFLICT (path) DO NOTHING;",
+ (cd_path,),
+ )
+ write_connection.commit()
+ return
+
+
+def read_from_clearcode_and_write_in_swh(
+ storage: StorageInterface, cursor, connection, date: Optional[datetime]
+) -> None:
+ """
+ Take storage, cursor, connection, date as input
+ and read from clearcode database and write only
+ the data that is discovered after 'date' in swh storage.
+ 'date' is the last discovery date of the object that was
+ stored at the time of previous run.
+ """
+ if date:
+ cursor.execute(
+ "SELECT path,content,last_modified_date FROM clearcode_cditem "
+ "WHERE last_modified_date < %s "
+ "ORDER BY last_modified_date DESC;",
+ (date,),
+ )
+ else:
+ cursor.execute(
+ """SELECT path,content,last_modified_date FROM clearcode_cditem
+ ORDER BY last_modified_date DESC;"""
+ )
+ rows = cursor.fetchall()
+ if len(rows) < 1:
+ return
+ new_date = rows[0][2]
+ write_next_date(
+ cursor=cursor,
+ update_connection=connection,
+ previous_date=date,
+ new_date=new_date,
+ )
+ for row in rows:
+ tool = get_type_of_tool(row[0]).value
+ if tool == "fossology":
+ pass
+ else:
+ orchestrate_row(
+ storage=storage,
+ cursor=cursor,
+ connection=connection,
+ row=Row(path=row[0], metadata=row[1], date=row[2]),
+ )
+
+
+def orchestrator(storage: StorageInterface, clearcode_dsn: str) -> None:
+ """
+ Take clearcode_dsn, swh_storage_backend_config as input
+ and write data periodically from clearcode database to
+ swh raw extrensic metadata
+ """
+ connection = psycopg2.connect(dsn=clearcode_dsn)
+ cursor = connection.cursor()
+ init_storage(storage=storage)
+ map_previously_unmapped_data(storage=storage, cursor=cursor, connection=connection)
+ date = get_last_run_date(cursor=cursor)
+ read_from_clearcode_and_write_in_swh(
+ storage=storage, cursor=cursor, connection=connection, date=date
+ )
diff --git a/swh/clearlydefined/sql/30-schema.sql b/swh/clearlydefined/sql/30-schema.sql
new file mode 100644
index 0000000..7e5a3d2
--- /dev/null
+++ b/swh/clearlydefined/sql/30-schema.sql
@@ -0,0 +1,50 @@
+---
+--- SQL implementation of the Clearlydefined data
+---
+
+-- schema versions
+create table dbversion
+(
+ version int primary key,
+ release timestamptz,
+ description text
+);
+
+comment on table dbversion is 'Details of current db version';
+comment on column dbversion.version is 'SQL schema version';
+comment on column dbversion.release is 'Version deployment timestamp';
+comment on column dbversion.description is 'Release description';
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(1, now(), 'Work In Progress');
+
+--schema clearcode_cditem
+create table clearcode_cditem(
+ path varchar(2048) primary key,
+ content bytea not null,
+ last_modified_date timestamptz not null,
+ last_map_date timestamptz,
+ map_error text,
+ uuid uuid not null
+);
+
+comment on table clearcode_cditem is 'Data of clearcode_toolkit';
+comment on column clearcode_cditem.path is 'ID';
+comment on column clearcode_cditem.content is 'Metadata content';
+
+--schema unmapped_data
+create table unmapped_data(
+ path varchar primary key
+);
+
+comment on table unmapped_data is 'Unmapped Data of clearcode_toolkit';
+comment on column unmapped_data.path is 'ID';
+
+--schema clearcode_env
+create table clearcode_env(
+ key text primary key,
+ value text
+);
+
+comment on table clearcode_env is 'Stores key value pair';
diff --git a/swh/clearlydefined/tests/data/README.md b/swh/clearlydefined/tests/data/README.md
new file mode 100644
index 0000000..ddaf377
--- /dev/null
+++ b/swh/clearlydefined/tests/data/README.md
@@ -0,0 +1,31 @@
+clearlydefined_metadata_2 - matching mapped data from clearlydefined_true
+
+clearlydefined_metadata - matching mapped data from clearlydefined_true
+
+clearlydefined_true - mock metadata (getting True as mapping status, row type clearlydefined)
+
+clearlydefined - mock metadata (getting False as mapping status, row type clearlydefined)
+
+clearlydefined_not_mapped - mock metadata that will not be mapped in first orchestration
+
+def_not_mapped - mock metadata that will not be mapped in first orchestration
+
+definitions_not_mapped_sha1_git - mock metadata (getting False as mapping status, row type definition sha1git)
+
+definitions_not_mapped - mock metadata (getting False as mapping status, row type definition sha1)
+
+definitions_sha1git - mock metadata (getting True as mapping status, row type definition sha1 git)
+
+definitions - mock metadata (getting True as mapping status, row type definition sha1)
+
+licensee_metadata - matching mapped data from licensee_true
+
+licensee_true - mock metadata (getting True as mapping status, row type licensee)
+
+licensee - mock metadata (getting False as mapping status, row type licensee)
+
+scancode_metadata - matching mapped data from scancode_true
+
+scancode_true - mock metadata (getting True as mapping status, row type scancode)
+
+scancode - mock metadata (getting False as mapping status, row type scancode)
diff --git a/swh/clearlydefined/tests/data/clearydefined_not_mapped.json b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json
new file mode 100644
index 0000000..3a16774
--- /dev/null
+++ b/swh/clearlydefined/tests/data/clearydefined_not_mapped.json
@@ -0,0 +1,212 @@
+{
+ "_metadata": {
+ "type": "npm",
+ "url": "cd:/npm/npmjs/@pixi/mesh-extras/5.3.5",
+ "fetchedAt": "2020-12-18T10:26:42.827Z",
+ "links": {
+ "self": {
+ "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:clearlydefined:1.3.4",
+ "type": "resource"
+ },
+ "siblings": {
+ "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:clearlydefined",
+ "type": "collection"
+ },
+ "licensee": {
+ "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:licensee",
+ "type": "collection"
+ },
+ "scancode": {
+ "href": "urn:npm:npmjs:@pixi:mesh-extras:revision:5.3.5:tool:scancode",
+ "type": "collection"
+ },
+ "source": {
+ "href": "urn:git:github:pixijs:pixi.js:revision:b5353da2693f0112230cd2b1be581f9bff0ce2a1",
+ "type": "resource"
+ }
+ },
+ "schemaVersion": "1.3.4",
+ "toolVersion": "1.1.4",
+ "processedAt": "2020-12-18T10:26:43.254Z"
+ },
+ "attachments": [
+ {
+ "path": "package/LICENSE",
+ "token": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b"
+ },
+ {
+ "path": "package/package.json",
+ "token": "85b85a7807ba4fdcd78d622e40497bdbe5f1b3346cdda1fe2cb859224de3d598"
+ }
+ ],
+ "summaryInfo": {
+ "k": 317,
+ "count": 11,
+ "hashes": {
+ "sha1": "c7e6ec806b594c8d5520ca3ffa87bdad27374411",
+ "sha256": "2988c69d931c63cfbcef7b8f2b8e051ea91626bb942596823df83bf4b3e36841"
+ }
+ },
+ "files": [
+ {
+ "path": "package/LICENSE",
+ "hashes": {
+ "sha1": "385f736f1ad8f5743cf2681b154d314f9cf48db8",
+ "sha256": "8a3c4ecc2f727e1b487daccf186b61457b60a5e1aa7103969fa9b0d8e3ba567b"
+ }
+ }
+ ],
+ "package.json": {
+ "name": "@pixi/mesh-extras",
+ "version": "5.3.5",
+ "main": "lib/mesh-extras.js",
+ "module": "lib/mesh-extras.es.js",
+ "bundle": "dist/mesh-extras.js",
+ "description": "Custom Mesh display objects, like Rope and SimplePlane",
+ "author": "Mat Groves",
+ "contributors": [
+ "Matt Karl <matt@mattkarl.com>"
+ ],
+ "homepage": "http://pixijs.com/",
+ "bugs": "https://github.com/pixijs/pixi.js/issues",
+ "license": "MIT",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/pixijs/pixi.js.git"
+ },
+ "publishConfig": {
+ "access": "public"
+ },
+ "files": [
+ "lib",
+ "dist"
+ ],
+ "dependencies": {
+ "@pixi/constants": "5.3.5",
+ "@pixi/core": "5.3.5",
+ "@pixi/math": "5.3.5",
+ "@pixi/mesh": "5.3.5",
+ "@pixi/utils": "5.3.5"
+ },
+ "devDependencies": {
+ "@pixi/loaders": "5.3.5"
+ },
+ "gitHead": "b5353da2693f0112230cd2b1be581f9bff0ce2a1"
+ },
+ "registryData": {
+ "_id": "@pixi/mesh-extras",
+ "_rev": "37-99af22664aed7bcd0476efccf57088fc",
+ "name": "@pixi/mesh-extras",
+ "dist-tags": {
+ "latest": "5.3.5",
+ "next": "5.0.0-alpha.3",
+ "latest-5.1.x": "5.1.6",
+ "prerelease": "5.4.0-rc.3",
+ "latest-5.2.x": "5.2.5"
+ },
+ "maintainers": [
+ {
+ "name": "bigtimebuddy",
+ "email": "matt@mattkarl.com"
+ }
+ ],
+ "description": "Custom Mesh display objects, like Rope and SimplePlane",
+ "homepage": "http://pixijs.com/",
+ "repository": {
+ "type": "git",
+ "url": "git+https://github.com/pixijs/pixi.js.git"
+ },
+ "contributors": [
+ {
+ "name": "Matt Karl",
+ "email": "matt@mattkarl.com"
+ }
+ ],
+ "author": {
+ "name": "Mat Groves"
+ },
+ "bugs": {
+ "url": "https://github.com/pixijs/pixi.js/issues"
+ },
+ "license": "MIT",
+ "readme": "# @pixi/mesh-extras\n\n## Installation\n\n```bash\nnpm install @pixi/mesh-extras\n```\n\n## Usage\n\n```js\nimport { MeshRenderer } from '@pixi/mesh';\nimport { Renderer } from '@pixi/core';\nimport { Rope } from '@pixi/mesh-extras';\n\nRenderer.registerPlugin('mesh', MeshRenderer);\n\nconst rope = new Rope();\n```",
+ "readmeFilename": "README.md",
+ "manifest": {
+ "name": "@pixi/mesh-extras",
+ "version": "5.3.5",
+ "main": "lib/mesh-extras.js",
+ "module": "lib/mesh-extras.es.js",
+ "bundle": "dist/mesh-extras.js",
+ "description": "Custom Mesh display objects, like Rope and SimplePlane",
+ "author": {
+ "name": "Mat Groves"
+ },
+ "contributors": [
+ {
+ "name": "Matt Karl",
+ "email": "matt@mattkarl.com"
+ }
+ ],
+ "homepage": "http://pixijs.com/",
+ "bugs": {
+ "url": "https://github.com/pixijs/pixi.js/issues"
+ },
+ "license": "MIT",
+ "repository": {
+ "type": "git",
+ "url": "git+https://github.com/pixijs/pixi.js.git"
+ },
+ "publishConfig": {
+ "access": "public"
+ },
+ "dependencies": {
+ "@pixi/constants": "5.3.5",
+ "@pixi/core": "5.3.5",
+ "@pixi/math": "5.3.5",
+ "@pixi/mesh": "5.3.5",
+ "@pixi/utils": "5.3.5"
+ },
+ "devDependencies": {
+ "@pixi/loaders": "5.3.5"
+ },
+ "gitHead": "b5353da2693f0112230cd2b1be581f9bff0ce2a1",
+ "_id": "@pixi/mesh-extras@5.3.5",
+ "_nodeVersion": "10.19.0",
+ "_npmVersion": "lerna/3.13.3/node@v10.19.0+x64 (darwin)",
+ "_npmUser": {
+ "name": "bigtimebuddy",
+ "email": "matt@mattkarl.com"
+ },
+ "dist": {
+ "integrity": "sha512-47oHFkxUWQikB+7RT4ZcCecwc/GaKFSbZm6SL7dG12mQ+N5nGAZkWBIDw2d/Ai9PxMHo4ciUCjcS7el0SHkBjA==",
+ "shasum": "c7e6ec806b594c8d5520ca3ffa87bdad27374411",
+ "tarball": "https://registry.npmjs.org/@pixi/mesh-extras/-/mesh-extras-5.3.5.tgz",
+ "fileCount": 11,
+ "unpackedSize": 308318,
+ "npm-signature": "-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.13\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJf26KUCRA9TVsSAnZWagAA/rAP/3UY+d5ycZ5QpSxO9Ke6\nrJEFC0jBfGRprbf4BGgvFil5HXjFX+s0xy5VB9AcSl+bPEDK6LIr6FZrCS85\nMBh4yRJXX+17mTfoSVaoLYA2UEl9fBhIGrryyObSF81TdAOhpKRmU5aPJ4hV\nORmdhetWLj4EOaHFt1pb0f62tVDrZdu8GU+TYqmU8ZpNoIfuw6iC8B79t1+R\njSxDqEUfVxCN3P/JTjakCP/oSqLOf1VSdUq/0wmyE8cdPLw+l8s20t5T7CQj\nCwe5jgVhL+Pg0z1rIQxJqRUzjGXZlnEdt+B5fIhHIcNbKQFtN/JMa1JcwjNA\nY9oAD8eHxGElK5gOtfFEYNqOlUPaeI0iupuaOK1gNVcGeKmvHZleE9dVsYrB\njM6d34F/mI6RPSw/ABOV1v7USWdxKe+f1paNSSMYbJNbehBYttDYxkXkgrLM\nynz7ru1EdnHACpJmcZVkZj/3hTZ0o7YDgRHpbLfIxAc2ETqr7ryN0y37uMAS\ns2kuaC89e6k7pd+91vEHQZtLKhMMtvivBYmdH2ZQWirU34kA9z+l5ux+NAoo\nWR9jfn/8+7S5SB4v0HFSzsgVbAxibuUaNHSvG39eBs9Au14uBdsasTwxaLwN\nRFwiTTJNS4fiCz165VCE08kI4uzrJAyqtblc9+z1gsmtWScpuDN7gJVrfsjx\nPSph\r\n=rg4i\r\n-----END PGP SIGNATURE-----\r\n"
+ },
+ "directories": {},
+ "maintainers": [
+ {
+ "name": "bigtimebuddy",
+ "email": "matt@mattkarl.com"
+ }
+ ],
+ "_npmOperationalInternal": {
+ "host": "s3://npm-registry-packages",
+ "tmp": "tmp/mesh-extras_5.3.5_1608229524025_0.8091580391334361"
+ },
+ "_hasShrinkwrap": false
+ },
+ "releaseDate": "2020-12-17T18:25:24.267Z"
+ },
+ "sourceInfo": {
+ "type": "git",
+ "provider": "github",
+ "namespace": "pixijs",
+ "name": "pixi.js",
+ "revision": "b5353da2693f0112230cd2b1be581f9bff0ce2a1",
+ "url": null,
+ "path": null
+ }
+}
\ No newline at end of file
diff --git a/swh/clearlydefined/tests/data/def_not_mapped.json b/swh/clearlydefined/tests/data/def_not_mapped.json
new file mode 100644
index 0000000..cfca1de
--- /dev/null
+++ b/swh/clearlydefined/tests/data/def_not_mapped.json
@@ -0,0 +1,88 @@
+{
+ "described": {
+ "releaseDate": "2019-03-29",
+ "sourceLocation": {
+ "type": "sourcearchive",
+ "provider": "mavencentral",
+ "namespace": "za.co.absa.cobrix",
+ "name": "cobol-parser",
+ "revision": "0.4.0",
+ "url": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0-sources.jar"
+ },
+ "urls": {
+ "registry": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser",
+ "version": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0",
+ "download": "http://central.maven.org/maven2/za/co/absa/cobrix/cobol-parser/0.4.0/cobol-parser-0.4.0.jar"
+ },
+ "hashes": {
+ "sha1": "3e21cc4942a4234c9e5edd8a9cacd1670fe59f13",
+ "sha256": "2bf17e47907dc3dfa64fc17ae6ef71b54d96a79a740f3b7a618104d4281656f0"
+ },
+ "files": 261,
+ "tools": [
+ "clearlydefined/1.5.0",
+ "scancode/3.2.2"
+ ],
+ "toolScore": {
+ "total": 100,
+ "date": 30,
+ "source": 70
+ },
+ "score": {
+ "total": 100,
+ "date": 30,
+ "source": 70
+ }
+ },
+ "licensed": {
+ "declared": "Apache-2.0",
+ "toolScore": {
+ "total": 60,
+ "declared": 30,
+ "discovered": 0,
+ "consistency": 15,
+ "spdx": 15,
+ "texts": 0
+ },
+ "facets": {
+ "core": {
+ "attribution": {
+ "unknown": 260,
+ "parties": [
+ "Copyright 2018-2019 ABSA Group Limited"
+ ]
+ },
+ "discovered": {
+ "unknown": 260,
+ "expressions": [
+ "Apache-2.0"
+ ]
+ },
+ "files": 261
+ }
+ },
+ "score": {
+ "total": 60,
+ "declared": 30,
+ "discovered": 0,
+ "consistency": 15,
+ "spdx": 15,
+ "texts": 0
+ }
+ },
+ "coordinates": {
+ "type": "maven",
+ "provider": "mavencentral",
+ "namespace": "za.co.absa.cobrix",
+ "name": "cobol-parser",
+ "revision": "0.4.0"
+ },
+ "_meta": {
+ "schemaVersion": "1.6.1",
+ "updated": "2019-11-04T05:20:21.308Z"
+ },
+ "scores": {
+ "effective": 80,
+ "tool": 80
+ }
+}
\ No newline at end of file
diff --git a/swh/clearlydefined/tests/test_cli.py b/swh/clearlydefined/tests/test_cli.py
new file mode 100644
index 0000000..08be5ea
--- /dev/null
+++ b/swh/clearlydefined/tests/test_cli.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import tempfile
+
+from click.testing import CliRunner
+import yaml
+
+from swh.clearlydefined.cli import clearlydefined as cli
+
+
+def test_orchestration_from_cli(swh_storage_backend_config, clearcode_dsn):
+ config = {"storage": swh_storage_backend_config}
+ with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd:
+ yaml.dump(config, config_fd)
+ config_fd.seek(0)
+ runner = CliRunner()
+ result = runner.invoke(
+ cli,
+ ["-C", config_fd.name, "--clearcode-dsn", clearcode_dsn, "fill_storage"],
+ )
+ assert result.exit_code == 0
+
+
+def test_cli_with_config_without_storage(swh_storage_backend_config, clearcode_dsn):
+ runner = CliRunner()
+ result = runner.invoke(
+ cli,
+ ["--clearcode-dsn", clearcode_dsn, "fill_storage"],
+ )
+ assert result.exit_code == 2
diff --git a/swh/clearlydefined/tests/test_orchestrator.py b/swh/clearlydefined/tests/test_orchestrator.py
new file mode 100644
index 0000000..5523b50
--- /dev/null
+++ b/swh/clearlydefined/tests/test_orchestrator.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+from datetime import timezone
+import gzip
+import os
+from typing import List, Optional, Tuple
+import uuid
+
+import psycopg2
+
+from swh.clearlydefined.orchestrator import get_last_run_date, orchestrator
+from swh.model.model import Content
+
+content_data = [
+ Content.from_data(b"42\n"),
+ Content.from_data(b"4242\n"),
+]
+
+
+def add_content_data(swh_storage):
+ swh_storage.content_add(content_data)
+
+
+def file_data(file_name: str) -> str:
+ with open(file_name) as file:
+ return file.read()
+
+
+def gzip_compress_data(filename: Optional[str], datadir) -> bytes:
+ """
+ Take filename as input
+ and return gzip compressed
+ data for that filename
+ """
+ if not filename:
+ return gzip.compress("".encode("utf-8"), compresslevel=9)
+ else:
+ return gzip.compress(
+ file_data(os.path.join(datadir, filename)).encode("utf-8"), compresslevel=9
+ )
+
+
+def fill_rows_in_table(
+ rows: List[Tuple[str, bytes, datetime, datetime, str]], cursor, connection
+):
+ """
+ Take rows as input and store
+ those rows in clearcode_cditem table
+ """
+ for row in rows:
+ cursor.execute(
+ """INSERT INTO clearcode_cditem (path, content, last_modified_date,
+ last_map_date, map_error, uuid) VALUES (%s, %s, %s, %s, %s, %s);""",
+ (
+ *row,
+ uuid.uuid4(),
+ ),
+ )
+ connection.commit()
+
+
+def fill_data_before_updation_of_storage(connection, cursor, datadir):
+ rows = [
+ (
+ "maven/mavencentral/za.co.absa.cobrix/cobol-parser/revision/0.4.0.json",
+ gzip_compress_data("definitions.json", datadir=datadir),
+ datetime(year=2021, month=2, day=1, tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=1, tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "npm/npmjs/@ngtools/webpack/revision/10.2.1/tool/scancode/" "3.2.2.json",
+ gzip_compress_data("scancode_true.json", datadir=datadir),
+ datetime(year=2021, month=2, day=2, tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=2, tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "npm/npmjs/@fluidframework/replay-driver/revision/0.31.0/tool/licensee/"
+ "9.13.0.json",
+ gzip_compress_data("licensee_true.json", datadir=datadir),
+ datetime(year=2021, month=2, day=3,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=3,tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/clearlydefined/1.3.4.json",
+ gzip_compress_data("clearlydefined_true.json", datadir=datadir),
+ datetime(year=2021, month=2, day=4,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=4,tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "maven/mavencentral/za.co.absa.cobrix/cobol/revision/0.4.0.json",
+ gzip_compress_data("def_not_mapped.json", datadir=datadir),
+ datetime(year=2021, month=2, day=5,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=5,tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "npm/npmjs/@pixi/mesh-extras/revision/5.3.6/tool/clearlydefined/1.3.4.json",
+ gzip_compress_data("clearydefined_not_mapped.json", datadir=datadir),
+ datetime(year=2021, month=2, day=6,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=6,tzinfo=timezone.utc),
+ "",
+ ),
+ (
+ "npm/npmjs/@pixi/mesh-extras/revision/5.3.5/tool/fossology/1.3.4.json",
+ gzip_compress_data(None, datadir=datadir),
+ datetime(year=2021, month=2, day=1,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=1,tzinfo=timezone.utc),
+ "",
+ ),
+ ]
+ fill_rows_in_table(rows=rows, cursor=cursor, connection=connection)
+
+
+def fill_data_after_updation_of_storage(connection, cursor, datadir):
+ rows = [
+ (
+ "maven/mavencentral/cobrix/cobol-parser/revision/0.4.0.json",
+ gzip_compress_data(None, datadir=datadir),
+ datetime(year=2021, month=2, day=1,tzinfo=timezone.utc),
+ datetime(year=2021, month=2, day=8,tzinfo=timezone.utc),
+ "",
+ ),
+ ]
+ fill_rows_in_table(rows=rows, cursor=cursor, connection=connection)
+
+
+def get_length_of_unmapped_data(connection, cursor) -> int:
+ cursor.execute("SELECT COUNT(*) FROM unmapped_data")
+ count = cursor.fetchall()[0][0]
+ return count
+
+
+def test_orchestrator(swh_storage, clearcode_dsn, datadir):
+ connection = psycopg2.connect(dsn=clearcode_dsn)
+ cursor = connection.cursor()
+ add_content_data(swh_storage)
+ # Fill data in clearcode database, for first time orchestration
+ fill_data_before_updation_of_storage(
+ connection=connection, cursor=cursor, datadir=datadir
+ )
+ orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn)
+ # Check how much data is unmapped after first orchestration
+ assert 2 == get_length_of_unmapped_data(connection=connection, cursor=cursor)
+ assert datetime(2021, 2, 6, 0, 0, tzinfo=timezone.utc) == get_last_run_date(
+ cursor=cursor
+ )
+ content_data.extend(
+ [Content.from_data(b"424242\n"), Content.from_data(b"42424242\n")]
+ )
+ add_content_data(swh_storage)
+ # Run orchestration after insertion in swh storage and
+ # check how much data is unmapped after second orchestration
+ orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn)
+ assert 0 == get_length_of_unmapped_data(connection=connection, cursor=cursor)
+ fill_data_after_updation_of_storage(
+ connection=connection, cursor=cursor, datadir=datadir
+ )
+ # Fill new data in clearcode database and
+ # check how much data is unmapped after second orchestration
+ orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn)
+ assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor)
+ # Check how much data is unmapped when archive was not updated
+ orchestrator(storage=swh_storage, clearcode_dsn=clearcode_dsn)
+ assert 1 == get_length_of_unmapped_data(connection=connection, cursor=cursor)

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:23 PM (6 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3347512

Event Timeline