diff --git a/requirements.txt b/requirements.txt index a92d174..897abe6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html requests aiohttp ndjson plotly pandas numpy dash dash_bootstrap_components +flask dulwich diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py new file mode 100644 index 0000000..6caa2c3 --- /dev/null +++ b/swh/scanner/backend.py @@ -0,0 +1,41 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from flask import Flask, request + +from .db import Db +from .exceptions import LargePayloadExc + +LIMIT = 1000 + + +def create_app(db: Db): + """Backend for swh-scanner, implementing the /known endpoint of the + Software Heritage Web API""" + app = Flask(__name__) + + @app.route("/api/1/known/", methods=["POST"]) + def known(): + swhids = request.get_json() + + if len(swhids) > LIMIT: + raise LargePayloadExc( + f"The maximum number of SWHIDs this endpoint can receive is {LIMIT}" + ) + + cur = db.conn.cursor() + res = {swhid: {"known": db.known(swhid, cur)} for swhid in swhids} + cur.close() + + return res + + return app + + +def run(host: str, port: int, db: Db): + """Serve the local database + """ + app = create_app(db) + app.run(host, port, debug=True) diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index 4ba5a7a..66d4f48 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,199 +1,242 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import os import sys from typing import Any, Dict, Optional import click import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group from .exceptions import DBError +# Config for the "serve" option +BACKEND_DEFAULT_PORT = 5011 + # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") DEFAULT_CONFIG: Dict[str, Any] = { "web-api": { "url": "https://archive.softwareheritage.org/api/1/", "auth-token": None, } } CONFIG_FILE_HELP = f"""Configuration file: \b The CLI option or the environment variable will fail if invalid. CLI option is checked first. Then, environment variable {CONFIG_ENVVAR} is checked. Then, if cannot load the default path, a set of default values are used. Default config path is {DEFAULT_CONFIG_PATH}. Default config values are: \b {yaml.dump(DEFAULT_CONFIG)}""" SCANNER_HELP = f"""Software Heritage Scanner tools. {CONFIG_FILE_HELP}""" def setup_config(ctx, api_url): config = ctx.obj["config"] if api_url: if not api_url.endswith("/"): api_url += "/" config["web-api"]["url"] = api_url return config @swh_cli_group.group( name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP, ) @click.option( "-C", "--config-file", default=None, type=click.Path(exists=False, dir_okay=False, path_type=str), help="""YAML configuration file""", ) @click.pass_context def scanner(ctx, config_file: Optional[str]): env_config_path = os.environ.get(CONFIG_ENVVAR) # read_raw_config do not fail if file does not exist, so check it beforehand # while enforcing loading priority if config_file: if not config.config_exists(config_file): raise click.BadParameter( f"File '{config_file}' cannot be opened.", param_hint="--config-file" ) elif env_config_path: if not config.config_exists(env_config_path): raise click.BadParameter( f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR ) config_file = env_config_path elif config.config_exists(DEFAULT_CONFIG_PATH): config_file = DEFAULT_CONFIG_PATH conf = DEFAULT_CONFIG if config_file is not None: conf = config.read_raw_config(config.config_basepath(config_file)) conf = config.merge_configs(DEFAULT_CONFIG, conf) ctx.ensure_object(dict) ctx.obj["config"] = conf @scanner.command(name="scan") @click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default=None, metavar="API_URL", show_default=True, help="URL for the api request", ) @click.option( "--exclude", "-x", "patterns", metavar="PATTERN", multiple=True, help="Exclude directories using glob patterns \ (e.g., '*.git' to exclude all .git directories)", ) @click.option( "-f", "--output-format", "out_fmt", default="text", show_default=True, type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), help="The output format", ) @click.option( "-i", "--interactive", is_flag=True, help="Show the result in a dashboard" ) @click.pass_context def scan(ctx, root_path, api_url, patterns, out_fmt, interactive): """Scan a source code project to discover files and directories already present in the archive""" import swh.scanner.scanner as scanner config = setup_config(ctx, api_url) scanner.scan(config, root_path, patterns, out_fmt, interactive) @scanner.group("db") @click.pass_context def db(ctx): pass @db.command("import") @click.option( "-i", "--input", "input_file", metavar="INPUT_FILE", required=True, type=click.File("r"), help="A file containing SWHIDs", ) @click.option( "-o", "--output", "output_file_db", metavar="OUTPUT_DB_FILE", required=True, show_default=True, help="The name of the generated sqlite database", ) @click.option( "-s", "--chunk-size", "chunk_size", default="10000", metavar="SIZE", show_default=True, type=int, help="The chunk size ", ) @click.pass_context def import_(ctx, chunk_size, input_file, output_file_db): """Create SQLite database of known SWHIDs from a textual list of SWHIDs """ from .db import Db db = Db(output_file_db) cur = db.conn.cursor() try: db.create_from(input_file, chunk_size, cur) db.close() except DBError: print("Failed to create database") os.remove(output_file_db) sys.exit(1) +@db.command("serve") +@click.option( + "-h", + "--host", + metavar="HOST", + default="127.0.0.1", + show_default=True, + help="The host of the API server", +) +@click.option( + "-p", + "--port", + metavar="PORT", + default=f"{BACKEND_DEFAULT_PORT}", + show_default=True, + help="The port of the API server", +) +@click.option( + "-f", + "--db-file", + "db_file", + metavar="DB_FILE", + default="SWHID_DB.sqlite", + show_default=True, + type=click.Path(exists=True), + help="An sqlite database file (it can be generated with: 'swh scanner db import')", +) +@click.pass_context +def serve(ctx, host, port, db_file): + """Start an API service using the sqlite database generated with the "db import" + option.""" + import swh.scanner.backend as backend + + from .db import Db + + db = Db(db_file) + backend.run(host, port, db) + db.close() + + def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") if __name__ == "__main__": main() diff --git a/swh/scanner/db.py b/swh/scanner/db.py index 905c9dd..41a4716 100644 --- a/swh/scanner/db.py +++ b/swh/scanner/db.py @@ -1,69 +1,68 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ This module is an interface to interact with the local database where the SWHIDs will be saved for the local API. SWHIDs can be added directly from an input file. """ from io import TextIOWrapper from pathlib import Path import sqlite3 from typing import Iterable from swh.core.utils import grouper from .exceptions import DBError class Db: """Local database interface""" def __init__(self, db_file: Path): self.db_file: Path = db_file self.conn: sqlite3.Connection = sqlite3.connect( db_file, check_same_thread=False ) def close(self): """Close the connection to the database.""" self.conn.close() def create_table(self, cur: sqlite3.Cursor): """Create the table where the SWHIDs will be stored.""" cur.execute("""CREATE TABLE IF NOT EXISTS swhids (swhid text PRIMARY KEY)""") def add(self, swhids: Iterable[str], chunk_size: int, cur: sqlite3.Cursor): """Insert the SWHID inside the database.""" for swhids_chunk in grouper(swhids, chunk_size): cur.executemany( """INSERT INTO swhids VALUES (?)""", [(swhid_chunk,) for swhid_chunk in swhids_chunk], ) def create_from( self, input_file: TextIOWrapper, chunk_size: int, cur: sqlite3.Cursor ): """Create a new database with the SWHIDs present inside the input file.""" self.create_table(cur) # use a set to avoid equal swhid swhids = set(line.strip() for line in input_file.readlines()) try: self.add(list(swhids), chunk_size, cur) cur.close() self.conn.commit() except Exception: raise DBError def known(self, swhid: str, cur: sqlite3.Cursor): """Check if a given SWHID is present or not inside the local database.""" cur.execute("""SELECT 1 FROM swhids WHERE swhid=?""", (swhid,)) res = cur.fetchone() - cur.close() return res is not None diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py index d86986c..b890a45 100644 --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -1,26 +1,30 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information class InvalidObjectType(TypeError): pass class InvalidDirectoryPath(Exception): pass +class LargePayloadExc(Exception): + pass + + class DBError(Exception): pass class APIError(Exception): def __str__(self): return '"%s"' % self.args def error_response(reason: str, status_code: int, api_url: str): error_msg = f"{status_code} {reason}: '{api_url}'" raise APIError(error_msg) diff --git a/swh/scanner/tests/test_backend.py b/swh/scanner/tests/test_backend.py new file mode 100644 index 0000000..93d5b5f --- /dev/null +++ b/swh/scanner/tests/test_backend.py @@ -0,0 +1,61 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scanner.backend import LIMIT, create_app +from swh.scanner.db import Db + +from .data import present_swhids + + +def test_backend_endpoint_all_present(tmp_path, live_server, test_swhids_sample): + tmp_dbfile = tmp_path / "tmp_db.sqlite" + db = Db(tmp_dbfile) + cur = db.conn.cursor() + db.create_from(test_swhids_sample, LIMIT, cur) + + app = create_app(db) + + with app.test_client() as test_client: + res = test_client.post("/api/1/known/", json=present_swhids) + + for swhid, attr in res.json.items(): + assert attr["known"] + + +def test_backend_endpoint_one_not_present(tmp_path, live_server, test_swhids_sample): + tmp_dbfile = tmp_path / "tmp_db.sqlite" + not_present_swhid = "swh:1:cnt:fa8eacf43d8646129ae8adfa1648f9307d999999" + swhids = present_swhids + [not_present_swhid] + + db = Db(tmp_dbfile) + cur = db.conn.cursor() + db.create_from(test_swhids_sample, LIMIT, cur) + + app = create_app(db) + + with app.test_client() as test_client: + res = test_client.post("/api/1/known/", json=swhids) + + for swhid, attr in res.json.items(): + if swhid != not_present_swhid: + assert attr["known"] + else: + assert not attr["known"] + + +def test_backend_large_payload_exc(tmp_path, live_server, test_swhids_sample): + tmp_dbfile = tmp_path / "tmp_db.sqlite" + swhid = "swh:1:cnt:fa8eacf43d8646129ae8adfa1648f9307d999999" + # the backend supports up to 1000 SWHID requests + swhids = [swhid for n in range(1001)] + db = Db(tmp_dbfile) + cur = db.conn.cursor() + db.create_from(test_swhids_sample, LIMIT, cur) + + app = create_app(db) + + with app.test_client() as test_client: + res = test_client.post("/api/1/known/", json=swhids) + assert res.status_code != 200