diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py index 765cad1..c47a394 100644 --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -1,138 +1,197 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import os +import sys from typing import Any, Dict, Optional import click import yaml from swh.core import config from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group +from .exceptions import DBError + # All generic config code should reside in swh.core.config CONFIG_ENVVAR = "SWH_CONFIG_FILE" DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml") DEFAULT_CONFIG: Dict[str, Any] = { "web-api": { "url": "https://archive.softwareheritage.org/api/1/", "auth-token": None, } } CONFIG_FILE_HELP = f"""Configuration file: \b The CLI option or the environment variable will fail if invalid. CLI option is checked first. Then, environment variable {CONFIG_ENVVAR} is checked. Then, if cannot load the default path, a set of default values are used. Default config path is {DEFAULT_CONFIG_PATH}. Default config values are: \b {yaml.dump(DEFAULT_CONFIG)}""" SCANNER_HELP = f"""Software Heritage Scanner tools. {CONFIG_FILE_HELP}""" +def setup_config(ctx, api_url): + config = ctx.obj["config"] + if api_url: + if not api_url.endswith("/"): + api_url += "/" + config["web-api"]["url"] = api_url + + return config + + @swh_cli_group.group( name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP, ) @click.option( "-C", "--config-file", default=None, type=click.Path(exists=False, dir_okay=False, path_type=str), help="""YAML configuration file""", ) @click.pass_context def scanner(ctx, config_file: Optional[str]): env_config_path = os.environ.get(CONFIG_ENVVAR) # read_raw_config do not fail if file does not exist, so check it beforehand # while enforcing loading priority if config_file: if not config.config_exists(config_file): raise click.BadParameter( f"File '{config_file}' cannot be opened.", param_hint="--config-file" ) elif env_config_path: if not config.config_exists(env_config_path): raise click.BadParameter( f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR ) config_file = env_config_path elif config.config_exists(DEFAULT_CONFIG_PATH): config_file = DEFAULT_CONFIG_PATH conf = DEFAULT_CONFIG if config_file is not None: conf = config.read_raw_config(config.config_basepath(config_file)) conf = config.merge_configs(DEFAULT_CONFIG, conf) ctx.ensure_object(dict) ctx.obj["config"] = conf @scanner.command(name="scan") @click.argument("root_path", required=True, type=click.Path(exists=True)) @click.option( "-u", "--api-url", default=None, metavar="API_URL", show_default=True, help="URL for the api request", ) @click.option( "--exclude", "-x", "patterns", metavar="PATTERN", multiple=True, help="Exclude directories using glob patterns \ (e.g., '*.git' to exclude all .git directories)", ) @click.option( "-f", "--output-format", "out_fmt", default="text", show_default=True, type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), help="The output format", ) @click.option( "-i", "--interactive", is_flag=True, help="Show the result in a dashboard" ) @click.pass_context def scan(ctx, root_path, api_url, patterns, out_fmt, interactive): """Scan a source code project to discover files and directories already present in the archive""" import swh.scanner.scanner as scanner - config = ctx.obj["config"] - if api_url: - if not api_url.endswith("/"): - api_url += "/" - config["web-api"]["url"] = api_url - + config = setup_config(ctx, api_url) scanner.scan(config, root_path, patterns, out_fmt, interactive) +@scanner.group("db") +@click.pass_context +def db(ctx): + pass + + +@db.command("import") +@click.option( + "-s", + "--chunk-size", + "chunk_size", + default="10000", + metavar="SIZE", + show_default=True, + type=int, + help="The chunk size ", +) +@click.option( + "-i", + "--input", + "input_file", + metavar="INPUT_FILE", + type=click.File("r"), + help="A file containing SWHIDs", +) +@click.option( + "-o", + "--output", + "output_file_db", + metavar="OUTPUT_DB_FILE", + default="SWHID_DB.sqlite", + show_default=True, + help="The name of the generated sqlite database", +) +@click.pass_context +def import_(ctx, chunk_size, input_file, output_file_db): + """Parse an input list of SWHID to generate a local sqlite database + """ + from .db import Db + + db = Db(output_file_db) + cur = db.conn.cursor() + try: + db.create_from(input_file, chunk_size, cur) + db.close() + except DBError: + print("Failed to create database") + os.remove(output_file_db) + sys.exit(1) + + def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") if __name__ == "__main__": main() diff --git a/swh/scanner/db.py b/swh/scanner/db.py new file mode 100644 index 0000000..905c9dd --- /dev/null +++ b/swh/scanner/db.py @@ -0,0 +1,69 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +This module is an interface to interact with the local database +where the SWHIDs will be saved for the local API. + +SWHIDs can be added directly from an input file. +""" + +from io import TextIOWrapper +from pathlib import Path +import sqlite3 +from typing import Iterable + +from swh.core.utils import grouper + +from .exceptions import DBError + + +class Db: + """Local database interface""" + + def __init__(self, db_file: Path): + self.db_file: Path = db_file + self.conn: sqlite3.Connection = sqlite3.connect( + db_file, check_same_thread=False + ) + + def close(self): + """Close the connection to the database.""" + self.conn.close() + + def create_table(self, cur: sqlite3.Cursor): + """Create the table where the SWHIDs will be stored.""" + cur.execute("""CREATE TABLE IF NOT EXISTS swhids (swhid text PRIMARY KEY)""") + + def add(self, swhids: Iterable[str], chunk_size: int, cur: sqlite3.Cursor): + """Insert the SWHID inside the database.""" + for swhids_chunk in grouper(swhids, chunk_size): + cur.executemany( + """INSERT INTO swhids VALUES (?)""", + [(swhid_chunk,) for swhid_chunk in swhids_chunk], + ) + + def create_from( + self, input_file: TextIOWrapper, chunk_size: int, cur: sqlite3.Cursor + ): + """Create a new database with the SWHIDs present inside the input file.""" + self.create_table(cur) + # use a set to avoid equal swhid + swhids = set(line.strip() for line in input_file.readlines()) + + try: + self.add(list(swhids), chunk_size, cur) + cur.close() + self.conn.commit() + except Exception: + raise DBError + + def known(self, swhid: str, cur: sqlite3.Cursor): + """Check if a given SWHID is present or not inside the local database.""" + cur.execute("""SELECT 1 FROM swhids WHERE swhid=?""", (swhid,)) + res = cur.fetchone() + cur.close() + + return res is not None diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py index d5ad445..d86986c 100644 --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -1,22 +1,26 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information class InvalidObjectType(TypeError): pass class InvalidDirectoryPath(Exception): pass +class DBError(Exception): + pass + + class APIError(Exception): def __str__(self): return '"%s"' % self.args def error_response(reason: str, status_code: int, api_url: str): error_msg = f"{status_code} {reason}: '{api_url}'" raise APIError(error_msg) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py index 0502129..8f8e709 100644 --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -1,143 +1,158 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import os from pathlib import Path import shutil import aiohttp from aioresponses import aioresponses # type: ignore import pytest from swh.model.cli import swhid_of_dir, swhid_of_file from swh.scanner.model import Tree +from .data import present_swhids from .flask_api import create_app @pytest.fixture def mock_aioresponse(): with aioresponses() as m: yield m @pytest.fixture def event_loop(): """Fixture that generate an asyncio event loop.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) yield loop loop.close() @pytest.fixture async def aiosession(): """Fixture that generate an aiohttp Client Session.""" session = aiohttp.ClientSession() yield session session.detach() @pytest.fixture(scope="function") def temp_folder(tmp_path): """Fixture that generates a temporary folder with the following structure: .. code-block:: python root = { subdir: { subsubdir filesample.txt filesample2.txt } subdir2 subfile.txt } """ root = tmp_path subdir = root / "subdir" subdir.mkdir() subsubdir = subdir / "subsubdir" subsubdir.mkdir() subdir2 = root / "subdir2" subdir2.mkdir() subfile = root / "subfile.txt" subfile.touch() filesample = subdir / "filesample.txt" filesample.touch() filesample2 = subdir / "filesample2.txt" filesample2.touch() avail_path = { subdir: swhid_of_dir(bytes(subdir)), subsubdir: swhid_of_dir(bytes(subsubdir)), subdir2: swhid_of_dir(bytes(subdir2)), subfile: swhid_of_file(bytes(subfile)), filesample: swhid_of_file(bytes(filesample)), filesample2: swhid_of_file(bytes(filesample2)), } return { "root": root, "paths": avail_path, "filesample": filesample, "filesample2": filesample2, "subsubdir": subsubdir, "subdir": subdir, } @pytest.fixture(scope="function") def example_tree(temp_folder): """Fixture that generate a Tree with the root present in the session fixture "temp_folder". """ example_tree = Tree(temp_folder["root"]) assert example_tree.path == temp_folder["root"] return example_tree @pytest.fixture(scope="function") def example_dirs(example_tree, temp_folder): """ Fixture that fill the fixture example_tree with the values contained in the fixture temp_folder and returns the directories information of the filled example_tree. """ root = temp_folder["root"] filesample_path = temp_folder["filesample"] filesample2_path = temp_folder["filesample2"] subsubdir_path = temp_folder["subsubdir"] known_paths = [filesample_path, filesample2_path, subsubdir_path] for path, swhid in temp_folder["paths"].items(): if path in known_paths: example_tree.add_node(path, swhid, True) else: example_tree.add_node(path, swhid, False) return example_tree.get_directories_info(root) @pytest.fixture def test_sample_folder(datadir, tmp_path): """Location of the "data" folder """ archive_path = Path(os.path.join(datadir, "sample-folder.tgz")) assert archive_path.exists() shutil.unpack_archive(archive_path, extract_dir=tmp_path) test_sample_folder = Path(os.path.join(tmp_path, "sample-folder")) assert test_sample_folder.exists() return test_sample_folder +@pytest.fixture +def test_swhids_sample(tmp_path): + """Create and return the opened "swhids_sample" file, + filled with present swhids present in data.py + """ + test_swhids_sample = Path(os.path.join(tmp_path, "swhids_sample.txt")) + + with open(test_swhids_sample, "w") as f: + f.write("\n".join(swhid for swhid in present_swhids)) + + assert test_swhids_sample.exists() + return open(test_swhids_sample, "r") + + @pytest.fixture(scope="session") def app(): """Flask backend API (used by live_server).""" app = create_app() return app diff --git a/swh/scanner/tests/test_cli.py b/swh/scanner/tests/test_cli.py index eb17079..61cec25 100644 --- a/swh/scanner/tests/test_cli.py +++ b/swh/scanner/tests/test_cli.py @@ -1,121 +1,150 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os from pathlib import Path from unittest.mock import Mock, call from click.testing import CliRunner import pytest import swh.scanner.cli as cli import swh.scanner.scanner as scanner +from .data import present_swhids + DATADIR = Path(__file__).absolute().parent / "data" CONFIG_PATH_GOOD = str(DATADIR / "global.yml") CONFIG_PATH_GOOD2 = str(DATADIR / "global2.yml") # alternative to global.yml ROOTPATH_GOOD = str(DATADIR) @pytest.fixture(scope="function") def m_scanner(mocker): """Returns a mock swh.scanner.scanner object with all attributes mocked""" # Customizable mock of scanner module # Fortunately, noop is the default behavior for all methods scanner_mock = Mock(scanner) yield mocker.patch("swh.scanner.scanner", scanner_mock) @pytest.fixture(scope="function") def spy_configopen(mocker): """Returns a mock of open builtin scoped to swh.core.config""" yield mocker.patch("swh.core.config.open", wraps=open) @pytest.fixture(scope="function") def cli_runner(monkeypatch, tmp_path): """Return a CliRunner with default environment variable SWH_CONFIG_FILE unset""" BAD_CONFIG_PATH = str(tmp_path / "missing") monkeypatch.setattr(cli, "DEFAULT_CONFIG_PATH", BAD_CONFIG_PATH) return CliRunner(env={"SWH_CONFIG_FILE": None}) +@pytest.fixture(scope="function") +def swhids_input_file(tmp_path): + swhids_input_file = Path(os.path.join(tmp_path, "input_file.txt")) + + with open(swhids_input_file, "w") as f: + f.write("\n".join(swhid for swhid in present_swhids)) + + assert swhids_input_file.exists() + return swhids_input_file + + # TEST BEGIN # For nominal code paths, check that the right config file is loaded # scanner is mocked to not run actual scan, config loading is mocked to check its usage def test_smoke(cli_runner): """Break if basic functionality breaks""" res = cli_runner.invoke(cli.scanner, ["scan", "-h"]) assert res.exit_code == 0 def test_config_path_option_bad(cli_runner, tmp_path): """Test bad option no envvar bad default""" CONFPATH_BAD = str(tmp_path / "missing") res = cli_runner.invoke(cli.scanner, ["-C", CONFPATH_BAD, "scan", ROOTPATH_GOOD]) assert res.exit_code != 0 def test_default_config_path(cli_runner, m_scanner, spy_configopen, monkeypatch): """Test no option no envvar good default""" monkeypatch.setattr(cli, "DEFAULT_CONFIG_PATH", CONFIG_PATH_GOOD) res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_GOOD]) assert res.exit_code == 0 assert spy_configopen.call_args == call(CONFIG_PATH_GOOD) assert m_scanner.scan.call_count == 1 def test_root_no_config(cli_runner, m_scanner, spy_configopen): """Test no config = no option no envvar bad default, good root""" res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_GOOD]) assert res.exit_code == 0 assert spy_configopen.call_count == 0 assert m_scanner.scan.call_count == 1 def test_root_bad(cli_runner, tmp_path): """Test no option no envvar bad default bad root""" ROOTPATH_BAD = str(tmp_path / "missing") res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_BAD]) assert res.exit_code != 0 def test_config_path_envvar_good(cli_runner, m_scanner, spy_configopen): """Test no option good envvar bad default good root""" cli_runner.env["SWH_CONFIG_FILE"] = CONFIG_PATH_GOOD res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_GOOD]) assert res.exit_code == 0 assert spy_configopen.call_args == call(CONFIG_PATH_GOOD) assert m_scanner.scan.call_count == 1 def test_config_path_envvar_bad(cli_runner, tmp_path): """Test no option bad envvar bad default good root""" CONFPATH_BAD = str(tmp_path / "missing") cli_runner.env["SWH_CONFIG_FILE"] = CONFPATH_BAD res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_GOOD]) assert res.exit_code != 0 def test_config_path_option_envvar(cli_runner, m_scanner, spy_configopen): """Test good option good envvar bad default good root Check that option has precedence over envvar""" cli_runner.env["SWH_CONFIG_FILE"] = CONFIG_PATH_GOOD2 res = cli_runner.invoke( cli.scanner, ["-C", CONFIG_PATH_GOOD, "scan", ROOTPATH_GOOD] ) assert res.exit_code == 0 assert spy_configopen.call_args == call(CONFIG_PATH_GOOD) assert m_scanner.scan.call_count == 1 def test_api_url_option(cli_runner, m_scanner): """Test no config good root good url""" API_URL = "https://example.com/api/1" # without trailing "/" res = cli_runner.invoke(cli.scanner, ["scan", ROOTPATH_GOOD, "-u", API_URL]) assert res.exit_code == 0 assert m_scanner.scan.call_count == 1 + + +def test_db_option(cli_runner, swhids_input_file, tmp_path): + res = cli_runner.invoke( + cli.scanner, + [ + "db", + "import", + "--input", + swhids_input_file, + "--output", + f"{tmp_path}/test_db.sqlite", + ], + ) + assert res.exit_code == 0 diff --git a/swh/scanner/tests/test_db.py b/swh/scanner/tests/test_db.py new file mode 100644 index 0000000..96a3260 --- /dev/null +++ b/swh/scanner/tests/test_db.py @@ -0,0 +1,40 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scanner.db import Db + +from .data import present_swhids + +CHUNK_SIZE = 1000 + + +def test_db_create_from(tmp_path, test_swhids_sample): + tmp_dbfile = tmp_path / "tmp_db.sqlite" + + db = Db(tmp_dbfile) + cur = db.conn.cursor() + db.create_from(test_swhids_sample, CHUNK_SIZE, cur) + + for swhid in present_swhids: + cur = db.conn.cursor() + assert db.known(swhid, cur) + + +def test_db_create_from_one_not_present(tmp_path, test_swhids_sample): + not_present_swhid = "swh:1:cnt:fa8eacf43d8646129ae8adfa1648f9307d999999" + swhids = present_swhids + [not_present_swhid] + + tmp_dbfile = tmp_path / "tmp_db.sqlite" + + db = Db(tmp_dbfile) + cur = db.conn.cursor() + db.create_from(test_swhids_sample, CHUNK_SIZE, cur) + + for swhid in swhids: + cur = db.conn.cursor() + if swhid != not_present_swhid: + assert db.known(swhid, cur) + else: + assert not db.known(swhid, cur)