diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -39,13 +39,13 @@ # Full sample: # https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py setup( - name='swh.', # example: swh.loader.pypi - description='Software Heritage ', + name='swh.scanner', + description='Software Heritage code scanner', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/', + url='https://forge.softwareheritage.org/diffusion/DTSCN/', packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements('swh'), tests_require=parse_requirements('test'), @@ -55,7 +55,7 @@ include_package_data=True, entry_points=''' [swh.cli.subcommands] - =swh..cli:cli + scanner=swh.scanner.cli:scanner ''', classifiers=[ "Programming Language :: Python :: 3", @@ -67,6 +67,6 @@ project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', - 'Source': 'https://forge.softwareheritage.org/source/swh-', + 'Source': 'https://forge.softwareheritage.org/source/swh-scanner', }, ) diff --git a/swh/foo/bar.py b/swh/foo/bar.py deleted file mode 100644 --- a/swh/foo/bar.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information diff --git a/swh/foo/cli.py b/swh/foo/cli.py deleted file mode 100644 --- a/swh/foo/cli.py +++ /dev/null @@ -1,18 +0,0 @@ -import click - -from swh.core.cli import CONTEXT_SETTINGS - - -@click.group(name='foo', context_settings=CONTEXT_SETTINGS) -@click.pass_context -def cli(ctx): - """Foo main command. - """ - - -@cli.command() -@click.option('--bar', help='Something') -@click.pass_context -def bar(ctx, bar): - '''Do something.''' - click.echo('bar') diff --git a/swh/foo/py.typed b/swh/foo/py.typed deleted file mode 100644 --- a/swh/foo/py.typed +++ /dev/null @@ -1 +0,0 @@ -# Marker file for PEP 561. diff --git a/swh/foo/__init__.py b/swh/scanner/__init__.py rename from swh/foo/__init__.py rename to swh/scanner/__init__.py diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py new file mode 100644 --- /dev/null +++ b/swh/scanner/cli.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + +from swh.core.cli import CONTEXT_SETTINGS +from swh.scanner.scanner import run + + +@click.group(name='scanner', context_settings=CONTEXT_SETTINGS) +@click.pass_context +def scanner(ctx): + '''Software Heritage Scanner tools.''' + pass + + +@scanner.command(name='scan') +@click.argument('path', required=True) +@click.option('--host', '-h', default='localhost', + metavar='IP', show_default=True, + help="web api endpoint ip") +@click.option('--port', '-p', default='5080', + metavar='PORT', show_default=True, + help="web api endpoint port") +@click.pass_context +def scan(ctx, path, host, port): + result = run(path, host, port) + print(result) + + +def main(): + return scanner(auto_envvar_prefix='SWH_SCANNER') + + +if __name__ == '__main__': + main() diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py new file mode 100644 --- /dev/null +++ b/swh/scanner/scanner.py @@ -0,0 +1,108 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import requests +import os +import json +import itertools +from pathlib import PosixPath + +from swh.model.cli import pid_of_file, pid_of_dir +from swh.model.identifiers import ( + parse_persistent_identifier, + DIRECTORY, CONTENT +) + + +def pids_discovery(pids, host, port): + """ + Args: + pids list(str): A list of persistent identifier + Returns: + A dictionary with: + key(str): persistent identifier + value(dict): + value['known'] = True if pid is found + value['known'] = False if pid is not found + """ + endpoint = 'http://%s:%s/api/1/known/' % (host, port) + req = requests.post(endpoint, json=pids) + resp = req.text + return json.loads(resp) + + +def get_sub_paths(path): + """Find the persistent identifier of the paths and files under + a given path. + + Args: + path(PosixPath): the entry root + + Yields: + tuple(path, pid): pairs of path and the relative persistent + identifier + """ + def pid_of(path): + if path.is_dir(): + return pid_of_dir(bytes(path)) + elif path.is_file(): + return pid_of_file(bytes(path)) + + dirpath, dnames, fnames = next(os.walk(path)) + for node in itertools.chain(dnames, fnames): + path = PosixPath(dirpath).joinpath(node) + yield (path, pid_of(path)) + + +def parse_path(path, host, port): + """Check if the sub paths of the given path is present in the + archive or not. + Args: + path(PosixPath): The source path + host(str): ip for the api request + port(str): port for the api request + Yields: + a tuple with the path found, the persistent identifier + relative to the path and a boolean: False if not found, + True if found. + """ + pid_map = dict(get_sub_paths(path)) + parsed_pids = pids_discovery(list(pid_map.values()), host, port) + + for sub_path, pid in pid_map.items(): + yield (sub_path, pid, parsed_pids[pid]['known']) + + +def run(root, host, port): + """Scan the given root + Args: + path: the path to scan + host(str): ip for the api request + port(str): port for the api request + Returns: + A set containing pairs of the path discovered and the + relative persistent identifier + """ + def _scan(root, host, port, accum): + assert root not in accum + + next_paths = [] + for path, pid, found in parse_path(root, host, port): + obj_type = parse_persistent_identifier(pid).object_type + + if obj_type == CONTENT and found: + accum.add((str(path), pid)) + elif obj_type == DIRECTORY: + if found: + accum.add((str(path), pid)) + else: + next_paths.append(path) + + for new_path in next_paths: + accum = _scan(new_path, host, port, accum) + + return accum + + return _scan(root, host, port, set()) diff --git a/swh/foo/tests/__init__.py b/swh/scanner/tests/__init__.py rename from swh/foo/tests/__init__.py rename to swh/scanner/tests/__init__.py