diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core +swh.model diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,3 @@ pytest +swh.core[testing-core] +swh.model[testing] diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,6 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner +requests +aiohttp +dulwich diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -39,13 +39,13 @@ # Full sample: # https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py setup( - name='swh.', # example: swh.loader.pypi - description='Software Heritage ', + name='swh.scanner', + description='Software Heritage code scanner', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/', + url='https://forge.softwareheritage.org/diffusion/DTSCN/', packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements('swh'), tests_require=parse_requirements('test'), @@ -55,7 +55,7 @@ include_package_data=True, entry_points=''' [swh.cli.subcommands] - =swh..cli:cli + scanner=swh.scanner.cli:scanner ''', classifiers=[ "Programming Language :: Python :: 3", @@ -67,6 +67,6 @@ project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', - 'Source': 'https://forge.softwareheritage.org/source/swh-', + 'Source': 'https://forge.softwareheritage.org/source/swh-scanner', }, ) diff --git a/swh/foo/bar.py b/swh/foo/bar.py deleted file mode 100644 --- a/swh/foo/bar.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information diff --git a/swh/foo/cli.py b/swh/foo/cli.py deleted file mode 100644 --- a/swh/foo/cli.py +++ /dev/null @@ -1,18 +0,0 @@ -import click - -from swh.core.cli import CONTEXT_SETTINGS - - -@click.group(name='foo', context_settings=CONTEXT_SETTINGS) -@click.pass_context -def cli(ctx): - """Foo main command. - """ - - -@cli.command() -@click.option('--bar', help='Something') -@click.pass_context -def bar(ctx, bar): - '''Do something.''' - click.echo('bar') diff --git a/swh/foo/__init__.py b/swh/scanner/__init__.py rename from swh/foo/__init__.py rename to swh/scanner/__init__.py diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py new file mode 100644 --- /dev/null +++ b/swh/scanner/cli.py @@ -0,0 +1,56 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import asyncio +import os +from pathlib import PosixPath +from urllib.parse import urlparse + +from .scanner import run +from .exceptions import InvalidPath +from .logger import setup_logger, log_counters +from .model import Tree + +from swh.core.cli import CONTEXT_SETTINGS + + +def parse_url(url): + if url.port == 80: + return 'https://' + url.hostname + else: + return url.geturl() + + +@click.command(context_settings=CONTEXT_SETTINGS) +@click.argument('path', required=True) +@click.option('--host', '-h', default='localhost', + metavar='IP', show_default=True, + help="web api endpoint ip") +@click.option('--port', '-p', default='', + metavar='PORT', show_default=True, + help="web api endpoint port") +@click.option('--debug/--no-debug', default=True, + help="enable debug") +@click.option('--verbose', '-v', is_flag=True, default=False, + help="show debug information") +def scanner(path, host, port, debug, verbose): + """Software Heritage tool to scan the source code of a project""" + if not os.path.exists(path): + raise InvalidPath(path) + + if debug: + setup_logger(bool(verbose)) + + url = parse_url(urlparse('https://%s:%s' % (host, port))) + source_tree = Tree(None, PosixPath(path)) + loop = asyncio.get_event_loop() + loop.run_until_complete(run(path, url, source_tree)) + source_tree.show() + log_counters() + + +if __name__ == '__main__': + scanner() diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py new file mode 100644 --- /dev/null +++ b/swh/scanner/exceptions.py @@ -0,0 +1,8 @@ +class InvalidPath(Exception): + def __str__(self): + return 'the provided path is invalid: "%s"' % self.args + + +class APIError(Exception): + def __str__(self): + return 'API Error: "%s"' % self.args diff --git a/swh/scanner/logger.py b/swh/scanner/logger.py new file mode 100644 --- /dev/null +++ b/swh/scanner/logger.py @@ -0,0 +1,61 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + + +logger = None + + +def init(**kwargs): + def decorate(func): + for k in kwargs: + setattr(func, k, kwargs[k]) + return func + return decorate + + +def setup_logger(verbose: bool) -> None: + global logger + console = logging.FileHandler('scan.log') + console.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s | %(levelname)s: %(message)s') + console.setFormatter(formatter) + + logger = logging.getLogger('debug') + logger.addHandler(console) + if not verbose: + logger.propagate = False + + +@init(count=0) +def log_queries(n: int) -> None: + if logger is not None: + log_queries.count += n + + +def log_counters() -> None: + if logger is not None: + logger.info('number of queries: %s' % log_queries.count) + + +def error(*args) -> None: + if logger is not None: + logger.error(args) + + +def warning(*args) -> None: + if logger is not None: + logger.warning(args) + + +def info(*args) -> None: + if logger is not None: + logger.info(args) + + +def debug(*args): + if logger is not None: + logger.debug(args) diff --git a/swh/scanner/model.py b/swh/scanner/model.py new file mode 100644 --- /dev/null +++ b/swh/scanner/model.py @@ -0,0 +1,76 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from __future__ import annotations +from pathlib import PosixPath +from typing import Any, Dict +from enum import Enum + +from swh.model.identifiers import ( + DIRECTORY, CONTENT +) + + +class Color(Enum): + blue = '\033[94m' + green = '\033[92m' + red = '\033[91m' + end = '\033[0m' + + +def colorize(text: str, color: Color): + return color.value + text + Color.end.value + + +class Tree: + """Representation of a file system structure + """ + def __init__(self, father: Tree, path: PosixPath): + self.father = father + self.path = path + self.otype = DIRECTORY if path.is_dir() else CONTENT + self.pid = '' + self.children: Dict[PosixPath, Tree] = {} + + def addNode(self, path: PosixPath, pid: str = None) -> None: + """Recursively add a new node path + """ + relative_path = path.relative_to(self.path) + + if relative_path == PosixPath('.'): + if pid is not None: + self.pid = pid + return + + new_path = self.path.joinpath(relative_path.parts[0]) + if new_path not in self.children: + self.children[new_path] = Tree(self, new_path) + + self.children[new_path].addNode(path, pid) + + def show(self) -> None: + """Print all the tree""" + print(Color.blue.value+str(self.path)+Color.end.value) + self.printChildren() + + def printChildren(self, inc: int = 0) -> None: + for path, node in self.children.items(): + self.printNode(node, inc) + if node.children: + node.printChildren(inc+1) + + def printNode(self, node: Any, inc: int) -> None: + rel_path = str(node.path.relative_to(self.path)) + if node.otype == DIRECTORY: + if node.pid: + print('│ '*inc + colorize(rel_path, Color.blue) + '/') + else: + print('│ '*inc + colorize(rel_path, Color.red) + '/') + + if node.otype == CONTENT: + if node.pid: + print('│ '*inc + colorize(rel_path, Color.green)) + else: + print('│ '*inc + colorize(rel_path, Color.red)) diff --git a/swh/foo/py.typed b/swh/scanner/py.typed rename from swh/foo/py.typed rename to swh/scanner/py.typed diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py new file mode 100644 --- /dev/null +++ b/swh/scanner/scanner.py @@ -0,0 +1,146 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import itertools +import asyncio +import aiohttp +from typing import List, Dict, Tuple, Generator, Iterator +from pathlib import PosixPath + +from .logger import log_queries +from .exceptions import APIError +from .model import Tree + +from swh.model.cli import pid_of_file, pid_of_dir +from swh.model.identifiers import ( + parse_persistent_identifier, + DIRECTORY, CONTENT +) + + +async def pids_discovery( + pids: List[str], session: aiohttp.ClientSession, url: str, + ) -> Dict[str, Dict[str, bool]]: + """API Request to get information about the persistent identifiers given in + input. + + Args: + pids: a list of persistent identifier + + Returns: + A dictionary with: + key: persistent identifier searched + value: + value['known'] = True if the pid is found + value['known'] = False if the pid is not found + + """ + endpoint = url + '/api/1/known/' + chunk_size = 1000 + requests = [] + + log_queries(len(pids)) + + def get_chunk(pids): + for i in range(0, len(pids), chunk_size): + yield pids[i:i + chunk_size] + + async def make_request(pids): + async with session.post(endpoint, json=pids) as resp: + if resp.status != 200: + error_message = '%s with given values %s' % ( + resp.text, str(pids)) + raise APIError(error_message) + return await resp.json() + + if len(pids) > chunk_size: + for pids_chunk in get_chunk(pids): + requests.append(asyncio.create_task( + make_request(pids_chunk))) + + res = await asyncio.gather(*requests) + # concatenate list of dictionaries + return dict(itertools.chain.from_iterable(e.items() for e in res)) + else: + return await make_request(pids) + + +def get_subpaths( + path: PosixPath) -> Generator[Tuple[PosixPath, str], None, None]: + """Find the persistent identifier of the directories and files under a + given path. + + Args: + path: the root path + + Yields: + pairs of: path, the relative persistent identifier + + """ + def pid_of(path): + if path.is_dir(): + return pid_of_dir(bytes(path)) + elif path.is_file(): + return pid_of_file(bytes(path)) + + dirpath, dnames, fnames = next(os.walk(path)) + for node in itertools.chain(dnames, fnames): + sub_path = PosixPath(dirpath).joinpath(node) + yield (sub_path, pid_of(sub_path)) + + +async def parse_path( + path: PosixPath, session: aiohttp.ClientSession, url: str + ) -> Iterator[Tuple[str, str, bool]]: + """Check if the sub paths of the given path are present in the + archive or not. + + Args: + path: the source path + url: url for the API request + + Returns: + a map containing tuples with: a subpath of the given path, + the pid of the subpath and the result of the api call + + """ + parsed_paths = dict(get_subpaths(path)) + parsed_pids = await pids_discovery( + list(parsed_paths.values()), session, url) + + def unpack(tup): + subpath, pid = tup + return (subpath, pid, parsed_pids[pid]['known']) + + return map(unpack, parsed_paths.items()) + + +async def run( + root: PosixPath, url: str, source_tree: Tree) -> None: + """Start scanning from the given root. + + It fill the source tree with the path discovered. + + Args: + root: the root path to scan + url: url for the API request + + """ + async def _scan(root, session, url, source_tree): + for path, pid, found in await parse_path(root, session, url): + obj_type = parse_persistent_identifier(pid).object_type + + if obj_type == CONTENT: + source_tree.addNode(path, pid if found else None) + elif obj_type == DIRECTORY: + if found: + source_tree.addNode(path, pid) + else: + source_tree.addNode(path) + await _scan(path, session, url, source_tree) + + async with aiohttp.ClientSession() as session: + await _scan(root, session, url, source_tree) diff --git a/swh/foo/tests/__init__.py b/swh/scanner/tests/__init__.py rename from swh/foo/tests/__init__.py rename to swh/scanner/tests/__init__.py diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -8,8 +8,8 @@ pytest-cov commands = pytest --doctest-modules \ - {envsitepackagesdir}/swh/foo \ - --cov={envsitepackagesdir}/swh/foo \ + {envsitepackagesdir}/swh/scanner \ + --cov={envsitepackagesdir}/swh/scanner \ --cov-branch {posargs} [testenv:flake8] diff --git a/version.txt b/version.txt new file mode 100644 --- /dev/null +++ b/version.txt @@ -0,0 +1 @@ +v0.0.1-0-gdd03798 \ No newline at end of file