diff --git a/.gitignore b/.gitignore new file mode 100644 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +version.txt diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core +swh.model diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,3 @@ pytest +swh.core[testing-core] +swh.model[testing] diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,6 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner +requests +aiohttp +dulwich diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -39,13 +39,13 @@ # Full sample: # https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py setup( - name='swh.', # example: swh.loader.pypi - description='Software Heritage ', + name='swh.scanner', + description='Software Heritage code scanner', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/', + url='https://forge.softwareheritage.org/diffusion/DTSCN/', packages=find_packages(), # packages's modules install_requires=parse_requirements() + parse_requirements('swh'), tests_require=parse_requirements('test'), @@ -55,7 +55,7 @@ include_package_data=True, entry_points=''' [swh.cli.subcommands] - =swh..cli:cli + scanner=swh.scanner.cli:scanner ''', classifiers=[ "Programming Language :: Python :: 3", @@ -67,6 +67,6 @@ project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', - 'Source': 'https://forge.softwareheritage.org/source/swh-', + 'Source': 'https://forge.softwareheritage.org/source/swh-scanner', }, ) diff --git a/swh/foo/cli.py b/swh/foo/cli.py deleted file mode 100644 --- a/swh/foo/cli.py +++ /dev/null @@ -1,18 +0,0 @@ -import click - -from swh.core.cli import CONTEXT_SETTINGS - - -@click.group(name='foo', context_settings=CONTEXT_SETTINGS) -@click.pass_context -def cli(ctx): - """Foo main command. - """ - - -@cli.command() -@click.option('--bar', help='Something') -@click.pass_context -def bar(ctx, bar): - '''Do something.''' - click.echo('bar') diff --git a/swh/foo/__init__.py b/swh/scanner/__init__.py rename from swh/foo/__init__.py rename to swh/scanner/__init__.py diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py new file mode 100644 --- /dev/null +++ b/swh/scanner/cli.py @@ -0,0 +1,47 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import asyncio +from pathlib import PosixPath + +from .scanner import run +from .model import Tree + +from swh.core.cli import CONTEXT_SETTINGS + +@click.group(name='scanner', context_settings=CONTEXT_SETTINGS) +@click.pass_context +def scanner(ctx): + '''Software Heritage Scanner tools.''' + pass + + +def parse_url(url): + if not url.startswith('http://') or not url.startswith('https://'): + url = 'https://' + url + if not url.endswith('/'): + url += '/' + return url + + +@scanner.command(name='scan') +@click.argument('path', required=True, type=click.Path(exists=True)) +@click.option('--api-url', default='archive.softwareheritage.org/api/1', + metavar='API_URL', show_default=True, + help="url for the api request") +@click.pass_context +def scan(ctx, path, api_url): + """Scan a source code project to discover files and directories already + present in the archive""" + api_url = parse_url(api_url) + source_tree = Tree(PosixPath(path)) + loop = asyncio.get_event_loop() + loop.run_until_complete(run(path, api_url, source_tree)) + source_tree.show() + + +if __name__ == '__main__': + scan() diff --git a/swh/foo/bar.py b/swh/scanner/exceptions.py rename from swh/foo/bar.py rename to swh/scanner/exceptions.py --- a/swh/foo/bar.py +++ b/swh/scanner/exceptions.py @@ -1,4 +1,9 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + + +class APIError(Exception): + def __str__(self): + return 'API Error: "%s"' % self.args diff --git a/swh/scanner/model.py b/swh/scanner/model.py new file mode 100644 --- /dev/null +++ b/swh/scanner/model.py @@ -0,0 +1,84 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from __future__ import annotations +import sys +from pathlib import PosixPath +from typing import Any, Dict +from enum import Enum + +from swh.model.identifiers import ( + DIRECTORY, CONTENT +) + + +class Color(Enum): + blue = '\033[94m' + green = '\033[92m' + red = '\033[91m' + end = '\033[0m' + + +def colorize(text: str, color: Color): + return color.value + text + Color.end.value + + +class Tree: + """Representation of a file system structure + """ + def __init__(self, path: PosixPath, father: Tree = None): + self.father = father + self.path = path + self.otype = DIRECTORY if path.is_dir() else CONTENT + self.pid = '' + self.children: Dict[PosixPath, Tree] = {} + + def addNode(self, path: PosixPath, pid: str = None) -> None: + """Recursively add a new node path + """ + relative_path = path.relative_to(self.path) + + if relative_path == PosixPath('.'): + if pid is not None: + self.pid = pid + return + + new_path = self.path.joinpath(relative_path.parts[0]) + if new_path not in self.children: + self.children[new_path] = Tree(new_path, self) + + self.children[new_path].addNode(path, pid) + + def show(self) -> None: + """Print all the tree""" + isatty = sys.stdout.isatty() + + print(colorize(str(self.path), Color.blue) if isatty + else str(self.path)) + self.printChildren(isatty) + + def printChildren(self, isatty: bool, inc: int = 0) -> None: + for path, node in self.children.items(): + self.printNode(node, isatty, inc) + if node.children: + node.printChildren(isatty, inc+1) + + def printNode(self, node: Any, isatty: bool, inc: int) -> None: + rel_path = str(node.path.relative_to(self.path)) + print('│ '*inc, end='') + if node.otype == DIRECTORY: + if node.pid: + print(colorize(rel_path, Color.blue) if isatty else rel_path, + end='') + else: + print(colorize(rel_path, Color.red) if isatty else rel_path, + end='') + print('/') + + elif node.otype == CONTENT: + if node.pid: + print(colorize(rel_path, Color.green) if isatty else rel_path) + else: + print(colorize(rel_path, Color.red) if isatty else rel_path) diff --git a/swh/foo/py.typed b/swh/scanner/py.typed rename from swh/foo/py.typed rename to swh/scanner/py.typed diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py new file mode 100644 --- /dev/null +++ b/swh/scanner/scanner.py @@ -0,0 +1,145 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import itertools +import asyncio +import aiohttp +from typing import List, Dict, Tuple, Iterator +from pathlib import PosixPath + +from .exceptions import APIError +from .model import Tree + +from swh.model.cli import pid_of_file, pid_of_dir +from swh.model.identifiers import ( + parse_persistent_identifier, + DIRECTORY, CONTENT +) + + +async def pids_discovery( + pids: List[str], session: aiohttp.ClientSession, api_url: str, + ) -> Dict[str, Dict[str, bool]]: + """API Request to get information about the persistent identifiers given in + input. + + Args: + pids: a list of persistent identifier + api_url: url for the API request + + Returns: + A dictionary with: + key: persistent identifier searched + value: + value['known'] = True if the pid is found + value['known'] = False if the pid is not found + + """ + endpoint = api_url + 'known/' + chunk_size = 1000 + requests = [] + + def get_chunk(pids): + for i in range(0, len(pids), chunk_size): + yield pids[i:i + chunk_size] + + async def make_request(pids): + async with session.post(endpoint, json=pids) as resp: + if resp.status != 200: + error_message = '%s with given values %s' % ( + resp.text, str(pids)) + raise APIError(error_message) + + return await resp.json() + + if len(pids) > chunk_size: + for pids_chunk in get_chunk(pids): + requests.append(asyncio.create_task( + make_request(pids_chunk))) + + res = await asyncio.gather(*requests) + # concatenate list of dictionaries + return dict(itertools.chain.from_iterable(e.items() for e in res)) + else: + return await make_request(pids) + + +def get_subpaths( + path: PosixPath) -> Iterator[Tuple[PosixPath, str]]: + """Find the persistent identifier of the directories and files under a + given path. + + Args: + path: the root path + + Yields: + pairs of: path, the relative persistent identifier + + """ + def pid_of(path): + if path.is_dir(): + return pid_of_dir(bytes(path)) + elif path.is_file(): + return pid_of_file(bytes(path)) + + dirpath, dnames, fnames = next(os.walk(path)) + for node in itertools.chain(dnames, fnames): + sub_path = PosixPath(dirpath).joinpath(node) + yield (sub_path, pid_of(sub_path)) + + +async def parse_path( + path: PosixPath, session: aiohttp.ClientSession, api_url: str + ) -> Iterator[Tuple[str, str, bool]]: + """Check if the sub paths of the given path are present in the + archive or not. + + Args: + path: the source path + api_url: url for the API request + + Returns: + a map containing tuples with: a subpath of the given path, + the pid of the subpath and the result of the api call + + """ + parsed_paths = dict(get_subpaths(path)) + parsed_pids = await pids_discovery( + list(parsed_paths.values()), session, api_url) + + def unpack(tup): + subpath, pid = tup + return (subpath, pid, parsed_pids[pid]['known']) + + return map(unpack, parsed_paths.items()) + + +async def run( + root: PosixPath, api_url: str, source_tree: Tree) -> None: + """Start scanning from the given root. + + It fills the source tree with the path discovered. + + Args: + root: the root path to scan + api_url: url for the API request + + """ + async def _scan(root, session, api_url, source_tree): + for path, pid, found in await parse_path(root, session, api_url): + obj_type = parse_persistent_identifier(pid).object_type + + if obj_type == CONTENT: + source_tree.addNode(path, pid if found else None) + elif obj_type == DIRECTORY: + if found: + source_tree.addNode(path, pid) + else: + source_tree.addNode(path) + await _scan(path, session, api_url, source_tree) + + async with aiohttp.ClientSession() as session: + await _scan(root, session, api_url, source_tree) diff --git a/swh/foo/tests/__init__.py b/swh/scanner/tests/__init__.py rename from swh/foo/tests/__init__.py rename to swh/scanner/tests/__init__.py diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -8,8 +8,8 @@ pytest-cov commands = pytest --doctest-modules \ - {envsitepackagesdir}/swh/foo \ - --cov={envsitepackagesdir}/swh/foo \ + {envsitepackagesdir}/swh/scanner \ + --cov={envsitepackagesdir}/swh/scanner \ --cov-branch {posargs} [testenv:flake8]