diff --git a/swh/scanner/logger.py b/swh/scanner/logger.py deleted file mode 100644 index b45e7e3..0000000 --- a/swh/scanner/logger.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (C) 2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging - - -logger = None - - -def init(**kwargs): - def decorate(func): - for k in kwargs: - setattr(func, k, kwargs[k]) - return func - return decorate - - -def setup_logger(verbose: bool) -> None: - global logger - console = logging.FileHandler('scan.log') - console.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(asctime)s | %(levelname)s: %(message)s') - console.setFormatter(formatter) - - logger = logging.getLogger('debug') - logger.addHandler(console) - if not verbose: - logger.propagate = False - - -@init(count=0) -def log_queries(n: int) -> None: - if logger is not None: - log_queries.count += n - - -def log_counters() -> None: - if logger is not None: - logger.info('number of queries: %s' % log_queries.count) - - -def error(*args) -> None: - if logger is not None: - logger.error(args) - - -def warning(*args) -> None: - if logger is not None: - logger.warning(args) - - -def info(*args) -> None: - if logger is not None: - logger.info(args) - - -def debug(*args): - if logger is not None: - logger.debug(args) diff --git a/swh/scanner/model.py b/swh/scanner/model.py index c46e618..c4a3d56 100644 --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -1,76 +1,84 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations +import sys from pathlib import PosixPath from typing import Any, Dict from enum import Enum from swh.model.identifiers import ( DIRECTORY, CONTENT ) class Color(Enum): blue = '\033[94m' green = '\033[92m' red = '\033[91m' end = '\033[0m' def colorize(text: str, color: Color): return color.value + text + Color.end.value class Tree: """Representation of a file system structure """ - def __init__(self, father: Tree, path: PosixPath): + def __init__(self, path: PosixPath, father: Tree = None): self.father = father self.path = path self.otype = DIRECTORY if path.is_dir() else CONTENT self.pid = '' self.children: Dict[PosixPath, Tree] = {} def addNode(self, path: PosixPath, pid: str = None) -> None: """Recursively add a new node path """ relative_path = path.relative_to(self.path) if relative_path == PosixPath('.'): if pid is not None: self.pid = pid return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: - self.children[new_path] = Tree(self, new_path) + self.children[new_path] = Tree(new_path, self) self.children[new_path].addNode(path, pid) def show(self) -> None: """Print all the tree""" - print(Color.blue.value+str(self.path)+Color.end.value) - self.printChildren() + isatty = sys.stdout.isatty() - def printChildren(self, inc: int = 0) -> None: + print(colorize(str(self.path), Color.blue) if isatty + else str(self.path)) + self.printChildren(isatty) + + def printChildren(self, isatty: bool, inc: int = 0) -> None: for path, node in self.children.items(): - self.printNode(node, inc) + self.printNode(node, isatty, inc) if node.children: - node.printChildren(inc+1) + node.printChildren(isatty, inc+1) - def printNode(self, node: Any, inc: int) -> None: + def printNode(self, node: Any, isatty: bool, inc: int) -> None: rel_path = str(node.path.relative_to(self.path)) + print('│ '*inc, end='') if node.otype == DIRECTORY: if node.pid: - print('│ '*inc + colorize(rel_path, Color.blue) + '/') + print(colorize(rel_path, Color.blue) if isatty else rel_path, + end='') else: - print('│ '*inc + colorize(rel_path, Color.red) + '/') + print(colorize(rel_path, Color.red) if isatty else rel_path, + end='') + print('/') - if node.otype == CONTENT: + elif node.otype == CONTENT: if node.pid: - print('│ '*inc + colorize(rel_path, Color.green)) + print(colorize(rel_path, Color.green) if isatty else rel_path) else: - print('│ '*inc + colorize(rel_path, Color.red)) + print(colorize(rel_path, Color.red) if isatty else rel_path) diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py index 11065e2..a8b7270 100644 --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -1,146 +1,145 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import asyncio import aiohttp -from typing import List, Dict, Tuple, Generator, Iterator +from typing import List, Dict, Tuple, Iterator from pathlib import PosixPath -from .logger import log_queries from .exceptions import APIError from .model import Tree from swh.model.cli import pid_of_file, pid_of_dir from swh.model.identifiers import ( parse_persistent_identifier, DIRECTORY, CONTENT ) async def pids_discovery( - pids: List[str], session: aiohttp.ClientSession, url: str, + pids: List[str], session: aiohttp.ClientSession, api_url: str, ) -> Dict[str, Dict[str, bool]]: """API Request to get information about the persistent identifiers given in input. Args: pids: a list of persistent identifier + api_url: url for the API request Returns: A dictionary with: key: persistent identifier searched value: value['known'] = True if the pid is found value['known'] = False if the pid is not found """ - endpoint = url + '/api/1/known/' + endpoint = api_url + 'known/' chunk_size = 1000 requests = [] - log_queries(len(pids)) - def get_chunk(pids): for i in range(0, len(pids), chunk_size): yield pids[i:i + chunk_size] async def make_request(pids): async with session.post(endpoint, json=pids) as resp: if resp.status != 200: error_message = '%s with given values %s' % ( resp.text, str(pids)) raise APIError(error_message) + return await resp.json() if len(pids) > chunk_size: for pids_chunk in get_chunk(pids): requests.append(asyncio.create_task( make_request(pids_chunk))) res = await asyncio.gather(*requests) # concatenate list of dictionaries return dict(itertools.chain.from_iterable(e.items() for e in res)) else: return await make_request(pids) def get_subpaths( - path: PosixPath) -> Generator[Tuple[PosixPath, str], None, None]: + path: PosixPath) -> Iterator[Tuple[PosixPath, str]]: """Find the persistent identifier of the directories and files under a given path. Args: path: the root path Yields: pairs of: path, the relative persistent identifier """ def pid_of(path): if path.is_dir(): return pid_of_dir(bytes(path)) elif path.is_file(): return pid_of_file(bytes(path)) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): sub_path = PosixPath(dirpath).joinpath(node) yield (sub_path, pid_of(sub_path)) async def parse_path( - path: PosixPath, session: aiohttp.ClientSession, url: str + path: PosixPath, session: aiohttp.ClientSession, api_url: str ) -> Iterator[Tuple[str, str, bool]]: """Check if the sub paths of the given path are present in the archive or not. Args: path: the source path - url: url for the API request + api_url: url for the API request Returns: a map containing tuples with: a subpath of the given path, the pid of the subpath and the result of the api call """ parsed_paths = dict(get_subpaths(path)) parsed_pids = await pids_discovery( - list(parsed_paths.values()), session, url) + list(parsed_paths.values()), session, api_url) def unpack(tup): subpath, pid = tup return (subpath, pid, parsed_pids[pid]['known']) return map(unpack, parsed_paths.items()) async def run( - root: PosixPath, url: str, source_tree: Tree) -> None: + root: PosixPath, api_url: str, source_tree: Tree) -> None: """Start scanning from the given root. - It fill the source tree with the path discovered. + It fills the source tree with the path discovered. Args: root: the root path to scan - url: url for the API request + api_url: url for the API request """ - async def _scan(root, session, url, source_tree): - for path, pid, found in await parse_path(root, session, url): + async def _scan(root, session, api_url, source_tree): + for path, pid, found in await parse_path(root, session, api_url): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: source_tree.addNode(path, pid if found else None) elif obj_type == DIRECTORY: if found: source_tree.addNode(path, pid) else: source_tree.addNode(path) - await _scan(path, session, url, source_tree) + await _scan(path, session, api_url, source_tree) async with aiohttp.ClientSession() as session: - await _scan(root, session, url, source_tree) + await _scan(root, session, api_url, source_tree)