Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/scanner/logger.py b/swh/scanner/logger.py
deleted file mode 100644
index b45e7e3..0000000
--- a/swh/scanner/logger.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (C) 2020 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import logging
-
-
-logger = None
-
-
-def init(**kwargs):
- def decorate(func):
- for k in kwargs:
- setattr(func, k, kwargs[k])
- return func
- return decorate
-
-
-def setup_logger(verbose: bool) -> None:
- global logger
- console = logging.FileHandler('scan.log')
- console.setLevel(logging.DEBUG)
- formatter = logging.Formatter('%(asctime)s | %(levelname)s: %(message)s')
- console.setFormatter(formatter)
-
- logger = logging.getLogger('debug')
- logger.addHandler(console)
- if not verbose:
- logger.propagate = False
-
-
-@init(count=0)
-def log_queries(n: int) -> None:
- if logger is not None:
- log_queries.count += n
-
-
-def log_counters() -> None:
- if logger is not None:
- logger.info('number of queries: %s' % log_queries.count)
-
-
-def error(*args) -> None:
- if logger is not None:
- logger.error(args)
-
-
-def warning(*args) -> None:
- if logger is not None:
- logger.warning(args)
-
-
-def info(*args) -> None:
- if logger is not None:
- logger.info(args)
-
-
-def debug(*args):
- if logger is not None:
- logger.debug(args)
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
index c46e618..c4a3d56 100644
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -1,76 +1,84 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
+import sys
from pathlib import PosixPath
from typing import Any, Dict
from enum import Enum
from swh.model.identifiers import (
DIRECTORY, CONTENT
)
class Color(Enum):
blue = '\033[94m'
green = '\033[92m'
red = '\033[91m'
end = '\033[0m'
def colorize(text: str, color: Color):
return color.value + text + Color.end.value
class Tree:
"""Representation of a file system structure
"""
- def __init__(self, father: Tree, path: PosixPath):
+ def __init__(self, path: PosixPath, father: Tree = None):
self.father = father
self.path = path
self.otype = DIRECTORY if path.is_dir() else CONTENT
self.pid = ''
self.children: Dict[PosixPath, Tree] = {}
def addNode(self, path: PosixPath, pid: str = None) -> None:
"""Recursively add a new node path
"""
relative_path = path.relative_to(self.path)
if relative_path == PosixPath('.'):
if pid is not None:
self.pid = pid
return
new_path = self.path.joinpath(relative_path.parts[0])
if new_path not in self.children:
- self.children[new_path] = Tree(self, new_path)
+ self.children[new_path] = Tree(new_path, self)
self.children[new_path].addNode(path, pid)
def show(self) -> None:
"""Print all the tree"""
- print(Color.blue.value+str(self.path)+Color.end.value)
- self.printChildren()
+ isatty = sys.stdout.isatty()
- def printChildren(self, inc: int = 0) -> None:
+ print(colorize(str(self.path), Color.blue) if isatty
+ else str(self.path))
+ self.printChildren(isatty)
+
+ def printChildren(self, isatty: bool, inc: int = 0) -> None:
for path, node in self.children.items():
- self.printNode(node, inc)
+ self.printNode(node, isatty, inc)
if node.children:
- node.printChildren(inc+1)
+ node.printChildren(isatty, inc+1)
- def printNode(self, node: Any, inc: int) -> None:
+ def printNode(self, node: Any, isatty: bool, inc: int) -> None:
rel_path = str(node.path.relative_to(self.path))
+ print('│ '*inc, end='')
if node.otype == DIRECTORY:
if node.pid:
- print('│ '*inc + colorize(rel_path, Color.blue) + '/')
+ print(colorize(rel_path, Color.blue) if isatty else rel_path,
+ end='')
else:
- print('│ '*inc + colorize(rel_path, Color.red) + '/')
+ print(colorize(rel_path, Color.red) if isatty else rel_path,
+ end='')
+ print('/')
- if node.otype == CONTENT:
+ elif node.otype == CONTENT:
if node.pid:
- print('│ '*inc + colorize(rel_path, Color.green))
+ print(colorize(rel_path, Color.green) if isatty else rel_path)
else:
- print('│ '*inc + colorize(rel_path, Color.red))
+ print(colorize(rel_path, Color.red) if isatty else rel_path)
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
index 11065e2..a8b7270 100644
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -1,146 +1,145 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import itertools
import asyncio
import aiohttp
-from typing import List, Dict, Tuple, Generator, Iterator
+from typing import List, Dict, Tuple, Iterator
from pathlib import PosixPath
-from .logger import log_queries
from .exceptions import APIError
from .model import Tree
from swh.model.cli import pid_of_file, pid_of_dir
from swh.model.identifiers import (
parse_persistent_identifier,
DIRECTORY, CONTENT
)
async def pids_discovery(
- pids: List[str], session: aiohttp.ClientSession, url: str,
+ pids: List[str], session: aiohttp.ClientSession, api_url: str,
) -> Dict[str, Dict[str, bool]]:
"""API Request to get information about the persistent identifiers given in
input.
Args:
pids: a list of persistent identifier
+ api_url: url for the API request
Returns:
A dictionary with:
key: persistent identifier searched
value:
value['known'] = True if the pid is found
value['known'] = False if the pid is not found
"""
- endpoint = url + '/api/1/known/'
+ endpoint = api_url + 'known/'
chunk_size = 1000
requests = []
- log_queries(len(pids))
-
def get_chunk(pids):
for i in range(0, len(pids), chunk_size):
yield pids[i:i + chunk_size]
async def make_request(pids):
async with session.post(endpoint, json=pids) as resp:
if resp.status != 200:
error_message = '%s with given values %s' % (
resp.text, str(pids))
raise APIError(error_message)
+
return await resp.json()
if len(pids) > chunk_size:
for pids_chunk in get_chunk(pids):
requests.append(asyncio.create_task(
make_request(pids_chunk)))
res = await asyncio.gather(*requests)
# concatenate list of dictionaries
return dict(itertools.chain.from_iterable(e.items() for e in res))
else:
return await make_request(pids)
def get_subpaths(
- path: PosixPath) -> Generator[Tuple[PosixPath, str], None, None]:
+ path: PosixPath) -> Iterator[Tuple[PosixPath, str]]:
"""Find the persistent identifier of the directories and files under a
given path.
Args:
path: the root path
Yields:
pairs of: path, the relative persistent identifier
"""
def pid_of(path):
if path.is_dir():
return pid_of_dir(bytes(path))
elif path.is_file():
return pid_of_file(bytes(path))
dirpath, dnames, fnames = next(os.walk(path))
for node in itertools.chain(dnames, fnames):
sub_path = PosixPath(dirpath).joinpath(node)
yield (sub_path, pid_of(sub_path))
async def parse_path(
- path: PosixPath, session: aiohttp.ClientSession, url: str
+ path: PosixPath, session: aiohttp.ClientSession, api_url: str
) -> Iterator[Tuple[str, str, bool]]:
"""Check if the sub paths of the given path are present in the
archive or not.
Args:
path: the source path
- url: url for the API request
+ api_url: url for the API request
Returns:
a map containing tuples with: a subpath of the given path,
the pid of the subpath and the result of the api call
"""
parsed_paths = dict(get_subpaths(path))
parsed_pids = await pids_discovery(
- list(parsed_paths.values()), session, url)
+ list(parsed_paths.values()), session, api_url)
def unpack(tup):
subpath, pid = tup
return (subpath, pid, parsed_pids[pid]['known'])
return map(unpack, parsed_paths.items())
async def run(
- root: PosixPath, url: str, source_tree: Tree) -> None:
+ root: PosixPath, api_url: str, source_tree: Tree) -> None:
"""Start scanning from the given root.
- It fill the source tree with the path discovered.
+ It fills the source tree with the path discovered.
Args:
root: the root path to scan
- url: url for the API request
+ api_url: url for the API request
"""
- async def _scan(root, session, url, source_tree):
- for path, pid, found in await parse_path(root, session, url):
+ async def _scan(root, session, api_url, source_tree):
+ for path, pid, found in await parse_path(root, session, api_url):
obj_type = parse_persistent_identifier(pid).object_type
if obj_type == CONTENT:
source_tree.addNode(path, pid if found else None)
elif obj_type == DIRECTORY:
if found:
source_tree.addNode(path, pid)
else:
source_tree.addNode(path)
- await _scan(path, session, url, source_tree)
+ await _scan(path, session, api_url, source_tree)
async with aiohttp.ClientSession() as session:
- await _scan(root, session, url, source_tree)
+ await _scan(root, session, api_url, source_tree)

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 11:56 AM (3 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3273930

Event Timeline