diff --git a/benchmark.py b/benchmark.py new file mode 100755 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from pathlib import Path +import shutil +from subprocess import DEVNULL, Popen, call +import sys + +import click + +import swh.scanner.logger as log + +# algorithms available in 'swh benchmark' command options +ALGOS = [ + "stopngo", + "file_priority", + "directory_priority", + "random", + "algo_min", +] + + +def parse_repo(temp_path, port_range): + dirpath, dnames, _ = next(os.walk(temp_path)) + extracted_repo_path = Path(dirpath).joinpath(dnames[0]) + + start, end = port_range.split(":") + api_urls = [ + f"http://localhost:{int(port)}/api/1/" for port in range(int(start), int(end)) + ] + + scenario_cmds = [ + [ + "swh", + "scanner", + "benchmark", + "-a", + algo, + "-u", + api_url, + "-x", + str(extracted_repo_path) + "/.git", + str(extracted_repo_path), + ] + for algo in ALGOS + for api_url in api_urls + ] + + processes = [ + Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) for cmd in scenario_cmds + ] + + for proc in processes: + proc.wait() + + shutil.rmtree(extracted_repo_path) + + +@click.command( + help="""Run multiple benchmark from an input archive. The repository + will be unpacked in the provided temporary path. The port range + should be specified in order to run the benchmark using different + backend""" +) +@click.argument("repo_path", type=click.Path(exists=True), required=True) +@click.argument("temp_path", type=click.Path(exists=True), required=True) +@click.option( + "--port-range", + "-p", + metavar="PORT_RANGE", + show_default=True, + required=True, + help="The backend port range (e.g., 5000:5001)", +) +def main(repo_path, temp_path, port_range): + log.setup_logger() + try: + exit_code = call(["tar", "xvf", repo_path, "-C", temp_path], stdout=DEVNULL) + if exit_code == 0: + parse_repo(temp_path, port_range) + else: + raise IOError( + 'Decompression of repo "%s" exited with code: %d' + % (repo_path, exit_code) + ) + except Exception as e: + log.error(e) + except IOError as ioerror: + log.error(ioerror) + + +if __name__ == "__main__": + main() diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -28,3 +28,6 @@ [mypy-plotly.*] ignore_missing_imports = True + +[mypy-git.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ dash dash_bootstrap_components flask +GitPython dulwich diff --git a/run_backend.sh b/run_backend.sh new file mode 100755 --- /dev/null +++ b/run_backend.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# This script simply runs multiple scanner backend using the files provided from stdin. + +start_port=5000 + +while IFS= read -r swhids_file; +do + gunicorn "-b" 127.0.0.1:$start_port 'swh.scanner.backend:create_app("'$swhids_file'")' & + ((start_port++)) +done diff --git a/run_benchmark.sh b/run_benchmark.sh new file mode 100755 --- /dev/null +++ b/run_benchmark.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# This script is used to benchmark multiple repositories (taken from stdin). +# A temporary directory and a port range should be provided. + +temp_dir=$1 +port_range=$2 + +if [ ! -d "$temp_dir" ]; then + echo "The provided temporary directory does not exists" + exit 1 +fi + +if [ "$port_range" == '' ]; then + echo "You should provide a valid port range (e.g., 5000:5003)" + exit 1 +fi + +while IFS= read -r repo; +do + ./benchmark.py "$repo" "$temp_dir" -p "$port_range" +done diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py --- a/swh/scanner/backend.py +++ b/swh/scanner/backend.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from pathlib import Path + from flask import Flask, request from .db import Db @@ -11,10 +13,11 @@ LIMIT = 1000 -def create_app(db: Db): +def create_app(db_file: str): """Backend for swh-scanner, implementing the /known endpoint of the Software Heritage Web API""" app = Flask(__name__) + db = Db(Path(db_file)) @app.route("/api/1/known/", methods=["POST"]) def known(): @@ -34,8 +37,13 @@ return app -def run(host: str, port: int, db: Db): +def run(host: str, port: int, db_file: str): """Serve the local database """ - app = create_app(db) - app.run(host, port, debug=True) + # from .db import Db + + # db = Db(db_file) + # app = create_app(db) + # app.run(host, port, debug=False) + # db.close() + pass diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py new file mode 100644 --- /dev/null +++ b/swh/scanner/benchmark_algos.py @@ -0,0 +1,385 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import itertools +import json +import os +from pathlib import Path +import random +from typing import Dict, Iterable, List, Optional + +from git import Repo +import requests + +from swh.model.from_disk import Content, Directory, accept_all_directories +from swh.model.identifiers import CONTENT, DIRECTORY, swhid + +from .exceptions import APIError +from .logger import error +from .model import Status, Tree +from .scanner import directory_filter, extract_regex_objs + + +class Counter: + """Class to store the number of api calls and queries made to the + backend during the discovery process of the various algorithms. + """ + + def __init__(self): + self.api_count: int = 0 + self.query_count: int = 0 + + def add_api(self, n: int): + self.api_count += n + + def add_query(self, n: int): + self.query_count += n + + def get_api(self): + return self.api_count + + def get_query(self): + return self.query_count + + +def query_swhids( + swhids: List[Tree], api_url: str, counter: Optional[Counter] = None +) -> Dict[str, Dict[str, bool]]: + """ + Returns: + A dictionary with: + key(str): persistent identifier + value(dict): + value['known'] = True if pid is found + value['known'] = False if pid is not found + """ + endpoint = api_url + "known/" + chunk_size = 1000 + + if counter: + counter.add_query(len(swhids)) + + def make_request(swhids): + swhids = [swhid.swhid for swhid in swhids] + req = requests.post(endpoint, json=swhids) + if req.status_code != 200: + error_message = "%s with given values %s" % (req.text, str(swhids)) + error(error_message) + raise APIError(error_message) + if counter: + counter.add_api(1) + resp = req.text + return json.loads(resp) + + def get_chunk(swhids): + for i in range(0, len(swhids), chunk_size): + yield swhids[i : i + chunk_size] + + if len(swhids) > chunk_size: + return dict( + itertools.chain.from_iterable( + make_request(swhids_chunk).items() for swhids_chunk in get_chunk(swhids) + ) + ) + else: + return make_request(swhids) + + +def stopngo(source_tree: Tree, api_url: str, counter: Counter): + def set_children_known(node): + for child_node in node.iterate(): + child_node.known = True + + nodes = [] + nodes.append(source_tree) + + while len(nodes) > 0: + parsed_nodes = query_swhids(nodes, api_url, counter) + for node in nodes.copy(): + nodes.remove(node) + node.known = parsed_nodes[node.swhid]["known"] + node.status = Status.queried + if node.otype == DIRECTORY: + if not node.known: + nodes.extend(list(node.children.values())) + else: + set_children_known(node) + + +def set_father_status(node, known): + """ + Recursively change father known and visited status of a given node + """ + parent = node.father + + if parent is None: + return + if parent.status != Status.unset: + return + + parent.known = known + set_father_status(parent, known) + + +def set_children_status(node, node_type, known, status: Status = Status.unset): + """ + Recursively change father known and visited status of a given node + """ + for child_node in node.iterate(): + if child_node.otype == node_type and child_node.status == status: + child_node.known = known + + +def file_priority(source_tree: Tree, api_url: str, counter: Counter): + # get all the files + all_contents = list( + filter(lambda node: node.otype == CONTENT, source_tree.iterate_bfs()) + ) + all_contents.reverse() # we check nodes from the deepest + + # query the backend to get all file contents status + parsed_contents = query_swhids(all_contents, api_url, counter) + # set all the file contents status + for cnt in all_contents: + cnt.known = parsed_contents[cnt.swhid]["known"] + cnt.status = Status.queried + # set all the upstream directories of unknown file contents to unknown + if not cnt.known: + set_father_status(cnt, False) + + # get all unset directories and check their status + # (update children directories accordingly) + unset_dirs = list( + filter( + lambda node: node.otype == DIRECTORY and node.status == Status.unset, + source_tree.iterate(), + ) + ) + + if source_tree.status == Status.unset: + unset_dirs.append(source_tree) + + # check unset directories + for dir_ in unset_dirs: + if dir_.status == Status.unset: + # update directory status + dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] + dir_.status = Status.queried + set_children_status(dir_, DIRECTORY, dir_.known) + + +def directory_priority(source_tree: Tree, api_url: str, counter: Counter): + # get all directory contents that have at least one file content + unset_dirs = list( + filter( + lambda dir_: dir_.otype == DIRECTORY and dir_.has_contents, + source_tree.iterate_bfs(), + ) + ) + unset_dirs.reverse() + # insert root if it has no contents + if source_tree.has_contents: + unset_dirs.append(source_tree) + + for dir_ in unset_dirs: + # if the directory is known set all the downstream file contents to known + if dir_.status == Status.unset: + dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] + dir_.status = Status.queried + if dir_.known: + set_children_status(dir_, CONTENT, True) + else: + set_father_status(dir_, False) + + # get remaining directories that have no file contents + unset_dirs_no_cnts = list( + filter( + lambda node: node.otype == DIRECTORY and not node.has_contents, + source_tree.iterate_bfs(), + ) + ) + parsed_dirs_no_cnts = query_swhids(unset_dirs_no_cnts, api_url, counter) + + # update status of directories that have no file contents + for dir_ in unset_dirs_no_cnts: + dir_.known = parsed_dirs_no_cnts[dir_.swhid]["known"] + dir_.status = Status.queried + + # check unknown file contents + unset_files = list( + filter( + lambda node: node.otype == CONTENT and node.status == Status.unset, + source_tree.iterate(), + ) + ) + parsed_unset_files = query_swhids(unset_files, api_url, counter) + + for file_ in unset_files: + file_.known = parsed_unset_files[file_.swhid]["known"] + file_.status = Status.queried + + +def random_( + source_tree: Tree, api_url: str, counter: Counter, seed: Optional[int] = None +): + + if seed: + random.seed(seed) + # get all directory/file contents + all_nodes = [node for node in source_tree.iterate()] + [source_tree] + # shuffle contents + random.shuffle(all_nodes) + + while len(all_nodes): + node = all_nodes.pop() + + if node.status != Status.unset: + continue + + node.known = query_swhids([node], api_url, counter)[node.swhid]["known"] + node.status = Status.queried + if node.otype == DIRECTORY and node.known: + for child_node in node.iterate(): + child_node.known = True + elif node.otype == CONTENT and not node.known: + set_father_status(node, False) + + +def algo_min(source_tree: Tree, api_url: str): + """ + The minimal number of queries knowing the known/unknown status of every node + """ + + def remove_parents(node, nodes): + parent = node.father + if parent is None or parent not in nodes: + return + else: + nodes.remove(parent) + remove_parents(parent, nodes) + + def remove_children(node, nodes): + for child_node in node.iterate(): + nodes.remove(child_node) + + all_nodes = [node for node in source_tree.iterate()] + all_nodes.insert(0, source_tree) + + parsed_nodes = query_swhids(all_nodes, api_url) + for node in all_nodes: + node.known = parsed_nodes[node.swhid]["known"] + + all_nodes_copy = all_nodes.copy() + + for node in all_nodes: + if node.otype == CONTENT and not node.known: + remove_parents(node, all_nodes_copy) + + for node in all_nodes_copy: + if node.otype == DIRECTORY and node.known: + remove_children(node, all_nodes_copy) + + return len(all_nodes_copy) + + +def get_swhids(paths: Iterable[Path], exclude_patterns): + def swhid_of(path): + if path.is_dir(): + if exclude_patterns: + + def dir_filter(dirpath, *args): + return directory_filter(dirpath, exclude_patterns) + + else: + dir_filter = accept_all_directories + + obj = Directory.from_disk( + path=bytes(path), dir_filter=dir_filter + ).get_data() + + return swhid(DIRECTORY, obj) + else: + obj = Content.from_file(path=bytes(path)).get_data() + return swhid(CONTENT, obj) + + for path in paths: + yield str(path), swhid_of(path) + + +def load_source(root, sre_patterns): + """ + Load the source code inside the Tree data structure + """ + + def _scan(root_path, source_tree, sre_patterns): + dirpath, dnames, fnames = next(os.walk(root_path)) + dirpath = Path(dirpath) + + if fnames: + files = [dirpath.joinpath(fname) for fname in fnames] + parsed_file_swhids = dict(get_swhids(files, sre_patterns)) + + for path, swhid_ in parsed_file_swhids.items(): + source_tree.add_node(Path(path), swhid_) + + if dnames: + dirs = [dirpath.joinpath(dname) for dname in dnames] + parsed_dirs_swhids = dict(get_swhids(dirs, sre_patterns)) + + for path, swhid_ in parsed_dirs_swhids.items(): + if not directory_filter(path, sre_patterns): + continue + source_tree.add_node(Path(path), swhid_) + _scan(path, source_tree, sre_patterns) + + source_tree = Tree(root) + root_swhid = dict(get_swhids([root], sre_patterns)) + source_tree.swhid = root_swhid[str(root)] + _scan(root, source_tree, sre_patterns) + return source_tree + + +def run( + root: str, + api_url: str, + exclude_patterns: Iterable[str], + algo: str, + seed: Optional[int] = None, +): + sre_patterns = set() + if exclude_patterns: + sre_patterns = { + reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns) + } + + repo = Repo(root) + repo_url = repo.remote("origin").url + revision = repo.git.rev_list("--count", "HEAD") + counter = Counter() + source_tree = load_source(Path(root), sre_patterns) + + if algo == "random": + random_(source_tree, api_url, counter, seed) + elif algo == "algo_min": + min_queries = algo_min(source_tree, api_url) + min_result = (repo_url, revision, len(source_tree), algo, min_queries) + print(min_result) + return + elif algo == "stopngo": + stopngo(source_tree, api_url, counter) + elif algo == "file_priority": + file_priority(source_tree, api_url, counter) + elif algo == "directory_priority": + directory_priority(source_tree, api_url, counter) + + result = ( + repo_url, + revision, + len(source_tree), + algo, + counter.get_api(), + counter.get_query(), + ) + print(result) diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -234,6 +234,58 @@ db.close() +@scanner.command() +@click.argument("root_path", required=True, type=click.Path(exists=True)) +@click.option( + "-u", + "--api-url", + default=None, + metavar="API_URL", + show_default=True, + help="URL for the api request", +) +@click.option( + "--exclude", + "-x", + "patterns", + metavar="PATTERN", + multiple=True, + show_default=True, + help="Exclude directories using glob patterns \ + (e.g., '*.git' to exclude all .git directories)", +) +@click.option( + "--algo", + "-a", + metavar="ALGO NAME", + show_default=True, + required=True, + help="Algorithm name", +) +# @click.option( +# "--seed", +# "-s", +# metavar="SEED", +# type=int, +# show_default=True, +# help="Seed for the random algorithm" +# ) +@click.pass_context +def benchmark(ctx, root_path, api_url, patterns, algo): + from swh.scanner.benchmark_algos import run + from swh.scanner.logger import error, setup_logger + + setup_logger() + try: + run(root_path, api_url, patterns, algo) + except Exception as e: + error( + f'Repository: "{root_path}" using "{algo}" ' + f'algorithm on "{api_url}" FAILED: {e}' + ) + pass + + def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") diff --git a/swh/scanner/logger.py b/swh/scanner/logger.py new file mode 100644 --- /dev/null +++ b/swh/scanner/logger.py @@ -0,0 +1,40 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + +logger = None + + +def setup_logger() -> None: + global logger + console = logging.FileHandler("scan.log") + console.setLevel(logging.NOTSET) + formatter = logging.Formatter("%(asctime)s | %(levelname)s: %(message)s") + console.setFormatter(formatter) + + logger = logging.getLogger("debug") + logger.addHandler(console) + # logger.propagate = True + + +def error(*args) -> None: + if logger is not None: + logger.error(args) + + +def warning(*args) -> None: + if logger is not None: + logger.warning(args) + + +def info(*args) -> None: + if logger is not None: + logger.info(args) + + +def debug(*args): + if logger is not None: + logger.debug(args) diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -22,6 +22,8 @@ class Color(Enum): blue = "\033[94m" green = "\033[92m" + yellow = "\033[93m" + magenta = "\033[95m" red = "\033[91m" end = "\033[0m" @@ -30,6 +32,12 @@ return color.value + text + Color.end.value +class Status(Enum): + unset = 0 + set = 1 + queried = 2 + + class Tree: """Representation of a file system structure """ @@ -40,23 +48,26 @@ self.otype = DIRECTORY if path.is_dir() else CONTENT self.swhid = "" self.known = False + self.status = Status.unset self.children: Dict[Path, Tree] = {} - def add_node(self, path: Path, swhid: str, known: bool) -> None: + def __len__(self): + return sum(1 for node in self.iterate()) + 1 # the root node + + def add_node(self, path: Path, swhid: str) -> None: """Recursively add a new path. """ relative_path = path.relative_to(self.path) if relative_path == Path("."): self.swhid = swhid - self.known = known return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: self.children[new_path] = Tree(new_path, self) - self.children[new_path].add_node(path, swhid, known) + self.children[new_path].add_node(path, swhid) def show(self, fmt) -> None: """Show tree in different formats""" @@ -90,15 +101,28 @@ end = "/" if node.otype == DIRECTORY else "" if isatty: - if not node.known: - rel_path = colorize(rel_path, Color.red) - elif node.otype == DIRECTORY: + if node.status == Status.unset: + rel_path = colorize(rel_path, Color.magenta) + elif node.status == Status.set and not node.known: + rel_path = colorize(rel_path, Color.yellow) + elif node.status == Status.set and node.known: rel_path = colorize(rel_path, Color.blue) - elif node.otype == CONTENT: + elif node.status == Status.queried and not node.known: + rel_path = colorize(rel_path, Color.red) + elif node.status == Status.queried and node.known: rel_path = colorize(rel_path, Color.green) print(f"{begin}{rel_path}{end}") + @property + def known(self): + return self._known + + @known.setter + def known(self, value: bool): + self._known = value + self.status = Status.set + @property def attributes(self) -> Dict[str, Dict[str, Any]]: """ @@ -158,6 +182,16 @@ if child_node.otype == DIRECTORY: yield from child_node.iterate() + def iterate_bfs(self) -> Iterator[Tree]: + """Get nodes in BFS order + """ + nodes = set(node for node in self.children.values()) + for node in nodes: + yield node + for node in nodes: + if node.otype == DIRECTORY: + yield from node.iterate_bfs() + def get_files_from_dir(self, dir_path: Path) -> List: """ Retrieve files information about a specific directory path @@ -249,3 +283,9 @@ if child_node.otype == DIRECTORY: return True return False + + def has_contents(self) -> bool: + for _, child_node in self.children.items(): + if child_node.otype == CONTENT: + return True + return False