diff --git a/benchmark.py b/benchmark.py new file mode 100755 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +import os +from pathlib import Path +import subprocess +import sys +from tempfile import TemporaryDirectory +from typing import Set + +import click + +SEED_OPTIONS = ["-s 10"] + + +def get_scenario_cmd(algo, kb_url, kb_label, origin_info, extracted_repo_path): + return [ + "swh", + "scanner", + "benchmark", + "--algo", + algo, + "--api-url", + kb_url, + "--backend-name", + kb_label, + "--origin", + origin_info["origin"], + "--commit", + origin_info["commit"], + "--exclude", + str(extracted_repo_path) + "/.git", + str(extracted_repo_path), + ] + + +def run_experiments( + repo_path: str, temp_path: str, kb_state_file: str, algos: Set[str] +): + """This function create a process for each experiment; one experiment is composed + by: the repository we want to scan, the algorithms we need to test and different + known-backends mapped in a "kb-state" file (given in input) + """ + dirpath, dnames, _ = next(os.walk(temp_path)) + extracted_repo_path = Path(dirpath).joinpath(dnames[0]) + + # get all the backends identifier and api URLs + backends = {} + with open(kb_state_file, "r") as kb_state_f: + for kb in kb_state_f.readlines(): + if kb.startswith("#"): + continue + elems = kb.split(" ") + backends[elems[0]] = elems[1] + + # get repository origin info from the "base_directory" + info_path = repo_path[:-7] + "info.json" + with open(info_path, "r") as json_file: + origin_info = json.load(json_file) + + scenario_cmds = [] + + for algo in algos: + for kb_label, kb_url in backends.items(): + if algo == "random": + for seed_opt in SEED_OPTIONS: + random_cmd = get_scenario_cmd( + algo, kb_url, kb_label, origin_info, str(extracted_repo_path) + ) + scenario_cmds.append(random_cmd + [seed_opt]) + else: + scenario_cmds.append( + get_scenario_cmd( + algo, kb_url, kb_label, origin_info, str(extracted_repo_path) + ) + ) + + processes = [ + subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) + for cmd in scenario_cmds + ] + + for proc in processes: + proc.wait() + + +@click.command( + help="""Run multiple benchmark from an input repository. The repository + will be unpacked in the provided temporary path and tested with + the input algorithms.""" +) +@click.argument("repo_path", type=click.Path(exists=True), required=True) +@click.argument("temp_path", type=click.Path(exists=True), required=True) +@click.argument("kb_state", type=click.Path(exists=True), required=True) +@click.option( + "-a", + "--algo", + "algos", + multiple=True, + required=True, + type=click.Choice( + ["stopngo", "file_priority", "directory_priority", "random", "algo_min"], + case_sensitive=False, + ), + metavar="ALGORITHM_NAME", + help="The algorithm name for the benchmark.", +) +def main(repo_path, temp_path, kb_state, algos): + logging.basicConfig( + filename="experiments.log", + format="%(asctime)s %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + try: + repo_id = Path(repo_path).parts[-1].split(".")[0] + with TemporaryDirectory(prefix=repo_id + "_", dir=temp_path) as tmp_dir: + subprocess.run( + ["tar", "xf", repo_path, "-C", tmp_dir, "--strip-components=1"], + check=True, + stdout=subprocess.DEVNULL, + stderr=sys.stderr, + ) + run_experiments(repo_path, temp_path, kb_state, set(algos)) + except Exception as e: + logging.exception(e) + except IOError as ioerror: + logging.exception(ioerror) + + +if __name__ == "__main__": + main() diff --git a/run_backend.sh b/run_backend.sh new file mode 100755 --- /dev/null +++ b/run_backend.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# This script simply runs multiple scanner backend using the backend information provided from stdin. + +while IFS= read -r line; +do + kb_info=($line) + if [[ ${kb_info[0]} = "#" ]] + then + continue + else + gunicorn "-b" ${kb_info[1]:7:14} 'swh.scanner.backend:create_app("'${kb_info[2]}'")' & + fi + +done diff --git a/run_benchmark.sh b/run_benchmark.sh new file mode 100755 --- /dev/null +++ b/run_benchmark.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# This script run the benchmark for the repositories taken from stdin +# You should provide: +# - temporary directory (where the repositories will be unpacked) +# - backend mapping file containing: the backend name, the backend api +# URL and the sqlite db file. +# - the algorithms to be executed +# +# USAGE EXAMPLE: +# find /repositories/dir -name '*.tar.zst' | ./run_benchmark.sh /temporary/dir kb_state.txt stopngo file_priority + + +temp_dir=$1 +kb_state=$2 + +if [ ! -d "$temp_dir" ]; then + echo "You should provide a valid temporary directory path" + exit 1 +fi + +if [ "$kb_state" == '' ]; then + echo "You should provide the file with mapped knowledge bases" + exit 1 +fi + +for i in "${@:3}"; do + algos="$algos -a $i" +done + +# print headers +echo "repo_id,origin,commit_id,kb_state,repo_size,algorithm_name,kb_queries,swhids_queried" + +while IFS= read -r repo; +do + ./benchmark.py $repo $temp_dir $kb_state $algos +done diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py --- a/swh/scanner/backend.py +++ b/swh/scanner/backend.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from pathlib import Path + from flask import Flask, request from .db import Db @@ -11,10 +13,11 @@ LIMIT = 1000 -def create_app(db: Db): +def create_app(db_file: str): """Backend for swh-scanner, implementing the /known endpoint of the Software Heritage Web API""" app = Flask(__name__) + db = Db(Path(db_file)) @app.route("/api/1/known/", methods=["POST"]) def known(): @@ -34,8 +37,13 @@ return app -def run(host: str, port: int, db: Db): +def run(host: str, port: int, db_file: str): """Serve the local database """ - app = create_app(db) - app.run(host, port, debug=True) + # from .db import Db + + # db = Db(db_file) + # app = create_app(db) + # app.run(host, port, debug=False) + # db.close() + pass diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py new file mode 100644 --- /dev/null +++ b/swh/scanner/benchmark_algos.py @@ -0,0 +1,396 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import collections +import itertools +import json +import os +from pathlib import Path +import random +from typing import Dict, Iterable, List, Optional + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +from swh.model.from_disk import Content, Directory, accept_all_directories +from swh.model.identifiers import CONTENT, DIRECTORY, swhid + +from .exceptions import APIError +from .model import Status, Tree +from .scanner import directory_filter, extract_regex_objs + +session = requests.Session() +retries_rule = Retry(total=5, backoff_factor=1) +session.mount("http://", HTTPAdapter(max_retries=retries_rule)) + + +def query_swhids( + swhids: List[Tree], api_url: str, counter: Optional[collections.Counter] = None +) -> Dict[str, Dict[str, bool]]: + """ + Returns: + A dictionary with: + key(str): persistent identifier + value(dict): + value['known'] = True if pid is found + value['known'] = False if pid is not found + """ + endpoint = api_url + "known/" + chunk_size = 1000 + + if counter: + counter["queries"] += len(swhids) + + def make_request(swhids): + swhids = [swhid.swhid for swhid in swhids] + req = session.post(endpoint, json=swhids) + if req.status_code != 200: + error_message = "%s with given values %s" % (req.text, str(swhids)) + raise APIError(error_message) + if counter: + counter["api_calls"] += 1 + resp = req.text + return json.loads(resp) + + def get_chunk(swhids): + for i in range(0, len(swhids), chunk_size): + yield swhids[i : i + chunk_size] + + if len(swhids) > chunk_size: + return dict( + itertools.chain.from_iterable( + make_request(swhids_chunk).items() for swhids_chunk in get_chunk(swhids) + ) + ) + else: + return make_request(swhids) + + +def stopngo(source_tree: Tree, api_url: str, counter: collections.Counter): + nodes = [] + nodes.append(source_tree) + + while len(nodes) > 0: + parsed_nodes = query_swhids(nodes, api_url, counter) + for node in nodes.copy(): + nodes.remove(node) + node.known = parsed_nodes[node.swhid]["known"] + node.status = Status.queried + if node.otype == DIRECTORY: + if not node.known: + nodes.extend(list(node.children.values())) + else: + set_children_status(node, [CONTENT, DIRECTORY], True) + + +def set_father_status(node, known): + """ + Recursively change father known and visited status of a given node + """ + parent = node.father + + if parent is None: + return + if parent.status != Status.unset: + return + + parent.known = known + set_father_status(parent, known) + + +def set_children_status( + node: Tree, node_types: Iterable[str], known: bool, status: Status = Status.unset +): + """ + Recursively change the status of the children of the provided node + """ + for child_node in node.iterate(): + if child_node.otype in node_types and child_node.status == status: + child_node.known = known + + +def file_priority(source_tree: Tree, api_url: str, counter: collections.Counter): + # get all the files + all_contents = list( + filter(lambda node: node.otype == CONTENT, source_tree.iterate_bfs()) + ) + all_contents.reverse() # we check nodes from the deepest + + # query the backend to get all file contents status + parsed_contents = query_swhids(all_contents, api_url, counter) + # set all the file contents status + for cnt in all_contents: + cnt.known = parsed_contents[cnt.swhid]["known"] + cnt.status = Status.queried + # set all the upstream directories of unknown file contents to unknown + if not cnt.known: + set_father_status(cnt, False) + + # get all unset directories and check their status + # (update children directories accordingly) + unset_dirs = list( + filter( + lambda node: node.otype == DIRECTORY and node.status == Status.unset, + source_tree.iterate(), + ) + ) + + if source_tree.status == Status.unset: + unset_dirs.append(source_tree) + + # check unset directories + for dir_ in unset_dirs: + if dir_.status == Status.unset: + # update directory status + dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] + dir_.status = Status.queried + if dir_.known: + set_children_status(dir_, [DIRECTORY], True) + + +def directory_priority(source_tree: Tree, api_url: str, counter: collections.Counter): + # get all directory contents that have at least one file content + unset_dirs = list( + filter( + lambda dir_: dir_.otype == DIRECTORY and dir_.has_contents, + source_tree.iterate_bfs(), + ) + ) + unset_dirs.reverse() + + for dir_ in unset_dirs: + # if the directory is known set all the downstream file contents to known + if dir_.status == Status.unset: + dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"] + dir_.status = Status.queried + if dir_.known: + set_children_status(dir_, [CONTENT], True) + else: + set_father_status(dir_, False) + + # get remaining directories that have no file contents + unset_dirs_no_cnts = list( + filter( + lambda node: node.otype == DIRECTORY and not node.has_contents, + source_tree.iterate_bfs(), + ) + ) + parsed_dirs_no_cnts = query_swhids(unset_dirs_no_cnts, api_url, counter) + + # update status of directories that have no file contents + for dir_ in unset_dirs_no_cnts: + dir_.known = parsed_dirs_no_cnts[dir_.swhid]["known"] + dir_.status = Status.queried + + # check unknown file contents + unset_files = list( + filter( + lambda node: node.otype == CONTENT and node.status == Status.unset, + source_tree.iterate(), + ) + ) + parsed_unset_files = query_swhids(unset_files, api_url, counter) + + for file_ in unset_files: + file_.known = parsed_unset_files[file_.swhid]["known"] + file_.status = Status.queried + + +def random_( + source_tree: Tree, + api_url: str, + counter: collections.Counter, + seed: Optional[int] = None, +): + + if seed: + random.seed(seed) + # get all directory/file contents + all_nodes = [node for node in source_tree.iterate()] + [source_tree] + # shuffle contents + random.shuffle(all_nodes) + + while len(all_nodes): + node = all_nodes.pop() + + if node.status != Status.unset: + continue + + node.known = query_swhids([node], api_url, counter)[node.swhid]["known"] + node.status = Status.queried + if node.otype == DIRECTORY and node.known: + for child_node in node.iterate(): + child_node.known = True + elif node.otype == CONTENT and not node.known: + set_father_status(node, False) + + +def algo_min(source_tree: Tree, api_url: str): + """ + The minimal number of queries knowing the known/unknown status of every node + """ + + def remove_parents(node, nodes): + parent = node.father + if parent is None or parent not in nodes: + return + else: + nodes.remove(parent) + remove_parents(parent, nodes) + + def remove_children(node, nodes): + for child_node in node.iterate(): + nodes.remove(child_node) + + all_nodes = [node for node in source_tree.iterate_bfs()] + + parsed_nodes = query_swhids(all_nodes, api_url) + for node in all_nodes: + node.known = parsed_nodes[node.swhid]["known"] + + all_nodes_copy = all_nodes.copy() + + for node in all_nodes: + if node.otype == CONTENT and not node.known: + remove_parents(node, all_nodes_copy) + + all_nodes.reverse() + for node in all_nodes: + if node.otype == DIRECTORY and not node.known: + remove_parents(node, all_nodes_copy) + + for node in all_nodes_copy: + if node.otype == DIRECTORY and node.known: + remove_children(node, all_nodes_copy) + + return len(all_nodes_copy) + + +def get_swhids(paths: Iterable[Path], exclude_patterns): + def swhid_of(path): + if path.is_dir(): + if exclude_patterns: + + def dir_filter(dirpath, *args): + return directory_filter(dirpath, exclude_patterns) + + else: + dir_filter = accept_all_directories + + obj = Directory.from_disk( + path=bytes(path), dir_filter=dir_filter + ).get_data() + + return swhid(DIRECTORY, obj) + else: + obj = Content.from_file(path=bytes(path)).get_data() + return swhid(CONTENT, obj) + + for path in paths: + yield str(path), swhid_of(path) + + +def load_source(root, sre_patterns): + """ + Load the source code inside the Tree data structure + """ + + def _scan(root_path, source_tree, sre_patterns): + files = [] + dirs = [] + for elem in os.listdir(root_path): + cnt = Path(root_path).joinpath(elem) + if not os.path.islink(cnt): + if os.path.isfile(cnt): + files.append(cnt) + elif os.path.isdir(cnt): + dirs.append(cnt) + + if files: + parsed_file_swhids = dict(get_swhids(files, sre_patterns)) + + for path, swhid_ in parsed_file_swhids.items(): + source_tree.add_node(Path(path), swhid_) + + if dirs: + parsed_dirs_swhids = dict(get_swhids(dirs, sre_patterns)) + + for path, swhid_ in parsed_dirs_swhids.items(): + if not directory_filter(path, sre_patterns): + continue + source_tree.add_node(Path(path), swhid_) + _scan(path, source_tree, sre_patterns) + + source_tree = Tree(root) + root_swhid = dict(get_swhids([root], sre_patterns)) + source_tree.swhid = root_swhid[str(root)] + _scan(root, source_tree, sre_patterns) + return source_tree + + +def run( + root: str, + api_url: str, + backend_name: str, + exclude_patterns: Iterable[str], + algo: str, + origin: str, + commit: str, + seed: Optional[int] = None, +): + sre_patterns = set() + if exclude_patterns: + sre_patterns = { + reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns) + } + + # temporary directory prefix + repo_id = Path(root).parts[-1].split("_")[0] + counter: collections.Counter = collections.Counter() + counter["api_calls"] = 0 + counter["queries"] = 0 + source_tree = load_source(Path(root), sre_patterns) + + if algo == "random": + if seed: + random_(source_tree, api_url, counter, seed) + else: + random_(source_tree, api_url, counter) + elif algo == "algo_min": + min_queries = algo_min(source_tree, api_url) + min_result = ( + repo_id, + origin, + commit, + backend_name, + len(source_tree), + algo, + -1, + min_queries, + ) + print(*min_result, sep=",") + return + elif algo == "stopngo": + stopngo(source_tree, api_url, counter) + elif algo == "file_priority": + file_priority(source_tree, api_url, counter) + elif algo == "directory_priority": + directory_priority(source_tree, api_url, counter) + else: + raise Exception(f'Algorithm "{algo}" not found') + + result = ( + repo_id, + origin, + commit, + backend_name, + len(source_tree), + algo, + counter["api_calls"], + counter["queries"], + ) + + print(*result, sep=",") diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -234,6 +234,79 @@ db.close() +@scanner.command() +@click.argument("root_path", required=True, type=click.Path(exists=True)) +@click.option( + "-u", + "--api-url", + default=None, + metavar="API_URL", + show_default=True, + required=True, + help="URL for the api request.", +) +@click.option( + "-n", + "--backend-name", + default=None, + metavar="BACKEND_NAME", + show_default=True, + required=True, + help="The backend name.", +) +@click.option( + "--origin", + "-o", + "origin_url", + metavar="ORIGIN_URL", + required=True, + help="Repository origin url.", +) +@click.option( + "--commit", "-c", metavar="COMMIT", required=True, help="Commit identifier.", +) +@click.option( + "--exclude", + "-x", + "patterns", + metavar="PATTERN", + multiple=True, + show_default=True, + help="Exclude directories using glob patterns \ + (e.g., '*.git' to exclude all .git directories).", +) +@click.option( + "--algo", "-a", metavar="ALGO NAME", required=True, help="Algorithm name.", +) +@click.option( + "--seed", "-s", metavar="SEED", type=int, help="Seed for the random algorithm" +) +@click.pass_context +def benchmark( + ctx, root_path, api_url, backend_name, origin_url, commit, patterns, algo, seed +): + from importlib import reload + import logging + + from swh.scanner.benchmark_algos import run + + # reload logging module avoid conflict with benchmark.py logging + reload(logging) + logging.basicConfig( + filename="experiments.log", + format="%(asctime)s %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + + try: + run(root_path, api_url, backend_name, patterns, algo, origin_url, commit, seed) + except Exception as e: + logging.exception( + f'Repository: "{root_path}" using "{algo}" ' + f'algorithm on "{api_url}" FAILED: {e}' + ) + + def main(): return scanner(auto_envvar_prefix="SWH_SCANNER") diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -22,6 +22,8 @@ class Color(Enum): blue = "\033[94m" green = "\033[92m" + yellow = "\033[93m" + magenta = "\033[95m" red = "\033[91m" end = "\033[0m" @@ -30,6 +32,12 @@ return color.value + text + Color.end.value +class Status(Enum): + unset = 0 + set = 1 + queried = 2 + + class Tree: """Representation of a file system structure """ @@ -40,23 +48,26 @@ self.otype = DIRECTORY if path.is_dir() else CONTENT self.swhid = "" self.known = False + self.status = Status.unset self.children: Dict[Path, Tree] = {} - def add_node(self, path: Path, swhid: str, known: bool) -> None: + def __len__(self): + return sum(1 for node in self.iterate()) + 1 # the root node + + def add_node(self, path: Path, swhid: str) -> None: """Recursively add a new path. """ relative_path = path.relative_to(self.path) if relative_path == Path("."): self.swhid = swhid - self.known = known return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: self.children[new_path] = Tree(new_path, self) - self.children[new_path].add_node(path, swhid, known) + self.children[new_path].add_node(path, swhid) def show(self, fmt) -> None: """Show tree in different formats""" @@ -90,15 +101,28 @@ end = "/" if node.otype == DIRECTORY else "" if isatty: - if not node.known: - rel_path = colorize(rel_path, Color.red) - elif node.otype == DIRECTORY: + if node.status == Status.unset: + rel_path = colorize(rel_path, Color.magenta) + elif node.status == Status.set and not node.known: + rel_path = colorize(rel_path, Color.yellow) + elif node.status == Status.set and node.known: rel_path = colorize(rel_path, Color.blue) - elif node.otype == CONTENT: + elif node.status == Status.queried and not node.known: + rel_path = colorize(rel_path, Color.red) + elif node.status == Status.queried and node.known: rel_path = colorize(rel_path, Color.green) print(f"{begin}{rel_path}{end}") + @property + def known(self): + return self._known + + @known.setter + def known(self, value: bool): + self._known = value + self.status = Status.set + @property def attributes(self) -> Dict[str, Dict[str, Any]]: """ @@ -158,6 +182,19 @@ if child_node.otype == DIRECTORY: yield from child_node.iterate() + def iterate_bfs(self) -> Iterator[Tree]: + """Get nodes in BFS order + """ + nodes = [] + nodes.append(self) + + while len(nodes) > 0: + for node in nodes.copy(): + yield node + nodes.remove(node) + if node.otype == DIRECTORY: + nodes.extend(list(node.children.values())) + def get_files_from_dir(self, dir_path: Path) -> List: """ Retrieve files information about a specific directory path @@ -249,3 +286,9 @@ if child_node.otype == DIRECTORY: return True return False + + def has_contents(self) -> bool: + for _, child_node in self.children.items(): + if child_node.otype == CONTENT: + return True + return False