diff --git a/benchmark.py b/benchmark.py --- a/benchmark.py +++ b/benchmark.py @@ -11,6 +11,7 @@ import shutil import subprocess import sys +from tempfile import TemporaryDirectory from typing import Set import click @@ -118,15 +119,16 @@ format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", ) - try: - subprocess.run( - ["tar", "xvf", repo_path, "-C", temp_path], - check=True, - stdout=subprocess.DEVNULL, - stderr=sys.stderr, - ) - run_experiments(repo_path, temp_path, kb_state, set(algos)) + repo_id = Path(repo_path).parts[-1].split(".")[0] + with TemporaryDirectory(prefix=repo_id + "_", dir=temp_path) as tmp_dir: + subprocess.run( + ["tar", "xf", repo_path, "-C", tmp_dir, "--strip-components=1"], + check=True, + stdout=subprocess.DEVNULL, + stderr=sys.stderr, + ) + run_experiments(repo_path, temp_path, kb_state, set(algos)) except Exception as e: logging.exception(e) except IOError as ioerror: diff --git a/run_benchmark.sh b/run_benchmark.sh --- a/run_benchmark.sh +++ b/run_benchmark.sh @@ -28,6 +28,9 @@ algos="$algos -a $i" done +# print headers +echo "repo_id,origin,commit_id,kb_state,repo_size,algorithm_name,kb_queries,swhids_queried" + while IFS= read -r repo; do ./benchmark.py $repo $temp_dir $kb_state $algos diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py --- a/swh/scanner/benchmark_algos.py +++ b/swh/scanner/benchmark_algos.py @@ -293,7 +293,7 @@ """ def _scan(root_path, source_tree, sre_patterns): - dirpath, dnames, fnames = next(os.walk(root_path)) + dirpath, dnames, fnames = next(os.walk(root_path, followlinks=False)) dirpath = Path(dirpath) if fnames: @@ -336,7 +336,8 @@ reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns) } - repo_id = Path(root).parts[-1] + # temporary directory prefix + repo_id = Path(root).parts[-1].split("_")[0] counter: collections.Counter = collections.Counter() counter["api_calls"] = 0 counter["queries"] = 0 @@ -356,9 +357,10 @@ backend_name, len(source_tree), algo, + -1, min_queries, ) - print(min_result) + print(*min_result, sep=",") return elif algo == "stopngo": stopngo(source_tree, api_url, counter) @@ -380,4 +382,4 @@ counter["queries"], ) - print(result) + print(*result, sep=",")