diff --git a/benchmark.py b/benchmark.py
new file mode 100755
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from pathlib import Path
+import shutil
+from subprocess import DEVNULL, Popen, call
+import sys
+
+import click
+
+import swh.scanner.logger as log
+
+# algorithms available in 'swh benchmark' command options
+ALGOS = [
+    "stopngo",
+    "file_priority",
+    "directory_priority",
+    "random",
+    "algo_min",
+]
+
+
+def parse_repo(temp_path, port_range):
+    dirpath, dnames, _ = next(os.walk(temp_path))
+    extracted_repo_path = Path(dirpath).joinpath(dnames[0])
+
+    start, end = port_range.split(":")
+    api_urls = [
+        f"http://localhost:{int(port)}/api/1/" for port in range(int(start), int(end))
+    ]
+
+    scenario_cmds = [
+        [
+            "swh",
+            "scanner",
+            "benchmark",
+            "-a",
+            algo,
+            "-u",
+            api_url,
+            "-x",
+            str(extracted_repo_path) + "/.git",
+            str(extracted_repo_path),
+        ]
+        for algo in ALGOS
+        for api_url in api_urls
+    ]
+
+    processes = [
+        Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) for cmd in scenario_cmds
+    ]
+
+    for proc in processes:
+        proc.wait()
+
+    shutil.rmtree(extracted_repo_path)
+
+
+@click.command(
+    help="""Run multiple benchmark from an input archive. The repository
+                    will be unpacked in the provided temporary path. The port range
+                    should be specified in order to run the benchmark using different
+                    backend"""
+)
+@click.argument("repo_path", type=click.Path(exists=True), required=True)
+@click.argument("temp_path", type=click.Path(exists=True), required=True)
+@click.option(
+    "--port-range",
+    "-p",
+    metavar="PORT_RANGE",
+    show_default=True,
+    required=True,
+    help="The backend port range (e.g., 5000:5001)",
+)
+def main(repo_path, temp_path, port_range):
+    log.setup_logger()
+    try:
+        exit_code = call(["tar", "xvf", repo_path, "-C", temp_path], stdout=DEVNULL)
+        if exit_code == 0:
+            parse_repo(temp_path, port_range)
+        else:
+            raise IOError(
+                'Decompression of repo "%s" exited with code: %d'
+                % (repo_path, exit_code)
+            )
+    except Exception as e:
+        log.error(e)
+    except IOError as ioerror:
+        log.error(ioerror)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -28,3 +28,6 @@
 
 [mypy-plotly.*]
 ignore_missing_imports = True
+
+[mypy-git.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@
 dash
 dash_bootstrap_components
 flask
+GitPython
 dulwich
diff --git a/run_backend.sh b/run_backend.sh
new file mode 100755
--- /dev/null
+++ b/run_backend.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+# This script simply runs multiple scanner backend using the files provided from stdin.
+
+start_port=5000
+
+while IFS= read -r swhids_file;
+do
+    gunicorn "-b" 127.0.0.1:$start_port 'swh.scanner.backend:create_app("'$swhids_file'")' &
+    ((start_port++))
+done
diff --git a/run_benchmark.sh b/run_benchmark.sh
new file mode 100755
--- /dev/null
+++ b/run_benchmark.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# This script is used to benchmark multiple repositories (taken from stdin).
+# A temporary directory and a port range should be provided.
+
+temp_dir=$1
+port_range=$2
+
+if [ ! -d "$temp_dir" ]; then
+    echo "The provided temporary directory does not exists"
+    exit 1
+fi
+
+if [ "$port_range" == '' ]; then
+    echo "You should provide a valid port range (e.g., 5000:5003)"
+    exit 1
+fi
+
+while IFS= read -r repo;
+do
+    ./benchmark.py "$repo" "$temp_dir" -p "$port_range"
+done
diff --git a/swh/scanner/backend.py b/swh/scanner/backend.py
--- a/swh/scanner/backend.py
+++ b/swh/scanner/backend.py
@@ -3,6 +3,8 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from pathlib import Path
+
 from flask import Flask, request
 
 from .db import Db
@@ -11,10 +13,11 @@
 LIMIT = 1000
 
 
-def create_app(db: Db):
+def create_app(db_file: str):
     """Backend for swh-scanner, implementing the /known endpoint of the
        Software Heritage Web API"""
     app = Flask(__name__)
+    db = Db(Path(db_file))
 
     @app.route("/api/1/known/", methods=["POST"])
     def known():
@@ -34,8 +37,13 @@
     return app
 
 
-def run(host: str, port: int, db: Db):
+def run(host: str, port: int, db_file: str):
     """Serve the local database
     """
-    app = create_app(db)
-    app.run(host, port, debug=True)
+    # from .db import Db
+
+    # db = Db(db_file)
+    # app = create_app(db)
+    # app.run(host, port, debug=False)
+    # db.close()
+    pass
diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/benchmark_algos.py
@@ -0,0 +1,385 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import itertools
+import json
+import os
+from pathlib import Path
+import random
+from typing import Dict, Iterable, List, Optional
+
+from git import Repo
+import requests
+
+from swh.model.from_disk import Content, Directory, accept_all_directories
+from swh.model.identifiers import CONTENT, DIRECTORY, swhid
+
+from .exceptions import APIError
+from .logger import error
+from .model import Status, Tree
+from .scanner import directory_filter, extract_regex_objs
+
+
+class Counter:
+    """Class to store the number of api calls and queries made to the
+       backend during the discovery process of the various algorithms.
+    """
+
+    def __init__(self):
+        self.api_count: int = 0
+        self.query_count: int = 0
+
+    def add_api(self, n: int):
+        self.api_count += n
+
+    def add_query(self, n: int):
+        self.query_count += n
+
+    def get_api(self):
+        return self.api_count
+
+    def get_query(self):
+        return self.query_count
+
+
+def query_swhids(
+    swhids: List[Tree], api_url: str, counter: Optional[Counter] = None
+) -> Dict[str, Dict[str, bool]]:
+    """
+    Returns:
+        A dictionary with:
+        key(str): persistent identifier
+        value(dict):
+            value['known'] = True if pid is found
+            value['known'] = False if pid is not found
+    """
+    endpoint = api_url + "known/"
+    chunk_size = 1000
+
+    if counter:
+        counter.add_query(len(swhids))
+
+    def make_request(swhids):
+        swhids = [swhid.swhid for swhid in swhids]
+        req = requests.post(endpoint, json=swhids)
+        if req.status_code != 200:
+            error_message = "%s with given values %s" % (req.text, str(swhids))
+            error(error_message)
+            raise APIError(error_message)
+        if counter:
+            counter.add_api(1)
+        resp = req.text
+        return json.loads(resp)
+
+    def get_chunk(swhids):
+        for i in range(0, len(swhids), chunk_size):
+            yield swhids[i : i + chunk_size]
+
+    if len(swhids) > chunk_size:
+        return dict(
+            itertools.chain.from_iterable(
+                make_request(swhids_chunk).items() for swhids_chunk in get_chunk(swhids)
+            )
+        )
+    else:
+        return make_request(swhids)
+
+
+def stopngo(source_tree: Tree, api_url: str, counter: Counter):
+    def set_children_known(node):
+        for child_node in node.iterate():
+            child_node.known = True
+
+    nodes = []
+    nodes.append(source_tree)
+
+    while len(nodes) > 0:
+        parsed_nodes = query_swhids(nodes, api_url, counter)
+        for node in nodes.copy():
+            nodes.remove(node)
+            node.known = parsed_nodes[node.swhid]["known"]
+            node.status = Status.queried
+            if node.otype == DIRECTORY:
+                if not node.known:
+                    nodes.extend(list(node.children.values()))
+                else:
+                    set_children_known(node)
+
+
+def set_father_status(node, known):
+    """
+    Recursively change father known and visited status of a given node
+    """
+    parent = node.father
+
+    if parent is None:
+        return
+    if parent.status != Status.unset:
+        return
+
+    parent.known = known
+    set_father_status(parent, known)
+
+
+def set_children_status(node, node_type, known, status: Status = Status.unset):
+    """
+    Recursively change father known and visited status of a given node
+    """
+    for child_node in node.iterate():
+        if child_node.otype == node_type and child_node.status == status:
+            child_node.known = known
+
+
+def file_priority(source_tree: Tree, api_url: str, counter: Counter):
+    # get all the files
+    all_contents = list(
+        filter(lambda node: node.otype == CONTENT, source_tree.iterate_bfs())
+    )
+    all_contents.reverse()  # we check nodes from the deepest
+
+    # query the backend to get all file contents status
+    parsed_contents = query_swhids(all_contents, api_url, counter)
+    # set all the file contents status
+    for cnt in all_contents:
+        cnt.known = parsed_contents[cnt.swhid]["known"]
+        cnt.status = Status.queried
+        # set all the upstream directories of unknown file contents to unknown
+        if not cnt.known:
+            set_father_status(cnt, False)
+
+    # get all unset directories and check their status
+    # (update children directories accordingly)
+    unset_dirs = list(
+        filter(
+            lambda node: node.otype == DIRECTORY and node.status == Status.unset,
+            source_tree.iterate(),
+        )
+    )
+
+    if source_tree.status == Status.unset:
+        unset_dirs.append(source_tree)
+
+    # check unset directories
+    for dir_ in unset_dirs:
+        if dir_.status == Status.unset:
+            # update directory status
+            dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
+            dir_.status = Status.queried
+            set_children_status(dir_, DIRECTORY, dir_.known)
+
+
+def directory_priority(source_tree: Tree, api_url: str, counter: Counter):
+    # get all directory contents that have at least one file content
+    unset_dirs = list(
+        filter(
+            lambda dir_: dir_.otype == DIRECTORY and dir_.has_contents,
+            source_tree.iterate_bfs(),
+        )
+    )
+    unset_dirs.reverse()
+    # insert root if it has no contents
+    if source_tree.has_contents:
+        unset_dirs.append(source_tree)
+
+    for dir_ in unset_dirs:
+        # if the directory is known set all the downstream file contents to known
+        if dir_.status == Status.unset:
+            dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
+            dir_.status = Status.queried
+            if dir_.known:
+                set_children_status(dir_, CONTENT, True)
+            else:
+                set_father_status(dir_, False)
+
+    # get remaining directories that have no file contents
+    unset_dirs_no_cnts = list(
+        filter(
+            lambda node: node.otype == DIRECTORY and not node.has_contents,
+            source_tree.iterate_bfs(),
+        )
+    )
+    parsed_dirs_no_cnts = query_swhids(unset_dirs_no_cnts, api_url, counter)
+
+    # update status of directories that have no file contents
+    for dir_ in unset_dirs_no_cnts:
+        dir_.known = parsed_dirs_no_cnts[dir_.swhid]["known"]
+        dir_.status = Status.queried
+
+    # check unknown file contents
+    unset_files = list(
+        filter(
+            lambda node: node.otype == CONTENT and node.status == Status.unset,
+            source_tree.iterate(),
+        )
+    )
+    parsed_unset_files = query_swhids(unset_files, api_url, counter)
+
+    for file_ in unset_files:
+        file_.known = parsed_unset_files[file_.swhid]["known"]
+        file_.status = Status.queried
+
+
+def random_(
+    source_tree: Tree, api_url: str, counter: Counter, seed: Optional[int] = None
+):
+
+    if seed:
+        random.seed(seed)
+    # get all directory/file contents
+    all_nodes = [node for node in source_tree.iterate()] + [source_tree]
+    # shuffle contents
+    random.shuffle(all_nodes)
+
+    while len(all_nodes):
+        node = all_nodes.pop()
+
+        if node.status != Status.unset:
+            continue
+
+        node.known = query_swhids([node], api_url, counter)[node.swhid]["known"]
+        node.status = Status.queried
+        if node.otype == DIRECTORY and node.known:
+            for child_node in node.iterate():
+                child_node.known = True
+        elif node.otype == CONTENT and not node.known:
+            set_father_status(node, False)
+
+
+def algo_min(source_tree: Tree, api_url: str):
+    """
+    The minimal number of queries knowing the known/unknown status of every node
+    """
+
+    def remove_parents(node, nodes):
+        parent = node.father
+        if parent is None or parent not in nodes:
+            return
+        else:
+            nodes.remove(parent)
+            remove_parents(parent, nodes)
+
+    def remove_children(node, nodes):
+        for child_node in node.iterate():
+            nodes.remove(child_node)
+
+    all_nodes = [node for node in source_tree.iterate()]
+    all_nodes.insert(0, source_tree)
+
+    parsed_nodes = query_swhids(all_nodes, api_url)
+    for node in all_nodes:
+        node.known = parsed_nodes[node.swhid]["known"]
+
+    all_nodes_copy = all_nodes.copy()
+
+    for node in all_nodes:
+        if node.otype == CONTENT and not node.known:
+            remove_parents(node, all_nodes_copy)
+
+    for node in all_nodes_copy:
+        if node.otype == DIRECTORY and node.known:
+            remove_children(node, all_nodes_copy)
+
+    return len(all_nodes_copy)
+
+
+def get_swhids(paths: Iterable[Path], exclude_patterns):
+    def swhid_of(path):
+        if path.is_dir():
+            if exclude_patterns:
+
+                def dir_filter(dirpath, *args):
+                    return directory_filter(dirpath, exclude_patterns)
+
+            else:
+                dir_filter = accept_all_directories
+
+            obj = Directory.from_disk(
+                path=bytes(path), dir_filter=dir_filter
+            ).get_data()
+
+            return swhid(DIRECTORY, obj)
+        else:
+            obj = Content.from_file(path=bytes(path)).get_data()
+            return swhid(CONTENT, obj)
+
+    for path in paths:
+        yield str(path), swhid_of(path)
+
+
+def load_source(root, sre_patterns):
+    """
+    Load the source code inside the Tree data structure
+    """
+
+    def _scan(root_path, source_tree, sre_patterns):
+        dirpath, dnames, fnames = next(os.walk(root_path))
+        dirpath = Path(dirpath)
+
+        if fnames:
+            files = [dirpath.joinpath(fname) for fname in fnames]
+            parsed_file_swhids = dict(get_swhids(files, sre_patterns))
+
+            for path, swhid_ in parsed_file_swhids.items():
+                source_tree.add_node(Path(path), swhid_)
+
+        if dnames:
+            dirs = [dirpath.joinpath(dname) for dname in dnames]
+            parsed_dirs_swhids = dict(get_swhids(dirs, sre_patterns))
+
+            for path, swhid_ in parsed_dirs_swhids.items():
+                if not directory_filter(path, sre_patterns):
+                    continue
+                source_tree.add_node(Path(path), swhid_)
+                _scan(path, source_tree, sre_patterns)
+
+    source_tree = Tree(root)
+    root_swhid = dict(get_swhids([root], sre_patterns))
+    source_tree.swhid = root_swhid[str(root)]
+    _scan(root, source_tree, sre_patterns)
+    return source_tree
+
+
+def run(
+    root: str,
+    api_url: str,
+    exclude_patterns: Iterable[str],
+    algo: str,
+    seed: Optional[int] = None,
+):
+    sre_patterns = set()
+    if exclude_patterns:
+        sre_patterns = {
+            reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns)
+        }
+
+    repo = Repo(root)
+    repo_url = repo.remote("origin").url
+    revision = repo.git.rev_list("--count", "HEAD")
+    counter = Counter()
+    source_tree = load_source(Path(root), sre_patterns)
+
+    if algo == "random":
+        random_(source_tree, api_url, counter, seed)
+    elif algo == "algo_min":
+        min_queries = algo_min(source_tree, api_url)
+        min_result = (repo_url, revision, len(source_tree), algo, min_queries)
+        print(min_result)
+        return
+    elif algo == "stopngo":
+        stopngo(source_tree, api_url, counter)
+    elif algo == "file_priority":
+        file_priority(source_tree, api_url, counter)
+    elif algo == "directory_priority":
+        directory_priority(source_tree, api_url, counter)
+
+    result = (
+        repo_url,
+        revision,
+        len(source_tree),
+        algo,
+        counter.get_api(),
+        counter.get_query(),
+    )
+    print(result)
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -234,6 +234,58 @@
     db.close()
 
 
+@scanner.command()
+@click.argument("root_path", required=True, type=click.Path(exists=True))
+@click.option(
+    "-u",
+    "--api-url",
+    default=None,
+    metavar="API_URL",
+    show_default=True,
+    help="URL for the api request",
+)
+@click.option(
+    "--exclude",
+    "-x",
+    "patterns",
+    metavar="PATTERN",
+    multiple=True,
+    show_default=True,
+    help="Exclude directories using glob patterns \
+    (e.g., '*.git' to exclude all .git directories)",
+)
+@click.option(
+    "--algo",
+    "-a",
+    metavar="ALGO NAME",
+    show_default=True,
+    required=True,
+    help="Algorithm name",
+)
+# @click.option(
+#     "--seed",
+#     "-s",
+#     metavar="SEED",
+#     type=int,
+#     show_default=True,
+#     help="Seed for the random algorithm"
+# )
+@click.pass_context
+def benchmark(ctx, root_path, api_url, patterns, algo):
+    from swh.scanner.benchmark_algos import run
+    from swh.scanner.logger import error, setup_logger
+
+    setup_logger()
+    try:
+        run(root_path, api_url, patterns, algo)
+    except Exception as e:
+        error(
+            f'Repository: "{root_path}" using "{algo}" '
+            f'algorithm on "{api_url}" FAILED: {e}'
+        )
+        pass
+
+
 def main():
     return scanner(auto_envvar_prefix="SWH_SCANNER")
 
diff --git a/swh/scanner/logger.py b/swh/scanner/logger.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/logger.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+
+logger = None
+
+
+def setup_logger() -> None:
+    global logger
+    console = logging.FileHandler("scan.log")
+    console.setLevel(logging.NOTSET)
+    formatter = logging.Formatter("%(asctime)s | %(levelname)s: %(message)s")
+    console.setFormatter(formatter)
+
+    logger = logging.getLogger("debug")
+    logger.addHandler(console)
+    # logger.propagate = True
+
+
+def error(*args) -> None:
+    if logger is not None:
+        logger.error(args)
+
+
+def warning(*args) -> None:
+    if logger is not None:
+        logger.warning(args)
+
+
+def info(*args) -> None:
+    if logger is not None:
+        logger.info(args)
+
+
+def debug(*args):
+    if logger is not None:
+        logger.debug(args)
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -22,6 +22,8 @@
 class Color(Enum):
     blue = "\033[94m"
     green = "\033[92m"
+    yellow = "\033[93m"
+    magenta = "\033[95m"
     red = "\033[91m"
     end = "\033[0m"
 
@@ -30,6 +32,12 @@
     return color.value + text + Color.end.value
 
 
+class Status(Enum):
+    unset = 0
+    set = 1
+    queried = 2
+
+
 class Tree:
     """Representation of a file system structure
     """
@@ -40,23 +48,26 @@
         self.otype = DIRECTORY if path.is_dir() else CONTENT
         self.swhid = ""
         self.known = False
+        self.status = Status.unset
         self.children: Dict[Path, Tree] = {}
 
-    def add_node(self, path: Path, swhid: str, known: bool) -> None:
+    def __len__(self):
+        return sum(1 for node in self.iterate()) + 1  # the root node
+
+    def add_node(self, path: Path, swhid: str) -> None:
         """Recursively add a new path.
         """
         relative_path = path.relative_to(self.path)
 
         if relative_path == Path("."):
             self.swhid = swhid
-            self.known = known
             return
 
         new_path = self.path.joinpath(relative_path.parts[0])
         if new_path not in self.children:
             self.children[new_path] = Tree(new_path, self)
 
-        self.children[new_path].add_node(path, swhid, known)
+        self.children[new_path].add_node(path, swhid)
 
     def show(self, fmt) -> None:
         """Show tree in different formats"""
@@ -90,15 +101,28 @@
         end = "/" if node.otype == DIRECTORY else ""
 
         if isatty:
-            if not node.known:
-                rel_path = colorize(rel_path, Color.red)
-            elif node.otype == DIRECTORY:
+            if node.status == Status.unset:
+                rel_path = colorize(rel_path, Color.magenta)
+            elif node.status == Status.set and not node.known:
+                rel_path = colorize(rel_path, Color.yellow)
+            elif node.status == Status.set and node.known:
                 rel_path = colorize(rel_path, Color.blue)
-            elif node.otype == CONTENT:
+            elif node.status == Status.queried and not node.known:
+                rel_path = colorize(rel_path, Color.red)
+            elif node.status == Status.queried and node.known:
                 rel_path = colorize(rel_path, Color.green)
 
         print(f"{begin}{rel_path}{end}")
 
+    @property
+    def known(self):
+        return self._known
+
+    @known.setter
+    def known(self, value: bool):
+        self._known = value
+        self.status = Status.set
+
     @property
     def attributes(self) -> Dict[str, Dict[str, Any]]:
         """
@@ -158,6 +182,16 @@
             if child_node.otype == DIRECTORY:
                 yield from child_node.iterate()
 
+    def iterate_bfs(self) -> Iterator[Tree]:
+        """Get nodes in BFS order
+        """
+        nodes = set(node for node in self.children.values())
+        for node in nodes:
+            yield node
+        for node in nodes:
+            if node.otype == DIRECTORY:
+                yield from node.iterate_bfs()
+
     def get_files_from_dir(self, dir_path: Path) -> List:
         """
         Retrieve files information about a specific directory path
@@ -249,3 +283,9 @@
             if child_node.otype == DIRECTORY:
                 return True
         return False
+
+    def has_contents(self) -> bool:
+        for _, child_node in self.children.items():
+            if child_node.otype == CONTENT:
+                return True
+        return False