diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
index de262df..5f00d48 100644
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -1,267 +1,285 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
 import os
 from typing import Any, Dict, Optional
 
 import click
 from importlib_metadata import version
 import yaml
 
 from swh.core import config
 from swh.core.cli import CONTEXT_SETTINGS
 from swh.core.cli import swh as swh_cli_group
 
 from .exceptions import DBError
 
 # Config for the "serve" option
 BACKEND_DEFAULT_PORT = 5011
 
 # All generic config code should reside in swh.core.config
 CONFIG_ENVVAR = "SWH_CONFIG_FILE"
 DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml")
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "web-api": {
         "url": "https://archive.softwareheritage.org/api/1/",
         "auth-token": None,
     }
 }
 
 
 CONFIG_FILE_HELP = f"""Configuration file:
 
 \b
 The CLI option or the environment variable will fail if invalid.
 CLI option is checked first.
 Then, environment variable {CONFIG_ENVVAR} is checked.
 Then, if cannot load the default path, a set of default values are used.
 Default config path is {DEFAULT_CONFIG_PATH}.
 Default config values are:
 
 \b
 {yaml.dump(DEFAULT_CONFIG)}"""
 SCANNER_HELP = f"""Software Heritage Scanner tools.
 
 {CONFIG_FILE_HELP}"""
 
 
 def setup_config(ctx, api_url):
     config = ctx.obj["config"]
     if api_url:
         if not api_url.endswith("/"):
             api_url += "/"
         config["web-api"]["url"] = api_url
 
     return config
 
 
 @swh_cli_group.group(
     name="scanner", context_settings=CONTEXT_SETTINGS, help=SCANNER_HELP,
 )
 @click.option(
     "-C",
     "--config-file",
     default=None,
     type=click.Path(exists=False, dir_okay=False, path_type=str),
     help="""YAML configuration file""",
 )
 @click.version_option(
     version=version("swh.scanner"), prog_name="swh.scanner",
 )
 @click.pass_context
 def scanner(ctx, config_file: Optional[str]):
 
     env_config_path = os.environ.get(CONFIG_ENVVAR)
 
     # read_raw_config do not fail if file does not exist, so check it beforehand
     # while enforcing loading priority
     if config_file:
         if not config.config_exists(config_file):
             raise click.BadParameter(
                 f"File '{config_file}' cannot be opened.", param_hint="--config-file"
             )
     elif env_config_path:
         if not config.config_exists(env_config_path):
             raise click.BadParameter(
                 f"File '{env_config_path}' cannot be opened.", param_hint=CONFIG_ENVVAR
             )
         config_file = env_config_path
     elif config.config_exists(DEFAULT_CONFIG_PATH):
         config_file = DEFAULT_CONFIG_PATH
 
     conf = DEFAULT_CONFIG
     if config_file is not None:
         conf = config.read_raw_config(config.config_basepath(config_file))
         conf = config.merge_configs(DEFAULT_CONFIG, conf)
 
     ctx.ensure_object(dict)
     ctx.obj["config"] = conf
 
 
 @scanner.command(name="scan")
 @click.argument("root_path", required=True, type=click.Path(exists=True))
 @click.option(
     "-u",
     "--api-url",
     default=None,
     metavar="API_URL",
     show_default=True,
     help="URL for the api request",
 )
 @click.option(
     "--exclude",
     "-x",
     "patterns",
     metavar="PATTERN",
     multiple=True,
     help="Exclude directories using glob patterns \
     (e.g., ``*.git`` to exclude all .git directories)",
 )
 @click.option(
     "-f",
     "--output-format",
     "out_fmt",
     default="text",
     show_default=True,
     type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False),
     help="The output format",
 )
 @click.option(
     "-i", "--interactive", is_flag=True, help="Show the result in a dashboard"
 )
 @click.option(
     "-p",
     "--policy",
     default="auto",
     show_default=True,
     type=click.Choice(["auto", "bfs", "greedybfs", "filepriority", "dirpriority"]),
     help="The scan policy.",
 )
+@click.option(
+    "-e",
+    "--extra-info",
+    "extra_info",
+    multiple=True,
+    type=click.Choice(["origin"]),
+    help="Add selected additional information about known software artifacts.",
+)
 @click.pass_context
-def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy):
+def scan(ctx, root_path, api_url, patterns, out_fmt, interactive, policy, extra_info):
     """Scan a source code project to discover files and directories already
     present in the archive.
 
     The source code project can be checked using different policies that can be set
-    using the -p/--policy option:
-
-    auto: it selects the best policy based on the source code, for codebase(s) with
-    less than 1000 file/dir contents all the nodes will be queried.
-
-    bfs: scan the source code in the BFS order, checking unknown directories only.
-
-    greedybfs: same as "bfs" policy, but lookup the status of source code artifacts in
-    chunks, in order to minimize the number of Web API round-trips with the archive.
-
-    filepriority: scan all the source code file contents, checking only unset
-    directories. (useful if the codebase contains a lot of source files)
-
-    dirpriority: scan all the source code directories and check only unknown
-    directory contents.
-    """
+    using the -p/--policy option:\n
+    \b
+      auto: it selects the best policy based on the source code, for codebase(s)
+      with less than 1000 file/dir contents all the nodes will be queried.
+
+      bfs: scan the source code in the BFS order, checking unknown directories only.
+
+    \b
+      greedybfs: same as "bfs" policy, but lookup the status of source code artifacts
+      in chunks, in order to minimize the number of Web API round-trips with the
+      archive.
+
+    \b
+      filepriority: scan all the source code file contents, checking only unset
+      directories. (useful if the codebase contains a lot of source files)
+
+      dirpriority: scan all the source code directories and check only unknown
+      directory contents.
+
+    Other information about software artifacts could be specified with the -e/
+    --extra-info option:\n
+    \b
+      origin: search the origin url of each source code files/dirs using the in-memory
+      compressed graph.
+"""
     import swh.scanner.scanner as scanner
 
     config = setup_config(ctx, api_url)
-    scanner.scan(config, root_path, patterns, out_fmt, interactive, policy)
+    extra_info = set(extra_info)
+    scanner.scan(config, root_path, patterns, out_fmt, interactive, policy, extra_info)
 
 
 @scanner.group("db", help="Manage local knowledge base for swh-scanner")
 @click.pass_context
 def db(ctx):
     pass
 
 
 @db.command("import")
 @click.option(
     "-i",
     "--input",
     "input_file",
     metavar="INPUT_FILE",
     required=True,
     type=click.File("r"),
     help="A file containing SWHIDs",
 )
 @click.option(
     "-o",
     "--output",
     "output_file_db",
     metavar="OUTPUT_DB_FILE",
     required=True,
     show_default=True,
     help="The name of the generated sqlite database",
 )
 @click.option(
     "-s",
     "--chunk-size",
     "chunk_size",
     default="10000",
     metavar="SIZE",
     show_default=True,
     type=int,
     help="The chunk size ",
 )
 @click.pass_context
 def import_(ctx, chunk_size, input_file, output_file_db):
     """Create SQLite database of known SWHIDs from a textual list of SWHIDs"""
     from .db import Db
 
     db = Db(output_file_db)
     cur = db.conn.cursor()
     try:
         db.create_from(input_file, chunk_size, cur)
         db.close()
     except DBError as e:
         ctx.fail("Failed to import SWHIDs into database: {0}".format(e))
 
 
 @db.command("serve")
 @click.option(
     "-h",
     "--host",
     metavar="HOST",
     default="127.0.0.1",
     show_default=True,
     help="The host of the API server",
 )
 @click.option(
     "-p",
     "--port",
     metavar="PORT",
     default=f"{BACKEND_DEFAULT_PORT}",
     show_default=True,
     help="The port of the API server",
 )
 @click.option(
     "-f",
     "--db-file",
     "db_file",
     metavar="DB_FILE",
     default="SWHID_DB.sqlite",
     show_default=True,
     type=click.Path(exists=True),
     help="An sqlite database file (it can be generated with: 'swh scanner db import')",
 )
 @click.pass_context
 def serve(ctx, host, port, db_file):
     """Start an API service using the sqlite database generated with the "db import"
     option."""
     import swh.scanner.backend as backend
 
     from .db import Db
 
     db = Db(db_file)
     backend.run(host, port, db)
     db.close()
 
 
 def main():
     return scanner(auto_envvar_prefix="SWH_SCANNER")
 
 
 if __name__ == "__main__":
     main()
diff --git a/swh/scanner/client.py b/swh/scanner/client.py
new file mode 100644
index 0000000..d814b72
--- /dev/null
+++ b/swh/scanner/client.py
@@ -0,0 +1,98 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""
+Minimal async web client for the Software Heritage Web API.
+
+This module could be removed when
+`T2635 <https://forge.softwareheritage.org/T2635>` is implemented.
+"""
+
+import asyncio
+import itertools
+from typing import Any, Dict, List, Optional
+
+import aiohttp
+
+from swh.model.identifiers import CoreSWHID
+
+from .exceptions import error_response
+
+# Maximum number of SWHIDs that can be requested by a single call to the
+# Web API endpoint /known/
+QUERY_LIMIT = 1000
+
+KNOWN_EP = "known/"
+GRAPH_RANDOMWALK_EP = "graph/randomwalk/"
+
+
+class Client:
+    """Manage requests to the Software Heritage Web API.
+    """
+
+    def __init__(self, api_url: str, session: aiohttp.ClientSession):
+        self.api_url = api_url
+        self.session = session
+
+    async def get_origin(self, swhid: CoreSWHID) -> Optional[Any]:
+        """Walk the compressed graph to discover the origin of a given swhid
+        """
+        endpoint = (
+            f"{self.api_url}{GRAPH_RANDOMWALK_EP}{str(swhid)}/ori/?direction="
+            f"backward&limit=-1&resolve_origins=true"
+        )
+        res = None
+        async with self.session.get(endpoint) as resp:
+            if resp.status == 200:
+                res = await resp.text()
+                res = res.rstrip()
+                return res
+            if resp.status != 404:
+                error_response(resp.reason, resp.status, endpoint)
+
+        return res
+
+    async def known(self, swhids: List[CoreSWHID]) -> Dict[str, Dict[str, bool]]:
+        """API Request to get information about the SoftWare Heritage persistent
+        IDentifiers (SWHIDs) given in input.
+
+        Args:
+            swhids: a list of CoreSWHID instances
+            api_url: url for the API request
+
+        Returns:
+            A dictionary with:
+
+            key:
+                string SWHID searched
+            value:
+                value['known'] = True if the SWHID is found
+                value['known'] = False if the SWHID is not found
+
+        """
+        endpoint = self.api_url + KNOWN_EP
+        requests = []
+
+        def get_chunk(swhids):
+            for i in range(0, len(swhids), QUERY_LIMIT):
+                yield swhids[i : i + QUERY_LIMIT]
+
+        async def make_request(swhids):
+            swhids = [str(swhid) for swhid in swhids]
+            async with self.session.post(endpoint, json=swhids) as resp:
+                if resp.status != 200:
+                    error_response(resp.reason, resp.status, endpoint)
+
+                return await resp.json()
+
+        if len(swhids) > QUERY_LIMIT:
+            for swhids_chunk in get_chunk(swhids):
+                requests.append(asyncio.create_task(make_request(swhids_chunk)))
+
+            res = await asyncio.gather(*requests)
+            # concatenate list of dictionaries
+            return dict(itertools.chain.from_iterable(e.items() for e in res))
+        else:
+            return await make_request(swhids)
diff --git a/swh/scanner/data.py b/swh/scanner/data.py
index 4db27f5..88ab170 100644
--- a/swh/scanner/data.py
+++ b/swh/scanner/data.py
@@ -1,107 +1,150 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from pathlib import Path
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 from swh.model.exceptions import ValidationError
 from swh.model.from_disk import Directory
 from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID
 
+from .client import Client
+
+SUPPORTED_INFO = {"known", "origin"}
+
 
 class MerkleNodeInfo(dict):
     """Store additional information about Merkle DAG nodes, using SWHIDs as keys"""
 
     def __setitem__(self, key, value):
         """The keys must be valid valid Software Heritage Persistent Identifiers
         while values must be dict.
         """
         if not isinstance(key, CoreSWHID):
             raise ValidationError("keys must be valid SWHID(s)")
 
         if not isinstance(value, dict):
             raise ValidationError(f"values must be dict, not {type(value)}")
 
         super(MerkleNodeInfo, self).__setitem__(key, value)
 
 
+def init_merkle_node_info(source_tree: Directory, data: MerkleNodeInfo, info: set):
+    """Populate the MerkleNodeInfo with the SWHIDs of the given source tree and the
+       attributes that will be stored.
+    """
+    if not info:
+        raise Exception("Data initialization requires node attributes values.")
+    nodes_info: Dict[str, Optional[str]] = {}
+    for ainfo in info:
+        if ainfo in SUPPORTED_INFO:
+            nodes_info[ainfo] = None
+        else:
+            raise Exception(f"Information {ainfo} is not supported.")
+
+    for node in source_tree.iter_tree():
+        data[node.swhid()] = nodes_info.copy()  # type: ignore
+
+
+async def add_origin(source_tree: Directory, data: MerkleNodeInfo, client: Client):
+    """Store origin information about software artifacts retrieved from the Software
+       Heritage graph service.
+    """
+    queue = []
+    queue.append(source_tree)
+    while queue:
+        for node in queue.copy():
+            queue.remove(node)
+            node_ori = await client.get_origin(node.swhid())
+            if node_ori:
+                data[node.swhid()]["origin"] = node_ori
+                if node.object_type == DIRECTORY:
+                    for sub_node in node.iter_tree():
+                        data[sub_node.swhid()]["origin"] = node_ori  # type: ignore
+            else:
+                if node.object_type == DIRECTORY:
+                    children = [sub_node for sub_node in node.iter_tree()]
+                    children.remove(node)
+                    queue.extend(children)  # type: ignore
+
+
 def get_directory_data(
     root_path: str,
     source_tree: Directory,
     nodes_data: MerkleNodeInfo,
     directory_data: Dict = {},
 ) -> Dict[Path, dict]:
     """Get content information for each directory inside source_tree.
 
     Returns:
      A dictionary with a directory path as key and the relative
      contents information as values.
     """
 
     def _get_directory_data(
         source_tree: Directory, nodes_data: MerkleNodeInfo, directory_data: Dict
     ):
         directories = list(
             filter(
                 lambda n: n.object_type == DIRECTORY,
                 map(lambda n: n[1], source_tree.items()),
             )
         )
         for node in directories:
             directory_info = directory_content(node, nodes_data)
             rel_path = Path(node.data["path"].decode()).relative_to(Path(root_path))
             directory_data[rel_path] = directory_info
             if has_dirs(node):
                 _get_directory_data(node, nodes_data, directory_data)
 
     _get_directory_data(source_tree, nodes_data, directory_data)
     return directory_data
 
 
 def directory_content(node: Directory, nodes_data: MerkleNodeInfo) -> Tuple[int, int]:
     """Count known contents inside the given directory.
 
     Returns:
      A tuple with the total number of contents inside the directory and the number
      of known contents.
     """
     known_cnt = 0
     node_contents = list(
         filter(lambda n: n.object_type == CONTENT, map(lambda n: n[1], node.items()))
     )
     for sub_node in node_contents:
         if nodes_data[sub_node.swhid()]["known"]:
             known_cnt += 1
 
     return (len(node_contents), known_cnt)
 
 
 def has_dirs(node: Directory) -> bool:
     """Check if the given directory has other directories inside."""
     for _, sub_node in node.items():
         if isinstance(sub_node, Directory):
             return True
     return False
 
 
 def get_content_from(
     node_path: bytes, source_tree: Directory, nodes_data: MerkleNodeInfo
 ) -> Dict[bytes, dict]:
     """Get content information from the given directory node."""
     # root in model.from_disk.Directory should be accessed with b""
     directory = source_tree[node_path if node_path != source_tree.data["path"] else b""]
     node_contents = list(
         filter(
             lambda n: n.object_type == CONTENT, map(lambda n: n[1], directory.items())
         )
     )
     files_data = {}
     for node in node_contents:
         node_info = nodes_data[node.swhid()]
         node_info["swhid"] = str(node.swhid())
         path_name = "path" if "path" in node.data.keys() else "data"
         files_data[node.data[path_name]] = node_info
 
     return files_data
diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py
index b890a45..dd9eded 100644
--- a/swh/scanner/exceptions.py
+++ b/swh/scanner/exceptions.py
@@ -1,30 +1,32 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from typing import Any, Optional
+
 
 class InvalidObjectType(TypeError):
     pass
 
 
 class InvalidDirectoryPath(Exception):
     pass
 
 
 class LargePayloadExc(Exception):
     pass
 
 
 class DBError(Exception):
     pass
 
 
 class APIError(Exception):
     def __str__(self):
         return '"%s"' % self.args
 
 
-def error_response(reason: str, status_code: int, api_url: str):
+def error_response(reason: Optional[Any], status_code: int, api_url: str):
     error_msg = f"{status_code} {reason}: '{api_url}'"
     raise APIError(error_msg)
diff --git a/swh/scanner/output.py b/swh/scanner/output.py
index 6a2607a..5269d0f 100644
--- a/swh/scanner/output.py
+++ b/swh/scanner/output.py
@@ -1,108 +1,109 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from enum import Enum
 import json
 import os
 import sys
 from typing import Any
 
 import ndjson
 
 from swh.model.from_disk import Directory
 
 from .dashboard.dashboard import run_app
 from .data import MerkleNodeInfo, get_directory_data
 from .plot import generate_sunburst, offline_plot
 
 DEFAULT_OUTPUT = "text"
 
 
 class Color(Enum):
     BLUE = "\033[94m"
     GREEN = "\033[92m"
     RED = "\033[91m"
     END = "\033[0m"
 
 
 def colorize(text: str, color: Color):
     return color.value + text + Color.END.value
 
 
 class Output:
     def __init__(
         self, root_path: str, nodes_data: MerkleNodeInfo, source_tree: Directory
     ):
         self.root_path = root_path
         self.nodes_data = nodes_data
         self.source_tree = source_tree
 
     def show(self, mode=DEFAULT_OUTPUT):
         if mode == "text":
             isatty = sys.stdout.isatty()
             self.print_text(isatty)
         elif mode == "sunburst":
             directory_data = get_directory_data(
                 self.root_path, self.source_tree, self.nodes_data
             )
             sunburst_figure = generate_sunburst(directory_data, self.root_path)
             offline_plot(sunburst_figure)
         elif mode == "interactive":
             directory_data = get_directory_data(
                 self.root_path, self.source_tree, self.nodes_data
             )
             sunburst_figure = generate_sunburst(directory_data, self.root_path)
             run_app(sunburst_figure, self.source_tree, self.nodes_data)
         elif mode == "json":
             self.print_json()
         elif mode == "ndjson":
             self.print_ndjson()
         else:
             raise Exception(f"mode {mode} is not an output format")
 
     def get_path_name(self, node):
         return "path" if "path" in node.data.keys() else "data"
 
     def print_text(self, isatty: bool) -> None:
         def compute_level(node):
             node_path = str(node.data[self.get_path_name(node)]).split("/")
             source_path = str(self.source_tree.data["path"]).split("/")
             return len(node_path) - len(source_path)
 
         for node in self.source_tree.iter_tree():
             self.print_node(node, isatty, compute_level(node))
 
     def print_node(self, node: Any, isatty: bool, level: int) -> None:
         rel_path = os.path.basename(node.data[self.get_path_name(node)])
         rel_path = rel_path.decode()
         begin = "│   " * level
         end = "/" if node.object_type == "directory" else ""
 
         if isatty:
             if not self.nodes_data[node.swhid()]["known"]:
                 rel_path = colorize(rel_path, Color.RED)
             elif node.object_type == "directory":
                 rel_path = colorize(rel_path, Color.BLUE)
             elif node.object_type == "content":
                 rel_path = colorize(rel_path, Color.GREEN)
 
         print(f"{begin}{rel_path}{end}")
 
     def data_as_json(self):
         json = {}
         for node in self.source_tree.iter_tree():
-            node_known = self.nodes_data[node.swhid()]["known"]
             rel_path = os.path.relpath(
                 node.data[self.get_path_name(node)].decode(),
                 self.source_tree.data["path"].decode(),
             )
-            json[rel_path] = {"swhid": str(node.swhid()), "known": node_known}
+            json[rel_path] = {"swhid": str(node.swhid())}
+            for k, v in self.nodes_data[node.swhid()].items():
+                json[rel_path][k] = v
         return json
 
     def print_json(self):
         print(json.dumps(self.data_as_json(), indent=4, sort_keys=True))
 
     def print_ndjson(self):
         print(ndjson.dumps({k: v} for k, v in self.data_as_json().items()))
diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py
index 431b999..44cf053 100644
--- a/swh/scanner/policy.py
+++ b/swh/scanner/policy.py
@@ -1,332 +1,260 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
-import asyncio
-import itertools
-from typing import Dict, List, no_type_check
-
-import aiohttp
+from typing import no_type_check
 
 from swh.core.utils import grouper
 from swh.model.from_disk import Directory
-from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID
+from swh.model.identifiers import CONTENT, DIRECTORY
 
+from .client import QUERY_LIMIT, Client
 from .data import MerkleNodeInfo
-from .exceptions import error_response
-
-# Maximum number of SWHIDs that can be requested by a single call to the
-# Web API endpoint /known/
-QUERY_LIMIT = 1000
-
-
-async def swhids_discovery(
-    swhids: List[CoreSWHID], session: aiohttp.ClientSession, api_url: str,
-) -> Dict[str, Dict[str, bool]]:
-    """API Request to get information about the SoftWare Heritage persistent
-    IDentifiers (SWHIDs) given in input.
-
-    Args:
-        swhids: a list of CoreSWHID instances
-        api_url: url for the API request
-
-    Returns:
-        A dictionary with:
-
-        key:
-            string SWHID searched
-        value:
-            value['known'] = True if the SWHID is found
-            value['known'] = False if the SWHID is not found
-
-    """
-    endpoint = api_url + "known/"
-    requests = []
-
-    def get_chunk(swhids):
-        for i in range(0, len(swhids), QUERY_LIMIT):
-            yield swhids[i : i + QUERY_LIMIT]
-
-    async def make_request(swhids):
-        swhids = [str(swhid) for swhid in swhids]
-        async with session.post(endpoint, json=swhids) as resp:
-            if resp.status != 200:
-                error_response(resp.reason, resp.status, endpoint)
-
-            return await resp.json()
-
-    if len(swhids) > QUERY_LIMIT:
-        for swhids_chunk in get_chunk(swhids):
-            requests.append(asyncio.create_task(make_request(swhids_chunk)))
-
-        res = await asyncio.gather(*requests)
-        # concatenate list of dictionaries
-        return dict(itertools.chain.from_iterable(e.items() for e in res))
-    else:
-        return await make_request(swhids)
 
 
 def source_size(source_tree: Directory):
     """return the size of a source tree as the number of nodes it contains
     """
     return sum(1 for n in source_tree.iter_tree(dedup=False))
 
 
 class Policy(metaclass=abc.ABCMeta):
 
     data: MerkleNodeInfo
     """information about contents and directories of the merkle tree"""
 
     source_tree: Directory
     """representation of a source code project directory in the merkle tree"""
 
     def __init__(self, source_tree: Directory, data: MerkleNodeInfo):
-        self.data = data
         self.source_tree = source_tree
-        for node in source_tree.iter_tree():
-            self.data[node.swhid()] = {"known": None}  # type: ignore
+        self.data = data
 
     @abc.abstractmethod
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         """Scan a source code project"""
         raise NotImplementedError("Must implement run method")
 
 
 class LazyBFS(Policy):
     """Read nodes in the merkle tree using the BFS algorithm.
        Lookup only directories that are unknown otherwise set all the downstream
        contents to known.
     """
 
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         queue = []
         queue.append(self.source_tree)
 
         while queue:
             swhids = [node.swhid() for node in queue]
-            swhids_res = await swhids_discovery(swhids, session, api_url)
+            swhids_res = await client.known(swhids)
             for node in queue.copy():
                 queue.remove(node)
                 self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())][
                     "known"
                 ]
                 if node.object_type == DIRECTORY:
                     if not self.data[node.swhid()]["known"]:
                         children = [n[1] for n in list(node.items())]
                         queue.extend(children)
                     else:
                         for sub_node in node.iter_tree():
                             if sub_node == node:
                                 continue
                             self.data[sub_node.swhid()]["known"] = True  # type: ignore
 
 
 class GreedyBFS(Policy):
     """Query graph nodes in chunks (to maximize the Web API rate limit use) and set the
        downstream contents of known directories to known.
     """
 
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         ssize = source_size(self.source_tree)
         seen = []
 
-        async for nodes_chunk in self.get_nodes_chunks(session, api_url, ssize):
+        async for nodes_chunk in self.get_nodes_chunks(client, ssize):
             for node in nodes_chunk:
                 seen.append(node)
                 if len(seen) == ssize:
                     return
                 if node.object_type == DIRECTORY and self.data[node.swhid()]["known"]:
                     sub_nodes = [n for n in node.iter_tree(dedup=False)]
                     sub_nodes.remove(node)  # remove root node
                     for sub_node in sub_nodes:
                         seen.append(sub_node)
                         self.data[sub_node.swhid()]["known"] = True
 
     @no_type_check
-    async def get_nodes_chunks(
-        self, session: aiohttp.ClientSession, api_url: str, ssize: int
-    ):
+    async def get_nodes_chunks(self, client: Client, ssize: int):
         """Query chunks of QUERY_LIMIT nodes at once in order to fill the Web API
            rate limit. It query all the nodes in the case the source code contains
            less than QUERY_LIMIT nodes.
         """
         nodes = self.source_tree.iter_tree(dedup=False)
         for nodes_chunk in grouper(nodes, QUERY_LIMIT):
             nodes_chunk = [n for n in nodes_chunk]
             swhids = [node.swhid() for node in nodes_chunk]
-            swhids_res = await swhids_discovery(swhids, session, api_url)
+            swhids_res = await client.known(swhids)
             for node in nodes_chunk:
                 swhid = node.swhid()
                 self.data[swhid]["known"] = swhids_res[str(swhid)]["known"]
             yield nodes_chunk
 
 
 class FilePriority(Policy):
     """Check the Merkle tree querying all the file contents and set all the upstream
        directories to unknown in the case a file content is unknown.
        Finally check all the directories which status is still unknown and set all the
        sub-directories of known directories to known.
     """
 
     @no_type_check
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         # get all the files
         all_contents = list(
             filter(
                 lambda node: node.object_type == CONTENT, self.source_tree.iter_tree()
             )
         )
         all_contents.reverse()  # check deepest node first
 
         # query the backend to get all file contents status
         cnt_swhids = [node.swhid() for node in all_contents]
-        cnt_status_res = await swhids_discovery(cnt_swhids, session, api_url)
+        cnt_status_res = await client.known(cnt_swhids)
         # set all the file contents status
         for cnt in all_contents:
             self.data[cnt.swhid()]["known"] = cnt_status_res[str(cnt.swhid())]["known"]
             # set all the upstream directories of unknown file contents to unknown
             if not self.data[cnt.swhid()]["known"]:
                 parent = cnt.parents[0]
                 while parent:
                     self.data[parent.swhid()]["known"] = False
                     parent = parent.parents[0] if parent.parents else None
 
         # get all unset directories and check their status
         # (update children directories accordingly)
         unset_dirs = list(
             filter(
                 lambda node: node.object_type == DIRECTORY
                 and self.data[node.swhid()]["known"] is None,
                 self.source_tree.iter_tree(),
             )
         )
 
         # check unset directories
         for dir_ in unset_dirs:
             if self.data[dir_.swhid()]["known"] is None:
                 # update directory status
-                dir_status = await swhids_discovery([dir_.swhid()], session, api_url)
+                dir_status = await client.known([dir_.swhid()])
                 dir_known = dir_status[str(dir_.swhid())]["known"]
                 self.data[dir_.swhid()]["known"] = dir_known
                 if dir_known:
                     sub_dirs = list(
                         filter(
                             lambda n: n.object_type == DIRECTORY
                             and self.data[n.swhid()]["known"] is None,
                             dir_.iter_tree(),
                         )
                     )
                     for node in sub_dirs:
                         self.data[node.swhid()]["known"] = True
 
 
 class DirectoryPriority(Policy):
     """Check the Merkle tree querying all the directories that have at least one file
        content and set all the upstream directories to unknown in the case a directory
        is unknown otherwise set all the downstream contents to known.
        Finally check the status of empty directories and all the remaining file
        contents.
     """
 
     @no_type_check
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         # get all directory contents that have at least one file content
         unknown_dirs = list(
             filter(
                 lambda dir_: dir_.object_type == DIRECTORY and self.has_contents(dir_),
                 self.source_tree.iter_tree(),
             )
         )
         unknown_dirs.reverse()  # check deepest node first
 
         for dir_ in unknown_dirs:
             if self.data[dir_.swhid()]["known"] is None:
-                dir_status = await swhids_discovery([dir_.swhid()], session, api_url)
+                dir_status = await client.known([dir_.swhid()])
                 dir_known = dir_status[str(dir_.swhid())]["known"]
                 self.data[dir_.swhid()]["known"] = dir_known
                 # set all the downstream file contents to known
                 if dir_known:
                     for cnt in self.get_contents(dir_):
                         self.data[cnt.swhid()]["known"] = True
                 # otherwise set all the upstream directories to unknown
                 else:
                     parent = dir_.parents[0]
                     while parent:
                         self.data[parent.swhid()]["known"] = False
                         parent = parent.parents[0] if parent.parents else None
 
         # get remaining directories that have no file contents
         empty_dirs = list(
             filter(
                 lambda n: n.object_type == DIRECTORY
                 and not self.has_contents(n)
                 and self.data[n.swhid()]["known"] is None,
                 self.source_tree.iter_tree(),
             )
         )
         empty_dirs_swhids = [n.swhid() for n in empty_dirs]
-        empty_dir_status = await swhids_discovery(empty_dirs_swhids, session, api_url)
+        empty_dir_status = await client.known(empty_dirs_swhids)
 
         # update status of directories that have no file contents
         for dir_ in empty_dirs:
             self.data[dir_.swhid()]["known"] = empty_dir_status[str(dir_.swhid())][
                 "known"
             ]
 
         # check unknown file contents
         unknown_cnts = list(
             filter(
                 lambda n: n.object_type == CONTENT
                 and self.data[n.swhid()]["known"] is None,
                 self.source_tree.iter_tree(),
             )
         )
         unknown_cnts_swhids = [n.swhid() for n in unknown_cnts]
-        unknown_cnts_status = await swhids_discovery(
-            unknown_cnts_swhids, session, api_url
-        )
+        unknown_cnts_status = await client.known(unknown_cnts_swhids)
 
         for cnt in unknown_cnts:
             self.data[cnt.swhid()]["known"] = unknown_cnts_status[str(cnt.swhid())][
                 "known"
             ]
 
     def has_contents(self, directory: Directory):
         """Check if the directory given in input has contents"""
         for entry in directory.entries:
             if entry["type"] == "file":
                 return True
         return False
 
     def get_contents(self, dir_: Directory):
         """Get all the contents of a given directory"""
         for _, node in list(dir_.items()):
             if node.object_type == CONTENT:
                 yield node
 
 
 class QueryAll(Policy):
     """Check the status of every node in the Merkle tree.
     """
 
     @no_type_check
-    async def run(
-        self, session: aiohttp.ClientSession, api_url: str,
-    ):
+    async def run(self, client: Client):
         all_nodes = [node for node in self.source_tree.iter_tree()]
         all_swhids = [node.swhid() for node in all_nodes]
-        swhids_res = await swhids_discovery(all_swhids, session, api_url)
+        swhids_res = await client.known(all_swhids)
         for node in all_nodes:
             self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())]["known"]
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
index 6cd0653..8200a4f 100644
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -1,87 +1,101 @@
 # Copyright (C) 2020-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import asyncio
 from typing import Any, Dict, Iterable
 
 import aiohttp
 
 from swh.model.cli import model_of_dir
 from swh.model.from_disk import Directory
 
-from .data import MerkleNodeInfo
+from .client import Client
+from .data import MerkleNodeInfo, add_origin, init_merkle_node_info
 from .output import Output
 from .policy import (
     QUERY_LIMIT,
     DirectoryPriority,
     FilePriority,
     GreedyBFS,
     LazyBFS,
     QueryAll,
     source_size,
 )
 
 
-async def run(config: Dict[str, Any], policy) -> None:
+async def run(
+    config: Dict[str, Any],
+    policy,
+    source_tree: Directory,
+    nodes_data: MerkleNodeInfo,
+    extra_info: set,
+) -> None:
     """Scan a given source code according to the policy given in input.
-
-    Args:
-        root: the root path to scan
-        api_url: url for the API request
-
     """
     api_url = config["web-api"]["url"]
 
     if config["web-api"]["auth-token"]:
         headers = {"Authorization": f"Bearer {config['web-api']['auth-token']}"}
     else:
         headers = {}
 
     async with aiohttp.ClientSession(headers=headers, trust_env=True) as session:
-        await policy.run(session, api_url)
+        client = Client(api_url, session)
+        for info in extra_info:
+            if info == "known":
+                await policy.run(client)
+            elif info == "origin":
+                await add_origin(source_tree, nodes_data, client)
+            else:
+                raise Exception(f"The information '{info}' cannot be retrieved")
 
 
 def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str):
     if policy == "auto":
         return (
             QueryAll(source_tree, nodes_data)
             if source_size(source_tree) <= QUERY_LIMIT
             else LazyBFS(source_tree, nodes_data)
         )
     elif policy == "bfs":
         return LazyBFS(source_tree, nodes_data)
     elif policy == "greedybfs":
         return GreedyBFS(source_tree, nodes_data)
     elif policy == "filepriority":
         return FilePriority(source_tree, nodes_data)
     elif policy == "dirpriority":
         return DirectoryPriority(source_tree, nodes_data)
     else:
         raise Exception(f"policy '{policy}' not found")
 
 
 def scan(
     config: Dict[str, Any],
     root_path: str,
     exclude_patterns: Iterable[str],
     out_fmt: str,
     interactive: bool,
     policy: str,
+    extra_info: set,
 ):
     """Scan a source code project to discover files and directories already
     present in the archive"""
     converted_patterns = [pattern.encode() for pattern in exclude_patterns]
     source_tree = model_of_dir(root_path.encode(), converted_patterns)
+
     nodes_data = MerkleNodeInfo()
+    extra_info.add("known")
+    init_merkle_node_info(source_tree, nodes_data, extra_info)
+
     policy = get_policy_obj(source_tree, nodes_data, policy)
 
     loop = asyncio.get_event_loop()
-    loop.run_until_complete(run(config, policy))
+    loop.run_until_complete(run(config, policy, source_tree, nodes_data, extra_info))
 
     out = Output(root_path, nodes_data, source_tree)
     if interactive:
         out.show("interactive")
     else:
         out.show(out_fmt)
diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py
index 42b8e21..12060dc 100644
--- a/swh/scanner/tests/data.py
+++ b/swh/scanner/tests/data.py
@@ -1,27 +1,32 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-correct_api_response = {
+correct_known_api_response = {
     "swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112": {"known": False},
     "swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140": {"known": True},
     "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True},
 }
 
+correct_origin_api_response = "https://bitbucket.org/chubbymaggie/bindead.git"
+
+sample_folder_root_swhid = "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce"
+fake_origin = {sample_folder_root_swhid: correct_origin_api_response}
+
 present_swhids = [
     "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a",  # quotes.md
     "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb",  # some-binary
     "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93",  # barfoo2/
     "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",  # toexclude/
 ]
 
 # these SWHIDs are considered known by the fake backend (scanner.test.flask_api)
 unknown_swhids = [
     "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce",  # root sample-folder-policy
     "swh:1:dir:0a7b61ef5780b03aa274d11069564980246445ce",  # root sample-folder
     "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119",  # toexclude/example.txt
     "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",  # toexclude/
 ]
 
 to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326"
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
index c521e63..42a8141 100644
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -1,41 +1,48 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from flask import Flask, request
+from flask import Flask, abort, request
 
 from swh.scanner.exceptions import LargePayloadExc
 from swh.scanner.policy import QUERY_LIMIT
 
-from .data import unknown_swhids
+from .data import fake_origin, unknown_swhids
 
 
 def create_app(tmp_requests):
     app = Flask(__name__)
 
     @app.route("/")
     def index():
         return "SWH scanner API"
 
     @app.route("/known/", methods=["POST"])
     def known():
         swhids = request.get_json()
         with open(tmp_requests, "a") as f:
             for swhid in swhids:
                 f.write(swhid + "\n")
 
         if len(swhids) > QUERY_LIMIT:
             raise LargePayloadExc(
                 f"The maximum number of SWHIDs this endpoint can receive is "
                 f"{QUERY_LIMIT}"
             )
 
         res = {swhid: {"known": False} for swhid in swhids}
         for swhid in swhids:
             if swhid not in unknown_swhids:
                 res[swhid]["known"] = True
 
         return res
 
+    @app.route("/graph/randomwalk/<swhid>/ori/", methods=["GET"])
+    def randomwalk(swhid):
+        if swhid in fake_origin.keys():
+            return fake_origin[swhid]
+        else:
+            abort(404)
+
     return app
diff --git a/swh/scanner/tests/test_client.py b/swh/scanner/tests/test_client.py
new file mode 100644
index 0000000..6a85eec
--- /dev/null
+++ b/swh/scanner/tests/test_client.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+import pytest
+
+from swh.model.identifiers import CoreSWHID
+from swh.scanner.client import Client
+from swh.scanner.exceptions import APIError
+
+from .data import correct_known_api_response, correct_origin_api_response
+
+AIO_URL = "http://example.org/api/"
+KNOWN_URL = f"{AIO_URL}known/"
+ORIGIN_URL = f"{AIO_URL}graph/randomwalk/"
+
+
+def test_client_known_correct_api_request(mock_aioresponse, event_loop, aiosession):
+    mock_aioresponse.post(
+        KNOWN_URL,
+        status=200,
+        content_type="application/json",
+        body=json.dumps(correct_known_api_response),
+    )
+
+    client = Client(AIO_URL, aiosession)
+    actual_result = event_loop.run_until_complete(client.known([]))
+
+    assert correct_known_api_response == actual_result
+
+
+def test_client_known_raise_apierror(mock_aioresponse, event_loop, aiosession):
+    mock_aioresponse.post(KNOWN_URL, content_type="application/json", status=413)
+
+    client = Client(AIO_URL, aiosession)
+    with pytest.raises(APIError):
+        event_loop.run_until_complete(client.known([]))
+
+
+def test_client_get_origin_correct_api_request(
+    mock_aioresponse, event_loop, aiosession
+):
+    origin_url = (
+        f"{ORIGIN_URL}swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140/ori"
+        f"/?direction=backward&limit=-1&resolve_origins=true"
+    )
+    mock_aioresponse.get(
+        origin_url, status=200, body=correct_origin_api_response,
+    )
+
+    client = Client(AIO_URL, aiosession)
+    swhid = CoreSWHID.from_string("swh:1:dir:01fa282bb80be5907505d44b4692d3fa40fad140")
+    actual_result = event_loop.run_until_complete(client.get_origin(swhid))
+
+    assert correct_origin_api_response == actual_result
diff --git a/swh/scanner/tests/test_data.py b/swh/scanner/tests/test_data.py
index 4a29751..2925c3e 100644
--- a/swh/scanner/tests/test_data.py
+++ b/swh/scanner/tests/test_data.py
@@ -1,44 +1,73 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from pathlib import Path
 
+from flask import url_for
 import pytest
 
 from swh.model.exceptions import ValidationError
+from swh.scanner.client import Client
 from swh.scanner.data import (
     MerkleNodeInfo,
+    add_origin,
     directory_content,
     get_directory_data,
     has_dirs,
+    init_merkle_node_info,
 )
 
+from .data import fake_origin
+
 
 def test_merkle_node_data_wrong_args():
     nodes_data = MerkleNodeInfo()
 
     with pytest.raises(ValidationError):
         nodes_data["wrong key"] = {"known": True}
 
     with pytest.raises(ValidationError):
         nodes_data["swh:1:dir:17d207da3804cc60a77cba58e76c3b2f767cb112"] = "wrong value"
 
 
+def test_init_merkle_supported_node_info(source_tree):
+    nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree, nodes_data, {"known", "origin"})
+    for _, node_attrs in nodes_data.items():
+        assert "known" and "origin" in node_attrs.keys()
+
+
+def test_init_merkle_not_supported_node_info(source_tree):
+    nodes_data = MerkleNodeInfo()
+    with pytest.raises(Exception):
+        init_merkle_node_info(source_tree, nodes_data, {"unsupported_info"})
+
+
+def test_add_origin(event_loop, live_server, aiosession, source_tree, nodes_data):
+    api_url = url_for("index", _external=True)
+    init_merkle_node_info(source_tree, nodes_data, {"known", "origin"})
+    client = Client(api_url, aiosession)
+
+    event_loop.run_until_complete(add_origin(source_tree, nodes_data, client))
+    for node, attrs in nodes_data.items():
+        assert attrs["origin"] == fake_origin[str(source_tree.swhid())]
+
+
 def test_get_directory_data(source_tree, nodes_data):
     root = Path(source_tree.data["path"].decode())
     dirs_data = get_directory_data(root, source_tree, nodes_data)
 
     assert len(dirs_data) == 5
 
 
 def test_directory_content(source_tree, nodes_data):
     foo_dir = source_tree[b"foo"]
     foo_content = directory_content(foo_dir, nodes_data)
     assert foo_content[0] == 3
     assert foo_content[1] == 3
 
 
 def test_has_dirs(source_tree):
     assert has_dirs(source_tree)
diff --git a/swh/scanner/tests/test_policy.py b/swh/scanner/tests/test_policy.py
index a60873c..937408c 100644
--- a/swh/scanner/tests/test_policy.py
+++ b/swh/scanner/tests/test_policy.py
@@ -1,168 +1,148 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import json
 
 from flask import url_for
 import pytest
 
 from swh.model.identifiers import CONTENT, CoreSWHID, ObjectType
-from swh.scanner.data import MerkleNodeInfo
-from swh.scanner.exceptions import APIError
+from swh.scanner.client import Client
+from swh.scanner.data import MerkleNodeInfo, init_merkle_node_info
 from swh.scanner.policy import (
     DirectoryPriority,
     FilePriority,
     GreedyBFS,
     LazyBFS,
     source_size,
-    swhids_discovery,
 )
 
-from .data import correct_api_response
-
-aio_url = "http://example.org/api/known/"
-
-
-def test_scanner_correct_api_request(mock_aioresponse, event_loop, aiosession):
-    mock_aioresponse.post(
-        aio_url,
-        status=200,
-        content_type="application/json",
-        body=json.dumps(correct_api_response),
-    )
-
-    actual_result = event_loop.run_until_complete(
-        swhids_discovery([], aiosession, "http://example.org/api/")
-    )
-
-    assert correct_api_response == actual_result
-
-
-def test_scanner_raise_apierror(mock_aioresponse, event_loop, aiosession):
-    mock_aioresponse.post(aio_url, content_type="application/json", status=413)
-
-    with pytest.raises(APIError):
-        event_loop.run_until_complete(
-            swhids_discovery([], aiosession, "http://example.org/api/")
-        )
-
 
 def test_scanner_directory_priority_has_contents(source_tree):
     nodes_data = MerkleNodeInfo()
     policy = DirectoryPriority(source_tree, nodes_data)
     assert policy.has_contents(source_tree[b"/bar/barfoo"])
 
 
 def get_backend_swhids_order(tmp_requests):
     with open(tmp_requests, "r") as f:
         backend_swhids_order = f.readlines()
 
     return [x.strip() for x in backend_swhids_order]
 
 
 def test_lazybfs_policy(
     live_server, aiosession, event_loop, source_tree_policy, tmp_requests
 ):
     open(tmp_requests, "w").close()
     api_url = url_for("index", _external=True)
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree_policy, nodes_data, {"known"})
     policy = LazyBFS(source_tree_policy, nodes_data)
-    event_loop.run_until_complete(policy.run(aiosession, api_url))
+    client = Client(api_url, aiosession)
+    event_loop.run_until_complete(policy.run(client))
 
     backend_swhids_requests = get_backend_swhids_order(tmp_requests)
 
     assert (
         backend_swhids_requests[0]
         == "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce"
     )
 
     # the second request must contain 3 SWHIDs related to directories and one content
     dir_count, cnt_count = 0, 0
     for swhid in backend_swhids_requests[1:5]:
         if CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY:
             dir_count += 1
         else:
             cnt_count += 1
 
     assert dir_count == 3
     assert cnt_count == 1
 
     # the last swhid must be a content related to the unknown directory
     # "sample-folder-policy/toexclude"
     assert (
         backend_swhids_requests[5]
         == "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119"
     )
 
 
 def test_directory_priority_policy(
     live_server, aiosession, event_loop, source_tree_policy, tmp_requests
 ):
     open(tmp_requests, "w").close()
     api_url = url_for("index", _external=True)
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree_policy, nodes_data, {"known"})
     policy = DirectoryPriority(source_tree_policy, nodes_data)
-    event_loop.run_until_complete(policy.run(aiosession, api_url))
+    client = Client(api_url, aiosession)
+    event_loop.run_until_complete(policy.run(client))
 
     backend_swhids_requests = get_backend_swhids_order(tmp_requests)
 
     for swhid in backend_swhids_requests[0:4]:
         assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY
 
     for swhid in backend_swhids_requests[5:]:
         assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT
 
 
 def test_file_priority_policy(
     live_server, aiosession, event_loop, source_tree_policy, tmp_requests
 ):
     open(tmp_requests, "w").close()
     api_url = url_for("index", _external=True)
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree_policy, nodes_data, {"known"})
     policy = FilePriority(source_tree_policy, nodes_data)
-    event_loop.run_until_complete(policy.run(aiosession, api_url))
+    client = Client(api_url, aiosession)
+    event_loop.run_until_complete(policy.run(client))
 
     backend_swhids_requests = get_backend_swhids_order(tmp_requests)
 
     for swhid in backend_swhids_requests[0:4]:
         assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT
 
     for swhid in backend_swhids_requests[5:]:
         assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY
 
 
 def test_greedy_bfs_policy(
     live_server, event_loop, aiosession, big_source_tree, tmp_requests
 ):
     open(tmp_requests, "w").close()
     api_url = url_for("index", _external=True)
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(big_source_tree, nodes_data, {"known"})
     policy = GreedyBFS(big_source_tree, nodes_data)
-    event_loop.run_until_complete(policy.run(aiosession, api_url))
+    client = Client(api_url, aiosession)
+    event_loop.run_until_complete(policy.run(client))
 
     backend_swhids_requests = get_backend_swhids_order(tmp_requests)
 
     last_swhid = backend_swhids_requests[-1]
     assert CoreSWHID.from_string(last_swhid).object_type == ObjectType.CONTENT
 
 
 @pytest.mark.asyncio
 async def test_greedy_bfs_get_nodes_chunks(live_server, aiosession, big_source_tree):
     api_url = url_for("index", _external=True)
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(big_source_tree, nodes_data, {"known"})
     policy = GreedyBFS(big_source_tree, nodes_data)
+    client = Client(api_url, aiosession)
     chunks = [
         n_chunk
         async for n_chunk in policy.get_nodes_chunks(
-            aiosession, api_url, source_size(big_source_tree)
+            client, source_size(big_source_tree)
         )
     ]
     assert len(chunks) == 2
     assert chunks[1][-1].object_type == CONTENT
diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py
index 903ed0b..42ae86c 100644
--- a/swh/scanner/tests/test_scanner.py
+++ b/swh/scanner/tests/test_scanner.py
@@ -1,84 +1,96 @@
 # Copyright (C) 2020-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from flask import url_for
 import pytest
 
-from swh.scanner.data import MerkleNodeInfo
+from swh.scanner.data import MerkleNodeInfo, init_merkle_node_info
 from swh.scanner.policy import DirectoryPriority, FilePriority, LazyBFS, QueryAll
 from swh.scanner.scanner import get_policy_obj, run
 
 from .data import unknown_swhids
 
 
 @pytest.mark.options(debug=False)
 def test_app(app):
     assert not app.debug
 
 
 def test_get_policy_obj_auto(source_tree, nodes_data):
     assert isinstance(get_policy_obj(source_tree, nodes_data, "auto"), QueryAll)
 
 
 def test_get_policy_obj_bfs(big_source_tree, nodes_data):
     # check that the policy object is the LazyBFS if the source tree contains more than
     # 1000 nodes
     assert isinstance(get_policy_obj(big_source_tree, nodes_data, "auto"), LazyBFS)
 
 
 def test_scanner_result_bfs(live_server, event_loop, source_tree):
     api_url = url_for("index", _external=True)
     config = {"web-api": {"url": api_url, "auth-token": None}}
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree, nodes_data, {"known"})
     policy = LazyBFS(source_tree, nodes_data)
-    event_loop.run_until_complete(run(config, policy))
+    event_loop.run_until_complete(
+        run(config, policy, source_tree, nodes_data, {"known"})
+    )
     for node in source_tree.iter_tree():
         if str(node.swhid()) in unknown_swhids:
             assert nodes_data[node.swhid()]["known"] is False
         else:
             assert nodes_data[node.swhid()]["known"] is True
 
 
 def test_scanner_result_file_priority(live_server, event_loop, source_tree):
     api_url = url_for("index", _external=True)
     config = {"web-api": {"url": api_url, "auth-token": None}}
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree, nodes_data, {"known"})
     policy = FilePriority(source_tree, nodes_data)
-    event_loop.run_until_complete(run(config, policy))
+    event_loop.run_until_complete(
+        run(config, policy, source_tree, nodes_data, {"known"})
+    )
     for node in source_tree.iter_tree():
         if str(node.swhid()) in unknown_swhids:
             assert nodes_data[node.swhid()]["known"] is False
         else:
             assert nodes_data[node.swhid()]["known"] is True
 
 
 def test_scanner_result_directory_priority(live_server, event_loop, source_tree):
     api_url = url_for("index", _external=True)
     config = {"web-api": {"url": api_url, "auth-token": None}}
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree, nodes_data, {"known"})
     policy = DirectoryPriority(source_tree, nodes_data)
-    event_loop.run_until_complete(run(config, policy))
+    event_loop.run_until_complete(
+        run(config, policy, source_tree, nodes_data, {"known"})
+    )
     for node in source_tree.iter_tree():
         if str(node.swhid()) in unknown_swhids:
             assert nodes_data[node.swhid()]["known"] is False
         else:
             assert nodes_data[node.swhid()]["known"] is True
 
 
 def test_scanner_result_query_all(live_server, event_loop, source_tree):
     api_url = url_for("index", _external=True)
     config = {"web-api": {"url": api_url, "auth-token": None}}
 
     nodes_data = MerkleNodeInfo()
+    init_merkle_node_info(source_tree, nodes_data, {"known"})
     policy = QueryAll(source_tree, nodes_data)
-    event_loop.run_until_complete(run(config, policy))
+    event_loop.run_until_complete(
+        run(config, policy, source_tree, nodes_data, {"known"})
+    )
     for node in source_tree.iter_tree():
         if str(node.swhid()) in unknown_swhids:
             assert nodes_data[node.swhid()]["known"] is False
         else:
             assert nodes_data[node.swhid()]["known"] is True