diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -2,118 +2,89 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import random
-import gzip
-import json
-import requests
-from pathlib import Path
-from collections import defaultdict
-
-from .models import GNUModel
+import logging
 
 from swh.scheduler import utils
 from swh.lister.core.simple_lister import SimpleLister
 
+from swh.lister.gnu.models import GNUModel
+from swh.lister.gnu.tree import GNUTree
+
+
+logger = logging.getLogger(__name__)
+
 
 class GNULister(SimpleLister):
     MODEL = GNUModel
     LISTER_NAME = 'gnu'
-    TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
-    BASE_URL = 'https://ftp.gnu.org'
     instance = 'gnu'
-    tarballs = defaultdict(dict)  # Dict of key with project name value the
-    # associated is list of tarballs of package to ingest from the gnu mirror
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
 
     def task_dict(self, origin_type, origin_url, **kwargs):
-        """
-        Return task format dict
+        """Return task format dict
 
         This is overridden from the lister_base as more information is
         needed for the ingestion task creation.
+
+        This creates tasks with args and kwargs set, for example:
+
+        .. code-block:: python
+
+            args: ['https://ftp.gnu.org/gnu/3dldf/']
+            kwargs: {
+                'tarballs': [{
+                    'archive': 'https://...',
+                    'time': 1071002600,
+                    'length': 128},
+                ...
+            ]}
+
         """
+        tarballs = self.gnu_tree.artifacts[origin_url]
         return utils.create_task_dict(
             'load-%s' % origin_type,
             kwargs.get('policy', 'oneshot'),
-            kwargs.get('name'),
             origin_url,
-            tarballs=self.tarballs[kwargs.get('name')])
+            tarballs=tarballs)
 
     def safely_issue_request(self, identifier):
-        '''
-        Download and unzip tree.json.gz file and returns its content
-        in JSON format
+        """Bypass the implementation. It's now the GNUTree which deals with
+        querying the gnu mirror.
 
-        File content in dictionary format
+        As an implementation detail, we cannot change simply the base
+        SimpleLister as other implementation still uses it. This shall be part
+        of another refactoring pass.
 
-        Args:
-            identifier: resource identifier (unused)
-
-        Returns:
-            Server response
-
-        '''
-        response = requests.get(self.TREE_URL,
-                                allow_redirects=True)
-        uncompressed_content = gzip.decompress(response.content)
-        return json.loads(uncompressed_content.decode('utf-8'))
+        """
+        return None
 
     def list_packages(self, response):
-        """
-        List the actual gnu origins with their names,url and the list
-        of all the tarball for a package from the response.
+        """List the actual gnu origins (package name) with their name, url and
+           associated tarballs.
 
         Args:
-            response : File structure of the website
-            in dictionary format
+            response: Unused
 
         Returns:
-            A list of all the packages with their names, url of their root
-            directory and the tarballs present for the particular package.
-            [
-                {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
-                 'tarballs':
-                    [
-                        {'archive':
-                            'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
-                        'date': '1071002600'},
-                        {'archive':
-                            'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
-                        'date': '1071078759'}}
-                    ]
-                },
-                {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
-                'tarballs':
-                    [
-                        {'archive':
-                            'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
-                        'date': '1461357336'},
-                        {'archive':
-                            'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
-                        'date': '1480991830'}
-                    ]
-            ]
+            List of packages name, url, last modification time
+
+            .. code-block:: python
+
+                [
+                    {'name': '3dldf',
+                     'url': 'https://ftp.gnu.org/gnu/3dldf/',
+                     'time_modified': 1071002600},
+                    {'name': '8sync',
+                     'url': 'https://ftp.gnu.org/gnu/8sync/',
+                     'time_modified': 1480991830},
+                    ...
+                ]
+
         """
-        response = filter_directories(response)
-        packages = []
-        for directory in response:
-            content = directory['contents']
-            for repo in content:
-                if repo['type'] == 'directory':
-                    package_url = '%s/%s/%s/' % (self.BASE_URL,
-                                                 directory['name'],
-                                                 repo['name'])
-                    package_tarballs = find_tarballs(
-                        repo['contents'], package_url)
-                    if package_tarballs != []:
-                        repo_details = {
-                            'name': repo['name'],
-                            'url': package_url,
-                            'time_modified': repo['time'],
-                        }
-                        self.tarballs[repo['name']] = package_tarballs
-                        packages.append(repo_details)
-        random.shuffle(packages)
-        return packages
+        return list(self.gnu_tree.projects.values())
 
     def get_model_from_repo(self, repo):
         """Transform from repository representation to model
@@ -128,89 +99,3 @@
             'time_last_updated': int(repo['time_modified']),
             'origin_type': 'tar',
         }
-
-
-def find_tarballs(package_file_structure, url):
-    '''Recursively lists tarballs present in the folder and subfolders for a
-    particular package url.
-
-    Args
-        package_file_structure: File structure of the package root directory
-        url: URL of the corresponding package
-
-    Returns
-        List of tarball urls and their associated metadata (time, length).
-        For example:
-
-        [
-            {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
-             'time': 1071002600,
-            'length': 543},
-            {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
-             'time': 1071078759,
-             'length': 456},
-            {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
-             'time': 1074278633,
-             'length': 251},
-            ...
-        ]
-
-    '''
-    tarballs = []
-    for single_file in package_file_structure:
-        filetype = single_file['type']
-        filename = single_file['name']
-        if filetype == 'file':
-            if file_extension_check(filename):
-                tarballs.append({
-                    'archive': url + filename,
-                    'time': int(single_file['time']),
-                    'length': int(single_file['size']),
-                })
-        # It will recursively check for tarballs in all sub-folders
-        elif filetype == 'directory':
-            tarballs_in_dir = find_tarballs(
-                single_file['contents'],
-                url + filename + '/')
-            tarballs.extend(tarballs_in_dir)
-
-    return tarballs
-
-
-def filter_directories(response):
-    '''
-    Keep only gnu and old-gnu folders from JSON
-    '''
-    final_response = []
-    file_system = response[0]['contents']
-    for directory in file_system:
-        if directory['name'] in ('gnu', 'old-gnu'):
-            final_response.append(directory)
-    return final_response
-
-
-def file_extension_check(file_name):
-    '''
-    Check for the extension of the file, if the file is of zip format of
-    .tar.x format, where x could be anything, then returns true.
-
-    Args:
-        file_name : name of the file for which the extensions is needs to
-                    be checked.
-
-    Returns:
-        True or False
-
-    example
-        file_extension_check('abc.zip')  will return True
-        file_extension_check('abc.tar.gz')  will return True
-        file_extension_check('abc.tar.gz.sig')  will return False
-
-    '''
-    file_suffixes = Path(file_name).suffixes
-    if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip':
-        return True
-    elif len(file_suffixes) > 1:
-        if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
-            return True
-    return False
diff --git a/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz b/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -1,59 +1,41 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import json
-
-from swh.lister.gnu.lister import find_tarballs, filter_directories
-from swh.lister.gnu.lister import file_extension_check
-
-
-def test_filter_directories():
-    f = open('swh/lister/gnu/tests/api_response.json')
-    api_response = json.load(f)
-    cleared_api_response = filter_directories(api_response)
-    for directory in cleared_api_response:
-        if directory['name'] not in ('gnu', 'old-gnu'):
-            assert False
-
-
-def test_find_tarballs_small_sample():
-    expected_tarballs = [
-        {
-            'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
-            'time': 1495205979,
-            'length': 424081,
-        },
-        {
-            'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip',  # noqa
-            'time': 898422900,
-            'length': 1514448
-        },
-        {
-            'archive': '/root/xboard/xboard-3.6.2.tar.gz',  # noqa
-            'time': 869814000,
-            'length': 450164,
-        },
-        {
-            'archive': '/root/xboard/xboard-4.0.0.tar.gz',  # noqa
-            'time': 898422900,
-            'length': 514951,
-        },
-    ]
-
-    file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
-    actual_tarballs = find_tarballs(file_structure, '/root/')
-    assert actual_tarballs == expected_tarballs
-
-
-def test_find_tarballs():
-    file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
-    actual_tarballs = find_tarballs(file_structure, '/root/')
-    assert len(actual_tarballs) == 42 + 3  # tar + zip
-
-
-def test_file_extension_check():
-    assert file_extension_check('abc.xy.zip')
-    assert file_extension_check('cvb.zip')
-    assert file_extension_check('abc.tar.bz2')
-    assert file_extension_check('abc') is False
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
+    lister = swh_listers['gnu']
+
+    lister.run()
+
+    r = lister.scheduler.search_tasks(task_type='load-tar')
+    assert len(r) == 383
+
+    for row in r:
+        assert row['type'] == 'load-tar'
+        # arguments check
+        args = row['arguments']['args']
+        assert len(args) == 1
+
+        url = args[0]
+        assert url.startswith('https://ftp.gnu.org')
+
+        url_suffix = url.split('https://ftp.gnu.org')[1]
+        assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
+
+        # kwargs
+        kwargs = row['arguments']['kwargs']
+        assert list(kwargs.keys()) == ['tarballs']
+
+        tarballs = kwargs['tarballs']
+        # check the tarball's structure
+        tarball = tarballs[0]
+        assert set(tarball.keys()) == set(['archive', 'length', 'time'])
+
+        assert row['policy'] == 'oneshot'
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+import pytest
+
+from os import path
+from swh.lister.gnu.tree import (
+    GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
+)
+
+
+def test_load_raw_data_from_query(requests_mock_datadir):
+    actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
+    assert actual_json is not None
+    assert isinstance(actual_json, list)
+    assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_query_failure(requests_mock_datadir):
+    inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
+    with pytest.raises(ValueError, match='Error during query'):
+        load_raw_data(inexistant_url)
+
+
+def test_load_raw_data_from_file(datadir):
+    filepath = path.join(datadir, 'ftp.gnu.org', 'tree.json.gz')
+    actual_json = load_raw_data(filepath)
+    assert actual_json is not None
+    assert isinstance(actual_json, list)
+    assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_file_failure(datadir):
+    unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
+    with pytest.raises(FileNotFoundError):
+        load_raw_data(unknown_path)
+
+
+def test_tree_json(requests_mock_datadir):
+    tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
+
+    assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
+        'name': '8sync',
+        'time_modified': '1489817408',
+        'url': 'https://ftp.gnu.org/gnu/8sync/'
+    }
+
+    assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
+        'name': '3dldf',
+        'time_modified': '1386961236',
+        'url': 'https://ftp.gnu.org/gnu/3dldf/'
+    }
+
+    assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
+        'name': 'a2ps',
+        'time_modified': '1198900505',
+        'url': 'https://ftp.gnu.org/gnu/a2ps/'
+    }
+
+    assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
+        'name': 'xshogi',
+        'time_modified': '1059822922',
+        'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
+    }
+
+    assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
+        {
+            'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz',  # noqa
+            'length': 90106,
+            'time': 857980800
+        },
+        {
+            'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz',  # noqa
+            'length': 89625,
+            'time': 860396400
+        }
+    ]
+
+
+def test_tree_json_failures(requests_mock_datadir):
+    url = 'https://unknown/tree.json.gz'
+    tree_json = GNUTree(url)
+
+    with pytest.raises(ValueError, match='Error during query to %s' % url):
+        tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
+
+    with pytest.raises(ValueError, match='Error during query to %s' % url):
+        tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
+
+
+def test_find_artifacts_small_sample():
+    expected_tarballs = [
+        {
+            'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
+            'time': 1495205979,
+            'length': 424081,
+        },
+        {
+            'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip',  # noqa
+            'time': 898422900,
+            'length': 1514448
+        },
+        {
+            'archive': '/root/xboard/xboard-3.6.2.tar.gz',  # noqa
+            'time': 869814000,
+            'length': 450164,
+        },
+        {
+            'archive': '/root/xboard/xboard-4.0.0.tar.gz',  # noqa
+            'time': 898422900,
+            'length': 514951,
+        },
+    ]
+
+    file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
+    actual_tarballs = find_artifacts(file_structure, '/root/')
+    assert actual_tarballs == expected_tarballs
+
+
+def test_find_artifacts():
+    file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
+    actual_tarballs = find_artifacts(file_structure, '/root/')
+    assert len(actual_tarballs) == 42 + 3  # tar + zip
+
+
+def test_check_filename_is_archive():
+    for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
+        assert check_filename_is_archive(ext) is True
+
+    for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
+        assert check_filename_is_archive(ext) is False
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tree.py
@@ -0,0 +1,187 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import gzip
+import json
+import logging
+import requests
+
+from pathlib import Path
+from typing import Dict, Tuple, List
+from urllib.parse import urlparse
+
+
+logger = logging.getLogger(__name__)
+
+
+def load_raw_data(url: str) -> List[Dict]:
+    """Load the raw json from the tree.json.gz
+
+    Args:
+        url: Tree.json.gz url or path
+
+    Returns:
+        The raw json list
+
+    """
+    if url.startswith('http://') or url.startswith('https://'):
+        response = requests.get(url, allow_redirects=True)
+        if not response.ok:
+            raise ValueError('Error during query to %s' % url)
+        raw = gzip.decompress(response.content)
+    else:
+        with gzip.open(url, 'r') as f:
+            raw = f.read()
+    raw_data = json.loads(raw.decode('utf-8'))
+    return raw_data
+
+
+class GNUTree:
+    """Gnu Tree's representation
+
+    """
+    def __init__(self, url: str):
+        self.url = url  # filepath or uri
+        u = urlparse(url)
+        self.base_url = '%s://%s' % (u.scheme, u.netloc)
+        # Interesting top level directories
+        self.top_level_directories = ['gnu', 'old-gnu']
+        # internal state
+        self._artifacts = {}  # type: Dict
+        self._projects = {}  # type: Dict
+
+    @property
+    def projects(self) -> Dict:
+        if not self._projects:
+            self._projects, self._artifacts = self._load()
+        return self._projects
+
+    @property
+    def artifacts(self) -> Dict:
+        if not self._artifacts:
+            self._projects, self._artifacts = self._load()
+        return self._artifacts
+
+    def _load(self) -> Tuple[Dict, Dict]:
+        """Compute projects and artifacts per project
+
+        Returns:
+            Tuple of dict projects (key project url, value the associated
+            information) and a dict artifacts (key project url, value the
+            info_file list)
+
+        """
+        projects = {}
+        artifacts = {}
+
+        raw_data = load_raw_data(self.url)[0]
+        for directory in raw_data['contents']:
+            if directory['name'] not in self.top_level_directories:
+                continue
+            infos = directory['contents']
+            for info in infos:
+                if info['type'] == 'directory':
+                    package_url = '%s/%s/%s/' % (
+                        self.base_url, directory['name'], info['name'])
+                    package_artifacts = find_artifacts(
+                        info['contents'], package_url)
+                    if package_artifacts != []:
+                        repo_details = {
+                            'name': info['name'],
+                            'url': package_url,
+                            'time_modified': info['time'],
+                        }
+                        artifacts[package_url] = package_artifacts
+                        projects[package_url] = repo_details
+
+        return projects, artifacts
+
+
+def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
+    """Recursively list artifacts present in the folder and subfolders for a
+    particular package url.
+
+    Args:
+
+        filesystem: File structure of the package root directory. This is a
+            list of Dict representing either file or directory information as
+            dict (keys: name, size, time, type).
+        url: URL of the corresponding package
+
+    Returns
+        List of tarball urls and their associated metadata (time, length).
+        For example:
+
+        .. code-block:: python
+
+            [
+                {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+                 'time': 1071002600,
+                 'length': 543},
+                {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+                 'time': 1071078759,
+                 'length': 456},
+                {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
+                 'time': 1074278633,
+                 'length': 251},
+                ...
+            ]
+
+    """
+    artifacts = []
+    for info_file in filesystem:
+        filetype = info_file['type']
+        filename = info_file['name']
+        if filetype == 'file':
+            if check_filename_is_archive(filename):
+                artifacts.append({
+                    'archive': url + filename,
+                    'time': int(info_file['time']),
+                    'length': int(info_file['size']),
+                })
+        # It will recursively check for artifacts in all sub-folders
+        elif filetype == 'directory':
+            tarballs_in_dir = find_artifacts(
+                info_file['contents'],
+                url + filename + '/')
+            artifacts.extend(tarballs_in_dir)
+
+    return artifacts
+
+
+def check_filename_is_archive(filename: str) -> bool:
+    """
+    Check for the extension of the file, if the file is of zip format of
+    .tar.x format, where x could be anything, then returns true.
+
+    Args:
+        filename: name of the file for which the extensions is needs to
+            be checked.
+
+    Returns:
+        Whether filename is an archive or not
+
+    Example:
+
+    >>> check_filename_is_archive('abc.zip')
+    True
+    >>> check_filename_is_archive('abc.tar.gz')
+    True
+    >>> check_filename_is_archive('bac.tar')
+    True
+    >>> check_filename_is_archive('abc.tar.gz.sig')
+    False
+    >>> check_filename_is_archive('foobar.tar.')
+    False
+
+    """
+    file_suffixes = Path(filename).suffixes
+    logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
+    if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
+        return True
+    elif len(file_suffixes) > 1:
+        if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
+            return True
+    return False