Page MenuHomeSoftware Heritage

D2076.diff
No OneTemporary

D2076.diff

diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -2,118 +2,89 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import random
-import gzip
-import json
-import requests
-from pathlib import Path
-from collections import defaultdict
-
-from .models import GNUModel
+import logging
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.gnu.models import GNUModel
+from swh.lister.gnu.tree import GNUTree
+
+
+logger = logging.getLogger(__name__)
+
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
- TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
- BASE_URL = 'https://ftp.gnu.org'
instance = 'gnu'
- tarballs = defaultdict(dict) # Dict of key with project name value the
- # associated is list of tarballs of package to ingest from the gnu mirror
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
def task_dict(self, origin_type, origin_url, **kwargs):
- """
- Return task format dict
+ """Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
+
+ This creates tasks with args and kwargs set, for example:
+
+ .. code-block:: python
+
+ args: ['https://ftp.gnu.org/gnu/3dldf/']
+ kwargs: {
+ 'tarballs': [{
+ 'archive': 'https://...',
+ 'time': 1071002600,
+ 'length': 128},
+ ...
+ ]}
+
"""
+ tarballs = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
- kwargs.get('name'),
origin_url,
- tarballs=self.tarballs[kwargs.get('name')])
+ tarballs=tarballs)
def safely_issue_request(self, identifier):
- '''
- Download and unzip tree.json.gz file and returns its content
- in JSON format
+ """Bypass the implementation. It's now the GNUTree which deals with
+ querying the gnu mirror.
- File content in dictionary format
+ As an implementation detail, we cannot change simply the base
+ SimpleLister as other implementation still uses it. This shall be part
+ of another refactoring pass.
- Args:
- identifier: resource identifier (unused)
-
- Returns:
- Server response
-
- '''
- response = requests.get(self.TREE_URL,
- allow_redirects=True)
- uncompressed_content = gzip.decompress(response.content)
- return json.loads(uncompressed_content.decode('utf-8'))
+ """
+ return None
def list_packages(self, response):
- """
- List the actual gnu origins with their names,url and the list
- of all the tarball for a package from the response.
+ """List the actual gnu origins (package name) with their name, url and
+ associated tarballs.
Args:
- response : File structure of the website
- in dictionary format
+ response: Unused
Returns:
- A list of all the packages with their names, url of their root
- directory and the tarballs present for the particular package.
- [
- {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'tarballs':
- [
- {'archive':
- 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'date': '1071002600'},
- {'archive':
- 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'date': '1071078759'}}
- ]
- },
- {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
- 'tarballs':
- [
- {'archive':
- 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'date': '1461357336'},
- {'archive':
- 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
- 'date': '1480991830'}
- ]
- ]
+ List of packages name, url, last modification time
+
+ .. code-block:: python
+
+ [
+ {'name': '3dldf',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'time_modified': 1071002600},
+ {'name': '8sync',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/',
+ 'time_modified': 1480991830},
+ ...
+ ]
+
"""
- response = filter_directories(response)
- packages = []
- for directory in response:
- content = directory['contents']
- for repo in content:
- if repo['type'] == 'directory':
- package_url = '%s/%s/%s/' % (self.BASE_URL,
- directory['name'],
- repo['name'])
- package_tarballs = find_tarballs(
- repo['contents'], package_url)
- if package_tarballs != []:
- repo_details = {
- 'name': repo['name'],
- 'url': package_url,
- 'time_modified': repo['time'],
- }
- self.tarballs[repo['name']] = package_tarballs
- packages.append(repo_details)
- random.shuffle(packages)
- return packages
+ return list(self.gnu_tree.projects.values())
def get_model_from_repo(self, repo):
"""Transform from repository representation to model
@@ -128,89 +99,3 @@
'time_last_updated': int(repo['time_modified']),
'origin_type': 'tar',
}
-
-
-def find_tarballs(package_file_structure, url):
- '''Recursively lists tarballs present in the folder and subfolders for a
- particular package url.
-
- Args
- package_file_structure: File structure of the package root directory
- url: URL of the corresponding package
-
- Returns
- List of tarball urls and their associated metadata (time, length).
- For example:
-
- [
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'time': 1071002600,
- 'length': 543},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'time': 1071078759,
- 'length': 456},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
- 'time': 1074278633,
- 'length': 251},
- ...
- ]
-
- '''
- tarballs = []
- for single_file in package_file_structure:
- filetype = single_file['type']
- filename = single_file['name']
- if filetype == 'file':
- if file_extension_check(filename):
- tarballs.append({
- 'archive': url + filename,
- 'time': int(single_file['time']),
- 'length': int(single_file['size']),
- })
- # It will recursively check for tarballs in all sub-folders
- elif filetype == 'directory':
- tarballs_in_dir = find_tarballs(
- single_file['contents'],
- url + filename + '/')
- tarballs.extend(tarballs_in_dir)
-
- return tarballs
-
-
-def filter_directories(response):
- '''
- Keep only gnu and old-gnu folders from JSON
- '''
- final_response = []
- file_system = response[0]['contents']
- for directory in file_system:
- if directory['name'] in ('gnu', 'old-gnu'):
- final_response.append(directory)
- return final_response
-
-
-def file_extension_check(file_name):
- '''
- Check for the extension of the file, if the file is of zip format of
- .tar.x format, where x could be anything, then returns true.
-
- Args:
- file_name : name of the file for which the extensions is needs to
- be checked.
-
- Returns:
- True or False
-
- example
- file_extension_check('abc.zip') will return True
- file_extension_check('abc.tar.gz') will return True
- file_extension_check('abc.tar.gz.sig') will return False
-
- '''
- file_suffixes = Path(file_name).suffixes
- if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip':
- return True
- elif len(file_suffixes) > 1:
- if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
- return True
- return False
diff --git a/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz b/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -1,59 +1,41 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
-
-from swh.lister.gnu.lister import find_tarballs, filter_directories
-from swh.lister.gnu.lister import file_extension_check
-
-
-def test_filter_directories():
- f = open('swh/lister/gnu/tests/api_response.json')
- api_response = json.load(f)
- cleared_api_response = filter_directories(api_response)
- for directory in cleared_api_response:
- if directory['name'] not in ('gnu', 'old-gnu'):
- assert False
-
-
-def test_find_tarballs_small_sample():
- expected_tarballs = [
- {
- 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
- 'time': 1495205979,
- 'length': 424081,
- },
- {
- 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
- 'time': 898422900,
- 'length': 1514448
- },
- {
- 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
- 'time': 869814000,
- 'length': 450164,
- },
- {
- 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
- 'time': 898422900,
- 'length': 514951,
- },
- ]
-
- file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
- actual_tarballs = find_tarballs(file_structure, '/root/')
- assert actual_tarballs == expected_tarballs
-
-
-def test_find_tarballs():
- file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
- actual_tarballs = find_tarballs(file_structure, '/root/')
- assert len(actual_tarballs) == 42 + 3 # tar + zip
-
-
-def test_file_extension_check():
- assert file_extension_check('abc.xy.zip')
- assert file_extension_check('cvb.zip')
- assert file_extension_check('abc.tar.bz2')
- assert file_extension_check('abc') is False
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
+ lister = swh_listers['gnu']
+
+ lister.run()
+
+ r = lister.scheduler.search_tasks(task_type='load-tar')
+ assert len(r) == 383
+
+ for row in r:
+ assert row['type'] == 'load-tar'
+ # arguments check
+ args = row['arguments']['args']
+ assert len(args) == 1
+
+ url = args[0]
+ assert url.startswith('https://ftp.gnu.org')
+
+ url_suffix = url.split('https://ftp.gnu.org')[1]
+ assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
+
+ # kwargs
+ kwargs = row['arguments']['kwargs']
+ assert list(kwargs.keys()) == ['tarballs']
+
+ tarballs = kwargs['tarballs']
+ # check the tarball's structure
+ tarball = tarballs[0]
+ assert set(tarball.keys()) == set(['archive', 'length', 'time'])
+
+ assert row['policy'] == 'oneshot'
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+import pytest
+
+from os import path
+from swh.lister.gnu.tree import (
+ GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
+)
+
+
+def test_load_raw_data_from_query(requests_mock_datadir):
+ actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
+ assert actual_json is not None
+ assert isinstance(actual_json, list)
+ assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_query_failure(requests_mock_datadir):
+ inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
+ with pytest.raises(ValueError, match='Error during query'):
+ load_raw_data(inexistant_url)
+
+
+def test_load_raw_data_from_file(datadir):
+ filepath = path.join(datadir, 'ftp.gnu.org', 'tree.json.gz')
+ actual_json = load_raw_data(filepath)
+ assert actual_json is not None
+ assert isinstance(actual_json, list)
+ assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_file_failure(datadir):
+ unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
+ with pytest.raises(FileNotFoundError):
+ load_raw_data(unknown_path)
+
+
+def test_tree_json(requests_mock_datadir):
+ tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
+ 'name': '8sync',
+ 'time_modified': '1489817408',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
+ 'name': '3dldf',
+ 'time_modified': '1386961236',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
+ 'name': 'a2ps',
+ 'time_modified': '1198900505',
+ 'url': 'https://ftp.gnu.org/gnu/a2ps/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
+ 'name': 'xshogi',
+ 'time_modified': '1059822922',
+ 'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
+ }
+
+ assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
+ {
+ 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
+ 'length': 90106,
+ 'time': 857980800
+ },
+ {
+ 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
+ 'length': 89625,
+ 'time': 860396400
+ }
+ ]
+
+
+def test_tree_json_failures(requests_mock_datadir):
+ url = 'https://unknown/tree.json.gz'
+ tree_json = GNUTree(url)
+
+ with pytest.raises(ValueError, match='Error during query to %s' % url):
+ tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
+
+ with pytest.raises(ValueError, match='Error during query to %s' % url):
+ tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
+
+
+def test_find_artifacts_small_sample():
+ expected_tarballs = [
+ {
+ 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
+ 'time': 1495205979,
+ 'length': 424081,
+ },
+ {
+ 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
+ 'time': 898422900,
+ 'length': 1514448
+ },
+ {
+ 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
+ 'time': 869814000,
+ 'length': 450164,
+ },
+ {
+ 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
+ 'time': 898422900,
+ 'length': 514951,
+ },
+ ]
+
+ file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
+ actual_tarballs = find_artifacts(file_structure, '/root/')
+ assert actual_tarballs == expected_tarballs
+
+
+def test_find_artifacts():
+ file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
+ actual_tarballs = find_artifacts(file_structure, '/root/')
+ assert len(actual_tarballs) == 42 + 3 # tar + zip
+
+
+def test_check_filename_is_archive():
+ for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
+ assert check_filename_is_archive(ext) is True
+
+ for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
+ assert check_filename_is_archive(ext) is False
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tree.py
@@ -0,0 +1,187 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import gzip
+import json
+import logging
+import requests
+
+from pathlib import Path
+from typing import Dict, Tuple, List
+from urllib.parse import urlparse
+
+
+logger = logging.getLogger(__name__)
+
+
+def load_raw_data(url: str) -> List[Dict]:
+ """Load the raw json from the tree.json.gz
+
+ Args:
+ url: Tree.json.gz url or path
+
+ Returns:
+ The raw json list
+
+ """
+ if url.startswith('http://') or url.startswith('https://'):
+ response = requests.get(url, allow_redirects=True)
+ if not response.ok:
+ raise ValueError('Error during query to %s' % url)
+ raw = gzip.decompress(response.content)
+ else:
+ with gzip.open(url, 'r') as f:
+ raw = f.read()
+ raw_data = json.loads(raw.decode('utf-8'))
+ return raw_data
+
+
+class GNUTree:
+ """Gnu Tree's representation
+
+ """
+ def __init__(self, url: str):
+ self.url = url # filepath or uri
+ u = urlparse(url)
+ self.base_url = '%s://%s' % (u.scheme, u.netloc)
+ # Interesting top level directories
+ self.top_level_directories = ['gnu', 'old-gnu']
+ # internal state
+ self._artifacts = {} # type: Dict
+ self._projects = {} # type: Dict
+
+ @property
+ def projects(self) -> Dict:
+ if not self._projects:
+ self._projects, self._artifacts = self._load()
+ return self._projects
+
+ @property
+ def artifacts(self) -> Dict:
+ if not self._artifacts:
+ self._projects, self._artifacts = self._load()
+ return self._artifacts
+
+ def _load(self) -> Tuple[Dict, Dict]:
+ """Compute projects and artifacts per project
+
+ Returns:
+ Tuple of dict projects (key project url, value the associated
+ information) and a dict artifacts (key project url, value the
+ info_file list)
+
+ """
+ projects = {}
+ artifacts = {}
+
+ raw_data = load_raw_data(self.url)[0]
+ for directory in raw_data['contents']:
+ if directory['name'] not in self.top_level_directories:
+ continue
+ infos = directory['contents']
+ for info in infos:
+ if info['type'] == 'directory':
+ package_url = '%s/%s/%s/' % (
+ self.base_url, directory['name'], info['name'])
+ package_artifacts = find_artifacts(
+ info['contents'], package_url)
+ if package_artifacts != []:
+ repo_details = {
+ 'name': info['name'],
+ 'url': package_url,
+ 'time_modified': info['time'],
+ }
+ artifacts[package_url] = package_artifacts
+ projects[package_url] = repo_details
+
+ return projects, artifacts
+
+
+def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
+ """Recursively list artifacts present in the folder and subfolders for a
+ particular package url.
+
+ Args:
+
+ filesystem: File structure of the package root directory. This is a
+ list of Dict representing either file or directory information as
+ dict (keys: name, size, time, type).
+ url: URL of the corresponding package
+
+ Returns
+ List of tarball urls and their associated metadata (time, length).
+ For example:
+
+ .. code-block:: python
+
+ [
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'time': 1071002600,
+ 'length': 543},
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'time': 1071078759,
+ 'length': 456},
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
+ 'time': 1074278633,
+ 'length': 251},
+ ...
+ ]
+
+ """
+ artifacts = []
+ for info_file in filesystem:
+ filetype = info_file['type']
+ filename = info_file['name']
+ if filetype == 'file':
+ if check_filename_is_archive(filename):
+ artifacts.append({
+ 'archive': url + filename,
+ 'time': int(info_file['time']),
+ 'length': int(info_file['size']),
+ })
+ # It will recursively check for artifacts in all sub-folders
+ elif filetype == 'directory':
+ tarballs_in_dir = find_artifacts(
+ info_file['contents'],
+ url + filename + '/')
+ artifacts.extend(tarballs_in_dir)
+
+ return artifacts
+
+
+def check_filename_is_archive(filename: str) -> bool:
+ """
+ Check for the extension of the file, if the file is of zip format of
+ .tar.x format, where x could be anything, then returns true.
+
+ Args:
+ filename: name of the file for which the extensions is needs to
+ be checked.
+
+ Returns:
+ Whether filename is an archive or not
+
+ Example:
+
+ >>> check_filename_is_archive('abc.zip')
+ True
+ >>> check_filename_is_archive('abc.tar.gz')
+ True
+ >>> check_filename_is_archive('bac.tar')
+ True
+ >>> check_filename_is_archive('abc.tar.gz.sig')
+ False
+ >>> check_filename_is_archive('foobar.tar.')
+ False
+
+ """
+ file_suffixes = Path(filename).suffixes
+ logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
+ if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
+ return True
+ elif len(file_suffixes) > 1:
+ if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
+ return True
+ return False

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 8:04 AM (8 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218338

Event Timeline