Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066409
D2076.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
D2076.diff
View Options
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -2,118 +2,89 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import random
-import gzip
-import json
-import requests
-from pathlib import Path
-from collections import defaultdict
-
-from .models import GNUModel
+import logging
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.gnu.models import GNUModel
+from swh.lister.gnu.tree import GNUTree
+
+
+logger = logging.getLogger(__name__)
+
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
- TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
- BASE_URL = 'https://ftp.gnu.org'
instance = 'gnu'
- tarballs = defaultdict(dict) # Dict of key with project name value the
- # associated is list of tarballs of package to ingest from the gnu mirror
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
def task_dict(self, origin_type, origin_url, **kwargs):
- """
- Return task format dict
+ """Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
+
+ This creates tasks with args and kwargs set, for example:
+
+ .. code-block:: python
+
+ args: ['https://ftp.gnu.org/gnu/3dldf/']
+ kwargs: {
+ 'tarballs': [{
+ 'archive': 'https://...',
+ 'time': 1071002600,
+ 'length': 128},
+ ...
+ ]}
+
"""
+ tarballs = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
- kwargs.get('name'),
origin_url,
- tarballs=self.tarballs[kwargs.get('name')])
+ tarballs=tarballs)
def safely_issue_request(self, identifier):
- '''
- Download and unzip tree.json.gz file and returns its content
- in JSON format
+ """Bypass the implementation. It's now the GNUTree which deals with
+ querying the gnu mirror.
- File content in dictionary format
+ As an implementation detail, we cannot change simply the base
+ SimpleLister as other implementation still uses it. This shall be part
+ of another refactoring pass.
- Args:
- identifier: resource identifier (unused)
-
- Returns:
- Server response
-
- '''
- response = requests.get(self.TREE_URL,
- allow_redirects=True)
- uncompressed_content = gzip.decompress(response.content)
- return json.loads(uncompressed_content.decode('utf-8'))
+ """
+ return None
def list_packages(self, response):
- """
- List the actual gnu origins with their names,url and the list
- of all the tarball for a package from the response.
+ """List the actual gnu origins (package name) with their name, url and
+ associated tarballs.
Args:
- response : File structure of the website
- in dictionary format
+ response: Unused
Returns:
- A list of all the packages with their names, url of their root
- directory and the tarballs present for the particular package.
- [
- {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'tarballs':
- [
- {'archive':
- 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'date': '1071002600'},
- {'archive':
- 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'date': '1071078759'}}
- ]
- },
- {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/',
- 'tarballs':
- [
- {'archive':
- 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'date': '1461357336'},
- {'archive':
- 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
- 'date': '1480991830'}
- ]
- ]
+ List of packages name, url, last modification time
+
+ .. code-block:: python
+
+ [
+ {'name': '3dldf',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'time_modified': 1071002600},
+ {'name': '8sync',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/',
+ 'time_modified': 1480991830},
+ ...
+ ]
+
"""
- response = filter_directories(response)
- packages = []
- for directory in response:
- content = directory['contents']
- for repo in content:
- if repo['type'] == 'directory':
- package_url = '%s/%s/%s/' % (self.BASE_URL,
- directory['name'],
- repo['name'])
- package_tarballs = find_tarballs(
- repo['contents'], package_url)
- if package_tarballs != []:
- repo_details = {
- 'name': repo['name'],
- 'url': package_url,
- 'time_modified': repo['time'],
- }
- self.tarballs[repo['name']] = package_tarballs
- packages.append(repo_details)
- random.shuffle(packages)
- return packages
+ return list(self.gnu_tree.projects.values())
def get_model_from_repo(self, repo):
"""Transform from repository representation to model
@@ -128,89 +99,3 @@
'time_last_updated': int(repo['time_modified']),
'origin_type': 'tar',
}
-
-
-def find_tarballs(package_file_structure, url):
- '''Recursively lists tarballs present in the folder and subfolders for a
- particular package url.
-
- Args
- package_file_structure: File structure of the package root directory
- url: URL of the corresponding package
-
- Returns
- List of tarball urls and their associated metadata (time, length).
- For example:
-
- [
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'time': 1071002600,
- 'length': 543},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'time': 1071078759,
- 'length': 456},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz',
- 'time': 1074278633,
- 'length': 251},
- ...
- ]
-
- '''
- tarballs = []
- for single_file in package_file_structure:
- filetype = single_file['type']
- filename = single_file['name']
- if filetype == 'file':
- if file_extension_check(filename):
- tarballs.append({
- 'archive': url + filename,
- 'time': int(single_file['time']),
- 'length': int(single_file['size']),
- })
- # It will recursively check for tarballs in all sub-folders
- elif filetype == 'directory':
- tarballs_in_dir = find_tarballs(
- single_file['contents'],
- url + filename + '/')
- tarballs.extend(tarballs_in_dir)
-
- return tarballs
-
-
-def filter_directories(response):
- '''
- Keep only gnu and old-gnu folders from JSON
- '''
- final_response = []
- file_system = response[0]['contents']
- for directory in file_system:
- if directory['name'] in ('gnu', 'old-gnu'):
- final_response.append(directory)
- return final_response
-
-
-def file_extension_check(file_name):
- '''
- Check for the extension of the file, if the file is of zip format of
- .tar.x format, where x could be anything, then returns true.
-
- Args:
- file_name : name of the file for which the extensions is needs to
- be checked.
-
- Returns:
- True or False
-
- example
- file_extension_check('abc.zip') will return True
- file_extension_check('abc.tar.gz') will return True
- file_extension_check('abc.tar.gz.sig') will return False
-
- '''
- file_suffixes = Path(file_name).suffixes
- if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip':
- return True
- elif len(file_suffixes) > 1:
- if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
- return True
- return False
diff --git a/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz b/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -1,59 +1,41 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
-
-from swh.lister.gnu.lister import find_tarballs, filter_directories
-from swh.lister.gnu.lister import file_extension_check
-
-
-def test_filter_directories():
- f = open('swh/lister/gnu/tests/api_response.json')
- api_response = json.load(f)
- cleared_api_response = filter_directories(api_response)
- for directory in cleared_api_response:
- if directory['name'] not in ('gnu', 'old-gnu'):
- assert False
-
-
-def test_find_tarballs_small_sample():
- expected_tarballs = [
- {
- 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
- 'time': 1495205979,
- 'length': 424081,
- },
- {
- 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
- 'time': 898422900,
- 'length': 1514448
- },
- {
- 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
- 'time': 869814000,
- 'length': 450164,
- },
- {
- 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
- 'time': 898422900,
- 'length': 514951,
- },
- ]
-
- file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
- actual_tarballs = find_tarballs(file_structure, '/root/')
- assert actual_tarballs == expected_tarballs
-
-
-def test_find_tarballs():
- file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
- actual_tarballs = find_tarballs(file_structure, '/root/')
- assert len(actual_tarballs) == 42 + 3 # tar + zip
-
-
-def test_file_extension_check():
- assert file_extension_check('abc.xy.zip')
- assert file_extension_check('cvb.zip')
- assert file_extension_check('abc.tar.bz2')
- assert file_extension_check('abc') is False
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
+ lister = swh_listers['gnu']
+
+ lister.run()
+
+ r = lister.scheduler.search_tasks(task_type='load-tar')
+ assert len(r) == 383
+
+ for row in r:
+ assert row['type'] == 'load-tar'
+ # arguments check
+ args = row['arguments']['args']
+ assert len(args) == 1
+
+ url = args[0]
+ assert url.startswith('https://ftp.gnu.org')
+
+ url_suffix = url.split('https://ftp.gnu.org')[1]
+ assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
+
+ # kwargs
+ kwargs = row['arguments']['kwargs']
+ assert list(kwargs.keys()) == ['tarballs']
+
+ tarballs = kwargs['tarballs']
+ # check the tarball's structure
+ tarball = tarballs[0]
+ assert set(tarball.keys()) == set(['archive', 'length', 'time'])
+
+ assert row['policy'] == 'oneshot'
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+
+import pytest
+
+from os import path
+from swh.lister.gnu.tree import (
+ GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
+)
+
+
+def test_load_raw_data_from_query(requests_mock_datadir):
+ actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
+ assert actual_json is not None
+ assert isinstance(actual_json, list)
+ assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_query_failure(requests_mock_datadir):
+ inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
+ with pytest.raises(ValueError, match='Error during query'):
+ load_raw_data(inexistant_url)
+
+
+def test_load_raw_data_from_file(datadir):
+ filepath = path.join(datadir, 'ftp.gnu.org', 'tree.json.gz')
+ actual_json = load_raw_data(filepath)
+ assert actual_json is not None
+ assert isinstance(actual_json, list)
+ assert len(actual_json) == 2
+
+
+def test_load_raw_data_from_file_failure(datadir):
+ unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
+ with pytest.raises(FileNotFoundError):
+ load_raw_data(unknown_path)
+
+
+def test_tree_json(requests_mock_datadir):
+ tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
+ 'name': '8sync',
+ 'time_modified': '1489817408',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
+ 'name': '3dldf',
+ 'time_modified': '1386961236',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
+ 'name': 'a2ps',
+ 'time_modified': '1198900505',
+ 'url': 'https://ftp.gnu.org/gnu/a2ps/'
+ }
+
+ assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
+ 'name': 'xshogi',
+ 'time_modified': '1059822922',
+ 'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
+ }
+
+ assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
+ {
+ 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
+ 'length': 90106,
+ 'time': 857980800
+ },
+ {
+ 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
+ 'length': 89625,
+ 'time': 860396400
+ }
+ ]
+
+
+def test_tree_json_failures(requests_mock_datadir):
+ url = 'https://unknown/tree.json.gz'
+ tree_json = GNUTree(url)
+
+ with pytest.raises(ValueError, match='Error during query to %s' % url):
+ tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
+
+ with pytest.raises(ValueError, match='Error during query to %s' % url):
+ tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
+
+
+def test_find_artifacts_small_sample():
+ expected_tarballs = [
+ {
+ 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
+ 'time': 1495205979,
+ 'length': 424081,
+ },
+ {
+ 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
+ 'time': 898422900,
+ 'length': 1514448
+ },
+ {
+ 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
+ 'time': 869814000,
+ 'length': 450164,
+ },
+ {
+ 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
+ 'time': 898422900,
+ 'length': 514951,
+ },
+ ]
+
+ file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json'))
+ actual_tarballs = find_artifacts(file_structure, '/root/')
+ assert actual_tarballs == expected_tarballs
+
+
+def test_find_artifacts():
+ file_structure = json.load(open('swh/lister/gnu/tests/tree.json'))
+ actual_tarballs = find_artifacts(file_structure, '/root/')
+ assert len(actual_tarballs) == 42 + 3 # tar + zip
+
+
+def test_check_filename_is_archive():
+ for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
+ assert check_filename_is_archive(ext) is True
+
+ for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
+ assert check_filename_is_archive(ext) is False
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/gnu/tree.py
@@ -0,0 +1,187 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import gzip
+import json
+import logging
+import requests
+
+from pathlib import Path
+from typing import Dict, Tuple, List
+from urllib.parse import urlparse
+
+
+logger = logging.getLogger(__name__)
+
+
+def load_raw_data(url: str) -> List[Dict]:
+ """Load the raw json from the tree.json.gz
+
+ Args:
+ url: Tree.json.gz url or path
+
+ Returns:
+ The raw json list
+
+ """
+ if url.startswith('http://') or url.startswith('https://'):
+ response = requests.get(url, allow_redirects=True)
+ if not response.ok:
+ raise ValueError('Error during query to %s' % url)
+ raw = gzip.decompress(response.content)
+ else:
+ with gzip.open(url, 'r') as f:
+ raw = f.read()
+ raw_data = json.loads(raw.decode('utf-8'))
+ return raw_data
+
+
+class GNUTree:
+ """Gnu Tree's representation
+
+ """
+ def __init__(self, url: str):
+ self.url = url # filepath or uri
+ u = urlparse(url)
+ self.base_url = '%s://%s' % (u.scheme, u.netloc)
+ # Interesting top level directories
+ self.top_level_directories = ['gnu', 'old-gnu']
+ # internal state
+ self._artifacts = {} # type: Dict
+ self._projects = {} # type: Dict
+
+ @property
+ def projects(self) -> Dict:
+ if not self._projects:
+ self._projects, self._artifacts = self._load()
+ return self._projects
+
+ @property
+ def artifacts(self) -> Dict:
+ if not self._artifacts:
+ self._projects, self._artifacts = self._load()
+ return self._artifacts
+
+ def _load(self) -> Tuple[Dict, Dict]:
+ """Compute projects and artifacts per project
+
+ Returns:
+ Tuple of dict projects (key project url, value the associated
+ information) and a dict artifacts (key project url, value the
+ info_file list)
+
+ """
+ projects = {}
+ artifacts = {}
+
+ raw_data = load_raw_data(self.url)[0]
+ for directory in raw_data['contents']:
+ if directory['name'] not in self.top_level_directories:
+ continue
+ infos = directory['contents']
+ for info in infos:
+ if info['type'] == 'directory':
+ package_url = '%s/%s/%s/' % (
+ self.base_url, directory['name'], info['name'])
+ package_artifacts = find_artifacts(
+ info['contents'], package_url)
+ if package_artifacts != []:
+ repo_details = {
+ 'name': info['name'],
+ 'url': package_url,
+ 'time_modified': info['time'],
+ }
+ artifacts[package_url] = package_artifacts
+ projects[package_url] = repo_details
+
+ return projects, artifacts
+
+
+def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
+ """Recursively list artifacts present in the folder and subfolders for a
+ particular package url.
+
+ Args:
+
+ filesystem: File structure of the package root directory. This is a
+ list of Dict representing either file or directory information as
+ dict (keys: name, size, time, type).
+ url: URL of the corresponding package
+
+ Returns
+ List of tarball urls and their associated metadata (time, length).
+ For example:
+
+ .. code-block:: python
+
+ [
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'time': 1071002600,
+ 'length': 543},
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'time': 1071078759,
+ 'length': 456},
+ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
+ 'time': 1074278633,
+ 'length': 251},
+ ...
+ ]
+
+ """
+ artifacts = []
+ for info_file in filesystem:
+ filetype = info_file['type']
+ filename = info_file['name']
+ if filetype == 'file':
+ if check_filename_is_archive(filename):
+ artifacts.append({
+ 'archive': url + filename,
+ 'time': int(info_file['time']),
+ 'length': int(info_file['size']),
+ })
+ # It will recursively check for artifacts in all sub-folders
+ elif filetype == 'directory':
+ tarballs_in_dir = find_artifacts(
+ info_file['contents'],
+ url + filename + '/')
+ artifacts.extend(tarballs_in_dir)
+
+ return artifacts
+
+
+def check_filename_is_archive(filename: str) -> bool:
+ """
+ Check for the extension of the file, if the file is of zip format of
+ .tar.x format, where x could be anything, then returns true.
+
+ Args:
+ filename: name of the file for which the extensions is needs to
+ be checked.
+
+ Returns:
+ Whether filename is an archive or not
+
+ Example:
+
+ >>> check_filename_is_archive('abc.zip')
+ True
+ >>> check_filename_is_archive('abc.tar.gz')
+ True
+ >>> check_filename_is_archive('bac.tar')
+ True
+ >>> check_filename_is_archive('abc.tar.gz.sig')
+ False
+ >>> check_filename_is_archive('foobar.tar.')
+ False
+
+ """
+ file_suffixes = Path(filename).suffixes
+ logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
+ if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
+ return True
+ elif len(file_suffixes) > 1:
+ if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
+ return True
+ return False
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 8:04 AM (8 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218338
Attached To
D2076: gnu: Separate listing and parsing logic and add integration test
Event Timeline
Log In to Comment