diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 17266d0..11daa2f 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,219 +1,222 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import gzip import json import requests from pathlib import Path from collections import defaultdict from .models import GNUModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' TREE_URL = 'https://ftp.gnu.org/tree.json.gz' BASE_URL = 'https://ftp.gnu.org' instance = 'gnu' tarballs = defaultdict(dict) # Dict of key with project name value the # associated is list of tarballs of package to ingest from the gnu mirror def task_dict(self, origin_type, origin_url, **kwargs): """ Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), kwargs.get('name'), origin_url, tarballs=self.tarballs[kwargs.get('name')]) def safely_issue_request(self, identifier): ''' Download and unzip tree.json.gz file and returns its content in JSON format File content in dictionary format Args: identifier: resource identifier (unused) Returns: Server response ''' response = requests.get(self.TREE_URL, allow_redirects=True) uncompressed_content = gzip.decompress(response.content) return json.loads(uncompressed_content.decode('utf-8')) def list_packages(self, response): """ List the actual gnu origins with their names,url and the list of all the tarball for a package from the response. Args: response : File structure of the website in dictionary format Returns: A list of all the packages with their names, url of their root directory and the tarballs present for the particular package. [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'date': '1071002600'}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'date': '1071078759'}} ] }, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'date': '1461357336'}, {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'date': '1480991830'} ] ] """ response = filter_directories(response) packages = [] for directory in response: content = directory['contents'] for repo in content: if repo['type'] == 'directory': package_url = '%s/%s/%s/' % (self.BASE_URL, directory['name'], repo['name']) package_tarballs = find_tarballs( - repo['contents'], package_url) + repo['contents'], package_url) if package_tarballs != []: repo_details = { 'name': repo['name'], 'url': package_url, 'time_modified': repo['time'], } self.tarballs[repo['name']] = package_tarballs packages.append(repo_details) random.shuffle(packages) return packages def get_model_from_repo(self, repo): """Transform from repository representation to model """ return { 'uid': repo['name'], 'name': repo['name'], 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], - 'time_last_updated': repo['time_modified'], + 'time_last_updated': int(repo['time_modified']), 'origin_type': 'tar', } def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo) for repo in response] def find_tarballs(package_file_structure, url): - ''' - Recursively lists all the tarball present in the folder and - subfolders for a particular package url. + '''Recursively lists tarballs present in the folder and subfolders for a + particular package url. Args - package_file_structure : File structure of the package root directory - url : URL of the corresponding package + package_file_structure: File structure of the package root directory + url: URL of the corresponding package Returns - List of all the tarball urls and the last their time of update - example- - For a package called 3dldf + List of tarball urls and their associated metadata (time, length). + For example: [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'date': '1071002600'} + 'time': 1071002600, + 'length': 543}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'date': '1071078759'} + 'time': 1071078759, + 'length': 456}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', - 'date': '1074278633'} + 'time': 1074278633, + 'length': 251}, ... ] + ''' tarballs = [] for single_file in package_file_structure: - file_type = single_file['type'] - file_name = single_file['name'] - if file_type == 'file': - if file_extension_check(file_name): - tarballs .append({ - "archive": url + file_name, - "date": single_file['time'] + filetype = single_file['type'] + filename = single_file['name'] + if filetype == 'file': + if file_extension_check(filename): + tarballs.append({ + 'archive': url + filename, + 'time': int(single_file['time']), + 'length': int(single_file['size']), }) # It will recursively check for tarballs in all sub-folders - elif file_type == 'directory': + elif filetype == 'directory': tarballs_in_dir = find_tarballs( - single_file['contents'], - url + file_name + '/') + single_file['contents'], + url + filename + '/') tarballs.extend(tarballs_in_dir) return tarballs def filter_directories(response): ''' Keep only gnu and old-gnu folders from JSON ''' final_response = [] file_system = response[0]['contents'] for directory in file_system: if directory['name'] in ('gnu', 'old-gnu'): final_response.append(directory) return final_response def file_extension_check(file_name): ''' Check for the extension of the file, if the file is of zip format of .tar.x format, where x could be anything, then returns true. Args: file_name : name of the file for which the extensions is needs to be checked. Returns: True or False example file_extension_check('abc.zip') will return True file_extension_check('abc.tar.gz') will return True file_extension_check('abc.tar.gz.sig') will return False ''' file_suffixes = Path(file_name).suffixes if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip': return True elif len(file_suffixes) > 1: if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': return True return False diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py index 154ca09..7ed9a1c 100644 --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -1,40 +1,59 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from swh.lister.gnu.lister import find_tarballs, filter_directories from swh.lister.gnu.lister import file_extension_check def test_filter_directories(): f = open('swh/lister/gnu/tests/api_response.json') api_response = json.load(f) cleared_api_response = filter_directories(api_response) for directory in cleared_api_response: if directory['name'] not in ('gnu', 'old-gnu'): assert False +def test_find_tarballs_small_sample(): + expected_tarballs = [ + { + 'archive': '/root/artanis/artanis-0.2.1.tar.bz2', + 'time': 1495205979, + 'length': 424081, + }, + { + 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa + 'time': 898422900, + 'length': 1514448 + }, + { + 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa + 'time': 869814000, + 'length': 450164, + }, + { + 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa + 'time': 898422900, + 'length': 514951, + }, + ] + + file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json')) + actual_tarballs = find_tarballs(file_structure, '/root/') + assert actual_tarballs == expected_tarballs + + def test_find_tarballs(): - f = open('swh/lister/gnu/tests/find_tarballs_output.json') - expected_list_of_all_tarballs = json.load(f) - - f = open('swh/lister/gnu/tests/file_structure.json') - file_structure = json.load(f) - list_of_all_tarballs = [] - list_of_all_tarballs.extend( - find_tarballs(file_structure[0]['contents'], - "https://ftp.gnu.org/gnu/artanis/")) - list_of_all_tarballs.extend( - find_tarballs(file_structure[1]['contents'], - "https://ftp.gnu.org/old-gnu/xboard/")) - assert list_of_all_tarballs == expected_list_of_all_tarballs + file_structure = json.load(open('swh/lister/gnu/tests/tree.json')) + actual_tarballs = find_tarballs(file_structure, '/root/') + assert len(actual_tarballs) == 42 + 3 # tar + zip def test_file_extension_check(): assert file_extension_check('abc.xy.zip') assert file_extension_check('cvb.zip') assert file_extension_check('abc.tar.bz2') assert file_extension_check('abc') is False diff --git a/swh/lister/gnu/tests/file_structure.json b/swh/lister/gnu/tests/tree.json similarity index 99% rename from swh/lister/gnu/tests/file_structure.json rename to swh/lister/gnu/tests/tree.json index 5928e74..e4a99d4 100644 --- a/swh/lister/gnu/tests/file_structure.json +++ b/swh/lister/gnu/tests/tree.json @@ -1,73 +1,73 @@ [ {"type":"directory","name":"artanis","size":4096,"time":"1546205705","contents":[ {"type":"file","name":"artanis-0.2.1.tar.bz2","size":424081,"time":"1495205979"}, {"type":"file","name":"artanis-0.2.1.tar.bz2.sig","size":833,"time":"1495205982"}, {"type":"file","name":"artanis-0.2.1.tar.gz","size":506599,"time":"1495205967"}, {"type":"file","name":"artanis-0.2.12-f39e-dirty.tar.gz","size":504759,"time":"1494994222"}, {"type":"file","name":"artanis-0.2.12-f39e-dirty.tar.gz.sig","size":833,"time":"1494994224"}, {"type":"file","name":"artanis-0.2.3.tar.bz2","size":439269,"time":"1520284021"}, {"type":"file","name":"artanis-0.2.3.tar.gz","size":526293,"time":"1520284007"}, {"type":"file","name":"artanis-0.2.4.tar.bz2","size":426626,"time":"1521742071"}, {"type":"file","name":"artanis-0.2.4.tar.bz2.sig","size":833,"time":"1521742074"}, {"type":"file","name":"artanis-0.2.4.tar.gz","size":508420,"time":"1521742057"}, {"type":"file","name":"artanis-0.2.5.tar.bz2","size":440350,"time":"1525717261"}, {"type":"file","name":"artanis-0.2.5.tar.gz","size":518316,"time":"1525717246"}, {"type":"file","name":"artanis-0.3.1.tar.bz2","size":448329,"time":"1546205569"}, {"type":"file","name":"artanis-0.3.1.tar.gz","size":535098,"time":"1546205555"}, {"type":"file","name":"artanis-0.3.tar.bz2","size":452609,"time":"1546205025"}, {"type":"file","name":"artanis-0.3.tar.bz2.sig","size":833,"time":"1546205027"}, {"type":"file","name":"artanis-0.3.tar.gz","size":550938,"time":"1546205012"}, {"type":"link","name":"artanis-latest.12-f39e-dirty.tar.bz2","target":"artanis-0.2.12-f39e-dirty.tar.bz2","size":33,"time":"1494994512","contents":[]}, {"type":"link","name":"artanis-latest.12-f39e-dirty.tar.gz","target":"artanis-0.2.12-f39e-dirty.tar.gz","size":32,"time":"1494994519","contents":[]}, {"type":"link","name":"artanis-latest.tar.bz2","target":"artanis-0.3.1.tar.bz2","size":21,"time":"1546205705","contents":[]}, {"type":"link","name":"artanis-latest.tar.gz","target":"artanis-0.3.1.tar.gz","size":20,"time":"1546205703","contents":[]}, {"type":"link","name":"artanis-latest.tar.gz.sig","target":"artanis-0.3.1.tar.gz.sig","size":24,"time":"1546205703","contents":[]} ]}, {"type":"directory","name":"xboard","size":4096,"time":"1254860068","contents":[ {"type":"directory","name":"winboard","size":4096,"time":"1181795103","contents":[ {"type":"file","name":"README","size":107,"time":"1070058107"}, {"type":"file","name":"README.sig","size":65,"time":"1070058115"}, {"type":"file","name":"winboard-4_0_0-src.zip","size":1514448,"time":"898422900"}, {"type":"file","name":"winboard-4_0_0.exe","size":1652037,"time":"898422900"}, {"type":"file","name":"winboard-4_0_2-src.zip","size":1482621,"time":"920018269"}, {"type":"file","name":"winboard-4_0_3-src.zip","size":1499275,"time":"936750503"}, {"type":"file","name":"winboard-4_0_4-src.tar.gz","size":1753506,"time":"944290190"}, {"type":"file","name":"winboard-4_0_5-src.tar.gz","size":1752189,"time":"944600462"}, {"type":"file","name":"winboard-4_0_6-src.tar.gz","size":1761396,"time":"952156231"}, {"type":"file","name":"winboard-4_0_6.README","size":1592,"time":"952156231"}, {"type":"file","name":"winboard-4_0_7-src.tar.gz","size":1764000,"time":"952313061"}, {"type":"file","name":"winboard-4_1_0-src.tar.gz","size":1902251,"time":"969299378"}, {"type":"file","name":"winboard-4_2_0beta-src.tar.gz","size":2000471,"time":"977027031"}, {"type":"file","name":"winboard-4_2_0beta.README","size":3048,"time":"977033442"}, {"type":"file","name":"winboard-4_2_0beta.exe","size":2292716,"time":"977027033"}, {"type":"file","name":"winboard-4_2_1-src.tar.gz","size":2090945,"time":"981323331"}, {"type":"file","name":"winboard-4_2_2-src.tar.gz","size":2025689,"time":"981570576"}, {"type":"file","name":"winboard-4_2_3-src.tar.gz","size":2001746,"time":"982656672"}, {"type":"file","name":"winboard-4_2_4-src.tar.gz","size":2388388,"time":"1007952574"}, {"type":"file","name":"winboard-4_2_5-src.tar.gz","size":1962754,"time":"1008502483"}, {"type":"file","name":"winboard-4_2_6-src.tar.gz","size":1982333,"time":"1012641285"}, {"type":"file","name":"winboard-4_2_7.exe.sig","size":65,"time":"1070057687"}, {"type":"file","name":"winboard-4_2_7b.exe","size":6213290,"time":"1181794790"} ]}, {"type":"file","name":"xboard-3.6.2.tar.gz","size":450164,"time":"869814000"}, {"type":"file","name":"xboard-4.0.0.tar.gz","size":514951,"time":"898422900"}, {"type":"file","name":"xboard-4.0.2.tar.gz","size":564856,"time":"920018202"}, {"type":"file","name":"xboard-4.0.3.tar.gz","size":577351,"time":"936750512"}, {"type":"file","name":"xboard-4.0.4.tar.gz","size":575421,"time":"944290148"}, {"type":"file","name":"xboard-4.0.5.tar.gz","size":576300,"time":"944599461"}, {"type":"file","name":"xboard-4.0.6.README","size":1592,"time":"952156235"}, {"type":"file","name":"xboard-4.0.6.tar.gz","size":579076,"time":"952156235"}, {"type":"file","name":"xboard-4.0.7.README","size":1721,"time":"952313082"}, {"type":"file","name":"xboard-4.0.7.tar.gz","size":578350,"time":"952313085"}, {"type":"file","name":"xboard-4.1.0.tar.gz","size":1069507,"time":"969299287"}, {"type":"file","name":"xboard-4.2.0beta.README","size":3048,"time":"977027107"}, {"type":"file","name":"xboard-4.2.0beta.tar.gz","size":1093901,"time":"977027108"}, {"type":"file","name":"xboard-4.2.1.tar.gz","size":1097200,"time":"981323501"}, {"type":"file","name":"xboard-4.2.2.tar.gz","size":1097682,"time":"981562809"}, {"type":"file","name":"xboard-4.2.3.tar.gz","size":1100059,"time":"982657006"}, {"type":"file","name":"xboard-4.2.4.tar.gz","size":1034728,"time":"1007952745"}, {"type":"file","name":"xboard-4.2.5.tar.gz","size":1055502,"time":"1008466945"}, {"type":"file","name":"xboard-4.2.6.tar.gz","size":1057625,"time":"1012641715"}, {"type":"file","name":"xboard-4.2.7.tar.gz","size":1318110,"time":"1070057764"} ]} -] \ No newline at end of file +] diff --git a/swh/lister/gnu/tests/tree.min.json b/swh/lister/gnu/tests/tree.min.json new file mode 100644 index 0000000..f742ebb --- /dev/null +++ b/swh/lister/gnu/tests/tree.min.json @@ -0,0 +1,16 @@ +[ + {"type":"directory","name":"artanis","size":4096,"time":"1546205705","contents":[ + {"type":"file","name":"artanis-0.2.1.tar.bz2","size":424081,"time":"1495205979"}, + {"type":"file","name":"artanis-0.2.1.tar.bz2.sig","size":833,"time":"1495205982"} + ]}, + {"type":"directory","name":"xboard","size":4096,"time":"1254860068","contents":[ + {"type":"directory","name":"winboard","size":4096,"time":"1181795103","contents":[ + {"type":"file","name":"README","size":107,"time":"1070058107"}, + {"type":"file","name":"README.sig","size":65,"time":"1070058115"}, + {"type":"file","name":"winboard-4_0_0-src.zip","size":1514448,"time":"898422900"}, + {"type":"file","name":"winboard-4_0_0.exe","size":1652037,"time":"898422900"} + ]}, + {"type":"file","name":"xboard-3.6.2.tar.gz","size":450164,"time":"869814000"}, + {"type":"file","name":"xboard-4.0.0.tar.gz","size":514951,"time":"898422900"} + ]} +]