diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 04a20aa..e674cc4 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,72 +1,71 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from urllib import parse import logging import iso8601 from swh.lister.bitbucket.models import BitBucketModel from swh.lister.core.indexing_lister import SWHIndexingHttpLister logger = logging.getLogger(__name__) class BitBucketLister(SWHIndexingHttpLister): PATH_TEMPLATE = '/repositories?after=%s' MODEL = BitBucketModel LISTER_NAME = 'bitbucket' instance = 'bitbucket' def get_model_from_repo(self, repo): return { 'uid': repo['uuid'], 'indexable': repo['created_on'], 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['links']['html']['href'], 'origin_url': repo['links']['clone'][0]['href'], 'origin_type': repo['scm'], - 'description': repo['description'] } def get_next_target_from_response(self, response): body = response.json() if 'next' in body: return parse.unquote(body['next'].split('after=')[1]) else: return None def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] def request_uri(self, identifier): return super().request_uri(identifier or '1970-01-01') def is_within_bounds(self, inner, lower=None, upper=None): # values are expected to be str dates try: inner = iso8601.parse_date(inner) if lower: lower = iso8601.parse_date(lower) if upper: upper = iso8601.parse_date(upper) if lower is None and upper is None: return True elif lower is None: ret = inner <= upper elif upper is None: ret = inner >= lower else: ret = lower <= inner <= upper except Exception as e: logger.error(str(e) + ': %s, %s, %s' % (('inner=%s%s' % (type(inner), inner)), ('lower=%s%s' % (type(lower), lower)), ('upper=%s%s' % (type(upper), upper))) ) raise return ret diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py index 15a1441..d0c61c5 100644 --- a/swh/lister/core/models.py +++ b/swh/lister/core/models.py @@ -1,50 +1,48 @@ # Copyright (C) 2015-2017 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc from datetime import datetime from sqlalchemy import Column, DateTime, Integer, String from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta from .abstractattribute import AbstractAttribute SQLBase = declarative_base() class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta): pass class ModelBase(SQLBase, metaclass=ABCSQLMeta): """a common repository""" __abstract__ = True __tablename__ = AbstractAttribute uid = AbstractAttribute('Column(, primary_key=True)') name = Column(String, index=True) full_name = Column(String, index=True) html_url = Column(String) origin_url = Column(String) origin_type = Column(String) - description = Column(String) last_seen = Column(DateTime, nullable=False) task_id = Column(Integer) - origin_id = Column(Integer) def __init__(self, **kw): kw['last_seen'] = datetime.now() super().__init__(**kw) class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta): __abstract__ = True __tablename__ = AbstractAttribute # The value used for sorting, segmenting, or api query paging, # because uids aren't always sequential. indexable = AbstractAttribute('Column(, index=True)') diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py index 5b93b64..a63260f 100644 --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,234 +1,234 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import time from unittest import TestCase from unittest.mock import Mock, patch import requests_mock from sqlalchemy import create_engine from testing.postgresql import Postgresql from swh.lister.core.abstractattribute import AbstractAttribute def noop(*args, **kwargs): pass @requests_mock.Mocker() class HttpListerTesterBase(abc.ABC): """Base testing class for subclasses of swh.lister.core.indexing_lister.SWHIndexingHttpLister. swh.lister.core.page_by_page_lister.PageByPageHttpLister See swh.lister.github.tests.test_gh_lister for an example of how to customize for a specific listing service. """ Lister = AbstractAttribute('The lister class to test') test_re = AbstractAttribute('Compiled regex matching the server url. Must' ' capture the index value.') lister_subdir = AbstractAttribute('bitbucket, github, etc.') good_api_response_file = AbstractAttribute('Example good response body') bad_api_response_file = AbstractAttribute('Example bad response body') first_index = AbstractAttribute('First index in good_api_response') entries_per_page = AbstractAttribute('Number of results in good response') LISTER_NAME = 'fake-lister' # May need to override this if the headers are used for something def response_headers(self, request): return {} # May need to override this if the server uses non-standard rate limiting # method. # Please keep the requested retry delay reasonably low. def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 429 context.headers['Retry-After'] = '1' return '{"error":"dummy"}' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.rate_limit = 1 self.response = None self.fl = None self.helper = None if self.__class__ != HttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop def request_index(self, request): m = self.test_re.search(request.path_url) if m and (len(m.groups()) > 0): return m.group(1) else: return None def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 context.status_code = 200 custom_headers = self.response_headers(request) context.headers.update(custom_headers) if self.request_index(request) == str(self.first_index): with open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file), 'r', encoding='utf-8') as r: return r.read() else: with open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.bad_api_response_file), 'r', encoding='utf-8') as r: return r.read() def mock_limit_n_response(self, n, request, context): self.fl.reset_backoff() if self.rate_limit <= n: return self.mock_rate_quota(n, request, context) else: return self.mock_response(request, context) def mock_limit_once_response(self, request, context): return self.mock_limit_n_response(1, request, context) def mock_limit_twice_response(self, request, context): return self.mock_limit_n_response(2, request, context) def get_fl(self, override_config=None): """Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: self.fl = self.Lister(api_baseurl='https://fakeurl', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def get_api_response(self): fl = self.get_fl() if self.response is None: self.response = fl.safely_issue_request(self.first_index) return self.response def test_is_within_bounds(self, http_mocker): fl = self.get_fl() self.assertFalse(fl.is_within_bounds(1, 2, 3)) self.assertTrue(fl.is_within_bounds(2, 1, 3)) self.assertTrue(fl.is_within_bounds(1, 1, 1)) self.assertTrue(fl.is_within_bounds(1, None, None)) self.assertTrue(fl.is_within_bounds(1, None, 2)) self.assertTrue(fl.is_within_bounds(1, 0, None)) self.assertTrue(fl.is_within_bounds("b", "a", "c")) self.assertFalse(fl.is_within_bounds("a", "b", "c")) self.assertTrue(fl.is_within_bounds("a", None, "c")) self.assertTrue(fl.is_within_bounds("a", None, None)) self.assertTrue(fl.is_within_bounds("b", "a", None)) self.assertFalse(fl.is_within_bounds("a", "b", None)) self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03")) self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03")) with self.assertRaises(TypeError): fl.is_within_bounds(1.0, "b", None) with self.assertRaises(TypeError): fl.is_within_bounds("A:B", "A::B", None) def test_api_request(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_limit_twice_response) with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: self.get_api_response() self.assertEqual(sleepmock.call_count, 2) def test_repos_list(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) li = self.get_fl().transport_response_simplified( self.get_api_response() ) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries_per_page) def test_model_map(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() li = fl.transport_response_simplified(self.get_api_response()) di = li[0] self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: - if k not in ['last_seen', 'task_id', 'origin_id', 'id']: + if k not in ['last_seen', 'task_id', 'id']: self.assertIn(k, di) def disable_scheduler(self, fl): fl.schedule_missing_tasks = Mock(return_value=None) def disable_db(self, fl): fl.winnow_models = Mock(return_value=[]) fl.db_inject_repo = Mock(return_value=fl.MODEL()) fl.disable_deleted_repo_tasks = Mock(return_value=None) def test_fetch_none_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=1, max_bound=1) # stores no results def test_fetch_one_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index, max_bound=self.first_index) def test_fetch_multiple_pages_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index) def init_db(self, db, model): engine = create_engine(db.url()) model.metadata.create_all(engine) class HttpListerTester(HttpListerTesterBase, abc.ABC): last_index = AbstractAttribute('Last index in good_api_response') @requests_mock.Mocker() def test_fetch_multiple_pages_yesdb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] initdb_args = ' '.join([initdb_args, '-E UTF-8']) db = Postgresql(initdb_args=initdb_args) fl = self.get_fl(override_config={ 'lister': { 'cls': 'local', 'args': {'db': db.url()} } }) self.init_db(db, fl.MODEL) self.disable_scheduler(fl) fl.run(min_bound=self.first_index) self.assertEqual(fl.db_last_index(), self.last_index) partitions = fl.db_partition_indices(5) self.assertGreater(len(partitions), 0) for k in partitions: self.assertLessEqual(len(k), 5) self.assertGreater(len(k), 0) diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 4277e14..73eeac9 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,119 +1,118 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json import logging import pkg_resources from swh.lister.cran.models import CRANModel from swh.scheduler.utils import create_task_dict from swh.core import utils from swh.lister.core.simple_lister import SimpleLister class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return create_task_dict( 'load-%s' % origin_type, 'recurring', kwargs.get('name'), origin_url, kwargs.get('version'), project_metadata=kwargs.get('description')) def r_script_request(self): """Runs R script which uses inbuilt API to return a json response containing data about all the R packages Returns: List of dictionaries example [ {'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' } {'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy Optical Character Recognition (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' } ... ] """ file_path = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') response = subprocess.run(file_path, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout) def get_model_from_repo(self, repo): """Transform from repository representation to model """ project_url = 'https://cran.r-project.org/src/contrib' \ '/%(Package)s_%(Version)s.tar.gz' % repo return { 'uid': repo["Package"], 'name': repo["Package"], 'full_name': repo["Title"], 'version': repo["Version"], 'html_url': project_url, 'origin_url': project_url, 'origin_type': 'cran', - 'description': repo["Description"] } def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo) for repo in response] def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.r_script_request() if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] for models in utils.grouper(models_list, n=10000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.create_missing_origins_and_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 245afde..866d0e5 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,53 +1,52 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import time from swh.lister.core.indexing_lister import SWHIndexingHttpLister from swh.lister.github.models import GitHubModel class GitHubLister(SWHIndexingHttpLister): PATH_TEMPLATE = '/repositories?since=%d' MODEL = GitHubModel API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') LISTER_NAME = 'github' instance = 'github' # There is only 1 instance of such lister def get_model_from_repo(self, repo): return { 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['html_url'], 'origin_url': repo['html_url'], 'origin_type': 'git', - 'description': repo['description'], 'fork': repo['fork'], } def transport_quota_check(self, response): reqs_remaining = int(response.headers['X-RateLimit-Remaining']) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['X-RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay else: return False, 0 def get_next_target_from_response(self, response): if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) else: return None def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] def request_headers(self): return {'Accept': 'application/vnd.github.v3+json'} diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 82bea89..f329682 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,223 +1,222 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import gzip import json import requests from pathlib import Path from collections import defaultdict from .models import GNUModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' TREE_URL = 'https://ftp.gnu.org/tree.json.gz' BASE_URL = 'https://ftp.gnu.org' instance = 'gnu' tarballs = defaultdict(dict) # Dict of key with project name value the # associated is list of tarballs of package to ingest from the gnu mirror def task_dict(self, origin_type, origin_url, **kwargs): """ Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return utils.create_task_dict( 'load-%s' % origin_type, 'recurring', kwargs.get('name'), origin_url, tarballs=self.tarballs[kwargs.get('name')]) def get_file(self): ''' Download and unzip tree.json.gz file and returns its content in JSON format Returns File content in dictionary format ''' response = requests.get(self.TREE_URL, allow_redirects=True) uncompressed_content = gzip.decompress(response.content) return json.loads(uncompressed_content.decode('utf-8')) def safely_issue_request(self, identifier): ''' Make network request to download the file which has file structure of the GNU website. Args: identifier: resource identifier Returns: Server response ''' return self.get_file() def list_packages(self, response): """ List the actual gnu origins with their names,url and the list of all the tarball for a package from the response. Args: response : File structure of the website in dictionary format Returns: A list of all the packages with their names, url of their root directory and the tarballs present for the particular package. [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'date': '1071002600'}, {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'date': '1071078759'}} ] }, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', 'tarballs': [ {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'date': '1461357336'}, {'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'date': '1480991830'} ] ] """ response = filter_directories(response) packages = [] for directory in response: content = directory['contents'] for repo in content: if repo['type'] == 'directory': package_url = '%s/%s/%s/' % (self.BASE_URL, directory['name'], repo['name']) package_tarballs = find_tarballs( repo['contents'], package_url) if package_tarballs != []: repo_details = { 'name': repo['name'], 'url': package_url, 'time_modified': repo['time'], } self.tarballs[repo['name']] = package_tarballs packages.append(repo_details) random.shuffle(packages) return packages def get_model_from_repo(self, repo): """Transform from repository representation to model """ return { 'uid': repo['name'], 'name': repo['name'], 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], 'time_last_updated': repo['time_modified'], 'origin_type': 'gnu', - 'description': None, } def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo) for repo in response] def find_tarballs(package_file_structure, url): ''' Recursively lists all the tarball present in the folder and subfolders for a particular package url. Args package_file_structure : File structure of the package root directory url : URL of the corresponding package Returns List of all the tarball urls and the last their time of update example- For a package called 3dldf [ {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'date': '1071002600'} {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'date': '1071078759'} {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', 'date': '1074278633'} ... ] ''' tarballs = [] for single_file in package_file_structure: file_type = single_file['type'] file_name = single_file['name'] if file_type == 'file': if file_extension_check(file_name): tarballs .append({ "archive": url + file_name, "date": single_file['time'] }) # It will recursively check for tarballs in all sub-folders elif file_type == 'directory': tarballs_in_dir = find_tarballs( single_file['contents'], url + file_name + '/') tarballs.extend(tarballs_in_dir) return tarballs def filter_directories(response): ''' Keep only gnu and old-gnu folders from JSON ''' final_response = [] file_system = response[0]['contents'] for directory in file_system: if directory['name'] in ('gnu', 'old-gnu'): final_response.append(directory) return final_response def file_extension_check(file_name): ''' Check for the extension of the file, if the file is of zip format of .tar.x format, where x could be anything, then returns true. Args: file_name : name of the file for which the extensions is needs to be checked. Returns: True or False example file_extension_check('abc.zip') will return True file_extension_check('abc.tar.gz') will return True file_extension_check('abc.tar.gz.sig') will return False ''' file_suffixes = Path(file_name).suffixes if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip': return True elif len(file_suffixes) > 1: if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': return True return False diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index d686851..bd8c4a6 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,158 +1,157 @@ # Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from urllib.parse import quote from swh.lister.core.indexing_lister import SWHIndexingHttpLister from swh.lister.npm.models import NpmModel from swh.scheduler.utils import create_task_dict class NpmListerBase(SWHIndexingHttpLister): """List packages available in the npm registry in a paginated way """ MODEL = NpmModel LISTER_NAME = 'npm' instance = 'npm' def __init__(self, api_baseurl='https://replicate.npmjs.com', per_page=1000, override_config=None): super().__init__(api_baseurl=api_baseurl, override_config=override_config) self.per_page = per_page + 1 self.PATH_TEMPLATE += '&limit=%s' % self.per_page @property def ADDITIONAL_CONFIG(self): """(Override) Add extra configuration """ default_config = super().ADDITIONAL_CONFIG default_config['loading_task_policy'] = ('str', 'recurring') return default_config def get_model_from_repo(self, repo_name): """(Override) Transform from npm package name to model """ package_url, package_metadata_url = self._compute_urls(repo_name) return { 'uid': repo_name, 'indexable': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': package_metadata_url, 'origin_url': package_url, 'origin_type': 'npm', - 'description': None } def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task dict for loading a npm package into the archive This is overridden from the lister_base as more information is needed for the ingestion task creation. """ task_type = 'load-%s' % origin_type task_policy = self.config['loading_task_policy'] package_name = kwargs.get('name') package_metadata_url = kwargs.get('html_url') return create_task_dict(task_type, task_policy, package_name, origin_url, package_metadata_url=package_metadata_url) def request_headers(self): """(Override) Set requests headers to send when querying the npm registry """ return {'User-Agent': 'Software Heritage npm lister', 'Accept': 'application/json'} def _compute_urls(self, repo_name): """Return a tuple (package_url, package_metadata_url) """ return ( 'https://www.npmjs.com/package/%s' % repo_name, # package metadata url needs to be escaped otherwise some requests # may fail (for instance when a package name contains '/') '%s/%s' % (self.api_baseurl, quote(repo_name, safe='')) ) def string_pattern_check(self, inner, lower, upper=None): """ (Override) Inhibit the effect of that method as packages indices correspond to package names and thus do not respect any kind of fixed length string pattern """ pass class NpmLister(NpmListerBase): """List all packages available in the npm registry in a paginated way """ PATH_TEMPLATE = '/_all_docs?startkey="%s"' def get_next_target_from_response(self, response): """(Override) Get next npm package name to continue the listing """ repos = response.json()['rows'] return repos[-1]['id'] if len(repos) == self.per_page else None def transport_response_simplified(self, response): """(Override) Transform npm registry response to list for model manipulation """ repos = response.json()['rows'] if len(repos) == self.per_page: repos = repos[:-1] return [self.get_model_from_repo(repo['id']) for repo in repos] class NpmIncrementalLister(NpmListerBase): """List packages in the npm registry, updated since a specific update_seq value of the underlying CouchDB database, in a paginated way """ PATH_TEMPLATE = '/_changes?since=%s' @property def CONFIG_BASE_FILENAME(self): # noqa: N802 return 'lister_npm_incremental' def get_next_target_from_response(self, response): """(Override) Get next npm package name to continue the listing """ repos = response.json()['results'] return repos[-1]['seq'] if len(repos) == self.per_page else None def transport_response_simplified(self, response): """(Override) Transform npm registry response to list for model manipulation """ repos = response.json()['results'] if len(repos) == self.per_page: repos = repos[:-1] return [self.get_model_from_repo(repo['id']) for repo in repos] def filter_before_inject(self, models_list): """(Override) Filter out documents in the CouchDB database not related to a npm package """ models_filtered = [] for model in models_list: package_name = model['name'] # document related to CouchDB internals if package_name.startswith('_design/'): continue models_filtered.append(model) return models_filtered def disable_deleted_repo_tasks(self, start, end, keep_these): """(Override) Disable the processing performed by that method as it is not relevant in this incremental lister context and it raises and exception due to a different index type (int instead of str) """ pass diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index dd30308..e5efc4c 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,179 +1,178 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import urllib.parse from swh.lister.core.indexing_lister import SWHIndexingHttpLister from swh.lister.phabricator.models import PhabricatorModel from collections import defaultdict logger = logging.getLogger(__name__) class PhabricatorLister(SWHIndexingHttpLister): PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' MODEL = PhabricatorModel LISTER_NAME = 'phabricator' def __init__(self, forge_url, instance=None, api_token=None, override_config=None): if forge_url.endswith("/"): forge_url = forge_url[:-1] self.forge_url = forge_url api_baseurl = '%s/api/diffusion.repository.search' % forge_url self.api_token = api_token if not instance: instance = urllib.parse.urlparse(forge_url).hostname self.instance = instance super().__init__(api_baseurl=api_baseurl, override_config=override_config) def _build_query_params(self, params, api_token): """Build query params to include the forge's api token Returns: updated params dict with 'params' entry. """ params.update({'params': {'api.token': api_token}}) return params def request_params(self, identifier): """Override the default params behavior to retrieve the api token Credentials are stored as: credentials: phabricator: : - username: password: """ params = {} params['headers'] = self.request_headers() or {} if self.api_token: return self._build_query_params(params, self.api_token) instance_creds = self.request_instance_credentials() if not instance_creds: raise ValueError( 'Phabricator forge needs authentication credential to list.') api_token = instance_creds[0]['password'] return self._build_query_params(params, api_token) def request_headers(self): """ (Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} def get_model_from_repo(self, repo): url = get_repo_url(repo['attachments']['uris']['uris']) if url is None: return None return { 'uid': self.forge_url + str(repo['id']), 'indexable': repo['id'], 'name': repo['fields']['shortName'], 'full_name': repo['fields']['name'], 'html_url': url, 'origin_url': url, - 'description': None, 'origin_type': repo['fields']['vcs'], 'instance': self.instance, } def get_next_target_from_response(self, response): body = response.json()['result']['cursor'] if body['after'] != 'null': return body['after'] else: return None def transport_response_simplified(self, response): repos = response.json() if repos['result'] is None: raise ValueError( 'Problem during information fetch: %s' % repos['error_code']) repos = repos['result']['data'] return [self.get_model_from_repo(repo) for repo in repos] def filter_before_inject(self, models_list): """ (Overrides) SWHIndexingLister.filter_before_inject Bounds query results by this Lister's set max_index. """ models_list = [m for m in models_list if m is not None] return super().filter_before_inject(models_list) def _bootstrap_repositories_listing(self): """ Method called when no min_bound value has been provided to the lister. Its purpose is to: 1. get the first repository data hosted on the Phabricator instance 2. inject them into the lister database 3. return the first repository index to start the listing after that value Returns: int: The first repository index """ params = '&order=oldest&limit=1' response = self.safely_issue_request(params) models_list = self.transport_response_simplified(response) self.max_index = models_list[0]['indexable'] models_list = self.filter_before_inject(models_list) injected = self.inject_repo_data_into_db(models_list) self.schedule_missing_tasks(models_list, injected) return self.max_index def run(self, min_bound=None, max_bound=None): """ (Override) Run the lister on the specified Phabricator instance Args: min_bound (int): Optional repository index to start the listing after it max_bound (int): Optional repository index to stop the listing after it """ # initial call to the lister, we need to bootstrap it in that case if min_bound is None: min_bound = self._bootstrap_repositories_listing() super().run(min_bound, max_bound) def get_repo_url(attachments): """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) for uri in attachments: protocol = uri['fields']['builtin']['protocol'] url = uri['fields']['uri']['effective'] identifier = uri['fields']['builtin']['identifier'] if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id', 'undefined']: if (protocol in processed_urls and identifier in processed_urls[protocol]): return processed_urls[protocol][identifier] return None diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index fedffd0..f45a3a6 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,77 +1,76 @@ # Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import xmltodict from .models import PyPIModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport class PyPILister(ListerOnePageApiTransport, SimpleLister): MODEL = PyPIModel LISTER_NAME = 'pypi' PAGE = 'https://pypi.org/simple/' instance = 'pypi' # As of today only the main pypi.org is used def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ _type = 'load-%s' % origin_type _policy = 'recurring' project_name = kwargs.get('name') project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( _type, _policy, project_name, origin_url, project_metadata_url=project_metadata_url) def list_packages(self, response): """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) _packages = [p['#text'] for p in result['html']['body']['a']] random.shuffle(_packages) return _packages def _compute_urls(self, repo_name): """Returns a tuple (project_url, project_metadata_url) """ return ( 'https://pypi.org/project/%s/' % repo_name, 'https://pypi.org/pypi/%s/json' % repo_name ) def get_model_from_repo(self, repo_name): """(Override) Transform from repository representation to model """ project_url, project_url_meta = self._compute_urls(repo_name) return { 'uid': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': project_url_meta, 'origin_url': project_url, 'origin_type': 'pypi', - 'description': None, } def transport_response_simplified(self, response): """(Override) Transform response to list for model manipulation """ return [self.get_model_from_repo(repo_name) for repo_name in response]