D4992.diff
No OneTemporary
Actions

Size

64 KB

Subscribers

None

D4992.diff
View Options

	diff --git a/conftest.py b/conftest.py
	--- a/conftest.py
	+++ b/conftest.py
	@@ -1,10 +1,10 @@
	-# Copyright (C) 2020 The Software Heritage developers
	+# Copyright (C) 2020-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import os

	-pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.lister.pytest_plugin"]
	+pytest_plugins = ["swh.scheduler.pytest_plugin"]

	os.environ["LC_ALL"] = "C.UTF-8"
	diff --git a/mypy.ini b/mypy.ini
	--- a/mypy.ini
	+++ b/mypy.ini
	@@ -2,10 +2,6 @@
	namespace_packages = True
	warn_unused_ignores = True

	-# support for sqlalchemy magic: see https://github.com/dropbox/sqlalchemy-stubs
	-plugins = sqlmypy
	-
	-
	# 3rd party libraries without stubs (yet)

	[mypy-bs4.*]
	@@ -38,9 +34,6 @@
	[mypy-requests_mock.*]
	ignore_missing_imports = True

	-[mypy-testing.postgresql.*]
	-ignore_missing_imports = True
	-
	[mypy-urllib3.util.*]
	ignore_missing_imports = True

	diff --git a/requirements-test.txt b/requirements-test.txt
	--- a/requirements-test.txt
	+++ b/requirements-test.txt
	@@ -1,5 +1,3 @@
	pytest
	pytest-mock
	requests_mock
	-sqlalchemy-stubs
	-testing.postgresql
	diff --git a/requirements.txt b/requirements.txt
	--- a/requirements.txt
	+++ b/requirements.txt
	@@ -1,5 +1,3 @@
	-SQLAlchemy
	-arrow
	python_debian
	requests
	setuptools
	diff --git a/swh/lister/cli.py b/swh/lister/cli.py
	--- a/swh/lister/cli.py
	+++ b/swh/lister/cli.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2018-2020 The Software Heritage developers
	+# Copyright (C) 2018-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -14,31 +14,11 @@

	from swh.core.cli import CONTEXT_SETTINGS
	from swh.core.cli import swh as swh_cli_group
	-from swh.lister import LISTERS, SUPPORTED_LISTERS, get_lister
	+from swh.lister import SUPPORTED_LISTERS, get_lister

	logger = logging.getLogger(__name__)


	-# the key in this dict is the suffix used to match new task-type to be added.
	-# For example for a task which function name is "list_gitlab_full', the default
	-# value used when inserting a new task-type in the scheduler db will be the one
	-# under the 'full' key below (because it matches xxx_full).
	-DEFAULT_TASK_TYPE = {
	- "full": { # for tasks like 'list_xxx_full()'
	- "default_interval": "90 days",
	- "min_interval": "90 days",
	- "max_interval": "90 days",
	- "backoff_factor": 1,
	- },
	- "*": { # value if not suffix matches
	- "default_interval": "1 day",
	- "min_interval": "1 day",
	- "max_interval": "1 day",
	- "backoff_factor": 1,
	- },
	-}
	-
	-
	@swh_cli_group.group(name="lister", context_settings=CONTEXT_SETTINGS)
	@click.option(
	"--config-file",
	@@ -47,15 +27,8 @@
	type=click.Path(exists=True, dir_okay=False,),
	help="Configuration file.",
	)
	-@click.option(
	- "--db-url",
	- "-d",
	- default=None,
	- help="SQLAlchemy DB URL; see "
	- "<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>",
	-) # noqa
	@click.pass_context
	-def lister(ctx, config_file, db_url):
	+def lister(ctx, config_file):
	"""Software Heritage Lister tools."""
	from swh.core import config

	@@ -64,51 +37,8 @@
	if not config_file:
	config_file = os.environ.get("SWH_CONFIG_FILENAME")
	conf = config.read(config_file)
	- if db_url:
	- conf["lister"] = {"cls": "local", "args": {"db": db_url}}
	- ctx.obj["config"] = conf
	-
	-
	-@lister.command(name="db-init", context_settings=CONTEXT_SETTINGS)
	-@click.option(
	- "--drop-tables",
	- "-D",
	- is_flag=True,
	- default=False,
	- help="Drop tables before creating the database schema",
	-)
	-@click.pass_context
	-def db_init(ctx, drop_tables):
	- """Initialize the database model for given listers.

	- """
	- from sqlalchemy import create_engine
	-
	- from swh.lister.core.models import initialize
	-
	- cfg = ctx.obj["config"]
	- lister_cfg = cfg["lister"]
	- if lister_cfg["cls"] != "local":
	- click.echo("A local lister configuration is required")
	- ctx.exit(1)
	-
	- db_url = lister_cfg["args"]["db"]
	- db_engine = create_engine(db_url)
	-
	- registry = {}
	- for lister, entrypoint in LISTERS.items():
	- logger.info("Loading lister %s", lister)
	- registry[lister] = entrypoint.load()()
	-
	- logger.info("Initializing database")
	- initialize(db_engine, drop_tables)
	-
	- for lister, entrypoint in LISTERS.items():
	- registry_entry = registry[lister]
	- init_hook = registry_entry.get("init")
	- if callable(init_hook):
	- logger.info("Calling init hook for %s", lister)
	- init_hook(db_engine)
	+ ctx.obj["config"] = conf


	@lister.command(
	@@ -122,17 +52,9 @@
	@click.option(
	"--lister", "-l", help="Lister to run", type=click.Choice(SUPPORTED_LISTERS)
	)
	-@click.option(
	- "--priority",
	- "-p",
	- default="high",
	- type=click.Choice(["high", "medium", "low"]),
	- help="Task priority for the listed repositories to ingest",
	-)
	-@click.option("--legacy", help="Allow unported lister to run with such flag")
	@click.argument("options", nargs=-1)
	@click.pass_context
	-def run(ctx, lister, priority, options, legacy):
	+def run(ctx, lister, options):
	from swh.scheduler.cli.utils import parse_options

	config = deepcopy(ctx.obj["config"])
	@@ -140,10 +62,6 @@
	if options:
	config.update(parse_options(options)[1])

	- if legacy:
	- config["priority"] = priority
	- config["policy"] = "oneshot"
	-
	get_lister(lister, **config).run()


	diff --git a/swh/lister/core/__init__.py b/swh/lister/core/__init__.py
	deleted file mode 100644
	diff --git a/swh/lister/core/abstractattribute.py b/swh/lister/core/abstractattribute.py
	deleted file mode 100644
	--- a/swh/lister/core/abstractattribute.py
	+++ /dev/null
	@@ -1,28 +0,0 @@
	-# Copyright (C) 2017 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-
	-class AbstractAttribute:
	- """AbstractAttributes in a base class must be overridden by the subclass.
	-
	- It's like the :func:`abc.abstractmethod` decorator, but for things that
	- are explicitly attributes/properties, not methods, without the need for
	- empty method def boilerplate. Like abc.abstractmethod, the class containing
	- AbstractAttributes must inherit from :class:`abc.ABC` or use the
	- :class:`abc.ABCMeta` metaclass.
	-
	- Usage example::
	-
	- import abc
	- class ClassContainingAnAbstractAttribute(abc.ABC):
	- foo: Union[AbstractAttribute, Any] = \
	- AbstractAttribute('docstring for foo')
	-
	- """
	-
	- __isabstractmethod__ = True
	-
	- def __init__(self, docstring=None):
	- if docstring is not None:
	- self.__doc__ = "AbstractAttribute: " + docstring
	diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py
	deleted file mode 100644
	--- a/swh/lister/core/lister_base.py
	+++ /dev/null
	@@ -1,508 +0,0 @@
	-# Copyright (C) 2015-2020 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import abc
	-import datetime
	-import gzip
	-import json
	-import logging
	-import os
	-import re
	-import time
	-from typing import Any, Dict, List, Optional, Type, Union
	-
	-from requests import Response
	-from sqlalchemy import create_engine, func
	-from sqlalchemy.orm import sessionmaker
	-
	-from swh.core import config
	-from swh.core.utils import grouper
	-from swh.scheduler import get_scheduler, utils
	-
	-from .abstractattribute import AbstractAttribute
	-
	-logger = logging.getLogger(__name__)
	-
	-
	-def utcnow():
	- return datetime.datetime.now(tz=datetime.timezone.utc)
	-
	-
	-class FetchError(RuntimeError):
	- def __init__(self, response):
	- self.response = response
	-
	- def __str__(self):
	- return repr(self.response)
	-
	-
	-DEFAULT_CONFIG = {
	- "scheduler": {"cls": "memory"},
	- "lister": {"cls": "local", "args": {"db": "postgresql:///lister",},},
	- "credentials": {},
	- "cache_responses": False,
	-}
	-
	-
	-class ListerBase(abc.ABC):
	- """Lister core base class.
	- Generally a source code hosting service provides an API endpoint
	- for listing the set of stored repositories. A Lister is the discovery
	- service responsible for finding this list, all at once or sequentially
	- by parts, and queueing local tasks to fetch and ingest the referenced
	- repositories.
	-
	- The core method in this class is ingest_data. Any subclasses should be
	- calling this method one or more times to fetch and ingest data from API
	- endpoints. See swh.lister.core.lister_base.IndexingLister for
	- example usage.
	-
	- This class cannot be instantiated. Any instantiable Lister descending
	- from ListerBase must provide at least the required overrides.
	- (see member docstrings for details):
	-
	- Required Overrides:
	- MODEL
	- def transport_request
	- def transport_response_to_string
	- def transport_response_simplified
	- def transport_quota_check
	-
	- Optional Overrides:
	- def filter_before_inject
	- def is_within_bounds
	- """
	-
	- MODEL = AbstractAttribute(
	- "Subclass type (not instance) of swh.lister.core.models.ModelBase "
	- "customized for a specific service."
	- ) # type: Union[AbstractAttribute, Type[Any]]
	- LISTER_NAME = AbstractAttribute(
	- "Lister's name"
	- ) # type: Union[AbstractAttribute, str]
	-
	- def transport_request(self, identifier):
	- """Given a target endpoint identifier to query, try once to request it.
	-
	- Implementation of this method determines the network request protocol.
	-
	- Args:
	- identifier (string): unique identifier for an endpoint query.
	- e.g. If the service indexes lists of repositories by date and
	- time of creation, this might be that as a formatted string. Or
	- it might be an integer UID. Or it might be nothing.
	- It depends on what the service needs.
	- Returns:
	- the entire request response
	- Raises:
	- Will catch internal transport-dependent connection exceptions and
	- raise swh.lister.core.lister_base.FetchError instead. Other
	- non-connection exceptions should propagate unchanged.
	- """
	- pass
	-
	- def transport_response_to_string(self, response):
	- """Convert the server response into a formatted string for logging.
	-
	- Implementation of this method depends on the shape of the network
	- response object returned by the transport_request method.
	-
	- Args:
	- response: the server response
	- Returns:
	- a pretty string of the response
	- """
	- pass
	-
	- def transport_response_simplified(self, response):
	- """Convert the server response into list of a dict for each repo in the
	- response, mapping columns in the lister's MODEL class to repo data.
	-
	- Implementation of this method depends on the server API spec and the
	- shape of the network response object returned by the transport_request
	- method.
	-
	- Args:
	- response: response object from the server.
	- Returns:
	- list of repo MODEL dicts
	- ( eg. [{'uid': r['id'], etc.} for r in response.json()] )
	- """
	- pass
	-
	- def transport_quota_check(self, response):
	- """Check server response to see if we're hitting request rate limits.
	-
	- Implementation of this method depends on the server communication
	- protocol and API spec and the shape of the network response object
	- returned by the transport_request method.
	-
	- Args:
	- response (session response): complete API query response
	- Returns:
	- 1) must retry request? True/False
	- 2) seconds to delay if True
	- """
	- pass
	-
	- def filter_before_inject(self, models_list: List[Dict]) -> List[Dict]:
	- """Filter models_list entries prior to injection in the db.
	- This is ran directly after `transport_response_simplified`.
	-
	- Default implementation is to have no filtering.
	-
	- Args:
	- models_list: list of dicts returned by
	- transport_response_simplified.
	- Returns:
	- models_list with entries changed according to custom logic.
	-
	- """
	- return models_list
	-
	- def do_additional_checks(self, models_list: List[Dict]) -> List[Dict]:
	- """Execute some additional checks on the model list (after the
	- filtering).
	-
	- Default implementation is to run no check at all and to return
	- the input as is.
	-
	- Args:
	- models_list: list of dicts returned by
	- transport_response_simplified.
	-
	- Returns:
	- models_list with entries if checks ok, False otherwise
	-
	- """
	- return models_list
	-
	- def is_within_bounds(
	- self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None
	- ) -> bool:
	- """See if a sortable value is inside the range [lower,upper].
	-
	- MAY BE OVERRIDDEN, for example if the server indexable* key is
	- technically sortable but not automatically so.
	-
	- * - ( see: swh.lister.core.indexing_lister.IndexingLister )
	-
	- Args:
	- inner (sortable type): the value being checked
	- lower (sortable type): optional lower bound
	- upper (sortable type): optional upper bound
	- Returns:
	- whether inner is confined by the optional lower and upper bounds
	- """
	- try:
	- if lower is None and upper is None:
	- return True
	- elif lower is None:
	- ret = inner <= upper # type: ignore
	- elif upper is None:
	- ret = inner >= lower
	- else:
	- ret = lower <= inner <= upper
	-
	- self.string_pattern_check(inner, lower, upper)
	- except Exception as e:
	- logger.error(
	- str(e)
	- + ": %s, %s, %s"
	- % (
	- ("inner=%s%s" % (type(inner), inner)),
	- ("lower=%s%s" % (type(lower), lower)),
	- ("upper=%s%s" % (type(upper), upper)),
	- )
	- )
	- raise
	-
	- return ret
	-
	- # You probably don't need to override anything below this line.
	-
	- INITIAL_BACKOFF = 10
	- MAX_RETRIES = 7
	- CONN_SLEEP = 10
	-
	- def __init__(self, override_config=None):
	- self.backoff = self.INITIAL_BACKOFF
	- self.config = config.load_from_envvar(DEFAULT_CONFIG)
	- if self.config["cache_responses"]:
	- cache_dir = self.config.get(
	- "cache_dir", f"~/.cache/swh/lister/{self.LISTER_NAME}"
	- )
	- self.config["cache_dir"] = os.path.expanduser(cache_dir)
	- config.prepare_folders(self.config, "cache_dir")
	-
	- if override_config:
	- self.config.update(override_config)
	-
	- logger.debug("%s CONFIG=%s" % (self, self.config))
	- self.scheduler = get_scheduler(**self.config["scheduler"])
	- self.db_engine = create_engine(self.config["lister"]["args"]["db"])
	- self.mk_session = sessionmaker(bind=self.db_engine)
	- self.db_session = self.mk_session()
	-
	- def reset_backoff(self):
	- """Reset exponential backoff timeout to initial level."""
	- self.backoff = self.INITIAL_BACKOFF
	-
	- def back_off(self) -> int:
	- """Get next exponential backoff timeout."""
	- ret = self.backoff
	- self.backoff *= 10
	- return ret
	-
	- def safely_issue_request(self, identifier: int) -> Optional[Response]:
	- """Make network request with retries, rate quotas, and response logs.
	-
	- Protocol is handled by the implementation of the transport_request
	- method.
	-
	- Args:
	- identifier: resource identifier
	- Returns:
	- server response
	- """
	- retries_left = self.MAX_RETRIES
	- do_cache = self.config["cache_responses"]
	- r = None
	- while retries_left > 0:
	- try:
	- r = self.transport_request(identifier)
	- except FetchError:
	- # network-level connection error, try again
	- logger.warning(
	- "connection error on %s: sleep for %d seconds"
	- % (identifier, self.CONN_SLEEP)
	- )
	- time.sleep(self.CONN_SLEEP)
	- retries_left -= 1
	- continue
	-
	- if do_cache:
	- self.save_response(r)
	-
	- # detect throttling
	- must_retry, delay = self.transport_quota_check(r)
	- if must_retry:
	- logger.warning(
	- "rate limited on %s: sleep for %f seconds" % (identifier, delay)
	- )
	- time.sleep(delay)
	- else: # request ok
	- break
	-
	- retries_left -= 1
	-
	- if not retries_left:
	- logger.warning("giving up on %s: max retries exceeded" % identifier)
	-
	- return r
	-
	- def db_query_equal(self, key: Any, value: Any):
	- """Look in the db for a row with key == value
	-
	- Args:
	- key: column key to look at
	- value: value to look for in that column
	- Returns:
	- sqlalchemy.ext.declarative.declarative_base object
	- with the given key == value
	- """
	- if isinstance(key, str):
	- key = self.MODEL.__dict__[key]
	- return self.db_session.query(self.MODEL).filter(key == value).first()
	-
	- def winnow_models(self, mlist, key, to_remove):
	- """Given a list of models, remove any with <key> matching
	- some member of a list of values.
	-
	- Args:
	- mlist (list of model rows): the initial list of models
	- key (column): the column to filter on
	- to_remove (list): if anything in mlist has column <key> equal to
	- one of the values in to_remove, it will be removed from the
	- result
	- Returns:
	- A list of model rows starting from mlist minus any matching rows
	- """
	- if isinstance(key, str):
	- key = self.MODEL.__dict__[key]
	-
	- if to_remove:
	- return mlist.filter(~key.in_(to_remove)).all()
	- else:
	- return mlist.all()
	-
	- def db_num_entries(self):
	- """Return the known number of entries in the lister db"""
	- return self.db_session.query(func.count("*")).select_from(self.MODEL).scalar()
	-
	- def db_inject_repo(self, model_dict):
	- """Add/update a new repo to the db and mark it last_seen now.
	-
	- Args:
	- model_dict: dictionary mapping model keys to values
	-
	- Returns:
	- new or updated sqlalchemy.ext.declarative.declarative_base
	- object associated with the injection
	-
	- """
	- sql_repo = self.db_query_equal("uid", model_dict["uid"])
	-
	- if not sql_repo:
	- sql_repo = self.MODEL(**model_dict)
	- self.db_session.add(sql_repo)
	- else:
	- for k in model_dict:
	- setattr(sql_repo, k, model_dict[k])
	- sql_repo.last_seen = utcnow()
	-
	- return sql_repo
	-
	- def task_dict(self, origin_type: str, origin_url: str, **kwargs) -> Dict[str, Any]:
	- """Return special dict format for the tasks list
	-
	- Args:
	- origin_type (string)
	- origin_url (string)
	- Returns:
	- the same information in a different form
	- """
	- logger.debug("origin-url: %s, type: %s", origin_url, origin_type)
	- _type = "load-%s" % origin_type
	- _policy = kwargs.get("policy", "recurring")
	- priority = kwargs.get("priority")
	- kw = {"priority": priority} if priority else {}
	- return utils.create_task_dict(_type, _policy, url=origin_url, **kw)
	-
	- def string_pattern_check(self, a, b, c=None):
	- """When comparing indexable types in is_within_bounds, complex strings
	- may not be allowed to differ in basic structure. If they do, it
	- could be a sign of not understanding the data well. For instance,
	- an ISO 8601 time string cannot be compared against its urlencoded
	- equivalent, but this is an easy mistake to accidentally make. This
	- method acts as a friendly sanity check.
	-
	- Args:
	- a (string): inner component of the is_within_bounds method
	- b (string): lower component of the is_within_bounds method
	- c (string): upper component of the is_within_bounds method
	- Returns:
	- nothing
	- Raises:
	- TypeError if strings a, b, and c don't conform to the same basic
	- pattern.
	- """
	- if isinstance(a, str):
	- a_pattern = re.sub("[a-zA-Z0-9]", "[a-zA-Z0-9]", re.escape(a))
	- if (
	- isinstance(b, str)
	- and (re.match(a_pattern, b) is None)
	- or isinstance(c, str)
	- and (re.match(a_pattern, c) is None)
	- ):
	- logger.debug(a_pattern)
	- raise TypeError("incomparable string patterns detected")
	-
	- def inject_repo_data_into_db(self, models_list: List[Dict]) -> Dict:
	- """Inject data into the db.
	-
	- Args:
	- models_list: list of dicts mapping keys from the db model
	- for each repo to be injected
	- Returns:
	- dict of uid:sql_repo pairs
	-
	- """
	- injected_repos = {}
	- for m in models_list:
	- injected_repos[m["uid"]] = self.db_inject_repo(m)
	- return injected_repos
	-
	- def schedule_missing_tasks(
	- self, models_list: List[Dict], injected_repos: Dict
	- ) -> None:
	- """Schedule any newly created db entries that do not have been
	- scheduled yet.
	-
	- Args:
	- models_list: List of dicts mapping keys in the db model
	- for each repo
	- injected_repos: Dict of uid:sql_repo pairs that have just
	- been created
	-
	- Returns:
	- Nothing. (Note that it Modifies injected_repos to set the new
	- task_id).
	-
	- """
	- tasks = {}
	-
	- def _task_key(m):
	- return "%s-%s" % (m["type"], json.dumps(m["arguments"], sort_keys=True))
	-
	- for m in models_list:
	- ir = injected_repos[m["uid"]]
	- if not ir.task_id:
	- # Patching the model instance to add the policy/priority task
	- # scheduling
	- if "policy" in self.config:
	- m["policy"] = self.config["policy"]
	- if "priority" in self.config:
	- m["priority"] = self.config["priority"]
	- task_dict = self.task_dict(**m)
	- task_dict.setdefault("retries_left", 3)
	- tasks[_task_key(task_dict)] = (ir, m, task_dict)
	-
	- gen_tasks = (task_dicts for (_, _, task_dicts) in tasks.values())
	- for grouped_tasks in grouper(gen_tasks, n=1000):
	- new_tasks = self.scheduler.create_tasks(list(grouped_tasks))
	- for task in new_tasks:
	- ir, m, _ = tasks[_task_key(task)]
	- ir.task_id = task["id"]
	-
	- def ingest_data(self, identifier: int, checks: bool = False):
	- """The core data fetch sequence. Request server endpoint. Simplify and
	- filter response list of repositories. Inject repo information into
	- local db. Queue loader tasks for linked repositories.
	-
	- Args:
	- identifier: Resource identifier.
	- checks (bool): Additional checks required
	- """
	- # Request (partial?) list of repositories info
	- response = self.safely_issue_request(identifier)
	- if not response:
	- return response, []
	- models_list = self.transport_response_simplified(response)
	- models_list = self.filter_before_inject(models_list)
	- if checks:
	- models_list = self.do_additional_checks(models_list)
	- if not models_list:
	- return response, []
	- # inject into local db
	- injected = self.inject_repo_data_into_db(models_list)
	- # queue workers
	- self.schedule_missing_tasks(models_list, injected)
	- return response, injected
	-
	- def save_response(self, response):
	- """Log the response from a server request to a cache dir.
	-
	- Args:
	- response: full server response
	- cache_dir: system path for cache dir
	- Returns:
	- nothing
	- """
	- datepath = utcnow().isoformat()
	-
	- fname = os.path.join(self.config["cache_dir"], datepath + ".gz",)
	-
	- with gzip.open(fname, "w") as f:
	- f.write(bytes(self.transport_response_to_string(response), "UTF-8"))
	diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py
	deleted file mode 100644
	--- a/swh/lister/core/lister_transports.py
	+++ /dev/null
	@@ -1,233 +0,0 @@
	-# Copyright (C) 2017-2018 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import abc
	-from datetime import datetime
	-from email.utils import parsedate
	-import logging
	-from pprint import pformat
	-import random
	-from typing import Any, Dict, List, Optional, Union
	-
	-import requests
	-from requests import Response
	-import xmltodict
	-
	-from swh.lister import USER_AGENT_TEMPLATE, __version__
	-
	-from .abstractattribute import AbstractAttribute
	-from .lister_base import FetchError
	-
	-logger = logging.getLogger(__name__)
	-
	-
	-class ListerHttpTransport(abc.ABC):
	- """Use the Requests library for making Lister endpoint requests.
	-
	- To be used in conjunction with ListerBase or a subclass of it.
	- """
	-
	- DEFAULT_URL = None # type: Optional[str]
	- PATH_TEMPLATE = AbstractAttribute(
	- "string containing a python string format pattern that produces"
	- " the API endpoint path for listing stored repositories when given"
	- ' an index, e.g., "/repositories?after=%s". To be implemented in'
	- " the API-specific class inheriting this."
	- ) # type: Union[AbstractAttribute, Optional[str]]
	-
	- EXPECTED_STATUS_CODES = (200, 429, 403, 404)
	-
	- def request_headers(self) -> Dict[str, Any]:
	- """Returns dictionary of any request headers needed by the server.
	-
	- MAY BE OVERRIDDEN if request headers are needed.
	- """
	- return {"User-Agent": USER_AGENT_TEMPLATE % self.lister_version}
	-
	- def request_instance_credentials(self) -> List[Dict[str, Any]]:
	- """Returns dictionary of any credentials configuration needed by the
	- forge instance to list.
	-
	- The 'credentials' configuration is expected to be a dict of multiple
	- levels. The first level is the lister's name, the second is the
	- lister's instance name, which value is expected to be a list of
	- credential structures (typically a couple username/password).
	-
	- For example::
	-
	- credentials:
	- github: # github lister
	- github: # has only one instance (so far)
	- - username: some
	- password: somekey
	- - username: one
	- password: onekey
	- - ...
	- gitlab: # gitlab lister
	- riseup: # has many instances
	- - username: someone
	- password: ...
	- - ...
	- gitlab:
	- - username: someone
	- password: ...
	- - ...
	-
	- Returns:
	- list of credential dicts for the current lister.
	-
	- """
	- all_creds = self.config.get("credentials") # type: ignore
	- if not all_creds:
	- return []
	- lister_creds = all_creds.get(self.LISTER_NAME, {}) # type: ignore
	- creds = lister_creds.get(self.instance, []) # type: ignore
	- return creds
	-
	- def request_uri(self, identifier: str) -> str:
	- """Get the full request URI given the transport_request identifier.
	-
	- MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is
	- required.
	- """
	- path = self.PATH_TEMPLATE % identifier # type: ignore
	- return self.url + path
	-
	- def request_params(self, identifier: str) -> Dict[str, Any]:
	- """Get the full parameters passed to requests given the
	- transport_request identifier.
	-
	- This uses credentials if any are provided (see
	- request_instance_credentials).
	-
	- MAY BE OVERRIDDEN if something more complex than the request headers
	- is needed.
	-
	- """
	- params = {}
	- params["headers"] = self.request_headers() or {}
	- creds = self.request_instance_credentials()
	- if not creds:
	- return params
	- auth = random.choice(creds) if creds else None
	- if auth:
	- params["auth"] = (
	- auth["username"], # type: ignore
	- auth["password"],
	- )
	- return params
	-
	- def transport_quota_check(self, response):
	- """Implements ListerBase.transport_quota_check with standard 429
	- code check for HTTP with Requests library.
	-
	- MAY BE OVERRIDDEN if the server notifies about rate limits in a
	- non-standard way that doesn't use HTTP 429 and the Retry-After
	- response header. ( https://tools.ietf.org/html/rfc6585#section-4 )
	-
	- """
	- if response.status_code == 429: # HTTP too many requests
	- retry_after = response.headers.get("Retry-After", self.back_off())
	- try:
	- # might be seconds
	- return True, float(retry_after)
	- except Exception:
	- # might be http-date
	- at_date = datetime(*parsedate(retry_after)[:6])
	- from_now = (at_date - datetime.today()).total_seconds() + 5
	- return True, max(0, from_now)
	- else: # response ok
	- self.reset_backoff()
	- return False, 0
	-
	- def __init__(self, url=None):
	- if not url:
	- url = self.config.get("url")
	- if not url:
	- url = self.DEFAULT_URL
	- if not url:
	- raise NameError("HTTP Lister Transport requires an url.")
	- self.url = url # eg. 'https://api.github.com'
	- self.session = requests.Session()
	- self.lister_version = __version__
	-
	- def _transport_action(self, identifier: str, method: str = "get") -> Response:
	- """Permit to ask information to the api prior to actually executing
	- query.
	-
	- """
	- path = self.request_uri(identifier)
	- params = self.request_params(identifier)
	-
	- logger.debug("path: %s", path)
	- logger.debug("params: %s", params)
	- logger.debug("method: %s", method)
	- try:
	- if method == "head":
	- response = self.session.head(path, **params)
	- else:
	- response = self.session.get(path, **params)
	- except requests.exceptions.ConnectionError as e:
	- logger.warning("Failed to fetch %s: %s", path, e)
	- raise FetchError(e)
	- else:
	- if response.status_code not in self.EXPECTED_STATUS_CODES:
	- raise FetchError(response)
	- return response
	-
	- def transport_head(self, identifier: str) -> Response:
	- """Retrieve head information on api.
	-
	- """
	- return self._transport_action(identifier, method="head")
	-
	- def transport_request(self, identifier: str) -> Response:
	- """Implements ListerBase.transport_request for HTTP using Requests.
	-
	- Retrieve get information on api.
	-
	- """
	- return self._transport_action(identifier)
	-
	- def transport_response_to_string(self, response: Response) -> str:
	- """Implements ListerBase.transport_response_to_string for HTTP given
	- Requests responses.
	- """
	- s = pformat(response.request.path_url)
	- s += "\n#\n" + pformat(response.request.headers)
	- s += "\n#\n" + pformat(response.status_code)
	- s += "\n#\n" + pformat(response.headers)
	- s += "\n#\n"
	- try: # json?
	- s += pformat(response.json())
	- except Exception: # not json
	- try: # xml?
	- s += pformat(xmltodict.parse(response.text))
	- except Exception: # not xml
	- s += pformat(response.text)
	- return s
	-
	-
	-class ListerOnePageApiTransport(ListerHttpTransport):
	- """Leverage requests library to retrieve basic html page and parse
	- result.
	-
	- To be used in conjunction with ListerBase or a subclass of it.
	-
	- """
	-
	- PAGE = AbstractAttribute(
	- "URL of the API's unique page to retrieve and parse " "for information"
	- ) # type: Union[AbstractAttribute, str]
	- PATH_TEMPLATE = None # we do not use it
	-
	- def __init__(self, url=None):
	- self.session = requests.Session()
	- self.lister_version = __version__
	-
	- def request_uri(self, _):
	- """Get the full request URI given the transport_request identifier.
	-
	- """
	- return self.PAGE
	diff --git a/swh/lister/core/models.py b/swh/lister/core/models.py
	deleted file mode 100644
	--- a/swh/lister/core/models.py
	+++ /dev/null
	@@ -1,77 +0,0 @@
	-# Copyright (C) 2015-2019 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import abc
	-from datetime import datetime
	-import logging
	-from typing import Type, Union
	-
	-from sqlalchemy import Column, DateTime, Integer, String
	-from sqlalchemy.ext.declarative import DeclarativeMeta, declarative_base
	-
	-from .abstractattribute import AbstractAttribute
	-
	-SQLBase = declarative_base()
	-
	-
	-logger = logging.getLogger(__name__)
	-
	-
	-class ABCSQLMeta(abc.ABCMeta, DeclarativeMeta):
	- pass
	-
	-
	-class ModelBase(SQLBase, metaclass=ABCSQLMeta):
	- """a common repository"""
	-
	- __abstract__ = True
	- __tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str]
	-
	- uid = AbstractAttribute(
	- "Column(<uid_type>, primary_key=True)"
	- ) # type: Union[AbstractAttribute, Column]
	-
	- name = Column(String, index=True)
	- full_name = Column(String, index=True)
	- html_url = Column(String)
	- origin_url = Column(String)
	- origin_type = Column(String)
	-
	- last_seen = Column(DateTime, nullable=False)
	-
	- task_id = Column(Integer)
	-
	- def __init__(self, **kw):
	- kw["last_seen"] = datetime.now()
	- super().__init__(**kw)
	-
	-
	-class IndexingModelBase(ModelBase, metaclass=ABCSQLMeta):
	- __abstract__ = True
	- __tablename__ = AbstractAttribute # type: Union[Type[AbstractAttribute], str]
	-
	- # The value used for sorting, segmenting, or api query paging,
	- # because uids aren't always sequential.
	- indexable = AbstractAttribute(
	- "Column(<indexable_type>, index=True)"
	- ) # type: Union[AbstractAttribute, Column]
	-
	-
	-def initialize(db_engine, drop_tables=False, **kwargs):
	- """Default database initialization function for a lister.
	-
	- Typically called from the lister's initialization hook.
	-
	- Args:
	- models (list): list of SQLAlchemy tables/models to drop/create.
	- db_engine (): the SQLAlchemy DB engine.
	- drop_tables (bool): if True, tables will be dropped before
	- (re)creating them.
	- """
	- if drop_tables:
	- logger.info("Dropping tables")
	- SQLBase.metadata.drop_all(db_engine, checkfirst=True)
	-
	- logger.info("Creating tables")
	- SQLBase.metadata.create_all(db_engine, checkfirst=True)
	diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py
	deleted file mode 100644
	--- a/swh/lister/core/simple_lister.py
	+++ /dev/null
	@@ -1,96 +0,0 @@
	-# Copyright (C) 2018-2019 The Software Heritage developers
	-# See the AUTHORS file at the top-level directory of this distribution
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import logging
	-from typing import Any, List
	-
	-from swh.core import utils
	-
	-from .lister_base import ListerBase
	-
	-logger = logging.getLogger(__name__)
	-
	-
	-class SimpleLister(ListerBase):
	- """Lister* intermediate class for any service that follows the simple,
	- 'list in oneshot information' pattern.
	-
	- - Client sends a request to list repositories in oneshot
	-
	- - Client receives structured (json/xml/etc) response with
	- information and stores those in db
	-
	- """
	-
	- flush_packet_db = 2
	- """Number of iterations in-between write flushes of lister repositories to
	- db (see fn:`ingest_data`).
	- """
	-
	- def list_packages(self, response: Any) -> List[Any]:
	- """Listing packages method.
	-
	- """
	- pass
	-
	- def ingest_data(self, identifier, checks=False):
	- """Rework the base ingest_data.
	- Request server endpoint which gives all in one go.
	-
	- Simplify and filter response list of repositories. Inject
	- repo information into local db. Queue loader tasks for
	- linked repositories.
	-
	- Args:
	- identifier: Resource identifier (unused)
	- checks (bool): Additional checks required (unused)
	-
	- """
	- response = self.safely_issue_request(identifier)
	- response = self.list_packages(response)
	- if not response:
	- return response, []
	- models_list = self.transport_response_simplified(response)
	- models_list = self.filter_before_inject(models_list)
	- all_injected = []
	- for i, models in enumerate(utils.grouper(models_list, n=100), start=1):
	- models = list(models)
	- logging.debug("models: %s" % len(models))
	- # inject into local db
	- injected = self.inject_repo_data_into_db(models)
	- # queue workers
	- self.schedule_missing_tasks(models, injected)
	- all_injected.append(injected)
	- if (i % self.flush_packet_db) == 0:
	- logger.debug("Flushing updates at index %s", i)
	- self.db_session.commit()
	- self.db_session = self.mk_session()
	-
	- return response, all_injected
	-
	- def transport_response_simplified(self, response):
	- """Transform response to list for model manipulation
	-
	- """
	- return [self.get_model_from_repo(repo_name) for repo_name in response]
	-
	- def run(self):
	- """Query the server which answers in one query. Stores the
	- information, dropping actual redundant information we
	- already have.
	-
	- Returns:
	- nothing
	-
	- """
	- dump_not_used_identifier = 0
	- response, injected_repos = self.ingest_data(dump_not_used_identifier)
	- if not response and not injected_repos:
	- logging.info("No response from api server, stopping")
	- status = "uneventful"
	- else:
	- status = "eventful"
	-
	- return {"status": status}
	diff --git a/swh/lister/core/tests/__init__.py b/swh/lister/core/tests/__init__.py
	deleted file mode 100644
	diff --git a/swh/lister/core/tests/test_abstractattribute.py b/swh/lister/core/tests/test_abstractattribute.py
	deleted file mode 100644
	--- a/swh/lister/core/tests/test_abstractattribute.py
	+++ /dev/null
	@@ -1,64 +0,0 @@
	-# Copyright (C) 2017 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import abc
	-from typing import Any
	-import unittest
	-
	-from swh.lister.core.abstractattribute import AbstractAttribute
	-
	-
	-class BaseClass(abc.ABC):
	- v1 = AbstractAttribute # type: Any
	- v2 = AbstractAttribute() # type: Any
	- v3 = AbstractAttribute("changed docstring") # type: Any
	- v4 = "qux"
	-
	-
	-class BadSubclass1(BaseClass):
	- pass
	-
	-
	-class BadSubclass2(BaseClass):
	- v1 = "foo"
	- v2 = "bar"
	-
	-
	-class BadSubclass3(BaseClass):
	- v2 = "bar"
	- v3 = "baz"
	-
	-
	-class GoodSubclass(BaseClass):
	- v1 = "foo"
	- v2 = "bar"
	- v3 = "baz"
	-
	-
	-class TestAbstractAttributes(unittest.TestCase):
	- def test_aa(self):
	- with self.assertRaises(TypeError):
	- BaseClass()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass1()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass2()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass3()
	-
	- self.assertIsInstance(GoodSubclass(), GoodSubclass)
	- gsc = GoodSubclass()
	-
	- self.assertEqual(gsc.v1, "foo")
	- self.assertEqual(gsc.v2, "bar")
	- self.assertEqual(gsc.v3, "baz")
	- self.assertEqual(gsc.v4, "qux")
	-
	- def test_aa_docstrings(self):
	- self.assertEqual(BaseClass.v1.__doc__, AbstractAttribute.__doc__)
	- self.assertEqual(BaseClass.v2.__doc__, AbstractAttribute.__doc__)
	- self.assertEqual(BaseClass.v3.__doc__, "AbstractAttribute: changed docstring")
	diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py
	deleted file mode 100644
	--- a/swh/lister/core/tests/test_lister.py
	+++ /dev/null
	@@ -1,453 +0,0 @@
	-# Copyright (C) 2019 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import abc
	-import datetime
	-import time
	-from typing import Any, Callable, Optional, Pattern, Type, Union
	-from unittest import TestCase
	-from unittest.mock import Mock, patch
	-
	-import requests_mock
	-from sqlalchemy import create_engine
	-
	-import swh.lister
	-from swh.lister.core.abstractattribute import AbstractAttribute
	-from swh.lister.tests.test_utils import init_db
	-
	-
	-def noop(args, *kwargs):
	- pass
	-
	-
	-def test_version_generation():
	- assert (
	- swh.lister.__version__ != "devel"
	- ), "Make sure swh.lister is installed (e.g. pip install -e .)"
	-
	-
	-class HttpListerTesterBase(abc.ABC):
	- """Testing base class for listers.
	- This contains methods for both :class:`HttpSimpleListerTester` and
	- :class:`HttpListerTester`.
	-
	- See :class:`swh.lister.gitlab.tests.test_lister` for an example of how
	- to customize for a specific listing service.
	-
	- """
	-
	- Lister = AbstractAttribute(
	- "Lister class to test"
	- ) # type: Union[AbstractAttribute, Type[Any]]
	- lister_subdir = AbstractAttribute(
	- "bitbucket, github, etc."
	- ) # type: Union[AbstractAttribute, str]
	- good_api_response_file = AbstractAttribute(
	- "Example good response body"
	- ) # type: Union[AbstractAttribute, str]
	- LISTER_NAME = "fake-lister"
	-
	- # May need to override this if the headers are used for something
	- def response_headers(self, request):
	- return {}
	-
	- # May need to override this if the server uses non-standard rate limiting
	- # method.
	- # Please keep the requested retry delay reasonably low.
	- def mock_rate_quota(self, n, request, context):
	- self.rate_limit += 1
	- context.status_code = 429
	- context.headers["Retry-After"] = "1"
	- return '{"error":"dummy"}'
	-
	- def __init__(self, args, *kwargs):
	- super().__init__(args, *kwargs)
	- self.rate_limit = 1
	- self.response = None
	- self.fl = None
	- self.helper = None
	- self.scheduler_tasks = []
	- if self.__class__ != HttpListerTesterBase:
	- self.run = TestCase.run.__get__(self, self.__class__)
	- else:
	- self.run = noop
	-
	- def mock_limit_n_response(self, n, request, context):
	- self.fl.reset_backoff()
	- if self.rate_limit <= n:
	- return self.mock_rate_quota(n, request, context)
	- else:
	- return self.mock_response(request, context)
	-
	- def mock_limit_twice_response(self, request, context):
	- return self.mock_limit_n_response(2, request, context)
	-
	- def get_api_response(self, identifier):
	- fl = self.get_fl()
	- if self.response is None:
	- self.response = fl.safely_issue_request(identifier)
	- return self.response
	-
	- def get_fl(self, override_config=None):
	- """Retrieve an instance of fake lister (fl).
	-
	- """
	- if override_config or self.fl is None:
	- self.fl = self.Lister(
	- url="https://fakeurl", override_config=override_config
	- )
	- self.fl.INITIAL_BACKOFF = 1
	-
	- self.fl.reset_backoff()
	- self.scheduler_tasks = []
	- return self.fl
	-
	- def disable_scheduler(self, fl):
	- fl.schedule_missing_tasks = Mock(return_value=None)
	-
	- def mock_scheduler(self, fl):
	- def _create_tasks(tasks):
	- task_id = 0
	- current_nb_tasks = len(self.scheduler_tasks)
	- if current_nb_tasks > 0:
	- task_id = self.scheduler_tasks[-1]["id"] + 1
	- for task in tasks:
	- scheduler_task = dict(task)
	- scheduler_task.update(
	- {
	- "status": "next_run_not_scheduled",
	- "retries_left": 0,
	- "priority": None,
	- "id": task_id,
	- "current_interval": datetime.timedelta(days=64),
	- }
	- )
	- self.scheduler_tasks.append(scheduler_task)
	- task_id = task_id + 1
	- return self.scheduler_tasks[current_nb_tasks:]
	-
	- def _disable_tasks(task_ids):
	- for task_id in task_ids:
	- self.scheduler_tasks[task_id]["status"] = "disabled"
	-
	- fl.scheduler.create_tasks = Mock(wraps=_create_tasks)
	- fl.scheduler.disable_tasks = Mock(wraps=_disable_tasks)
	-
	- def disable_db(self, fl):
	- fl.winnow_models = Mock(return_value=[])
	- fl.db_inject_repo = Mock(return_value=fl.MODEL())
	- fl.disable_deleted_repo_tasks = Mock(return_value=None)
	-
	- def init_db(self, db, model):
	- engine = create_engine(db.url())
	- model.metadata.create_all(engine)
	-
	- @requests_mock.Mocker()
	- def test_is_within_bounds(self, http_mocker):
	- fl = self.get_fl()
	- self.assertFalse(fl.is_within_bounds(1, 2, 3))
	- self.assertTrue(fl.is_within_bounds(2, 1, 3))
	- self.assertTrue(fl.is_within_bounds(1, 1, 1))
	- self.assertTrue(fl.is_within_bounds(1, None, None))
	- self.assertTrue(fl.is_within_bounds(1, None, 2))
	- self.assertTrue(fl.is_within_bounds(1, 0, None))
	- self.assertTrue(fl.is_within_bounds("b", "a", "c"))
	- self.assertFalse(fl.is_within_bounds("a", "b", "c"))
	- self.assertTrue(fl.is_within_bounds("a", None, "c"))
	- self.assertTrue(fl.is_within_bounds("a", None, None))
	- self.assertTrue(fl.is_within_bounds("b", "a", None))
	- self.assertFalse(fl.is_within_bounds("a", "b", None))
	- self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03"))
	- self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03"))
	- with self.assertRaises(TypeError):
	- fl.is_within_bounds(1.0, "b", None)
	- with self.assertRaises(TypeError):
	- fl.is_within_bounds("A:B", "A::B", None)
	-
	-
	-class HttpListerTester(HttpListerTesterBase, abc.ABC):
	- """Base testing class for subclass of
	-
	- :class:`swh.lister.core.indexing_lister.IndexingHttpLister`
	-
	- See :class:`swh.lister.github.tests.test_gh_lister` for an example of how
	- to customize for a specific listing service.
	-
	- """
	-
	- last_index = AbstractAttribute(
	- "Last index " "in good_api_response"
	- ) # type: Union[AbstractAttribute, int]
	- first_index = AbstractAttribute(
	- "First index in " " good_api_response"
	- ) # type: Union[AbstractAttribute, Optional[int]]
	- bad_api_response_file = AbstractAttribute(
	- "Example bad response body"
	- ) # type: Union[AbstractAttribute, str]
	- entries_per_page = AbstractAttribute(
	- "Number of results in " "good response"
	- ) # type: Union[AbstractAttribute, int]
	- test_re = AbstractAttribute(
	- "Compiled regex matching the server url. Must capture the " "index value."
	- ) # type: Union[AbstractAttribute, Pattern]
	- convert_type = str # type: Callable[..., Any]
	- """static method used to convert the "request_index" to its right type (for
	- indexing listers for example, this is in accordance with the model's
	- "indexable" column).
	-
	- """
	-
	- def mock_response(self, request, context):
	- self.fl.reset_backoff()
	- self.rate_limit = 1
	- context.status_code = 200
	- custom_headers = self.response_headers(request)
	- context.headers.update(custom_headers)
	- req_index = self.request_index(request)
	-
	- if req_index == self.first_index:
	- response_file = self.good_api_response_file
	- else:
	- response_file = self.bad_api_response_file
	-
	- with open(
	- "swh/lister/%s/tests/%s" % (self.lister_subdir, response_file),
	- "r",
	- encoding="utf-8",
	- ) as r:
	- return r.read()
	-
	- def request_index(self, request):
	- m = self.test_re.search(request.path_url)
	- if m and (len(m.groups()) > 0):
	- return self.convert_type(m.group(1))
	-
	- def create_fl_with_db(self, http_mocker):
	- http_mocker.get(self.test_re, text=self.mock_response)
	- db = init_db()
	-
	- fl = self.get_fl(
	- override_config={"lister": {"cls": "local", "args": {"db": db.url()}}}
	- )
	- fl.db = db
	- self.init_db(db, fl.MODEL)
	-
	- self.mock_scheduler(fl)
	- return fl
	-
	- @requests_mock.Mocker()
	- def test_fetch_no_bounds_yesdb(self, http_mocker):
	- fl = self.create_fl_with_db(http_mocker)
	-
	- fl.run()
	-
	- self.assertEqual(fl.db_last_index(), self.last_index)
	- ingested_repos = list(fl.db_query_range(self.first_index, self.last_index))
	- self.assertEqual(len(ingested_repos), self.entries_per_page)
	-
	- @requests_mock.Mocker()
	- def test_fetch_multiple_pages_yesdb(self, http_mocker):
	-
	- fl = self.create_fl_with_db(http_mocker)
	- fl.run(min_bound=self.first_index)
	-
	- self.assertEqual(fl.db_last_index(), self.last_index)
	-
	- partitions = fl.db_partition_indices(5)
	- self.assertGreater(len(partitions), 0)
	- for k in partitions:
	- self.assertLessEqual(len(k), 5)
	- self.assertGreater(len(k), 0)
	-
	- @requests_mock.Mocker()
	- def test_fetch_none_nodb(self, http_mocker):
	- http_mocker.get(self.test_re, text=self.mock_response)
	- fl = self.get_fl()
	-
	- self.disable_scheduler(fl)
	- self.disable_db(fl)
	-
	- fl.run(min_bound=1, max_bound=1) # stores no results
	- # FIXME: Determine what this method tries to test and add checks to
	- # actually test
	-
	- @requests_mock.Mocker()
	- def test_fetch_one_nodb(self, http_mocker):
	- http_mocker.get(self.test_re, text=self.mock_response)
	- fl = self.get_fl()
	-
	- self.disable_scheduler(fl)
	- self.disable_db(fl)
	-
	- fl.run(min_bound=self.first_index, max_bound=self.first_index)
	- # FIXME: Determine what this method tries to test and add checks to
	- # actually test
	-
	- @requests_mock.Mocker()
	- def test_fetch_multiple_pages_nodb(self, http_mocker):
	- http_mocker.get(self.test_re, text=self.mock_response)
	- fl = self.get_fl()
	-
	- self.disable_scheduler(fl)
	- self.disable_db(fl)
	-
	- fl.run(min_bound=self.first_index)
	- # FIXME: Determine what this method tries to test and add checks to
	- # actually test
	-
	- @requests_mock.Mocker()
	- def test_repos_list(self, http_mocker):
	- """Test the number of repos listed by the lister
	-
	- """
	- http_mocker.get(self.test_re, text=self.mock_response)
	- li = self.get_fl().transport_response_simplified(
	- self.get_api_response(self.first_index)
	- )
	- self.assertIsInstance(li, list)
	- self.assertEqual(len(li), self.entries_per_page)
	-
	- @requests_mock.Mocker()
	- def test_model_map(self, http_mocker):
	- """Check if all the keys of model are present in the model created by
	- the `transport_response_simplified`
	-
	- """
	- http_mocker.get(self.test_re, text=self.mock_response)
	- fl = self.get_fl()
	- li = fl.transport_response_simplified(self.get_api_response(self.first_index))
	- di = li[0]
	- self.assertIsInstance(di, dict)
	- pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")]
	- for k in pubs:
	- if k not in ["last_seen", "task_id", "id"]:
	- self.assertIn(k, di)
	-
	- @requests_mock.Mocker()
	- def test_api_request(self, http_mocker):
	- """Test API request for rate limit handling
	-
	- """
	- http_mocker.get(self.test_re, text=self.mock_limit_twice_response)
	- with patch.object(time, "sleep", wraps=time.sleep) as sleepmock:
	- self.get_api_response(self.first_index)
	- self.assertEqual(sleepmock.call_count, 2)
	-
	- @requests_mock.Mocker()
	- def test_request_headers(self, http_mocker):
	- fl = self.create_fl_with_db(http_mocker)
	- fl.run()
	- self.assertNotEqual(len(http_mocker.request_history), 0)
	- for request in http_mocker.request_history:
	- assert "User-Agent" in request.headers
	- user_agent = request.headers["User-Agent"]
	- assert "Software Heritage Lister" in user_agent
	- assert swh.lister.__version__ in user_agent
	-
	- def scheduled_tasks_test(
	- self, next_api_response_file, next_last_index, http_mocker
	- ):
	- """Check that no loading tasks get disabled when processing a new
	- page of repositories returned by a forge API
	- """
	- fl = self.create_fl_with_db(http_mocker)
	-
	- # process first page of repositories listing
	- fl.run()
	-
	- # process second page of repositories listing
	- prev_last_index = self.last_index
	- self.first_index = self.last_index
	- self.last_index = next_last_index
	- self.good_api_response_file = next_api_response_file
	- fl.run(min_bound=prev_last_index)
	-
	- # check expected number of ingested repos and loading tasks
	- ingested_repos = list(fl.db_query_range(0, self.last_index))
	- self.assertEqual(len(ingested_repos), len(self.scheduler_tasks))
	- self.assertEqual(len(ingested_repos), 2 * self.entries_per_page)
	-
	- # check tasks are not disabled
	- for task in self.scheduler_tasks:
	- self.assertTrue(task["status"] != "disabled")
	-
	-
	-class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC):
	- """Base testing class for subclass of
	- :class:`swh.lister.core.simple)_lister.SimpleLister`
	-
	- See :class:`swh.lister.pypi.tests.test_lister` for an example of how
	- to customize for a specific listing service.
	-
	- """
	-
	- entries = AbstractAttribute(
	- "Number of results " "in good response"
	- ) # type: Union[AbstractAttribute, int]
	- PAGE = AbstractAttribute(
	- "URL of the server api's unique page to retrieve and " "parse for information"
	- ) # type: Union[AbstractAttribute, str]
	-
	- def get_fl(self, override_config=None):
	- """Retrieve an instance of fake lister (fl).
	-
	- """
	- if override_config or self.fl is None:
	- self.fl = self.Lister(override_config=override_config)
	- self.fl.INITIAL_BACKOFF = 1
	-
	- self.fl.reset_backoff()
	- return self.fl
	-
	- def mock_response(self, request, context):
	- self.fl.reset_backoff()
	- self.rate_limit = 1
	- context.status_code = 200
	- custom_headers = self.response_headers(request)
	- context.headers.update(custom_headers)
	- response_file = self.good_api_response_file
	-
	- with open(
	- "swh/lister/%s/tests/%s" % (self.lister_subdir, response_file),
	- "r",
	- encoding="utf-8",
	- ) as r:
	- return r.read()
	-
	- @requests_mock.Mocker()
	- def test_api_request(self, http_mocker):
	- """Test API request for rate limit handling
	-
	- """
	- http_mocker.get(self.PAGE, text=self.mock_limit_twice_response)
	- with patch.object(time, "sleep", wraps=time.sleep) as sleepmock:
	- self.get_api_response(0)
	- self.assertEqual(sleepmock.call_count, 2)
	-
	- @requests_mock.Mocker()
	- def test_model_map(self, http_mocker):
	- """Check if all the keys of model are present in the model created by
	- the `transport_response_simplified`
	-
	- """
	- http_mocker.get(self.PAGE, text=self.mock_response)
	- fl = self.get_fl()
	- li = fl.list_packages(self.get_api_response(0))
	- li = fl.transport_response_simplified(li)
	- di = li[0]
	- self.assertIsInstance(di, dict)
	- pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith("_")]
	- for k in pubs:
	- if k not in ["last_seen", "task_id", "id"]:
	- self.assertIn(k, di)
	-
	- @requests_mock.Mocker()
	- def test_repos_list(self, http_mocker):
	- """Test the number of packages listed by the lister
	-
	- """
	- http_mocker.get(self.PAGE, text=self.mock_response)
	- li = self.get_fl().list_packages(self.get_api_response(0))
	- self.assertIsInstance(li, list)
	- self.assertEqual(len(li), self.entries)
	diff --git a/swh/lister/core/tests/test_model.py b/swh/lister/core/tests/test_model.py
	deleted file mode 100644
	--- a/swh/lister/core/tests/test_model.py
	+++ /dev/null
	@@ -1,91 +0,0 @@
	-# Copyright (C) 2017 the Software Heritage developers
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import unittest
	-
	-from sqlalchemy import Column, Integer
	-
	-from swh.lister.core.models import IndexingModelBase, ModelBase
	-
	-
	-class BadSubclass1(ModelBase):
	- __abstract__ = True
	- pass
	-
	-
	-class BadSubclass2(ModelBase):
	- __abstract__ = True
	- __tablename__ = "foo"
	-
	-
	-class BadSubclass3(BadSubclass2):
	- __abstract__ = True
	- pass
	-
	-
	-class GoodSubclass(BadSubclass2):
	- uid = Column(Integer, primary_key=True)
	- indexable = Column(Integer, index=True)
	-
	-
	-class IndexingBadSubclass(IndexingModelBase):
	- __abstract__ = True
	- pass
	-
	-
	-class IndexingBadSubclass2(IndexingModelBase):
	- __abstract__ = True
	- __tablename__ = "foo"
	-
	-
	-class IndexingBadSubclass3(IndexingBadSubclass2):
	- __abstract__ = True
	- pass
	-
	-
	-class IndexingGoodSubclass(IndexingModelBase):
	- uid = Column(Integer, primary_key=True)
	- indexable = Column(Integer, index=True)
	- __tablename__ = "bar"
	-
	-
	-class TestModel(unittest.TestCase):
	- def test_model_instancing(self):
	- with self.assertRaises(TypeError):
	- ModelBase()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass1()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass2()
	-
	- with self.assertRaises(TypeError):
	- BadSubclass3()
	-
	- self.assertIsInstance(GoodSubclass(), GoodSubclass)
	- gsc = GoodSubclass(uid="uid")
	-
	- self.assertEqual(gsc.__tablename__, "foo")
	- self.assertEqual(gsc.uid, "uid")
	-
	- def test_indexing_model_instancing(self):
	- with self.assertRaises(TypeError):
	- IndexingModelBase()
	-
	- with self.assertRaises(TypeError):
	- IndexingBadSubclass()
	-
	- with self.assertRaises(TypeError):
	- IndexingBadSubclass2()
	-
	- with self.assertRaises(TypeError):
	- IndexingBadSubclass3()
	-
	- self.assertIsInstance(IndexingGoodSubclass(), IndexingGoodSubclass)
	- gsc = IndexingGoodSubclass(uid="uid", indexable="indexable")
	-
	- self.assertEqual(gsc.__tablename__, "bar")
	- self.assertEqual(gsc.uid, "uid")
	- self.assertEqual(gsc.indexable, "indexable")
	diff --git a/swh/lister/pytest_plugin.py b/swh/lister/pytest_plugin.py
	deleted file mode 100644
	--- a/swh/lister/pytest_plugin.py
	+++ /dev/null
	@@ -1,62 +0,0 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	-# See the AUTHORS file at the top-level directory of this distribution
	-# License: GNU General Public License version 3, or any later version
	-# See top-level LICENSE file for more information
	-
	-import logging
	-import os
	-
	-import pytest
	-from sqlalchemy import create_engine
	-import yaml
	-
	-from swh.lister import SUPPORTED_LISTERS, get_lister
	-from swh.lister.core.models import initialize
	-
	-logger = logging.getLogger(__name__)
	-
	-
	-@pytest.fixture
	-def lister_db_url(postgresql):
	- db_params = postgresql.get_dsn_parameters()
	- db_url = "postgresql://{user}@{host}:{port}/{dbname}".format(**db_params)
	- logger.debug("lister db_url: %s", db_url)
	- return db_url
	-
	-
	-@pytest.fixture
	-def lister_under_test():
	- """Fixture to determine which lister to test"""
	- return "core"
	-
	-
	-@pytest.fixture
	-def swh_lister_config(lister_db_url, swh_scheduler_config):
	- return {
	- "scheduler": {"cls": "local", **swh_scheduler_config},
	- "lister": {"cls": "local", "args": {"db": lister_db_url},},
	- "credentials": {},
	- "cache_responses": False,
	- }
	-
	-
	-@pytest.fixture(autouse=True)
	-def swh_config(swh_lister_config, monkeypatch, tmp_path):
	- conf_path = os.path.join(str(tmp_path), "lister.yml")
	- with open(conf_path, "w") as f:
	- f.write(yaml.dump(swh_lister_config))
	- monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path)
	- return conf_path
	-
	-
	-@pytest.fixture
	-def engine(lister_db_url):
	- engine = create_engine(lister_db_url)
	- initialize(engine, drop_tables=True)
	- return engine
	-
	-
	-@pytest.fixture
	-def swh_lister(engine, lister_db_url, lister_under_test, swh_config):
	- assert lister_under_test in SUPPORTED_LISTERS
	- return get_lister(lister_under_test, db_url=lister_db_url)
	diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
	--- a/swh/lister/tests/test_cli.py
	+++ b/swh/lister/tests/test_cli.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	+# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -7,8 +7,6 @@

	from swh.lister.cli import SUPPORTED_LISTERS, get_lister

	-from .test_utils import init_db
	-
	lister_args = {
	"cgit": {"url": "https://git.eclipse.org/c/",},
	"phabricator": {
	@@ -33,13 +31,11 @@
	"""Instantiating a supported lister should be ok

	"""
	- db_url = init_db().url()
	# Drop launchpad lister from the lister to check, its test setup is more involved
	# than the other listers and it's not currently done here
	for lister_name in SUPPORTED_LISTERS:
	lst = get_lister(
	lister_name,
	- db_url,
	scheduler={"cls": "local", **swh_scheduler_config},
	**lister_args.get(lister_name, {}),
	)
	diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py
	--- a/swh/lister/tests/test_utils.py
	+++ b/swh/lister/tests/test_utils.py
	@@ -6,7 +6,6 @@
	import requests
	from requests.status_codes import codes
	from tenacity.wait import wait_fixed
	-from testing.postgresql import Postgresql

	from swh.lister.utils import (
	MAX_NUMBER_ATTEMPTS,
	@@ -37,18 +36,6 @@
	next(split_range(total_pages, nb_pages))


	-def init_db():
	- """Factorize the db_url instantiation
	-
	- Returns:
	- db object to ease db manipulation
	-
	- """
	- initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"]
	- initdb_args = " ".join([initdb_args, "-E UTF-8"])
	- return Postgresql(initdb_args=initdb_args)
	-
	-
	TEST_URL = "https://example.og/api/repositories"

File Metadata

Mime Type: text/plain
Expires: Dec 17 2024, 12:12 PM (29 w, 6 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3219798

D4992.diffNo OneTemporaryActions

D4992.diffView Options

File Metadata

Event Timeline

D4992.diff
No OneTemporary
Actions

D4992.diff
View Options