diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py --- a/swh/lister/debian/__init__.py +++ b/swh/lister/debian/__init__.py @@ -1,76 +1,16 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -from typing import Any, List, Mapping - -logger = logging.getLogger(__name__) - - -def debian_init( - db_engine, - override_conf: Mapping[str, Any] = {}, - distribution_name: str = "Debian", - suites: List[str] = ["stretch", "buster", "bullseye"], - components: List[str] = ["main", "contrib", "non-free"], -): - """Initialize the debian data model. - - Args: - db_engine: SQLAlchemy manipulation database object - override_conf: Override conf to pass to instantiate a lister - distribution_name: Distribution to initialize - suites: Default suites to register with the lister - components: Default components to register per suite - - """ - from sqlalchemy.orm import sessionmaker - - from swh.lister.debian.models import Area, Distribution - - db_session = sessionmaker(bind=db_engine)() - distrib = ( - db_session.query(Distribution) - .filter(Distribution.name == distribution_name) - .one_or_none() - ) - - if distrib is None: - distrib = Distribution( - name=distribution_name, - type="deb", - mirror_uri="http://deb.debian.org/debian/", - ) - db_session.add(distrib) - - # Check the existing - existing_area = db_session.query(Area).filter(Area.distribution == distrib).all() - existing_area = set([a.name for a in existing_area]) - - logger.debug("Area already known: %s", ", ".join(existing_area)) - - # Create only the new ones - for suite in suites: - for component in components: - area_name = f"{suite}/{component}" - if area_name in existing_area: - logger.debug("Area '%s' already set, skipping", area_name) - continue - area = Area(name=area_name, distribution=distrib) - db_session.add(area) - - db_session.commit() - db_session.close() +from typing import Any, Mapping def register() -> Mapping[str, Any]: from .lister import DebianLister return { - "models": [DebianLister.MODEL], + "models": [], "lister": DebianLister, "task_modules": ["%s.tasks" % __name__], - "init": debian_init, } diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -1,260 +1,287 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import bz2 from collections import defaultdict -import datetime +from dataclasses import dataclass, field import gzip +from itertools import product import logging import lzma -from typing import Any, Dict, Mapping, Optional +from typing import Any, Callable, Dict, Iterator, List, Set, Tuple +from urllib.parse import urljoin from debian.deb822 import Sources -from requests import Response -from sqlalchemy.orm import joinedload, load_only -from sqlalchemy.schema import CreateTable, DropTable - -from swh.lister.core.lister_base import FetchError, ListerBase -from swh.lister.core.lister_transports import ListerHttpTransport -from swh.lister.debian.models import ( - AreaSnapshot, - Distribution, - DistributionSnapshot, - Package, - TempPackage, -) - -decompressors = { +import requests + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import Lister + +logger = logging.getLogger(__name__) + +decompressors: Dict[str, Callable[[Any], Any]] = { "gz": lambda f: gzip.GzipFile(fileobj=f), "bz2": bz2.BZ2File, "xz": lzma.LZMAFile, } +Suite = str +Component = str +PkgName = str +PkgVersion = str +DebianOrigin = str +DebianPageType = Iterator[Sources] -logger = logging.getLogger(__name__) +@dataclass +class DebianListerState: + """State of debian lister""" -class DebianLister(ListerHttpTransport, ListerBase): - MODEL = Package - PATH_TEMPLATE = None - LISTER_NAME = "debian" - instance = "debian" + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" - def __init__( - self, - distribution: str = "Debian", - date: Optional[datetime.datetime] = None, - override_config: Mapping = {}, - ): - """Initialize the debian lister for a given distribution at a given - date. - Args: - distribution: name of the distribution (e.g. "Debian") - date: date the snapshot is taken (defaults to now if empty) - override_config: Override configuration (which takes precedence - over the parameters if provided) - - """ - ListerHttpTransport.__init__(self, url="notused") - ListerBase.__init__(self, override_config=override_config) - self.distribution = override_config.get("distribution", distribution) - self.date = override_config.get("date", date) or datetime.datetime.now( - tz=datetime.timezone.utc - ) +class DebianLister(Lister[DebianListerState, DebianPageType]): + """ + List source packages for a given debian or derivative distribution. - def transport_request(self, identifier) -> Response: - """Subvert ListerHttpTransport.transport_request, to try several - index URIs in turn. + The lister will create a snapshot for each package name from all its + available versions. - The Debian repository format supports several compression algorithms - across the ages, so we try several URIs. + If a package snapshot is different from the last listing operation, + it will be send to the scheduler that will create a loading task + to archive newly found source code. - Once we have found a working URI, we break and set `self.decompressor` - to the one that matched. + Args: + scheduler: instance of SchedulerInterface + distribution: identifier of listed distribution (e.g. Debian, Ubuntu) + mirror_url: debian package archives mirror URL + suites: list of distribution suites to process + components: list of package components to process + """ - Returns: - a requests Response object. + LISTER_NAME = "debian" - Raises: - FetchError: when all the URIs failed to be retrieved. - """ - response = None - compression = None + def __init__( + self, + scheduler: SchedulerInterface, + distribution: str = "Debian", + mirror_url: str = "http://deb.debian.org/debian/", + suites: List[Suite] = ["stretch", "buster", "bullseye"], + components: List[Component] = ["main", "contrib", "non-free"], + ): + super().__init__( + scheduler=scheduler, url=mirror_url, instance=distribution, + ) - for uri, compression in self.area.index_uris(): - response = super().transport_request(uri) + # to ensure urljoin will produce valid Sources URL + if not self.url.endswith("/"): + self.url += "/" + + self.distribution = distribution + self.suites = suites + self.components = components + + self.session = requests.Session() + self.session.headers.update({"User-Agent": USER_AGENT}) + + # will hold all listed origins info + self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {} + # will contain origin urls that have already been listed + # in a previous page + self.sent_origins: Set[DebianOrigin] = set() + # will contain already listed package info that need to be sent + # to the scheduler for update in the commit_page method + self.origins_to_update: Dict[DebianOrigin, ListedOrigin] = {} + # will contain the lister state after a call to run + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> DebianListerState: + return DebianListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: DebianListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def debian_index_urls( + self, suite: Suite, component: Component + ) -> Iterator[Tuple[str, str]]: + """Return an iterator on possible Sources file URLs as multiple compression + formats can be used.""" + compression_exts = ("xz", "bz2", "gz") + base_url = urljoin(self.url, f"dists/{suite}/{component}/source/Sources") + for ext in compression_exts: + yield (f"{base_url}.{ext}", ext) + yield (base_url, "") + + def page_request(self, suite: Suite, component: Component) -> DebianPageType: + """Return parsed package Sources file for a given debian suite and component.""" + for url, compression in self.debian_index_urls(suite, component): + response = requests.get(url, stream=True) + logging.debug("Fetched URL: %s, status code: %s", url, response.status_code) if response.status_code == 200: break else: - raise FetchError("Could not retrieve index for %s" % self.area) - self.decompressor = decompressors.get(compression) - return response - - def request_uri(self, identifier): - # In the overridden transport_request, we pass - # ListerBase.transport_request() the full URI as identifier, so we - # need to return it here. - return identifier - - def request_params(self, identifier) -> Dict[str, Any]: - # Enable streaming to allow wrapping the response in the decompressor - # in transport_response_simplified. - params = super().request_params(identifier) - params["stream"] = True - return params - - def transport_response_simplified(self, response): - """Decompress and parse the package index fetched in `transport_request`. - - For each package, we "pivot" the file list entries (Files, - Checksums-Sha1, Checksums-Sha256), to return a files dict mapping - filenames to their checksums. - """ - if self.decompressor: - data = self.decompressor(response.raw) + raise Exception( + "Could not retrieve sources index for %s/%s", suite, component + ) + + decompressor = decompressors.get(compression) + if decompressor: + data = decompressor(response.raw) else: data = response.raw - for src_pkg in Sources.iter_paragraphs(data.readlines()): - files = defaultdict(dict) + return Sources.iter_paragraphs(data.readlines()) + + def get_pages(self) -> Iterator[DebianPageType]: + """Return an iterator on parsed debian package Sources files, one per combination + of debian suite and component.""" + for suite, component in product(self.suites, self.components): + logger.debug( + "Processing %s %s source packages info for %s component.", + self.instance, + suite, + component, + ) + self.current_suite = suite + self.current_component = component + yield self.page_request(suite, component) + + def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin: + """Return the origin url for the given package""" + return f"deb://{self.instance}/packages/{package_name}" + + def get_origins_from_page(self, page: DebianPageType) -> Iterator[ListedOrigin]: + """Convert a page of debian package sources into an iterator of ListedOrigin. + + Please note that the returned origins correspond to packages only + listed for the first time in order to get an accurate origins counter + in the statistics returned by the run method of the lister. - for field in src_pkg._multivalued_fields: - if field.startswith("checksums-"): - sum_name = field[len("checksums-") :] + Packages already listed in another page but with different versions will + be put in cache by the method and updated ListedOrigin objects will + be sent to the scheduler later in the commit_page method. + + Indeed as multiple debian suites can be processed, a similar set of + package names can be listed for two different package source pages, + only their version will differ, resulting in origins counted multiple + times in lister statistics. + """ + assert self.lister_obj.id is not None + + origins_to_send = {} + self.origins_to_update = {} + + # iterate on each package source info + for src_pkg in page: + # gather package files info that will be used by the debian loader + files: Dict[str, Dict[str, Any]] = defaultdict(dict) + for field_ in src_pkg._multivalued_fields: + if field_.startswith("checksums-"): + sum_name = field_[len("checksums-") :] else: sum_name = "md5sum" - if field in src_pkg: - for entry in src_pkg[field]: + if field_ in src_pkg: + for entry in src_pkg[field_]: name = entry["name"] files[name]["name"] = entry["name"] files[name]["size"] = int(entry["size"], 10) files[name][sum_name] = entry[sum_name] - yield { - "name": src_pkg["Package"], - "version": src_pkg["Version"], - "directory": src_pkg["Directory"], - "files": files, - } - - def inject_repo_data_into_db(self, models_list): - """Generate the Package entries that didn't previously exist. - - Contrary to ListerBase, we don't actually insert the data in - database. `schedule_missing_tasks` does it once we have the - origin and task identifiers. - """ - by_name_version = {} - temp_packages = [] - - area_id = self.area.id + # extract package name and version + package_name = src_pkg["Package"] + package_version = src_pkg["Version"] + # build origin url + origin_url = self.origin_url_for_package(package_name) - for model in models_list: - name = model["name"] - version = model["version"] - temp_packages.append( - {"area_id": area_id, "name": name, "version": version,} + # create package version key as expected by the debian loader + package_version_key = ( + f"{self.current_suite}/{self.current_component}/{package_version}" ) - by_name_version[name, version] = model - - # Add all the listed packages to a temporary table - self.db_session.execute(CreateTable(TempPackage.__table__)) - self.db_session.bulk_insert_mappings(TempPackage, temp_packages) - - def exists_tmp_pkg(db_session, model): - return ( - db_session.query(model) - .filter(Package.area_id == TempPackage.area_id) - .filter(Package.name == TempPackage.name) - .filter(Package.version == TempPackage.version) - .exists() - ) - - # Filter out the packages that already exist in the main Package table - new_packages = ( - self.db_session.query(TempPackage) - .options(load_only("name", "version")) - .filter(~exists_tmp_pkg(self.db_session, Package)) - .all() - ) - - self.old_area_packages = ( - self.db_session.query(Package) - .filter(exists_tmp_pkg(self.db_session, TempPackage)) - .all() - ) - - self.db_session.execute(DropTable(TempPackage.__table__)) - - added_packages = [] - for package in new_packages: - model = by_name_version[package.name, package.version] - - added_packages.append(Package(area=self.area, **model)) - - self.db_session.add_all(added_packages) - return added_packages - def schedule_missing_tasks(self, models_list, added_packages): - """We create tasks at the end of the full snapshot processing""" - return - - def create_tasks_for_snapshot(self, snapshot): - tasks = [ - snapshot.task_for_package(name, versions) - for name, versions in snapshot.get_packages().items() - ] - - return self.scheduler.create_tasks(tasks) + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="deb", + extra_loader_arguments={"date": None, "packages": {}}, + ) + # origin will be yielded at the end of that method + origins_to_send[origin_url] = self.listed_origins[origin_url] + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + + # package has already been listed in a previous page or current page + elif origin_url not in origins_to_send: + # if package has been listed in a previous page, its new versions + # will be added to its ListedOrigin object but the update will + # be sent to the scheduler in the commit_page method + self.origins_to_update[origin_url] = self.listed_origins[origin_url] + + # update package versions data in parameter that will be provided + # to the debian loader + self.listed_origins[origin_url].extra_loader_arguments["packages"].update( + { + package_version_key: { + "name": package_name, + "version": package_version, + "files": files, + } + } + ) - def run(self): - """Run the lister for a given (distribution, area) tuple. + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # package has already been listed during a previous listing process + if package_name in self.state.package_versions: + new_versions = ( + self.package_versions[package_name] + - self.state.package_versions[package_name] + ) + # no new versions so far, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.pop(origin_url, None) + self.origins_to_update.pop(origin_url, None) + # new versions found, ensure the origin will be sent to the scheduler + elif origin_url not in self.sent_origins: + self.origins_to_update.pop(origin_url, None) + origins_to_send[origin_url] = self.listed_origins[origin_url] + + # update already counted origins with changes since last page + self.sent_origins.update(origins_to_send.keys()) - """ - distribution = ( - self.db_session.query(Distribution) - .options(joinedload(Distribution.areas)) - .filter(Distribution.name == self.distribution) - .one_or_none() + logger.debug( + "Found %s new packages, %s packages with new versions.", + len(origins_to_send), + len(self.origins_to_update), ) - - if not distribution: - logger.error("Distribution %s is not registered" % self.distribution) - return {"status": "failed"} - - if not distribution.type == "deb": - logger.error("Distribution %s is not a Debian derivative" % distribution) - return {"status": "failed"} - - date = self.date - logger.debug( - "Creating snapshot for distribution %s on date %s" % (distribution, date) + "Current total number of listed packages is equal to %s.", + len(self.listed_origins), ) - snapshot = DistributionSnapshot(date=date, distribution=distribution) - - self.db_session.add(snapshot) - - for area in distribution.areas: - if not area.active: - continue - - self.area = area - - logger.debug("Processing area %s" % area) - - _, new_area_packages = self.ingest_data(None) - area_snapshot = AreaSnapshot(snapshot=snapshot, area=area) - self.db_session.add(area_snapshot) - area_snapshot.packages.extend(new_area_packages) - area_snapshot.packages.extend(self.old_area_packages) + yield from origins_to_send.values() - self.create_tasks_for_snapshot(snapshot) + def get_origins_to_update(self) -> Iterator[ListedOrigin]: + yield from self.origins_to_update.values() - self.db_session.commit() + def commit_page(self, page: DebianPageType): + """Send to scheduler already listed origins where new versions have been found + in current page.""" + self.send_origins(self.get_origins_to_update()) - return {"status": "eventful"} + def finalize(self): + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.sent_origins) > 0 diff --git a/swh/lister/debian/models.py b/swh/lister/debian/models.py deleted file mode 100644 --- a/swh/lister/debian/models.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (C) 2017-2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import binascii -from collections import defaultdict -import datetime -from typing import Any, Mapping - -from sqlalchemy import ( - Boolean, - Column, - DateTime, - Enum, - ForeignKey, - Integer, - LargeBinary, - String, - Table, - UniqueConstraint, -) - -try: - from sqlalchemy import JSON -except ImportError: - # SQLAlchemy < 1.1 - from sqlalchemy.dialects.postgresql import JSONB as JSON - -from sqlalchemy.orm import relationship - -from swh.lister.core.models import SQLBase - - -class Distribution(SQLBase): - """A distribution (e.g. Debian, Ubuntu, Fedora, ...)""" - - __tablename__ = "distribution" - - id = Column(Integer, primary_key=True) - name = Column(String, unique=True, nullable=False) - type = Column(Enum("deb", "rpm", name="distribution_types"), nullable=False) - mirror_uri = Column(String, nullable=False) - - areas = relationship("Area", back_populates="distribution") - - def origin_for_package(self, package_name: str) -> str: - """Return the origin url for the given package - - """ - return "%s://%s/packages/%s" % (self.type, self.name, package_name) - - def __repr__(self): - return "Distribution(%s (%s) on %s)" % (self.name, self.type, self.mirror_uri,) - - -class Area(SQLBase): - __tablename__ = "area" - __table_args__ = (UniqueConstraint("distribution_id", "name"),) - - id = Column(Integer, primary_key=True) - distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False) - name = Column(String, nullable=False) - active = Column(Boolean, nullable=False, default=True) - - distribution = relationship("Distribution", back_populates="areas") - - def index_uris(self): - """Get possible URIs for this component's package index""" - if self.distribution.type == "deb": - compression_exts = ("xz", "bz2", "gz", None) - base_uri = "%s/dists/%s/source/Sources" % ( - self.distribution.mirror_uri, - self.name, - ) - for ext in compression_exts: - if ext: - yield (base_uri + "." + ext, ext) - else: - yield (base_uri, None) - else: - raise NotImplementedError( - "Do not know how to build index URI for Distribution type %s" - % self.distribution.type - ) - - def __repr__(self): - return "Area(%s of %s)" % (self.name, self.distribution.name,) - - -class Package(SQLBase): - __tablename__ = "package" - __table_args__ = (UniqueConstraint("area_id", "name", "version"),) - - id = Column(Integer, primary_key=True) - area_id = Column(Integer, ForeignKey("area.id"), nullable=False) - name = Column(String, nullable=False) - version = Column(String, nullable=False) - directory = Column(String, nullable=False) - files = Column(JSON, nullable=False) - - origin_id = Column(Integer) - task_id = Column(Integer) - - revision_id = Column(LargeBinary(20)) - - area = relationship("Area") - - @property - def distribution(self): - return self.area.distribution - - def fetch_uri(self, filename): - """Get the URI to fetch the `filename` file associated with the - package""" - if self.distribution.type == "deb": - return "%s/%s/%s" % ( - self.distribution.mirror_uri, - self.directory, - filename, - ) - else: - raise NotImplementedError( - "Do not know how to build fetch URI for Distribution type %s" - % self.distribution.type - ) - - def loader_dict(self): - ret = { - "id": self.id, - "name": self.name, - "version": self.version, - } - if self.revision_id: - ret["revision_id"] = binascii.hexlify(self.revision_id).decode() - else: - files = {name: checksums.copy() for name, checksums in self.files.items()} - for name in files: - files[name]["uri"] = self.fetch_uri(name) - - ret.update( - {"revision_id": None, "files": files,} - ) - return ret - - def __repr__(self): - return "Package(%s_%s of %s %s)" % ( - self.name, - self.version, - self.distribution.name, - self.area.name, - ) - - -class DistributionSnapshot(SQLBase): - __tablename__ = "distribution_snapshot" - - id = Column(Integer, primary_key=True) - date = Column(DateTime, nullable=False, index=True) - distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False) - - distribution = relationship("Distribution") - areas = relationship("AreaSnapshot", back_populates="snapshot") - - def task_for_package( - self, package_name: str, package_versions: Mapping - ) -> Mapping[str, Any]: - """Return the task dictionary for the given list of package versions - - """ - origin_url = self.distribution.origin_for_package(package_name) - - return { - "policy": "oneshot", - "type": "load-%s-package" % self.distribution.type, - "next_run": datetime.datetime.now(tz=datetime.timezone.utc), - "arguments": { - "args": [], - "kwargs": { - "url": origin_url, - "date": self.date.isoformat(), - "packages": package_versions, - }, - }, - "retries_left": 3, - } - - def get_packages(self): - packages = defaultdict(dict) - for area_snapshot in self.areas: - area_name = area_snapshot.area.name - for package in area_snapshot.packages: - ref_name = "%s/%s" % (area_name, package.version) - packages[package.name][ref_name] = package.loader_dict() - - return packages - - -area_snapshot_package_assoc = Table( - "area_snapshot_package", - SQLBase.metadata, - Column("area_snapshot_id", Integer, ForeignKey("area_snapshot.id"), nullable=False), - Column("package_id", Integer, ForeignKey("package.id"), nullable=False), -) - - -class AreaSnapshot(SQLBase): - __tablename__ = "area_snapshot" - - id = Column(Integer, primary_key=True) - snapshot_id = Column( - Integer, ForeignKey("distribution_snapshot.id"), nullable=False - ) - area_id = Column(Integer, ForeignKey("area.id"), nullable=False) - - snapshot = relationship("DistributionSnapshot", back_populates="areas") - area = relationship("Area") - packages = relationship("Package", secondary=area_snapshot_package_assoc) - - -class TempPackage(SQLBase): - __tablename__ = "temp_package" - __table_args__ = { - "prefixes": ["TEMPORARY"], - } - - id = Column(Integer, primary_key=True) - area_id = Column(Integer) - name = Column(String) - version = Column(String) diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,9 +8,9 @@ @shared_task(name=__name__ + ".DebianListerTask") -def list_debian_distribution(distribution, **lister_args): +def list_debian_distribution(**lister_args): """List a Debian distribution""" - return DebianLister(distribution=distribution, **lister_args).run() + return DebianLister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/debian/tests/conftest.py b/swh/lister/debian/tests/conftest.py deleted file mode 100644 --- a/swh/lister/debian/tests/conftest.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os - -import pytest -from sqlalchemy.orm import sessionmaker - -from swh.core.db.pytest_plugin import postgresql_fact - -from swh.lister.debian import debian_init -import swh.scheduler - -SQL_DIR = os.path.join(os.path.dirname(swh.scheduler.__file__), "sql") -postgresql_scheduler = postgresql_fact( - "postgresql_proc", - db_name="scheduler-lister", - dump_files=os.path.join(SQL_DIR, "*.sql"), - # do not truncate the task tables, it's required in between test - no_truncate_tables={"dbversion", "priority_ratio", "task"}, -) - - -@pytest.fixture -def swh_scheduler_config(postgresql_scheduler): - return {"db": postgresql_scheduler.dsn} - - -@pytest.fixture -def lister_under_test(): - return "debian" - - -@pytest.fixture -def lister_debian(swh_lister): - # Initialize the debian data model - debian_init( - swh_lister.db_engine, suites=["stretch"], components=["main", "contrib"] - ) - - # Add the load-deb-package in the scheduler backend - swh_lister.scheduler.create_task_type( - { - "type": "load-deb-package", - "description": "Load a Debian package", - "backend_name": "swh.loader.packages.debian.tasks.LoaderDebianPackage", - "default_interval": "1 day", - } - ) - - return swh_lister - - -@pytest.fixture -def session(lister_db_url, engine): - session = sessionmaker(bind=engine)() - yield session - session.close() - engine.dispose() diff --git a/swh/lister/debian/tests/data/Sources_bullseye b/swh/lister/debian/tests/data/Sources_bullseye new file mode 100644 --- /dev/null +++ b/swh/lister/debian/tests/data/Sources_bullseye @@ -0,0 +1,107 @@ +Package: git +Binary: git, git-man, git-doc, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all +Version: 1:2.29.2-1 +Maintainer: Jonathan Nieder +Uploaders: Anders Kaseorg +Build-Depends: libz-dev, gettext, libpcre2-dev | libpcre3-dev, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, python3, libhttp-date-perl | libtime-parsedate-perl, libcgi-pm-perl, liberror-perl, libmailtools-perl, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper-compat (= 10), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~) +Build-Depends-Indep: asciidoc (>= 8.6.10), xmlto, docbook-xsl +Architecture: any all +Standards-Version: 4.3.0.1 +Format: 3.0 (quilt) +Files: + ef246c390b2673819cd55085984fb6bc 2867 git_2.29.2-1.dsc + f5f9d4e7a3c633bc7a9178cfd822045f 6187988 git_2.29.2.orig.tar.xz + cfed1fd3dffd4fb31a0319e51471877f 663292 git_2.29.2-1.debian.tar.xz +Vcs-Browser: https://repo.or.cz/w/git/debian.git/ +Vcs-Git: https://repo.or.cz/r/git/debian.git/ +Checksums-Sha256: + 9f2203314f0d076e24750fa29f38d1bb49d4124f3e8d8789b751c84473e57ead 2867 git_2.29.2-1.dsc + f2fc436ebe657821a1360bcd1e5f4896049610082419143d60f6fa13c2f607c1 6187988 git_2.29.2.orig.tar.xz + ad79671893257ca6205156c7c58d06e265d793f076c0efc8e225e832217f760a 663292 git_2.29.2-1.debian.tar.xz +Homepage: https://git-scm.com/ +Package-List: + git deb vcs optional arch=any + git-all deb vcs optional arch=all + git-cvs deb vcs optional arch=all + git-daemon-run deb vcs optional arch=all + git-daemon-sysvinit deb vcs optional arch=all + git-doc deb doc optional arch=all + git-el deb vcs optional arch=all + git-email deb vcs optional arch=all + git-gui deb vcs optional arch=all + git-man deb doc optional arch=all + git-mediawiki deb vcs optional arch=all + git-svn deb vcs optional arch=all + gitk deb vcs optional arch=all + gitweb deb vcs optional arch=all +Directory: pool/main/g/git +Priority: source +Section: vcs + +Package: subversion +Binary: subversion, libsvn1, libsvn-dev, libsvn-doc, libapache2-mod-svn, python3-subversion, subversion-tools, libsvn-java, libsvn-perl, ruby-svn +Version: 1.14.0-3 +Maintainer: James McCoy +Build-Depends: autoconf, bash-completion, chrpath, debhelper-compat (= 12), default-jdk-headless (>= 2:1.8) [!hurd-i386 !hppa !sparc] , dh-apache2, dh-python, doxygen, junit4 [!hurd-i386 !hppa !sparc] , libapr1-dev, libaprutil1-dev, libdb5.3-dev, libdbus-1-dev, liblz4-dev (>= 0.0~r129), libkf5coreaddons-dev , libkf5i18n-dev , libkf5wallet-dev , libperl-dev, libsasl2-dev, libsecret-1-dev, libserf-dev (>= 1.3.9-4~), libsqlite3-dev (>= 3.8.7), libtool, libutf8proc-dev, perl, py3c-dev, python3-all-dev, rename, ruby , ruby-dev , swig (>= 3.0.10), zlib1g-dev +Build-Conflicts: libsvn-dev (>= 1.15~), libsvn-dev (<< 1.14~), libsvn1 (>= 1.15~), libsvn1 (<< 1.14~) +Architecture: any all +Standards-Version: 4.5.0 +Format: 3.0 (quilt) +Files: + 65f7c225ddbcc855b57341954268098b 3807 subversion_1.14.0-3.dsc + 0136e67d8f58731b2858b9f2dba7c536 11519871 subversion_1.14.0.orig.tar.gz + f68b938ba71e19f333069bfd3c6ec236 3917 subversion_1.14.0.orig.tar.gz.asc + de6248e80a7f8b6481606ff16a9e9237 427396 subversion_1.14.0-3.debian.tar.xz +Vcs-Browser: https://salsa.debian.org/jamessan/subversion +Vcs-Git: https://salsa.debian.org/jamessan/subversion.git +Checksums-Sha256: + ebe6e2417a79ad5254072d994ccf6313489a90f299304ee2ccfb6ebe1392c580 3807 subversion_1.14.0-3.dsc + ef3d1147535e41874c304fb5b9ea32745fbf5d7faecf2ce21d4115b567e937d0 11519871 subversion_1.14.0.orig.tar.gz + 98333df38d29a64500d4ad1693741d3d087485555207289b4e53af309abac71a 3917 subversion_1.14.0.orig.tar.gz.asc + fd5383bf82ccf89acd7caf0fd80dc01ee2f7a3e163dcab6b2646ad01b7b746d9 427396 subversion_1.14.0-3.debian.tar.xz +Homepage: http://subversion.apache.org/ +Dgit: 6ef306f777223c0d5c2eaab0586420ada61435f3 debian archive/debian/1.14.0-3 https://git.dgit.debian.org/subversion +Package-List: + libapache2-mod-svn deb httpd optional arch=any + libsvn-dev deb libdevel optional arch=any + libsvn-doc deb doc optional arch=all + libsvn-java deb java optional arch=any profile=!pkg.subversion.nojava + libsvn-perl deb perl optional arch=any + libsvn1 deb libs optional arch=any + python3-subversion deb python optional arch=any + ruby-svn deb ruby optional arch=any profile=!pkg.subversion.noruby + subversion deb vcs optional arch=any + subversion-tools deb vcs optional arch=any +Testsuite: autopkgtest +Testsuite-Triggers: apache2, wget +Directory: pool/main/s/subversion +Priority: source +Section: vcs + +Package: hg-git +Binary: mercurial-git +Version: 0.9.0-2 +Maintainer: Debian Python Team +Uploaders: Tristan Seligmann +Build-Depends: debhelper-compat (= 13), dh-python, git, python3-mercurial, openssh-client, python3, python3-dulwich (>= 0.20.6), python3-setuptools, unzip +Architecture: all +Standards-Version: 4.5.0 +Format: 3.0 (quilt) +Files: + 7dee1b877cf129c1f6ee618ebf690179 2090 hg-git_0.9.0-2.dsc + bcf30d513d8463332288aa93c1c67d3e 129138 hg-git_0.9.0.orig.tar.bz2 + 5674d6e2e8271150adf68b08833e4806 6996 hg-git_0.9.0-2.debian.tar.xz +Vcs-Browser: https://salsa.debian.org/python-team/packages/hg-git +Vcs-Git: https://salsa.debian.org/python-team/packages/hg-git.git +Checksums-Sha256: + a40beaef731c00a820d89918afedc1f01580d87f6e8c29e74903b1e108e38b27 2090 hg-git_0.9.0-2.dsc + eedd8773de76b21b47fd21a7e5c04c05c7ab0ecfc62a54bc947eb225b2c44424 129138 hg-git_0.9.0.orig.tar.bz2 + ded524f1688a248a0eefbd0cf9843daedf60001cc39bfbb9e89734742fa4a4d2 6996 hg-git_0.9.0-2.debian.tar.xz +Homepage: https://hg-git.github.io/ +Package-List: + mercurial-git deb vcs optional arch=all +Testsuite: autopkgtest +Testsuite-Triggers: git, openssh-client, unzip +Directory: pool/main/h/hg-git +Priority: source +Section: vcs diff --git a/swh/lister/debian/tests/data/Sources_buster b/swh/lister/debian/tests/data/Sources_buster new file mode 100644 --- /dev/null +++ b/swh/lister/debian/tests/data/Sources_buster @@ -0,0 +1,78 @@ +Package: git +Binary: git, git-man, git-doc, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all +Version: 1:2.20.1-2+deb10u3 +Maintainer: Gerrit Pape +Uploaders: Jonathan Nieder , Anders Kaseorg +Build-Depends: libz-dev, gettext, libpcre2-dev | libpcre3-dev, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, python, libhttp-date-perl | libtime-parsedate-perl, libcgi-pm-perl, liberror-perl, libmailtools-perl, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper (>= 9), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~) +Build-Depends-Indep: asciidoc (>= 8.6.10), xmlto, docbook-xsl +Architecture: any all +Standards-Version: 4.3.0.1 +Format: 3.0 (quilt) +Files: + fcfb1e01b74dfa383f8171ae7d331de9 2923 git_2.20.1-2+deb10u3.dsc + 5fb4ff92b56ce3172b99c1c74c046c1a 5359872 git_2.20.1.orig.tar.xz + 3b629f9b0d2da6fa6ce5816478a57e09 646216 git_2.20.1-2+deb10u3.debian.tar.xz +Vcs-Browser: https://repo.or.cz/w/git/debian.git/ +Vcs-Git: https://repo.or.cz/r/git/debian.git/ +Checksums-Sha256: + 6322d0dbe9b867a6cd1cd75f95a4a20335faa2030c38688f460ddaaaacbd4d06 2923 git_2.20.1-2+deb10u3.dsc + 9d2e91e2faa2ea61ba0a70201d023b36f54d846314591a002c610ea2ab81c3e9 5359872 git_2.20.1.orig.tar.xz + 3c6e2f8495350bccd0981d579d4d1cac6b0e051e1f7ba8b1d22c842bd4cb3453 646216 git_2.20.1-2+deb10u3.debian.tar.xz +Homepage: https://git-scm.com/ +Package-List: + git deb vcs optional arch=any + git-all deb vcs optional arch=all + git-cvs deb vcs optional arch=all + git-daemon-run deb vcs optional arch=all + git-daemon-sysvinit deb vcs optional arch=all + git-doc deb doc optional arch=all + git-el deb vcs optional arch=all + git-email deb vcs optional arch=all + git-gui deb vcs optional arch=all + git-man deb doc optional arch=all + git-mediawiki deb vcs optional arch=all + git-svn deb vcs optional arch=all + gitk deb vcs optional arch=all + gitweb deb vcs optional arch=all +Directory: pool/main/g/git +Priority: source +Section: vcs + +Package: subversion +Binary: subversion, libsvn1, libsvn-dev, libsvn-doc, libapache2-mod-svn, python-subversion, subversion-tools, libsvn-java, libsvn-perl, ruby-svn +Version: 1.10.4-1+deb10u1 +Maintainer: James McCoy +Build-Depends: apache2-dev (>= 2.4.16), autoconf, bash-completion, chrpath, debhelper (>= 11~), default-jdk-headless (>= 2:1.6) [!hurd-i386 !hppa !sparc], dh-apache2, dh-python, doxygen, junit [!hurd-i386 !hppa !sparc], libapr1-dev, libaprutil1-dev, libdb5.3-dev, libdbus-1-dev, liblz4-dev (>= 0.0~r129), libkf5coreaddons-dev, libkf5i18n-dev, libkf5wallet-dev, libperl-dev, libsasl2-dev, libsecret-1-dev, libserf-dev (>= 1.3.9-4~), libsqlite3-dev (>= 3.8.7), libtool, libutf8proc-dev, perl, python-all-dev (>= 2.7), rename, ruby, ruby-dev, swig, zlib1g-dev +Build-Conflicts: libsvn-dev (<< 1.10~) +Architecture: any all +Standards-Version: 4.3.0 +Format: 3.0 (quilt) +Files: + 70b1d3c8ae91301a3f7766b8181d09c9 3428 subversion_1.10.4-1+deb10u1.dsc + fcfd1bcd95a8b44e6a6de3a97425aead 11347907 subversion_1.10.4.orig.tar.gz + 98e9c6902e6a18973b3d936657384a88 2107 subversion_1.10.4.orig.tar.gz.asc + a4a14bcff3cef49d0d9388356213f3e4 438024 subversion_1.10.4-1+deb10u1.debian.tar.xz +Vcs-Browser: https://salsa.debian.org/jamessan/subversion +Vcs-Git: https://salsa.debian.org/jamessan/subversion.git +Checksums-Sha256: + c9956fd5b850924dd123048b39195b3d591f55b9cbdf18d4d2a0f496f7decc72 3428 subversion_1.10.4-1+deb10u1.dsc + 354022a837596eb1b5676639ea8d73aa326fa8b2c610d8e1b39aeb7228921f4e 11347907 subversion_1.10.4.orig.tar.gz + bc6173c43ac837f875d9f2921e118c194455796b419769e155496cf084376428 2107 subversion_1.10.4.orig.tar.gz.asc + 1bc8900ef1b9d2af84827dab0fd0164e2058381be3bba0db6fd13cbc858c9b1e 438024 subversion_1.10.4-1+deb10u1.debian.tar.xz +Homepage: http://subversion.apache.org/ +Package-List: + libapache2-mod-svn deb httpd optional arch=any + libsvn-dev deb libdevel optional arch=any + libsvn-doc deb doc optional arch=all + libsvn-java deb java optional arch=any + libsvn-perl deb perl optional arch=any + libsvn1 deb libs optional arch=any + python-subversion deb python optional arch=any + ruby-svn deb ruby optional arch=any + subversion deb vcs optional arch=any + subversion-tools deb vcs optional arch=any +Testsuite: autopkgtest +Testsuite-Triggers: apache2, wget +Directory: pool/main/s/subversion +Priority: source +Section: vcs diff --git a/swh/lister/debian/tests/data/Sources_stretch b/swh/lister/debian/tests/data/Sources_stretch new file mode 100644 --- /dev/null +++ b/swh/lister/debian/tests/data/Sources_stretch @@ -0,0 +1,113 @@ +Package: dh-elpa +Binary: dh-elpa +Version: 0.0.18 +Maintainer: Debian Emacs addons team +Uploaders: David Bremner +Build-Depends: debhelper (>= 9), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~) +Architecture: all +Standards-Version: 3.9.6 +Format: 1.0 +Files: + 25beb4376110fe075460f4b7776d0349 1471 dh-elpa_0.0.18.dsc + dc0d3b42c1db80cac9817f43c171bfb3 10038 dh-elpa_0.0.18.tar.gz +Vcs-Browser: http://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/ +Vcs-Git: git://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git +Checksums-Sha256: + 87fb2f13d4a8cdea0cec752cc9873eef1c92961655315d2f14d178f9b1b7fc43 1471 dh-elpa_0.0.18.dsc + 24e5be28cda286398db0018d9577493445c61a0602e239ca285a2981f1068b10 10038 dh-elpa_0.0.18.tar.gz +Package-List: + dh-elpa deb devel optional arch=all +Extra-Source-Only: yes +Directory: pool/main/d/dh-elpa +Priority: extra +Section: misc + +Package: dh-elpa +Binary: dh-elpa +Version: 0.0.19 +Maintainer: Debian Emacs addons team +Uploaders: David Bremner +Build-Depends: debhelper (>= 9), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~) +Architecture: all +Standards-Version: 3.9.6 +Format: 1.0 +Files: + e4513c0f2112ba60031777ad0a65f9dc 1471 dh-elpa_0.0.19.dsc + ac70db483578ecac510612e1b894e53b 10291 dh-elpa_0.0.19.tar.gz +Vcs-Browser: http://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/ +Vcs-Git: git://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git +Checksums-Sha256: + 796a96fad0b03eb589f47c44406f8d32e5b8881dce34c425f1c915650618235c 1471 dh-elpa_0.0.19.dsc + 4bb0a0ecdb75585e168a56a53c79e620b2da70584db9d29e136a3ae9f8a92a76 10291 dh-elpa_0.0.19.tar.gz +Package-List: + dh-elpa deb devel optional arch=all +Extra-Source-Only: yes +Directory: pool/main/d/dh-elpa +Priority: extra +Section: misc + +Package: dh-elpa +Binary: dh-elpa +Version: 0.0.20 +Maintainer: Debian Emacs addons team +Uploaders: David Bremner , Sean Whitton , +Build-Depends: debhelper (>= 9.20151004), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~) +Architecture: all +Standards-Version: 3.9.8 +Format: 1.0 +Files: + 82455df65ccd88896cdc083541d29236 1526 dh-elpa_0.0.20.dsc + 4a7cc13b097e44228b5635c400e33202 12884 dh-elpa_0.0.20.tar.gz +Vcs-Browser: https://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/ +Vcs-Git: https://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git +Checksums-Sha256: + 77c9761b1359c256ad25d4c7a826a27643a0094929a4cb3ac8cdaa0fcdb02d1b 1526 dh-elpa_0.0.20.dsc + 13e4c6ffaaa6cd793d19de677af470ac0edac098779627e9f8555644a7da42f0 12884 dh-elpa_0.0.20.tar.gz +Package-List: + dh-elpa deb devel optional arch=all +Extra-Source-Only: yes +Directory: pool/main/d/dh-elpa +Priority: extra +Section: misc + +Package: git +Binary: git, git-man, git-core, git-doc, git-arch, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all +Version: 1:2.11.0-3+deb9u7 +Maintainer: Gerrit Pape +Uploaders: Jonathan Nieder , Anders Kaseorg +Build-Depends: libz-dev, libpcre3-dev, gettext, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, libhttp-date-perl | libtime-modules-perl, libcgi-pm-perl, python, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper (>= 9), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~) +Build-Depends-Indep: asciidoc, xmlto, docbook-xsl +Architecture: any all +Standards-Version: 3.9.6.0 +Format: 3.0 (quilt) +Files: + e594aeada05ecb15253cc5768412ce3b 2944 git_2.11.0-3+deb9u7.dsc + dd4e3360e28aec5bb902fb34dd7fce3b 4197984 git_2.11.0.orig.tar.xz + e8d896e5307397f0e106e6a85c1b8682 610188 git_2.11.0-3+deb9u7.debian.tar.xz +Vcs-Browser: http://repo.or.cz/w/git/debian.git/ +Vcs-Git: https://repo.or.cz/r/git/debian.git/ +Checksums-Sha256: + 7f2be1b1709c216ad06590687cc8fc0ff6b55a6c3e0ad6ec32b2567ce10adec1 2944 git_2.11.0-3+deb9u7.dsc + 7e7e8d69d494892373b87007674be5820a4bc1ef596a0117d03ea3169119fd0b 4197984 git_2.11.0.orig.tar.xz + 3f54b7ea7b8cda477ddb559c63de063c5bd49d8ab772330c05c79ace546ce38d 610188 git_2.11.0-3+deb9u7.debian.tar.xz +Homepage: https://git-scm.com/ +Package-List: + git deb vcs optional arch=any + git-all deb vcs optional arch=all + git-arch deb vcs optional arch=all + git-core deb vcs optional arch=all + git-cvs deb vcs optional arch=all + git-daemon-run deb vcs optional arch=all + git-daemon-sysvinit deb vcs extra arch=all + git-doc deb doc optional arch=all + git-el deb vcs optional arch=all + git-email deb vcs optional arch=all + git-gui deb vcs optional arch=all + git-man deb doc optional arch=all + git-mediawiki deb vcs optional arch=all + git-svn deb vcs optional arch=all + gitk deb vcs optional arch=all + gitweb deb vcs optional arch=all +Directory: pool/main/g/git +Priority: source +Section: vcs diff --git a/swh/lister/debian/tests/data/http_deb.debian.org/debian__dists_stretch_contrib_source_Sources.xz b/swh/lister/debian/tests/data/http_deb.debian.org/debian__dists_stretch_contrib_source_Sources.xz deleted file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ SourcesText: + return Path(datadir, f"Sources_{suite}").read_text() + + +@pytest.fixture +def debian_sources(datadir: str) -> Dict[Suite, SourcesText]: + return {suite: _debian_sources_content(datadir, suite) for suite in _suites} + + +# suite -> package name -> list of versions +DebianSuitePkgSrcInfo = Dict[Suite, Dict[PkgName, List[Sources]]] + + +def _init_test( + swh_scheduler: SchedulerInterface, + debian_sources: Dict[Suite, SourcesText], + requests_mock, +) -> Tuple[DebianLister, DebianSuitePkgSrcInfo]: + lister = DebianLister( + scheduler=swh_scheduler, + mirror_url=_mirror_url, + suites=list(debian_sources.keys()), + components=_components, + ) + + suite_pkg_info: DebianSuitePkgSrcInfo = {} + + for suite, sources in debian_sources.items(): + suite_pkg_info[suite] = defaultdict(list) + for pkg_src in Sources.iter_paragraphs(sources): + suite_pkg_info[suite][pkg_src["Package"]].append(pkg_src) + + for idx_url, compression in lister.debian_index_urls(suite, _components[0]): + if compression: + requests_mock.get(idx_url, status_code=404) + else: + requests_mock.get(idx_url, text=sources) + + return lister, suite_pkg_info + + +def _check_listed_origins( + swh_scheduler: SchedulerInterface, + lister: DebianLister, + suite_pkg_info: DebianSuitePkgSrcInfo, + lister_previous_state: Dict[PkgName, Set[PkgVersion]], +) -> Set[DebianOrigin]: + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = set() + + # iterate on each debian suite for the main component + for suite, pkg_info in suite_pkg_info.items(): + # iterate on each package + for package_name, pkg_srcs in pkg_info.items(): + # iterate on each package version info + for pkg_src in pkg_srcs: + # build package version key + package_version_key = f"{suite}/{_components[0]}/{pkg_src['Version']}" + # if package or its version not previously listed, those info should + # have been sent to the scheduler database + if ( + package_name not in lister_previous_state + or package_version_key not in lister_previous_state[package_name] + ): + # build origin url + origin_url = lister.origin_url_for_package(package_name) + origin_urls.add(origin_url) + # get ListerOrigin object from scheduler database + filtered_origins = [ + scheduler_origin + for scheduler_origin in scheduler_origins + if scheduler_origin.url == origin_url + ] + + assert filtered_origins + # check the version info are available + assert ( + package_version_key + in filtered_origins[0].extra_loader_arguments["packages"] + ) + + # check listed package version is in lister state + assert package_name in lister.state.package_versions + assert ( + package_version_key + in lister.state.package_versions[package_name] + ) + return origin_urls + + +def test_lister_debian_all_suites( + swh_scheduler: SchedulerInterface, + debian_sources: Dict[Suite, SourcesText], + requests_mock, +): + """ + Simulate a full listing of main component packages for all debian suites. + """ + lister, suite_pkg_info = _init_test(swh_scheduler, debian_sources, requests_mock) + + stats = lister.run() + + origin_urls = _check_listed_origins( + swh_scheduler, lister, suite_pkg_info, lister_previous_state={} + ) + + assert stats.pages == len(_suites) * len(_components) + assert stats.origins == len(origin_urls) + + stats = lister.run() + + assert stats.pages == len(_suites) * len(_components) + assert stats.origins == 0 + + +@pytest.mark.parametrize( + "suites_params", + [[_suites[:1]], [_suites[:1], _suites[:2]], [_suites[:1], _suites[:2], _suites],], +) +def test_lister_debian_updated_packages( + swh_scheduler: SchedulerInterface, + debian_sources: Dict[Suite, SourcesText], + requests_mock, + suites_params: List[Suite], +): + """ + Simulate incremental listing of main component packages by adding new suite + to process between each listing operation. """ - # Run the lister - lister_debian.run() - r = lister_debian.scheduler.search_tasks(task_type="load-deb-package") - assert len(r) == 151 + lister_previous_state: Dict[PkgName, Set[PkgVersion]] = {} + + for idx, suites in enumerate(suites_params): + + sources = {suite: debian_sources[suite] for suite in suites} + + lister, suite_pkg_info = _init_test(swh_scheduler, sources, requests_mock) + + stats = lister.run() - for row in r: - assert row["type"] == "load-deb-package" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + origin_urls = _check_listed_origins( + swh_scheduler, + lister, + suite_pkg_info, + lister_previous_state=lister_previous_state, + ) - # kwargs - kwargs = row["arguments"]["kwargs"] - assert set(kwargs.keys()) == {"url", "date", "packages"} + assert stats.pages == len(sources) + assert stats.origins == len(origin_urls) - logger.debug("kwargs: %s", kwargs) - assert isinstance(kwargs["url"], str) + lister_previous_state = lister.state.package_versions - assert row["policy"] == "oneshot" - assert row["priority"] is None + # only new packages or packages with new versions should be listed + if len(suites) > 1 and idx < len(suites) - 1: + assert stats.origins == 0 + else: + assert stats.origins != 0 diff --git a/swh/lister/debian/tests/test_models.py b/swh/lister/debian/tests/test_models.py deleted file mode 100644 --- a/swh/lister/debian/tests/test_models.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - -from swh.lister.debian.models import Area, Distribution - - -def test_area_index_uris_deb(session): - d = Distribution( - name="Debian", type="deb", mirror_uri="http://deb.debian.org/debian" - ) - a = Area(distribution=d, name="unstable/main", active=True,) - session.add_all([d, a]) - session.commit() - - uris = list(a.index_uris()) - assert uris - - -def test_area_index_uris_rpm(session): - d = Distribution( - name="CentOS", type="rpm", mirror_uri="http://centos.mirrors.proxad.net/" - ) - a = Area(distribution=d, name="8", active=True,) - session.add_all([d, a]) - session.commit() - - with pytest.raises(NotImplementedError): - list(a.index_uris()) diff --git a/swh/lister/debian/tests/test_tasks.py b/swh/lister/debian/tests/test_tasks.py --- a/swh/lister/debian/tests/test_tasks.py +++ b/swh/lister/debian/tests/test_tasks.py @@ -1,10 +1,12 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch +from swh.lister.pattern import ListerStats + def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.debian.tasks.ping") @@ -17,15 +19,25 @@ @patch("swh.lister.debian.tasks.DebianLister") def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked DebianLister - lister.return_value = lister - lister.run.return_value = None + lister.from_configfile.return_value = lister + stats = ListerStats(pages=12, origins=35618) + lister.run.return_value = stats + + kwargs = dict( + mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", + distribution="Ubuntu", + suites=["xenial", "bionic", "focal"], + components=["main", "multiverse", "restricted", "universe"], + ) res = swh_scheduler_celery_app.send_task( - "swh.lister.debian.tasks.DebianListerTask", ("stretch",) + "swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs ) assert res res.wait() assert res.successful() - lister.assert_called_once_with(distribution="stretch") + lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() + + assert res.result == stats.dict() diff --git a/swh/lister/debian/utils.py b/swh/lister/debian/utils.py deleted file mode 100644 --- a/swh/lister/debian/utils.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (C) 2017-2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging - -import click - -from swh.lister.debian.lister import DebianLister -from swh.lister.debian.models import Area, Distribution, SQLBase - - -@click.group() -@click.option("--verbose/--no-verbose", default=False) -@click.pass_context -def cli(ctx, verbose): - ctx.obj["lister"] = DebianLister() - if verbose: - loglevel = logging.DEBUG - logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO) - else: - loglevel = logging.INFO - - logging.basicConfig( - format="%(asctime)s %(process)d %(levelname)s %(message)s", level=loglevel, - ) - - -@cli.command() -@click.pass_context -def create_schema(ctx): - """Create the schema from the models""" - SQLBase.metadata.create_all(ctx.obj["lister"].db_engine) - - -@cli.command() -@click.option("--name", help="The name of the distribution") -@click.option("--type", help="The type of distribution") -@click.option("--mirror-uri", help="The URL to the mirror of the distribution") -@click.option("--area", help="The areas for the distribution", multiple=True) -@click.pass_context -def create_distribution(ctx, name, type, mirror_uri, area): - to_add = [] - db_session = ctx.obj["lister"].db_session - d = ( - db_session.query(Distribution) - .filter(Distribution.name == name) - .filter(Distribution.type == type) - .one_or_none() - ) - - if not d: - d = Distribution(name=name, type=type, mirror_uri=mirror_uri) - to_add.append(d) - - for area_name in area: - a = None - if d.id: - a = ( - db_session.query(Area) - .filter(Area.distribution == d) - .filter(Area.name == area_name) - .one_or_none() - ) - - if not a: - a = Area(name=area_name, distribution=d) - to_add.append(a) - - db_session.add_all(to_add) - db_session.commit() - - -@cli.command() -@click.option("--name", help="The name of the distribution") -@click.pass_context -def list_distribution(ctx, name): - """List the distribution""" - ctx.obj["lister"].run(name) - - -if __name__ == "__main__": - cli(obj={})