diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py --- a/swh/lister/packagist/__init__.py +++ b/swh/lister/packagist/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import PackagistLister - from .models import PackagistModel return { - "models": [PackagistModel], "lister": PackagistLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,102 +1,182 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version +# License: Packagist General Public License version 3, or any later version # See top-level LICENSE file for more information -import json +from dataclasses import dataclass +from datetime import datetime, timezone import logging -import random -from typing import Any, Dict, List, Mapping +from typing import Any, Dict, Iterator, List, Optional -from swh.lister.core.lister_transports import ListerOnePageApiTransport -from swh.lister.core.simple_lister import SimpleLister -from swh.scheduler import utils +import iso8601 +import requests -from .models import PackagistModel +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) +PackagistPageType = List[str] -def compute_package_url(repo_name: str) -> str: - """Compute packgist package url from repo name. - """ - return "https://repo.packagist.org/p/%s.json" % repo_name +@dataclass +class PackagistListerState: + """State of Packagist lister""" + last_listing_date: Optional[datetime] = None + """Last date where packagist lister was executed""" -class PackagistLister(ListerOnePageApiTransport, SimpleLister): - """List packages available in the Packagist package manager. - The lister sends the request to the url present in the class - variable `PAGE`, to receive a list of all the package names - present in the Packagist package manager. Iterates over all the - packages and constructs the metadata url of the package from - the name of the package and creates a loading task:: +class PackagistLister(Lister[PackagistListerState, PackagistPageType]): + """ + List all Packagist projects and send associated origins to scheduler. + + The lister queries the Packagist API, whose documentation can be found at + https://packagist.org/apidoc. + + For each package, its metadata are retrieved using Packagist API endpoints + whose responses are served from static files, which are guaranteed to be + efficient on the Packagist side (no dymamic queries). + Furthermore, subsequent listing will send the "If-Modified-Since" HTTP + header to only retrieve packages metadata updated since the previous listing + operation in order to save bandwidth and return only origins which might have + new released versions. + """ - Task: - Type: load-packagist - Policy: recurring - Args: - - + LISTER_NAME = "Packagist" + PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" + PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p" + + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.PACKAGIST_PACKAGES_LIST_URL, + instance="packagist", + credentials=credentials, + ) - Example:: + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + self.listing_date = datetime.now().astimezone(tz=timezone.utc) - Task: - Type: load-packagist - Policy: recurring - Args: - 'hypejunction/hypegamemechanics' - 'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json' + def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: + last_listing_date = d.get("last_listing_date") + if last_listing_date is not None: + d["last_listing_date"] = iso8601.parse_date(last_listing_date) + return PackagistListerState(**d) - """ + def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: + d: Dict[str, Optional[str]] = {"last_listing_date": None} + last_listing_date = state.last_listing_date + if last_listing_date is not None: + d["last_listing_date"] = last_listing_date.isoformat() + return d - MODEL = PackagistModel - LISTER_NAME = "packagist" - PAGE = "https://packagist.org/packages/list.json" - instance = "packagist" + def api_request(self, url: str) -> Any: + logger.debug("Fetching URL %s", url) - def __init__(self, override_config=None): - ListerOnePageApiTransport.__init__(self) - SimpleLister.__init__(self, override_config=override_config) + response = self.session.get(url) - def task_dict( - self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str] - ) -> Dict[str, Any]: - """Return task format dict + if response.status_code not in (200, 304): + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) - This is overridden from the lister_base as more information is - needed for the ingestion task creation. + response.raise_for_status() - """ - return utils.create_task_dict( - "load-%s" % origin_type, - kwargs.get("policy", "recurring"), - kwargs.get("name"), - origin_url, - retries_left=3, - ) - - def list_packages(self, response: Any) -> List[str]: - """List the actual packagist origins from the response. + # response is empty when status code is 304 + return response.json() if response.status_code == 200 else {} + def get_pages(self) -> Iterator[PackagistPageType]: """ - response = json.loads(response.text) - packages = [name for name in response["packageNames"]] - logger.debug("Number of packages: %s", len(packages)) - random.shuffle(packages) - return packages - - def get_model_from_repo(self, repo_name: str) -> Mapping[str, str]: - """Transform from repository representation to model + Yield a single page listing all Packagist projects. + """ + yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] + def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Packagist projects and yield ListedOrigin instances. """ - url = compute_package_url(repo_name) - return { - "uid": repo_name, - "name": repo_name, - "full_name": repo_name, - "html_url": url, - "origin_url": url, - "origin_type": "packagist", - } + assert self.lister_obj.id is not None + + # save some bandwidth by only getting packages metadata updated since + # last listing + if self.state.last_listing_date is not None: + if_modified_since = self.state.last_listing_date.strftime( + "%a, %d %b %Y %H:%M:%S GMT" + ) + self.session.headers["If-Modified-Since"] = if_modified_since + + # to ensure origins will not be listed multiple times + origin_urls = set() + + for package_name in page: + try: + metadata = self.api_request( + f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json" + ) + if not metadata.get("packages", {}): + # package metadata not updated since last listing + continue + if package_name not in metadata["packages"]: + # missing package metadata in response + continue + versions_info = metadata["packages"][package_name].values() + except requests.exceptions.HTTPError: + # error when getting package metadata (usually 404 when a + # package has been removed), skip it and process next package + continue + + origin_url = None + visit_type = None + last_update = None + + # extract origin url for package, vcs type and latest release date + for version_info in versions_info: + origin_url = version_info.get("source", {}).get("url", "") + if not origin_url: + continue + # can be git, hg or svn + visit_type = version_info.get("source", {}).get("type", "") + dist_time_str = version_info.get("time", "") + if not dist_time_str: + continue + dist_time = iso8601.parse_date(dist_time_str) + if last_update is None or dist_time > last_update: + last_update = dist_time + + # skip package with already seen origin url or with missing required info + if visit_type is None or origin_url is None or origin_url in origin_urls: + continue + + # bitbucket closed its mercurial hosting service, those origins can not be + # loaded into the archive anymore + if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"): + continue + + origin_urls.add(origin_url) + + logger.debug( + "Found package %s last updated on %s", package_name, last_update + ) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type=visit_type, + last_update=last_update, + ) + + def finalize(self) -> None: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/packagist/models.py b/swh/lister/packagist/models.py deleted file mode 100644 --- a/swh/lister/packagist/models.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class PackagistModel(ModelBase): - """a Packagist repository representation - - """ - - __tablename__ = "packagist_repo" - - uid = Column(String, primary_key=True) diff --git a/swh/lister/packagist/tasks.py b/swh/lister/packagist/tasks.py --- a/swh/lister/packagist/tasks.py +++ b/swh/lister/packagist/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ @shared_task(name=__name__ + ".PackagistListerTask") def list_packagist(**lister_args): "List the packagist (php) registry" - PackagistLister(**lister_args).run() + return PackagistLister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/packagist/tests/conftest.py b/swh/lister/packagist/tests/conftest.py deleted file mode 100644 --- a/swh/lister/packagist/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "packagist" - - -@pytest.fixture -def lister_packagist(swh_lister): - # Amend the scheduler with an unknown yet load-packagist task type - swh_lister.scheduler.create_task_type( - { - "type": "load-packagist", - "description": "Load packagist origin", - "backend_name": "swh.loader.package.tasks.LoaderPackagist", - "default_interval": "1 day", - } - ) - - return swh_lister diff --git a/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json b/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json deleted file mode 100644 --- a/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "packageNames": [ - "0.0.0/composer-include-files", - "0.0.0/laravel-env-shim", - "0.0.1/try-make-package", - "0099ff/dialogflowphp", - "00f100/array_dot" - ] -} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/ljjackson_linnworks.json b/swh/lister/packagist/tests/data/ljjackson_linnworks.json new file mode 100644 --- /dev/null +++ b/swh/lister/packagist/tests/data/ljjackson_linnworks.json @@ -0,0 +1,83 @@ +{ + "packages": { + "ljjackson/linnworks": { + "0.1": { + "name": "ljjackson/linnworks", + "description": "A PHP API Integration of Linnworks.", + "keywords": [], + "homepage": "https://github.com/ljjackson", + "version": "0.1", + "version_normalized": "0.1.0.0", + "license": [], + "authors": [{ + "name": "Liam Jackson", + "homepage": "https://github.com/ljjackson", + "role": "Developer" + }], + "source": { + "type": "git", + "url": "https://github.com/ljjackson/linnworks.git", + "reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ljjackson/linnworks/zipball/b2d16490823a8a9012a83b80cdcd6a129cfc5dea", + "reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea", + "shasum": "" + }, + "type": "library", + "time": "2018-10-22T19:52:25+00:00", + "autoload": { + "psr-4": { + "LJJackson\\Linnworks\\": "src/" + } + }, + "require": { + "php": "^7.0", + "guzzlehttp/guzzle": "^6.3", + "ext-json": "*" + }, + "uid": 2535139 + }, + "dev-master": { + "name": "ljjackson/linnworks", + "description": "A PHP API Integration of Linnworks.", + "keywords": [], + "homepage": "https://github.com/ljjackson", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [], + "authors": [{ + "name": "Liam Jackson", + "homepage": "https://github.com/ljjackson", + "role": "Developer" + }], + "source": { + "type": "git", + "url": "https://github.com/ljjackson/linnworks.git", + "reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ljjackson/linnworks/zipball/7c6b1209dc3bafad4284b130bda8450f3478ea26", + "reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26", + "shasum": "" + }, + "type": "library", + "time": "2018-11-01T21:45:50+00:00", + "autoload": { + "psr-4": { + "LJJackson\\Linnworks\\": "src/" + } + }, + "require": { + "guzzlehttp/guzzle": "^6.3", + "ext-json": "*", + "php": "^7.1.3", + "nesbot/carbon": "*" + }, + "uid": 2517334 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/lky_wx_article.json b/swh/lister/packagist/tests/data/lky_wx_article.json new file mode 100644 --- /dev/null +++ b/swh/lister/packagist/tests/data/lky_wx_article.json @@ -0,0 +1,240 @@ +{ + "packages": { + "lky/wx_article": { + "1.0": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wxgzharticle", + "version": "1.0", + "version_normalized": "1.0.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "bd1826f17a42a1d3da44c4562af3be370687466b" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/bd1826f17a42a1d3da44c4562af3be370687466b", + "reference": "bd1826f17a42a1d3da44c4562af3be370687466b", + "shasum": "" + }, + "type": "library", + "time": "2018-08-28T06:51:46+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "illuminate/support": "~5", + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": "5.2.*", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2493149 + }, + "dev-master": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be", + "reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be", + "shasum": "" + }, + "type": "library", + "time": "2018-08-30T07:37:09+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "default-branch": true, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 4096807 + }, + "v1.2": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "v1.2", + "version_normalized": "1.2.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "shasum": "" + }, + "type": "library", + "time": "2018-08-29T08:26:06+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2493150 + }, + "v1.6": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "v1.6", + "version_normalized": "1.6.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "shasum": "" + }, + "type": "library", + "time": "2018-08-29T08:26:06+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2427550 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/spryker-eco_computop-api.json b/swh/lister/packagist/tests/data/spryker-eco_computop-api.json new file mode 100644 --- /dev/null +++ b/swh/lister/packagist/tests/data/spryker-eco_computop-api.json @@ -0,0 +1,145 @@ +{ + "packages": { + "spryker-eco/computop-api": { + "1.0.0": { + "name": "spryker-eco/computop-api", + "description": "Computop API Module", + "keywords": [], + "homepage": "", + "version": "1.0.0", + "version_normalized": "1.0.0.0", + "license": [ + "MIT" + ], + "authors": [], + "source": { + "type": "git", + "url": "https://github.com/spryker-eco/computop-api.git", + "reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/d75dc7d2c80bd93e65081b26433ee559d2c92f0a", + "reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a", + "shasum": "" + }, + "type": "library", + "time": "2018-08-31T11:51:23+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 2432548 + }, + "dev-dev": { + "name": "spryker-eco/computop-api", + "description": "Computop API Module", + "keywords": [], + "homepage": "", + "version": "dev-dev", + "version_normalized": "dev-dev", + "license": [ + "MIT" + ], + "authors": [], + "source": { + "type": "git", + "url": "https://github.com/spryker-eco/computop-api.git", + "reference": "7a695d1e412132296546d072364f410186572790" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7a695d1e412132296546d072364f410186572790", + "reference": "7a695d1e412132296546d072364f410186572790", + "shasum": "" + }, + "type": "library", + "time": "2018-08-31T11:38:22+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 2209824 + }, + "dev-master": { + "name": "spryker-eco/computop-api", + "description": "ComputopApi module", + "keywords": [], + "homepage": "", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "MIT" + ], + "authors": [], + "source": { + "type": "git", + "url": "https://github.com/spryker-eco/computop-api.git", + "reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7ac81d5db52c0639bc06a61a35d7738a964fde88", + "reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88", + "shasum": "" + }, + "type": "library", + "time": "2020-06-22T15:50:29+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "default-branch": true, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 4006827 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -1,104 +1,106 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest -from unittest.mock import patch - -import requests_mock - -from swh.lister.core.tests.test_lister import HttpSimpleListerTester -from swh.lister.packagist.lister import PackagistLister, compute_package_url - -expected_packages = [ - "0.0.0/composer-include-files", - "0.0.0/laravel-env-shim", - "0.0.1/try-make-package", - "0099ff/dialogflowphp", - "00f100/array_dot", -] - -expected_model = { - "uid": "0099ff/dialogflowphp", - "name": "0099ff/dialogflowphp", - "full_name": "0099ff/dialogflowphp", - "html_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", - "origin_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", - "origin_type": "packagist", -} +import json +from pathlib import Path +import iso8601 -class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase): - Lister = PackagistLister - PAGE = "https://packagist.org/packages/list.json" - lister_subdir = "packagist" - good_api_response_file = "data/https_packagist.org/packages_list.json" - entries = 5 +from swh.lister.packagist.lister import PackagistLister - @requests_mock.Mocker() - def test_list_packages(self, http_mocker): - """List packages from simple api page should retrieve all packages within +_packages_list = { + "packageNames": [ + "ljjackson/linnworks", + "lky/wx_article", + "spryker-eco/computop-api", + ] +} - """ - http_mocker.get(self.PAGE, text=self.mock_response) - fl = self.get_fl() - packages = fl.list_packages(self.get_api_response(0)) - for package in expected_packages: - assert package in packages +def _package_metadata(datadir, package_name): + return json.loads( + Path(datadir, f"{package_name.replace('/', '_')}.json").read_text() + ) - def test_transport_response_simplified(self): - """Test model created by the lister - """ - fl = self.get_fl() - model = fl.transport_response_simplified(["0099ff/dialogflowphp"]) - assert len(model) == 1 - for key, values in model[0].items(): - assert values == expected_model[key] +def _package_origin_info(package_name, package_metadata): + origin_url = None + visit_type = None + last_update = None + for version_info in package_metadata["packages"][package_name].values(): + origin_url = version_info["source"]["url"] + visit_type = version_info["source"]["type"] + version_date = iso8601.parse_date(version_info["time"]) + if last_update is None or version_date > last_update: + last_update = version_date + return origin_url, visit_type, last_update - @patch("swh.lister.packagist.lister.utils.create_task_dict") - def test_task_dict(self, mock_create_tasks): - """Test the task creation of lister - """ - fl = self.get_fl() - fl.task_dict( - origin_type="packagist", origin_url="https://abc", name="test_pack" - ) - mock_create_tasks.assert_called_once_with( - "load-packagist", "recurring", "test_pack", "https://abc", retries_left=3 - ) +def _request_without_if_modified_since(request): + return request.headers.get("If-Modified-Since") is None -def test_compute_package_url(): - expected_url = "https://repo.packagist.org/p/hello.json" - actual_url = compute_package_url("hello") - assert actual_url == expected_url +def _request_with_if_modified_since(request): + return request.headers.get("If-Modified-Since") is not None -def test_packagist_lister(lister_packagist, requests_mock_datadir): - lister_packagist.run() +def test_packagist_lister(swh_scheduler, requests_mock, datadir): + # first listing, should return one origin per package + lister = PackagistLister(scheduler=swh_scheduler) + requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) + packages_metadata = {} + for package_name in _packages_list["packageNames"]: + metadata = _package_metadata(datadir, package_name) + packages_metadata[package_name] = metadata + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + json=metadata, + additional_matcher=_request_without_if_modified_since, + ) + stats = lister.run() - r = lister_packagist.scheduler.search_tasks(task_type="load-packagist") - assert len(r) == 5 + assert stats.pages == 1 + assert stats.origins == len(_packages_list["packageNames"]) + assert lister.updated + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + for package_name, package_metadata in packages_metadata.items(): + origin_url, visit_type, last_update = _package_origin_info( + package_name, package_metadata + ) + filtered_origins = [o for o in scheduler_origins if o.url == origin_url] + assert filtered_origins + assert filtered_origins[0].visit_type == visit_type + assert filtered_origins[0].last_update == last_update + + # second listing, should return 0 origins as no package metadata + # has been updated since first listing + lister = PackagistLister(scheduler=swh_scheduler) + for package_name in _packages_list["packageNames"]: + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + additional_matcher=_request_with_if_modified_since, + status_code=304, + ) - for row in r: - assert row["type"] == "load-packagist" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 2 + assert lister.get_state_from_scheduler().last_listing_date is not None - package = args[0] - url = args[1] + stats = lister.run() - expected_url = compute_package_url(package) - assert url == expected_url + assert stats.pages == 1 + assert stats.origins == 0 + assert lister.updated - # kwargs - kwargs = row["arguments"]["kwargs"] - assert kwargs == {} - assert row["policy"] == "recurring" - assert row["priority"] is None +def test_lister_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") + load_from_envvar.return_value = { + "scheduler": {"cls": "local", **swh_scheduler_config}, + "credentials": {}, + } + lister = PackagistLister.from_configfile() + assert lister.scheduler is not None + assert lister.credentials is not None diff --git a/swh/lister/packagist/tests/test_tasks.py b/swh/lister/packagist/tests/test_tasks.py --- a/swh/lister/packagist/tests/test_tasks.py +++ b/swh/lister/packagist/tests/test_tasks.py @@ -1,8 +1,8 @@ -# Copyright (C) 2019-2020 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): @@ -13,11 +13,11 @@ assert res.result == "OK" -@patch("swh.lister.packagist.tasks.PackagistLister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked PackagistLister - lister.return_value = lister - lister.run.return_value = None +def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.packagist.tasks.PackagistLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=286500) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( "swh.lister.packagist.tasks.PackagistListerTask" @@ -25,7 +25,7 @@ assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()