diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` +- `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` @@ -36,7 +37,7 @@ ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) +`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -19,6 +19,7 @@ tutorial run_a_new_lister + save_forge Reference Documentation diff --git a/docs/run_a_new_lister.rst b/docs/run_a_new_lister.rst --- a/docs/run_a_new_lister.rst +++ b/docs/run_a_new_lister.rst @@ -9,8 +9,8 @@ Here are the steps you need to follow to run a lister within your local environment. -1. You must edit the docker-compose override file (`docker-compose.override.yml`). - following the sample provided :: +1. You must edit the docker-compose override file (:file:`docker-compose.override.yml`). + following the sample provided:: version: '2' @@ -19,7 +19,7 @@ volumes: - "$SWH_ENVIRONMENT_HOME/swh-lister:/src/swh-lister" - The file named `docker-compose.override.yml` will automatically be loaded by + The file named :file:`docker-compose.override.yml` will automatically be loaded by ``docker-compose``.Having an override makes it possible to run a docker container with some swh packages installed from sources instead of using the latest published packages from pypi. For more details, you may refer to README.md @@ -45,7 +45,7 @@ ~/swh-environment/swh-docker-dev$ docker-compose up -d 5. Add the lister task-type in the scheduler. For example, if you want to - add pypi lister task-type :: + add pypi lister task-type:: ~/swh-environment$ swh scheduler task-type add list-gnu-full \ "swh.lister.gnu.tasks.GNUListerTask" "Full GNU lister" \ diff --git a/docs/save_forge.rst b/docs/save_forge.rst new file mode 100644 --- /dev/null +++ b/docs/save_forge.rst @@ -0,0 +1,77 @@ +.. _save-forge: + +Save a forge +============ + +Assuming the forge's :ref:`listing type is already supported in the +scheduler`, use ``swh scheduler task add`` command: + +:: + + swh scheduler --config-file /etc/softwareheritage/scheduler.yml \ + task add [--policy [recurring|oneshot]] [param1=value1] [param2=value2] + +For example: + +- To add a task requiring no parameters (launchpad lister) + +:: + + $ swh scheduler --config-file /etc/softwareheritage/scheduler.yml \ + task add list-launchpad-full + INFO:swh.core.config:Loading config file /etc/softwareheritage/scheduler.yml + Created 1 tasks + + Task 1240540 + Next run: just now (2020-09-08 13:08:07+00:00) + Interval: 90 days, 0:00:00 + Type: list-launchpad-full + Policy: recurring + Args: + Keyword args: + +- To add a one-shot task with parameters: + +:: + + $ swh scheduler --config-file /etc/softwareheritage/scheduler.yml \ + task add --policy oneshot \ + list-gitea-full url=https://codeberg.org/api/v1/ limit=100 + INFO:swh.core.config:Loading config file /etc/softwareheritage/scheduler.yml + Created 1 tasks + + Task 1240540 + Next run: just now (2020-09-11 14:25:45+00:00) + Interval: 90 days, 0:00:00 + Type: list-gitea-full + Policy: oneshot + Args: + Keyword args: + limit: 100 + url: 'https://codeberg.org/api/v1/' + +.. _register-task-type: + +Register task types to the scheduler +------------------------------------ + +- To register new task types, ensure you have the code at the required version: + + - docker environment: use :file:`docker-compose.override.yml` with the desired + :ref:`volume for both lister and scheduler* containers` + - for production/staging, upgrade the swh package first then trigger the cli. + +- Use the ``swh scheduler task-type register`` command: + +:: + + $ swh scheduler --config-file /etc/softwareheritage/scheduler.yml task-type register + INFO:swh.core.config:Loading config file /etc/softwareheritage/scheduler.yml + INFO:swh.scheduler.cli.task_type:Loading entrypoint for plugin lister.launchpad + INFO:swh.scheduler.cli.task_type:Create task type list-launchpad-incremental in scheduler + INFO:swh.scheduler.cli.task_type:Create task type list-launchpad-full in scheduler + INFO:swh.scheduler.cli.task_type:Create task type list-launchpad-new in scheduler + ... + + +Note: The command is idempotent so it can be executed multiple times. diff --git a/docs/tutorial.rst b/docs/tutorial.rst --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -172,7 +172,7 @@ :ref:`handling-specific-topics`. In any case, we want the data we return to be usefully filtered and structured. The -easiest way to create an iterator is to use the `yield` keyword. Yield each data page +easiest way to create an iterator is to use the ``yield`` keyword. Yield each data page you have structured in accordance with the page type you have declared. The page type exists only for static type checking of data passed from :py:meth:`get_pages` to :py:meth:`get_origins_from_page`; you can choose whatever fits the bill. @@ -204,7 +204,7 @@ able to load the origin. It is needed for example when additional context is needed along with the URL to effectively load from the origin. -See the definition of ListedOrigin_. +See the definition of :swh_web:`ListedOrigin `. Now that that we showed how those two methods operate, let's put it together by showing how they fit in the principal :py:meth:`Lister.run` method:: @@ -239,8 +239,6 @@ role of the remaining methods and attributes appearing here in the next section as it is related to the lister state. -.. _ListedOrigin: https://archive.softwareheritage.org/browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177 - .. _handling-lister-state: Handling lister state @@ -317,9 +315,9 @@ We generally recommend logging every unhandleable error with the response content and then immediately stop the listing by doing an equivalent of -:py:meth:`Response.raise_for_status` from the `requests` library. As for rate-limiting +:py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting errors, we have a strategy of using a flexible decorator to handle the retrying for us. -It is based on the `tenacity` library and accessible as :py:func:`throttling_retry` from +It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from :py:mod:`swh.lister.utils`. Pagination @@ -343,11 +341,11 @@ information needed into a structured page. This all makes for easier debugging. Misc files -^^^^^^^^^^^^^^^ +^^^^^^^^^^ There are also a few files that need to be modified outside of the lister directory, namely: -* `/setup.py` to add your lister to the end of the list in the *setup* section: +* :file:`/setup.py` to add your lister to the end of the list in the *setup* section:: entry_points=""" [swh.cli.subcommands] @@ -357,16 +355,16 @@ lister.cgit=swh.lister.cgit:register ...""" -* `/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests. -* `/README.md` to reference the new lister. -* `/CONTRIBUTORS` to add your name. +* :file:`/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests. +* :file:`/README.md` to reference the new lister. +* :file:`/CONTRIBUTORS` to add your name. Testing your lister ------------------- When developing a new lister, it's important to test. For this, add the tests -(check `swh/lister/*/tests/`) and register the celery tasks in the main -conftest.py (`swh/lister/core/tests/conftest.py`). +(check :file:`swh/lister/*/tests/`) and register the celery tasks in the main +conftest.py (:file:`swh/lister/core/tests/conftest.py`). Another important step is to actually run it within the docker-dev (:ref:`run-lister-tutorial`). diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -36,3 +36,7 @@ [mypy-urllib3.util.*] ignore_missing_imports = True + +[mypy-xmltodict.*] +ignore_missing_imports = True + diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ iso8601 beautifulsoup4 launchpadlib -tenacity +tenacity >= 6.2 +xmltodict diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register + lister.maven=swh.lister.maven:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/__init__.py b/swh/__init__.py --- a/swh/__init__.py +++ b/swh/__init__.py @@ -1,4 +1,3 @@ from pkgutil import extend_path -from typing import List -__path__: List[str] = extend_path(__path__, __name__) +__path__ = extend_path(__path__, __name__) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -11,9 +11,11 @@ from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError +from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -74,6 +76,7 @@ ) self.base_git_url = base_git_url + @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -3,6 +3,7 @@ # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone +import os from typing import List import pytest @@ -229,3 +230,30 @@ assert ( listed_origin.url.startswith(url) is False ), f"url should be mapped to {base_git_url}" + + +def test_lister_cgit_get_pages_with_pages_and_retry( + requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler +): + url = "https://git.tizen/cgit/" + + with open(os.path.join(datadir, "https_git.tizen/cgit,ofs=50"), "rb") as page: + + requests_mock.get( + f"{url}?ofs=50", + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": page.read(), "status_code": 200}, + ], + ) + + lister_cgit = CGitLister(swh_scheduler, url=url) + + mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep") + + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) + # we should have 16 repos (listed on 3 pages) + assert len(repos) == 3 + assert len(flattened_repos) == 16 diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -60,11 +60,15 @@ yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, - visit_type="tar", + visit_type="cran", last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [ - {"url": artifact_url, "version": package_info["Version"]} + { + "url": artifact_url, + "version": package_info["Version"], + "package": package_info["Package"], + } ] }, ) diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -99,7 +99,13 @@ assert len(filtered_origins) == 1 assert filtered_origins[0].extra_loader_arguments == { - "artifacts": [{"url": artifact_url, "version": package_info["Version"]}] + "artifacts": [ + { + "url": artifact_url, + "version": package_info["Version"], + "package": package_info["Package"], + } + ] } filtered_origins[0].last_update == parse_packaged_date(package_info) diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -7,10 +7,12 @@ import bz2 from collections import defaultdict from dataclasses import dataclass, field +from email.utils import parsedate_to_datetime import gzip from itertools import product import logging import lzma +import os from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple from urllib.parse import urljoin @@ -133,19 +135,21 @@ response = requests.get(url, stream=True) logging.debug("Fetched URL: %s, status code: %s", url, response.status_code) if response.status_code == 200: + last_modified = response.headers.get("Last-Modified") + self.last_sources_update = ( + parsedate_to_datetime(last_modified) if last_modified else None + ) + decompressor = decompressors.get(compression) + if decompressor: + data = decompressor(response.raw).readlines() + else: + data = response.raw.readlines() break else: - raise Exception( - "Could not retrieve sources index for %s/%s", suite, component - ) - - decompressor = decompressors.get(compression) - if decompressor: - data = decompressor(response.raw) - else: - data = response.raw + data = "" + logger.debug("Could not retrieve sources index for %s/%s", suite, component) - return Sources.iter_paragraphs(data.readlines()) + return Sources.iter_paragraphs(data) def get_pages(self) -> Iterator[DebianPageType]: """Return an iterator on parsed debian package Sources files, one per combination @@ -198,9 +202,12 @@ if field_ in src_pkg: for entry in src_pkg[field_]: name = entry["name"] - files[name]["name"] = entry["name"] + files[name]["name"] = name files[name]["size"] = int(entry["size"], 10) files[name][sum_name] = entry[sum_name] + files[name]["uri"] = os.path.join( + self.url, src_pkg["Directory"], name + ) # extract package name and version package_name = src_pkg["Package"] @@ -221,7 +228,8 @@ lister_id=self.lister_obj.id, url=origin_url, visit_type="deb", - extra_loader_arguments={"date": None, "packages": {}}, + extra_loader_arguments={"packages": {}}, + last_update=self.last_sources_update, ) # origin will be yielded at the end of that method origins_to_send[origin_url] = self.listed_origins[origin_url] @@ -247,6 +255,15 @@ } ) + if self.listed_origins[origin_url].last_update is None or ( + self.last_sources_update is not None + and self.last_sources_update # type: ignore + > self.listed_origins[origin_url].last_update + ): + # update debian package last update if current processed sources index + # has a greater modification date + self.listed_origins[origin_url].last_update = self.last_sources_update + # add package version key to the set of found versions self.package_versions[package_name].add(package_version_key) diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py --- a/swh/lister/debian/tests/test_lister.py +++ b/swh/lister/debian/tests/test_lister.py @@ -4,6 +4,9 @@ # See top-level LICENSE file for more information from collections import defaultdict +from datetime import datetime +from email.utils import formatdate, parsedate_to_datetime +import os from pathlib import Path from typing import Dict, List, Set, Tuple @@ -35,7 +38,8 @@ _mirror_url = "http://deb.debian.org/debian" _suites = ["stretch", "buster", "bullseye"] -_components = ["main"] +_components = ["main", "foo"] +_last_modified = {} SourcesText = str @@ -58,6 +62,7 @@ debian_sources: Dict[Suite, SourcesText], requests_mock, ) -> Tuple[DebianLister, DebianSuitePkgSrcInfo]: + lister = DebianLister( scheduler=swh_scheduler, mirror_url=_mirror_url, @@ -67,16 +72,26 @@ suite_pkg_info: DebianSuitePkgSrcInfo = {} - for suite, sources in debian_sources.items(): + for i, (suite, sources) in enumerate(debian_sources.items()): + # ensure to generate a different date for each suite + last_modified = formatdate(timeval=datetime.now().timestamp() + i, usegmt=True) suite_pkg_info[suite] = defaultdict(list) for pkg_src in Sources.iter_paragraphs(sources): suite_pkg_info[suite][pkg_src["Package"]].append(pkg_src) + # backup package last update date + global _last_modified + _last_modified[pkg_src["Package"]] = last_modified for idx_url, compression in lister.debian_index_urls(suite, _components[0]): if compression: requests_mock.get(idx_url, status_code=404) else: - requests_mock.get(idx_url, text=sources) + requests_mock.get( + idx_url, text=sources, headers={"Last-Modified": last_modified}, + ) + + for idx_url, _ in lister.debian_index_urls(suite, _components[1]): + requests_mock.get(idx_url, status_code=404) return lister, suite_pkg_info @@ -117,11 +132,23 @@ ] assert filtered_origins - # check the version info are available - assert ( - package_version_key - in filtered_origins[0].extra_loader_arguments["packages"] + expected_last_update = parsedate_to_datetime( + _last_modified[pkg_src["Package"]] ) + assert filtered_origins[0].last_update == expected_last_update + packages = filtered_origins[0].extra_loader_arguments["packages"] + # check the version info are available + assert package_version_key in packages + + # check package files URIs are available + for file in pkg_src["files"]: + filename = file["name"] + file_uri = os.path.join( + _mirror_url, pkg_src["Directory"], filename + ) + package_files = packages[package_version_key]["files"] + assert filename in package_files + assert package_files[filename]["uri"] == file_uri # check listed package version is in lister state assert package_name in lister.state.package_versions @@ -189,7 +216,7 @@ lister_previous_state=lister_previous_state, ) - assert stats.pages == len(sources) + assert stats.pages == len(sources) * len(_components) assert stats.origins == len(origin_urls) lister_previous_state = lister.state.package_versions diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -8,7 +8,7 @@ import logging import random import time -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Set from urllib.parse import parse_qs, urlparse import iso8601 @@ -305,11 +305,17 @@ """ assert self.lister_obj.id is not None + seen_in_page: Set[str] = set() + for repo in page: if not repo: # null repositories in listings happen sometimes... continue + if repo["html_url"] in seen_in_page: + continue + seen_in_page.add(repo["html_url"]) + pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -17,12 +17,16 @@ from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import is_retryable_exception, retry_attempt, throttling_retry +from swh.lister.utils import is_retryable_exception, throttling_retry from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) +# Some instance provides hg_git type which can be ingested as hg origins +VCS_MAPPING = {"hg_git": "hg"} + + @dataclass class GitLabListerState: """State of the GitLabLister""" @@ -49,7 +53,7 @@ with specific ratelimit header. """ - attempt = retry_attempt(retry_state) + attempt = retry_state.outcome if attempt.failed: exc = attempt.exception() return ( @@ -93,16 +97,17 @@ """ - LISTER_NAME = "gitlab" - def __init__( self, scheduler, url: str, + name: Optional[str] = "gitlab", instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, incremental: bool = False, ): + if name is not None: + self.LISTER_NAME = name super().__init__( scheduler=scheduler, url=url.rstrip("/"), @@ -203,10 +208,12 @@ repositories = page_result.repositories if page_result.repositories else [] for repo in repositories: + visit_type = repo.get("vcs_type", "git") + visit_type = VCS_MAPPING.get(visit_type, visit_type) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["http_url_to_repo"], - visit_type="git", + visit_type=visit_type, last_update=iso8601.parse_date(repo["last_activity_at"]), ) diff --git a/swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json b/swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json @@ -0,0 +1,320 @@ +[ + { + "id": 1, + "description": "Slides for a Heptapod presentation at Mercurial Conference - Paris 2019", + "vcs_type": "hg_git", + "name": "2019-hg-paris", + "name_with_namespace": "heptapod / slides / 2019-hg-paris", + "path": "2019-hg-paris", + "path_with_namespace": "heptapod/slides/2019-hg-paris", + "created_at": "2019-05-28T00:53:04.064Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/slides/2019-hg-paris", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/slides/2019-hg-paris", + "web_url": "https://foss.heptapod.net/heptapod/slides/2019-hg-paris", + "readme_url": null, + "avatar_url": null, + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2019-06-11T16:39:49.827Z", + "namespace": { + "id": 4, + "name": "slides", + "path": "slides", + "kind": "group", + "full_path": "heptapod/slides", + "parent_id": 3, + "avatar_url": null, + "web_url": "https://foss.heptapod.net/groups/heptapod/slides" + } + }, + { + "id": 3, + "description": "Obsolete fork of omnibus-gitlab, predating the creation of the separate heptapod-docker project done for Heptapod *0.6.1*", + "vcs_type": "hg_git", + "name": "omnibus", + "name_with_namespace": "heptapod / omnibus", + "path": "omnibus", + "path_with_namespace": "heptapod/omnibus", + "created_at": "2019-06-01T17:15:28.005Z", + "default_branch": "branch/heptapod", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/omnibus", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/omnibus", + "web_url": "https://foss.heptapod.net/heptapod/omnibus", + "readme_url": "https://foss.heptapod.net/heptapod/omnibus/-/blob/branch/heptapod/README.md", + "avatar_url": null, + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2020-04-24T13:57:28.102Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 5, + "description": "GitLab CE Rails application, converted to a Mercurial repository and modified for Mercurial support in a Mercurial branch called \"Heptapod\".", + "vcs_type": "hg_git", + "name": "heptapod", + "name_with_namespace": "heptapod / heptapod", + "path": "heptapod", + "path_with_namespace": "heptapod/heptapod", + "created_at": "2019-06-02T10:49:49.250Z", + "default_branch": "branch/heptapod", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/heptapod", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/heptapod", + "web_url": "https://foss.heptapod.net/heptapod/heptapod", + "readme_url": "https://foss.heptapod.net/heptapod/heptapod/-/blob/branch/heptapod/README.md", + "avatar_url": "https://foss.heptapod.net/uploads/-/system/project/avatar/5/heptapod.png", + "forks_count": 0, + "star_count": 17, + "last_activity_at": "2021-09-13T16:11:42.053Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 7, + "description": "Mercurial-Git bridge, modified for the needs of Heptapod in a branch called \"heptapod\"", + "vcs_type": "hg_git", + "name": "hg-git", + "name_with_namespace": "heptapod / hg-git", + "path": "hg-git", + "path_with_namespace": "heptapod/hg-git", + "created_at": "2019-06-02T14:40:36.730Z", + "default_branch": "branch/heptapod-0-8", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/hg-git", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/hg-git", + "web_url": "https://foss.heptapod.net/heptapod/hg-git", + "readme_url": "https://foss.heptapod.net/heptapod/hg-git/-/blob/branch/heptapod-0-8/README.md", + "avatar_url": null, + "forks_count": 0, + "star_count": 1, + "last_activity_at": "2020-05-06T13:29:51.900Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 9, + "description": "A Mercurial extension to provide logs via the `logging` module of the Python standard library.", + "vcs_type": "hg", + "name": "hgext-loggingmod", + "name_with_namespace": "heptapod / hgext-loggingmod", + "path": "hgext-loggingmod", + "path_with_namespace": "heptapod/hgext-loggingmod", + "created_at": "2019-07-05T17:48:54.928Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/hgext-loggingmod", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/hgext-loggingmod", + "web_url": "https://foss.heptapod.net/heptapod/hgext-loggingmod", + "readme_url": "https://foss.heptapod.net/heptapod/hgext-loggingmod/-/blob/branch/default/README.md", + "avatar_url": null, + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2020-08-03T11:37:39.413Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 11, + "description": "Source for the statically generated website at https://heptapod.net", + "vcs_type": "hg_git", + "name": "website", + "name_with_namespace": "heptapod / website", + "path": "website", + "path_with_namespace": "heptapod/website", + "created_at": "2019-07-17T15:14:17.576Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/website", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/website", + "web_url": "https://foss.heptapod.net/heptapod/website", + "readme_url": "https://foss.heptapod.net/heptapod/website/-/blob/branch/default/README.rst", + "avatar_url": "https://foss.heptapod.net/uploads/-/system/project/avatar/11/logo-heptapod-www.png", + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2021-08-25T08:21:51.036Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 12, + "description": "A suite of functional / API tests written with Selenium and the Python requests library (for the API part)", + "vcs_type": "hg_git", + "name": "heptapod-tests", + "name_with_namespace": "heptapod / heptapod-tests", + "path": "heptapod-tests", + "path_with_namespace": "heptapod/heptapod-tests", + "created_at": "2019-07-19T14:51:15.657Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/heptapod-tests", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/heptapod-tests", + "web_url": "https://foss.heptapod.net/heptapod/heptapod-tests", + "readme_url": "https://foss.heptapod.net/heptapod/heptapod-tests/-/blob/branch/default/README.md", + "avatar_url": "https://foss.heptapod.net/uploads/-/system/project/avatar/12/selenium-logo.png", + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2021-09-08T13:10:15.911Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 14, + "description": "Generic Docker images to serve Mercurial content over HTTP, and notably to act as a mirror.\r\nThese are published on [Docker Hub](https://hub.docker.com/u/octobus)", + "vcs_type": "hg_git", + "name": "mercurial-mirror", + "name_with_namespace": "heptapod / mercurial-mirror", + "path": "mercurial-mirror", + "path_with_namespace": "heptapod/mercurial-mirror", + "created_at": "2019-08-21T13:10:30.330Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/mercurial-mirror", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/mercurial-mirror", + "web_url": "https://foss.heptapod.net/heptapod/mercurial-mirror", + "readme_url": null, + "avatar_url": null, + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2019-08-21T13:10:30.330Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + }, + { + "id": 15, + "description": "This is the development repository for the evolve extension.\r\n\r\nOfficial repository at: https://mercurial-scm.org/repo/evolve/\r\n\r\nOfficial bug tracker: https://bz.mercurial-scm.org/ (component, \"evolution\")\r\n", + "vcs_type": "hg_git", + "name": "evolve", + "name_with_namespace": "mercurial / evolve", + "path": "evolve", + "path_with_namespace": "mercurial/evolve", + "created_at": "2019-08-31T07:34:31.812Z", + "default_branch": "branch/default", + "tag_list": [ + "extension", + "history-rewriting", + "mercurial" + ], + "topics": [ + "extension", + "history-rewriting", + "mercurial" + ], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/mercurial/evolve", + "http_url_to_repo": "https://foss.heptapod.net/mercurial/evolve", + "web_url": "https://foss.heptapod.net/mercurial/evolve", + "readme_url": "https://foss.heptapod.net/mercurial/evolve/-/blob/branch/default/README.rst", + "avatar_url": "https://foss.heptapod.net/uploads/-/system/project/avatar/15/ammonoid.png", + "forks_count": 0, + "star_count": 11, + "last_activity_at": "2021-09-12T18:01:37.794Z", + "namespace": { + "id": 18, + "name": "mercurial", + "path": "mercurial", + "kind": "group", + "full_path": "mercurial", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/18/droplets-100.png", + "web_url": "https://foss.heptapod.net/groups/mercurial" + } + }, + { + "id": 21, + "description": "This project is deprecated in favour of Omnibus Heptapod Docker build capability, and will be archived once Heptapod 0.17 becomes the new stable series.", + "vcs_type": "hg", + "name": "heptapod-docker", + "name_with_namespace": "heptapod / heptapod-docker", + "path": "heptapod-docker", + "path_with_namespace": "heptapod/heptapod-docker", + "created_at": "2019-09-17T17:06:28.678Z", + "default_branch": "branch/default", + "tag_list": [], + "topics": [], + "ssh_url_to_repo": "ssh://hg@foss.heptapod.net/heptapod/heptapod-docker", + "http_url_to_repo": "https://foss.heptapod.net/heptapod/heptapod-docker", + "web_url": "https://foss.heptapod.net/heptapod/heptapod-docker", + "readme_url": "https://foss.heptapod.net/heptapod/heptapod-docker/-/blob/branch/default/README.md", + "avatar_url": null, + "forks_count": 0, + "star_count": 0, + "last_activity_at": "2021-06-21T15:14:34.070Z", + "namespace": { + "id": 3, + "name": "heptapod", + "path": "heptapod", + "kind": "group", + "full_path": "heptapod", + "parent_id": null, + "avatar_url": "/uploads/-/system/group/avatar/3/heptapod.png", + "web_url": "https://foss.heptapod.net/groups/heptapod" + } + } +] diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -56,6 +56,42 @@ assert listed_origin.last_update is not None +def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock): + """Heptapod lister happily lists hg, hg_git as hg and git origins + + """ + name = "heptapod" + instance = "foss.heptapod.net" + lister = GitLabLister( + swh_scheduler, url=api_url(instance), name=name, instance=instance + ) + assert lister.LISTER_NAME == name + + response = gitlab_page_response(datadir, instance, 1) + + requests_mock.get( + lister.page_url(), [{"json": response}], additional_matcher=_match_request, + ) + + listed_result = lister.run() + expected_nb_origins = len(response) + + for entry in response: + assert entry["vcs_type"] in ("hg", "hg_git") + + assert listed_result == ListerStats(pages=1, origins=expected_nb_origins) + + scheduler_origins = lister.scheduler.get_listed_origins( + lister.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "hg" + assert listed_origin.url.startswith(f"https://{instance}") + assert listed_origin.last_update is not None + + def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]: """Return list of repositories (out of test dataset)""" datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json") diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import logging -from typing import Any, Iterator, Mapping +from typing import Any, Iterator, Mapping, Optional import iso8601 @@ -36,12 +36,16 @@ instance="GNU", credentials=credentials, ) - self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") + # no side-effect calls in constructor, if extra state is needed, as preconized + # by the pattern docstring, this must happen in the get_pages method. + self.gnu_tree: Optional[GNUTree] = None def get_pages(self) -> Iterator[GNUPageType]: """ Yield a single page listing all GNU projects. """ + # first fetch the manifest to parse + self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") yield self.gnu_tree.projects def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: @@ -49,6 +53,7 @@ Iterate on all GNU projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None + assert self.gnu_tree is not None artifacts = self.gnu_tree.artifacts diff --git a/swh/lister/maven/README.md b/swh/lister/maven/README.md new file mode 100644 --- /dev/null +++ b/swh/lister/maven/README.md @@ -0,0 +1,142 @@ + +## The Maven lister + +This readme describes the design decisions made during development. + +More information can be found on the Software Heritage forge at [https://forge.softwareheritage.org/T1724](https://forge.softwareheritage.org/T1724) and on the diff of the lister at [https://forge.softwareheritage.org/D6133](https://forge.softwareheritage.org/D6133) . + +## Execution sequence (TL;DR) + +The complete sequence of actions to list the source artifacts and scm urls is as follows: + +On the `index_exporter` server (asynchronously): + +* Check the list of remote indexes, and compare it to the list of local index files. +* Retrieve the missing Maven Indexer indexes from the remote repository. \ + Example of index from Maven Central: [https://repo1.maven.org/maven2/.index/](https://repo1.maven.org/maven2/.index/) +* Start execution of the Docker container: + * If the `indexes` directory doesn't exist, unpack the Lucene indexes from the Maven Indexer indexes using `indexer-cli`.\ + This generates a set of binary files as shown below: + + ``` + boris@castalia:maven$ ls -lh /media/home2/work/indexes/ + total 5,2G + -rw-r--r-- 1 root root 500M juil. 7 22:06 _4m.fdt + -rw-r--r-- 1 root root 339K juil. 7 22:06 _4m.fdx + -rw-r--r-- 1 root root 2,2K juil. 7 22:07 _4m.fnm + -rw-r--r-- 1 root root 166M juil. 7 22:07 _4m_Lucene50_0.doc + -rw-r--r-- 1 root root 147M juil. 7 22:07 _4m_Lucene50_0.pos + -rw-r--r-- 1 root root 290M juil. 7 22:07 _4m_Lucene50_0.time + -rw-r--r-- 1 root root 3,1M juil. 7 22:07 _4m_Lucene50_0.tip + [SNIP] + -rw-r--r-- 1 root root 363 juil. 7 22:06 _e0.si + -rw-r--r-- 1 root root 1,7K juil. 7 22:07 segments_2 + -rw-r--r-- 1 root root 8 juil. 7 21:54 timestamp + -rw-r--r-- 1 root root 0 juil. 7 21:54 write.lock + ``` + * If the `export` directory doesn't exist, export the Lucene documents from the Lucene indexes using `clue`.\ + This generates a set of text files as shown below: + + ``` + boris@castalia:~$ ls -lh /work/export/ + total 49G + -rw-r--r-- 1 root root 13G juil. 7 22:12 _p.fld + -rw-r--r-- 1 root root 7,0K juil. 7 22:21 _p.inf + -rw-r--r-- 1 root root 2,9G juil. 7 22:21 _p.len + -rw-r--r-- 1 root root 33G juil. 7 22:20 _p.pst + -rw-r--r-- 1 root root 799 juil. 7 22:21 _p.si + -rw-r--r-- 1 root root 138 juil. 7 22:21 segments_1 + -rw-r--r-- 1 root root 0 juil. 7 22:07 write.lock + ``` +* On the host, copy export files to `/var/www/html/` to make them available on the network. + +On the lister side: + +* Get the exports from the above local index server. +* Extract the list of all pom and source artefacts from the Lucene export. +* Yield the list of source artefacts to the Maven Loader as they are found. +* Download all poms from the above list. +* Parse all poms to extract the scm attribute, and yield the list of scm urls towards the classic loaders (git, svn, hg..). + +The process has been optimised as much as it could be, scaling down from 140 GB on disk / 60 GB RAM / 90 mn exec time to 60 GB on disk / 2 GB (excl. docker) / 32 mn exec time. + +For the long read about why we came to here, please continue. + +## About the Maven ecosystem + +Maven repositories are a loose, decentralised network of HTTP servers with a well-defined hosted structure. They are used according to the Maven dependency resolver[i](#sdendnote1sym), an inheritance-based mechanism used to identify and locate artefacts required in Maven builds. + +There is no uniform, standardised way to list the contents of maven repositories, since consumers are supposed to know what artefacts they need. Instead, Maven repository owners usually setup a Maven Indexer[ii](#sdendnote2sym) to enablesource code identification and listing in IDEs – for this reason, source jars usually don’t have build files and information, only providing pure sources. + +Maven Indexer is not a mandatory part of the maven repository stack, but it is the *de facto* standard for maven repositories indexing and querying. All major Maven repositories we have seen so far use it. Most artefacts are located in the main central repository: Maven Central[iii](#sdendnote3sym), hosted and run by Sonatype[iv](#sdendnote4sym). Other well-known repositories are listed on MVN Repository[v](#sdendnote5sym). + +Maven repositories are mainly used for binary content (e.g. class jars), but the following sources of information are relevant to our goal in the maven repositories/ecosystem: + +* SCM attributes in pom XML files contain the **scm URL** of the associated source code. They can be fed to standard Git/SVN/others loaders. +* **Source artefacts** contain pure source code (i.e. no build files) associated to the artefact. There are two main naming conventions for them, although not always enforced: + * ${artifactId}-${version}-source-release.zip + * ${artifactId}-${version}-src.zip + + They come in various archiving formats (jar, zip, tar.bz2, tar.gz) and require a specific loader to attach the artefact metadata. + +[i](#sdendnote1anc)Maven dependency resolver: [https://maven.apache.org/resolver/index.html](https://maven.apache.org/resolver/index.html) + +[ii](#sdendnote2anc)Maven Indexer: [https://maven.apache.org/maven-indexer/](https://maven.apache.org/maven-indexer/) + +[iii](#sdendnote3anc)Maven Central: [https://search.maven.org/](https://search.maven.org/) + +[iv](#sdendnote4anc)Sonatype Company: [https://www.sonatype.com/](https://www.sonatype.com/) + +[v](#sdendnote5anc)MVN Repository: [https://mvnrepository.com/repos](https://mvnrepository.com/repos) + +## Preliminary research + +Listing the full content of a Maven repository is very unusual, and the whole system has not been built for this purpose. Instead, tools and build systems can easily fetch individual artefacts according to their Maven coordinates (groupId, artifactId, version, classifier, extension). Usual listing means (e.g. scapping) are highly discouraged and will trigger bannishment easily. There is no common API defined either. + +Once we have the artifactId/group we can easily get the list of versions (e.g. for updates) by reading the [maven-metadata.xml file at the package level](https://repo1.maven.org/maven2/ant/ant/maven-metadata.xml), although this is not always reliable. The various options that were investigated to get the interesting artefacts are: + +* **Scrapping** could work but is explicitly forbidden[i](#sdendnote1sym). Pages could easily be parsed through, and it would allow to identify \*all\* artifacts. +* Using **Maven indexes** is the "official" way to retrieve information from a maven repository and most repositories provide this feature. It would also enable a smart incremental listing. The Maven Indexer data format however is not we + ll documented. It relies under the hood on an old version (Lucene54) of a lucene indexes, and the only libraries that can access it are written in java. This implies a dedicated Docker container with a jvm and some specific tools (maven indexer and luke for the lucene index), and thus would bring some complexity to the docker & prod setups. +* A third path could be to **parse all the pom.xml's** that we find and follow all artifactId's recursively, building a graph of dependencies and parent poms. This is more of a non-complete heuristic, and we would miss leaf nodes (i.e. artifacts that are not used by others), but it could help setup a basic list. +* It should be noted also that there are two main implementations of maven repositories: Nexus and Artifactory. By being more specific we could use the respective APIs of these products to get information. But getting the full list of artefacts is still not straightforward, and we'd lose any generic treatment doing so. + +The best option in our opinion is to go with the Maven Indexer, for it is the most complete listing available (notably for the biggest repository by far: maven central). + +[i](#sdendnote1anc)Maven repository’s Terms of Service: [https://repo1.maven.org/terms.html](https://repo1.maven.org/terms.html) + +## Maven indexes conversion + +[Maven-Indexer](https://maven.apache.org/maven-indexer/) is a (thick) wrapper around lucene. It parses the repository and stores documents, fields and terms in an index. One can extract the lucene index from a maven index using the command: `java -jar indexer-cli-5.1.1.jar --unpack nexus-maven-repository-index.gz --destination test --type full`. Note however that 5.1.1 is an old version of maven indexer; newer versions of the maven indexer won't work on the central indexes. + +[Clue](https://maven.apache.org/maven-indexer/) is a CLI tool to read lucene indexes, and version 6.2.0 works with our maven indexes. One can use the following command to export the index to text: `java -jar clue-6.2.0-1.0.0.jar maven/central-lucene-index/ export central_export text`. + +The exported text file looks like this: + +``` +doc 0 + field 0 + name u + type string + value com.redhat.rhevm.api|rhevm-api-powershell-jaxrs|1.0-rc1.16|javadoc|jar + field 1 + name m + type string + value 1321264789727 + field 2 + name i + type string + value jar|1320743675000|768291|2|2|1|jar + field 10 + name n + type string + value RHEV-M API Powershell Wrapper Implementation JAX-RS + field 13 + name 1 + type string + value 454eb6762e5bb14a75a21ae611ce2048dd548550 +``` + +The execution of these two jars requires a Java virtual machine -- java execution in python is not possible without a JVM. Docker is a good way to run both tools and generate the exports independently, rather than add a JVM to the existing production environment. + +We decided (2021-08-25) to install and execute a docker container on a separate server so the lister would simply have to fetch it on the network and parse it (the latter part in pure python). diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import MavenLister + + return { + "lister": MavenLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/lister.py @@ -0,0 +1,361 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +import logging +import re +from typing import Any, Dict, Iterator, Optional +from urllib.parse import urljoin + +import requests +from tenacity.before_sleep import before_sleep_log +from urllib3.util import parse_url +import xmltodict + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +RepoPage = Dict[str, Any] + + +@dataclass +class MavenListerState: + """State of the MavenLister""" + + last_seen_doc: int = -1 + """Last doc ID ingested during an incremental pass + + """ + + last_seen_pom: int = -1 + """Last doc ID related to a pom and ingested during + an incremental pass + + """ + + +class MavenLister(Lister[MavenListerState, RepoPage]): + """List origins from a Maven repository. + + Maven Central provides artifacts for Java builds. + It includes POM files and source archives, which we download to get + the source code of artifacts and links to their scm repository. + + This lister yields origins of types: git/svn/hg or whatever the Artifacts + use as repository type, plus maven types for the maven loader (tgz, jar).""" + + LISTER_NAME = "maven" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + index_url: str = None, + instance: Optional[str] = None, + credentials: CredentialsType = None, + incremental: bool = True, + ): + """Lister class for Maven repositories. + + Args: + url: main URL of the Maven repository, i.e. url of the base index + used to fetch maven artifacts. For Maven central use + https://repo1.maven.org/maven2/ + index_url: the URL to download the exported text indexes from. + Would typically be a local host running the export docker image. + See README.md in this directory for more information. + instance: Name of maven instance. Defaults to url's network location + if unset. + incremental: bool, defaults to True. Defines if incremental listing + is activated or not. + + """ + self.BASE_URL = url + self.INDEX_URL = index_url + self.incremental = incremental + + if instance is None: + instance = parse_url(url).host + + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT,} + ) + + def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: + return MavenListerState(**d) + + def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: + return asdict(state) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[RepoPage]: + """ Retrieve and parse exported maven indexes to + identify all pom files and src archives. + """ + + # Example of returned RepoPage's: + # [ + # { + # "type": "maven", + # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", + # "time": 1626109619335, + # "gid": "org.xwiki.platform", + # "aid": "xwiki-platform-wikistream-events-xwiki", + # "version": "5.4.2" + # }, + # { + # "type": "scm", + # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", + # "project": "openengsb-framework", + # }, + # ... + # ] + + # Download the main text index file. + logger.info("Downloading text index from %s.", self.INDEX_URL) + assert self.INDEX_URL is not None + response = requests.get(self.INDEX_URL, stream=True) + response.raise_for_status() + + # Prepare regexes to parse index exports. + + # Parse doc id. + # Example line: "doc 13" + re_doc = re.compile(r"^doc (?P\d+)$") + + # Parse gid, aid, version, classifier, extension. + # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" + re_val = re.compile( + r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + + r"(?P[^|]+)\|(?P[^|]+)$" + ) + + # Parse last modification time. + # Example line: " value jar|1626109619335|14316|2|2|0|jar" + re_time = re.compile( + r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + + r"\|([^|]+)\|([^|]+)$" + ) + + # Read file line by line and process it + out_pom: Dict = {} + jar_src: Dict = {} + doc_id: int = 0 + jar_src["doc"] = None + url_src = None + + iterator = response.iter_lines(chunk_size=1024) + for line_bytes in iterator: + # Read the index text export and get URLs and SCMs. + line = line_bytes.decode(errors="ignore") + m_doc = re_doc.match(line) + if m_doc is not None: + doc_id = int(m_doc.group("doc")) + if ( + self.incremental + and self.state + and self.state.last_seen_doc + and self.state.last_seen_doc >= doc_id + ): + # jar_src["doc"] contains the id of the current document, whatever + # its type (scm or jar). + jar_src["doc"] = None + else: + jar_src["doc"] = doc_id + else: + # If incremental mode, we don't record any line that is + # before our last recorded doc id. + if self.incremental and jar_src["doc"] is None: + continue + m_val = re_val.match(line) + if m_val is not None: + (gid, aid, version, classifier, ext) = m_val.groups() + ext = ext.strip() + path = "/".join(gid.split(".")) + if classifier == "NA" and ext.lower() == "pom": + # If incremental mode, we don't record any line that is + # before our last recorded doc id. + if ( + self.incremental + and self.state + and self.state.last_seen_pom + and self.state.last_seen_pom >= doc_id + ): + continue + url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" + url_pom = urljoin(self.BASE_URL, url_path,) + out_pom[url_pom] = doc_id + elif ( + classifier.lower() == "sources" or ("src" in classifier) + ) and ext.lower() in ("zip", "jar"): + url_path = ( + f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" + ) + url_src = urljoin(self.BASE_URL, url_path) + jar_src["gid"] = gid + jar_src["aid"] = aid + jar_src["version"] = version + else: + m_time = re_time.match(line) + if m_time is not None and url_src is not None: + time = m_time.group("mtime") + jar_src["time"] = int(time) + artifact_metadata_d = { + "type": "maven", + "url": url_src, + **jar_src, + } + logger.debug( + "* Yielding jar %s: %s", url_src, artifact_metadata_d + ) + yield artifact_metadata_d + url_src = None + + logger.info("Found %s poms.", len(out_pom)) + + # Now fetch pom files and scan them for scm info. + + logger.info("Fetching poms..") + for pom in out_pom: + text = self.page_request(pom, {}) + try: + project = xmltodict.parse(text.content.decode()) + if "scm" in project["project"]: + if "connection" in project["project"]["scm"]: + scm = project["project"]["scm"]["connection"] + gid = project["project"]["groupId"] + aid = project["project"]["artifactId"] + artifact_metadata_d = { + "type": "scm", + "doc": out_pom[pom], + "url": scm, + "project": f"{gid}.{aid}", + } + logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) + yield artifact_metadata_d + else: + logger.debug("No scm.connection in pom %s", pom) + else: + logger.debug("No scm in pom %s", pom) + except xmltodict.expat.ExpatError as error: + logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins. + + """ + assert self.lister_obj.id is not None + scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") + if page["type"] == "scm": + # If origin is a scm url: detect scm type and yield. + # Note that the official format is: + # scm:git:git://github.com/openengsb/openengsb-framework.git + # but many, many projects directly put the repo url, so we have to + # detect the content to match it properly. + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) + if m_scm is not None: + scm_type = m_scm.group("type") + if scm_type in scm_types_ok: + scm_url = m_scm.group("url") + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, + ) + yield origin + else: + if page["url"].endswith(".git"): + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=page["url"], visit_type="git", + ) + yield origin + else: + # Origin is a source archive: + last_update_dt = None + last_update_iso = "" + last_update_seconds = str(page["time"])[:-3] + try: + last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) + last_update_dt_tz = last_update_dt.astimezone(timezone.utc) + except OverflowError: + logger.warning("- Failed to convert datetime %s.", last_update_seconds) + if last_update_dt: + last_update_iso = last_update_dt_tz.isoformat() + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=page["url"], + visit_type=page["type"], + last_update=last_update_dt, + extra_loader_arguments={ + "artifacts": [ + { + "time": last_update_iso, + "gid": page["gid"], + "aid": page["aid"], + "version": page["version"], + "base_url": self.BASE_URL, + } + ] + }, + ) + yield origin + + def commit_page(self, page: RepoPage) -> None: + """Update currently stored state using the latest listed doc. + + Note: this is a noop for full listing mode + + """ + if self.incremental and self.state: + # We need to differentiate the two state counters according + # to the type of origin. + if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: + self.state.last_seen_doc = page["doc"] + elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: + self.state.last_seen_doc = page["doc"] + self.state.last_seen_pom = page["doc"] + + def finalize(self) -> None: + """Finalize the lister state, set update if any progress has been made. + + Note: this is a noop for full listing mode + + """ + if self.incremental and self.state: + last_seen_doc = self.state.last_seen_doc + last_seen_pom = self.state.last_seen_pom + + scheduler_state = self.get_state_from_scheduler() + if last_seen_doc and last_seen_pom: + if (scheduler_state.last_seen_doc < last_seen_doc) or ( + scheduler_state.last_seen_pom < last_seen_pom + ): + self.updated = True diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import MavenLister + + +@shared_task(name=__name__ + ".FullMavenLister") +def list_maven_full(**lister_args) -> Dict[str, int]: + """Full update of a Maven repository instance""" + lister = MavenLister.from_configfile(incremental=False, **lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".IncrementalMavenLister") +def list_maven_incremental(**lister_args) -> Dict[str, int]: + """Incremental update of a Maven repository instance""" + lister = MavenLister.from_configfile(incremental=True, **lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py new file mode 100644 diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export.fld @@ -0,0 +1,113 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1626111735737 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1626111735764 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1626111784883 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1626111784915 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 14 + name DESCRIPTOR + type string + value NexusIndex + field 15 + name IDXINFO + type string + value 1.0|index +doc 5 + field 16 + name allGroups + type string + value allGroups + field 17 + name allGroupsList + type string + value al.aldi +doc 6 + field 18 + name rootGroups + type string + value rootGroups + field 19 + name rootGroupsList + type string + value al +END +checksum 00000000003321211082 diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.fld b/swh/lister/maven/tests/data/http_indexes/export_incr.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export_incr.fld @@ -0,0 +1,134 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1633786348254 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1633786348271 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1633786370818 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1633786370857 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 0 + name u + type string + value com.arangodb|arangodb-graphql|1.2|NA|pom + field 1 + name m + type string + value 1634498235946 + field 2 + name i + type string + value jar|1624265143830|-1|0|0|0|pom + field 10 + name n + type string + value arangodb-graphql + field 11 + name d + type string + value ArangoDB Graphql +doc 5 + field 14 + name DESCRIPTOR + type string + value NexusIndex + field 15 + name IDXINFO + type string + value 1.0|index_1 +doc 6 + field 16 + name allGroups + type string + value allGroups + field 17 + name allGroupsList + type string + value com.arangodb|al.aldi +doc 7 + field 18 + name rootGroups + type string + value rootGroups + field 19 + name rootGroupsList + type string + value com|al +END +checksum 00000000004102281591 diff --git a/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom @@ -0,0 +1,208 @@ + + + + + 4.0.0 + + com.arangodb + arangodb-graphql + 1.2 + + arangodb-graphql + ArangoDB Graphql + https://github.com/ArangoDB-Community/arangodb-graphql-java + + + + Apache License 2.0 + http://www.apache.org/licenses/LICENSE-2.0 + repo + + + + + + Colin Findlay + + + Michele Rastelli + https://github.com/rashtao + + + + + UTF-8 + 1.8 + 1.8 + 1.8 + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.8 + true + + ossrh + https://oss.sonatype.org/ + 84aff6e87e214c + false + + + + org.apache.maven.plugins + maven-resources-plugin + 3.1.0 + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + 3.1.0 + + + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.1.1 + + + attach-javadocs + + jar + + + + + + maven-deploy-plugin + 2.8.2 + + false + 10 + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + assembly + package + + single + + + + + + ${project.artifactId}-${project.version}-standalone + + false + false + + jar-with-dependencies + + + + + + + + + com.graphql-java + graphql-java + 11.0 + + + com.arangodb + arangodb-java-driver + 6.5.0 + + + junit + junit + 4.12 + test + + + org.mockito + mockito-core + 2.15.0 + test + + + org.hamcrest + hamcrest-library + 1.3 + test + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + https://github.com/ArangoDB-Community/arangodb-graphql-java + scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git + scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git + + + + ArangoDB GmbH + https://www.arangodb.com + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:https://github.com/aldialimucaj/sprova4j.git + scm:ghttps://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git:git://github.com/aldialimucaj/sprova4j.git + scm:git:git://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.1 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.5 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.14.1 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_lister.py @@ -0,0 +1,320 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import timezone +from pathlib import Path + +import iso8601 +import pytest +import requests + +from swh.lister.maven.lister import MavenLister + +MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url +INDEX_URL = "http://indexes/export.fld" # index directory url + +URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" +URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" +URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" + +LIST_GIT = ( + "git://github.com/aldialimucaj/sprova4j.git", + "https://github.com/aldialimucaj/sprova4j.git", +) + +LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) + +LIST_SRC = ( + MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", + MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", +) + +LIST_SRC_DATA = ( + { + "type": "maven", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.0/sprova4j-0.1.0-sources.jar", + "time": "2021-07-12T17:06:59+00:00", + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.0", + }, + { + "type": "maven", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.1/sprova4j-0.1.1-sources.jar", + "time": "2021-07-12T17:37:05+00:00", + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.1", + }, +) + + +@pytest.fixture +def maven_index(datadir) -> str: + return Path(datadir, "http_indexes", "export.fld").read_text() + + +@pytest.fixture +def maven_index_incr(datadir) -> str: + return Path(datadir, "http_indexes", "export_incr.fld").read_text() + + +@pytest.fixture +def maven_pom_1(datadir) -> str: + return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() + + +@pytest.fixture +def maven_pom_1_malformed(datadir) -> str: + return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text() + + +@pytest.fixture +def maven_pom_2(datadir) -> str: + return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() + + +@pytest.fixture +def maven_pom_3(datadir) -> str: + return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() + + +def test_maven_full_listing( + swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2, +): + """Covers full listing of multiple pages, checking page results and listed + origins, statelessness.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 4 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + last_update_src = iso8601.parse_date(src.get("time")).astimezone( + tz=timezone.utc + ) + assert last_update_src == origin.last_update + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + assert MVN_URL == artifact["base_url"] + break + else: + raise AssertionError( + "Could not find scheduler origin in referenced origins." + ) + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + +def test_maven_full_listing_malformed( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_pom_1_malformed, + maven_pom_2, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) + assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + assert MVN_URL == artifact["base_url"] + break + else: + raise AssertionError( + "Could not find scheduler origin in referenced origins." + ) + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + +def test_maven_incremental_listing( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_index_incr, + maven_pom_1, + maven_pom_2, + maven_pom_3, +): + """Covers full listing of multiple pages, checking page results and listed + origins, with a second updated run for statefulness.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 4 + assert stats.origins == 4 + + # Second execution of the lister, incremental mode + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == 3 + assert scheduler_state.last_seen_pom == 3 + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index_incr) + requests_mock.get(URL_POM_3, text=maven_pom_3) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + break + else: + raise AssertionError + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == 4 + assert scheduler_state.last_seen_pom == 4 + + +@pytest.mark.parametrize("http_code", [400, 404, 500, 502]) +def test_maven_list_http_error( + swh_scheduler, requests_mock, mocker, maven_index, http_code +): + """Test handling of some common HTTP errors: + - 400: Bad request. + - 404: Resource no found. + - 500: Internal server error. + - 502: Bad gateway ou proxy Error. + """ + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + # Test failure of index retrieval. + + requests_mock.get(INDEX_URL, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + # Test failure of artefacts retrieval. + + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + # If the maven_index step succeeded but not the get_pom step, + # then we get only the 2 maven-jar origins (and not the 2 additional + # src origins). + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 2 diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_tasks.py @@ -0,0 +1,45 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@pytest.mark.parametrize( + "task_name,incremental", + [("IncrementalMavenLister", True), ("FullMavenLister", False)], +) +def test_task_lister_maven( + task_name, + incremental, + swh_scheduler_celery_app, + swh_scheduler_celery_worker, + mocker, +): + lister = mocker.patch("swh.lister.maven.tasks.MavenLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld" + ) + res = swh_scheduler_celery_app.send_task( + f"swh.lister.maven.tasks.{task_name}", kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(incremental=incremental, **kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -7,8 +7,7 @@ import logging import os from subprocess import PIPE, Popen, call -import tempfile -from typing import Iterator +from typing import Any, Dict, Iterator, Optional from swh.lister.pattern import StatelessLister from swh.scheduler.interface import SchedulerInterface @@ -44,40 +43,34 @@ self, scheduler: SchedulerInterface, url: str, - instance: str, + instance: Optional[str] = None, credentials: CredentialsType = None, + opam_root: str = "/tmp/opam/", ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.env = os.environ.copy() - self.opamroot = tempfile.mkdtemp(prefix="swh_opam_lister") - call( - [ - "opam", - "init", - "--reinit", - "--bare", - "--no-setup", - "--root", - self.opamroot, - instance, - url, - ], - env=self.env, - ) + # Opam root folder is initialized in the :meth:`get_pages` method as no + # side-effect should happen in the constructor to ease instantiation + self.opam_root = opam_root def get_pages(self) -> Iterator[PageType]: + # Initialize the opam root directory + opam_init(self.opam_root, self.instance, self.url, self.env) + + # Actually list opam instance data proc = Popen( [ "opam", "list", "--all", "--no-switch", + "--safe", "--repos", self.instance, "--root", - self.opamroot, + self.opam_root, "--normalise", "--short", ], @@ -99,9 +92,50 @@ url=url, last_update=None, extra_loader_arguments={ - "opam_root": self.opamroot, + "opam_root": self.opam_root, "opam_instance": self.instance, "opam_url": self.url, "opam_package": page, }, ) + + +def opam_init(opam_root: str, instance: str, url: str, env: Dict[str, Any]) -> None: + """Initialize an opam_root folder. + + Args: + opam_root: The opam root folder to initialize + instance: Name of the opam repository to add or initialize + url: The associated url of the opam repository to add or initialize + env: The global environment to use for the opam command. + + Returns: + None. + + """ + if not os.path.exists(opam_root) or not os.listdir(opam_root): + command = [ + "opam", + "init", + "--reinit", + "--bare", + "--no-setup", + "--root", + opam_root, + instance, + url, + ] + else: + # The repository exists and is populated, we just add another instance in the + # repository. If it's already setup, it's a noop + command = [ + "opam", + "repository", + "add", + "--root", + opam_root, + instance, + url, + ] + # Actually execute the command + call(command, env=env) diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -4,26 +4,98 @@ # See top-level LICENSE file for more information import io +import os +from tempfile import mkdtemp from unittest.mock import MagicMock -from swh.lister.opam.lister import OpamLister +import pytest +from swh.lister.opam.lister import OpamLister, opam_init -def test_urls(swh_scheduler, mocker): +module_name = "swh.lister.opam.lister" - instance_url = "https://opam.ocaml.org" - lister = OpamLister(swh_scheduler, url=instance_url, instance="opam") +@pytest.fixture +def mock_opam(mocker): + """Fixture to bypass the actual opam calls within the test context. + """ + # inhibits the real `subprocess.call` which prepares the required internal opam + # state + mock_init = mocker.patch(f"{module_name}.call", return_value=None) + # replaces the real Popen with a fake one (list origins command) mocked_popen = MagicMock() mocked_popen.stdout = io.BytesIO(b"bar\nbaz\nfoo\n") + mock_open = mocker.patch(f"{module_name}.Popen", return_value=mocked_popen) + return mock_init, mock_open + + +def test_mock_init_repository_init(mock_opam, tmp_path, datadir): + """Initializing opam root directory with an instance should be ok + + """ + mock_init, mock_popen = mock_opam + + instance = "fake" + instance_url = f"file://{datadir}/{instance}" + opam_root = str(tmp_path / "test-opam") + assert not os.path.exists(opam_root) + + # This will initialize an opam directory with the instance + opam_init(opam_root, instance, instance_url, {}) + + assert mock_init.called + + +def test_mock_init_repository_update(mock_opam, tmp_path, datadir): + """Updating opam root directory with another instance should be ok + + """ + mock_init, mock_popen = mock_opam + + instance = "fake_opam_repo" + instance_url = f"file://{datadir}/{instance}" + opam_root = str(tmp_path / "test-opam") - # replaces the real Popen with a fake one - mocker.patch("swh.lister.opam.lister.Popen", return_value=mocked_popen) + os.makedirs(opam_root, exist_ok=True) + with open(os.path.join(opam_root, "opam"), "w") as f: + f.write("one file to avoid empty folder") + + assert os.path.exists(opam_root) + assert os.listdir(opam_root) == ["opam"] # not empty + # This will update the repository opam with another instance + opam_init(opam_root, instance, instance_url, {}) + + assert mock_init.called + + +def test_lister_opam_optional_instance(swh_scheduler): + """Instance name should be optional and default to be built out of the netloc.""" + netloc = "opam.ocaml.org" + instance_url = f"https://{netloc}" + + lister = OpamLister(swh_scheduler, url=instance_url,) + assert lister.instance == netloc + assert lister.opam_root == "/tmp/opam/" + + +def test_urls(swh_scheduler, mock_opam, tmp_path): + mock_init, mock_popen = mock_opam + instance_url = "https://opam.ocaml.org" + tmp_folder = mkdtemp(dir=tmp_path, prefix="swh_opam_lister") + + lister = OpamLister( + swh_scheduler, url=instance_url, instance="opam", opam_root=tmp_folder, + ) + assert lister.instance == "opam" + assert lister.opam_root == tmp_folder # call the lister and get all listed origins urls stats = lister.run() + assert mock_init.called + assert mock_popen.called + assert stats.pages == 3 assert stats.origins == 3 @@ -40,11 +112,44 @@ assert expected_urls == result_urls -def test_opam_binary(datadir, swh_scheduler): +def test_opam_binary(datadir, swh_scheduler, tmp_path): + instance_url = f"file://{datadir}/fake_opam_repo" + + lister = OpamLister( + swh_scheduler, + url=instance_url, + instance="fake", + opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"), + ) + + stats = lister.run() + + assert stats.pages == 4 + assert stats.origins == 4 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + expected_urls = [ + f"opam+{instance_url}/packages/agrid/", + f"opam+{instance_url}/packages/calculon/", + f"opam+{instance_url}/packages/directories/", + f"opam+{instance_url}/packages/ocb/", + ] + + result_urls = [origin.url for origin in scheduler_origins] + + assert expected_urls == result_urls + +def test_opam_multi_instance(datadir, swh_scheduler, tmp_path): instance_url = f"file://{datadir}/fake_opam_repo" - lister = OpamLister(swh_scheduler, url=instance_url, instance="fake") + lister = OpamLister( + swh_scheduler, + url=instance_url, + instance="fake", + opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"), + ) stats = lister.run() diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -13,7 +13,7 @@ from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import retry_attempt, throttling_retry +from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -49,7 +49,7 @@ in 1 seconds.'> """ - attempt = retry_attempt(retry_state) + attempt = retry_state.outcome return attempt.failed and isinstance(attempt.exception(), Fault) diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -248,12 +248,13 @@ ) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: + last_modified: str = str(hit.last_modified) + last_update: datetime.datetime = iso8601.parse_date(last_modified) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, - last_update=iso8601.parse_date(hit.last_modified), - enabled=False, + last_update=last_update, ) def _get_pages_from_subsitemap( diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -18,6 +18,10 @@ "tuleap": {"url": "https://tuleap.net",}, "gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",}, "opam": {"url": "https://opam.ocaml.org", "instance": "opam"}, + "maven": { + "url": "https://repo1.maven.org/maven2/", + "index_url": "http://indexes/export.fld", + }, } diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 the Software Heritage developers +# Copyright (C) 2018-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -47,14 +47,7 @@ def assert_sleep_calls(mocker, mock_sleep, sleep_params): - try: - mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) - except AssertionError: - # tenacity < 5.1 has a different behavior for wait_exponential - # https://github.com/jd/tenacity/commit/aac4307a0aa30d7befd0ebe4212ee4fc69083a95 - mock_sleep.assert_has_calls( - [mocker.call(param * WAIT_EXP_BASE) for param in sleep_params] - ) + mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) def test_throttling_retry(requests_mock, mocker): diff --git a/swh/lister/utils.py b/swh/lister/utils.py --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -55,24 +55,11 @@ return is_connection_error or is_throttling_exception(e) or is_500_error -def retry_attempt(retry_state): - """ - Utility function to get last retry attempt info based on the - tenacity version (as debian buster packages version 4.12). - """ - try: - attempt = retry_state.outcome - except AttributeError: - # tenacity < 5.0 - attempt = retry_state - return attempt - - def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> bool: """ Custom tenacity retry predicate for handling exceptions with the given predicate. """ - attempt = retry_attempt(retry_state) + attempt = retry_state.outcome if attempt.failed: exception = attempt.exception() return predicate(exception) diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -34,7 +34,7 @@ extras = testing deps = - mypy + mypy==0.920 commands = mypy swh