diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -36,3 +36,7 @@ [mypy-urllib3.util.*] ignore_missing_imports = True + +[mypy-xmltodict.*] +ignore_missing_imports = True + diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ beautifulsoup4 launchpadlib tenacity +xmltodict diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register + lister.maven=swh.lister.maven:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import MavenLister + + return { + "lister": MavenLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/lister.py @@ -0,0 +1,292 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from collections import defaultdict +import logging +from os import remove +import re +from tempfile import NamedTemporaryFile +from typing import Any, Dict, Iterator, Optional +from urllib.parse import urljoin + +import requests +from tenacity.before_sleep import before_sleep_log +from urllib3.util import parse_url +import xmltodict + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +RepoPage = Dict[str, Any] + + +class MavenLister(StatelessLister[RepoPage]): + """List origins from a Maven repository. + + Maven Central provides artifacts for Java builds. + It includes POM files and source archives, which we download to get + the source code of artifacts and links to their scm repository. + + This lister yields origins of types: git/svn/hg or whatever the Artifacts + use as repository type, plus maven types for the maven loader (tgz, jar).""" + + LISTER_NAME = "maven" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + index_url: str = None, + instance: Optional[str] = None, + credentials: CredentialsType = None, + ): + """Lister class for Maven repositories. + + Args: + url: main URL of the Maven repository, i.e. url of the base index + used to fetch maven artifacts. For Maven central use + https://repo1.maven.org/maven2/ + index_url: the URL to download the exported text indexes from. + Would typically be a local host running the export docker image. + instance: Name of maven instance. Defaults to url's network location + if unset. + + """ + self.BASE_URL = url + self.INDEX_URL = index_url + + if instance is None: + instance = parse_url(url).host + + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT,} + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[RepoPage]: + """ Retrieve and parse exported maven indexes to + identify all pom files and src archives. + + """ + + # Example of returned RepoPage's: + # [ + # { + # "type": "jar", + # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", + # "gid": "org.xwiki.platform", + # "aid": "xwiki-platform-wikistream-events-xwiki", + # "version": "5.4.2" + # }, + # { + # "type": "scm", + # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", + # "gid": "org.xwiki.platform", + # "aid": "xwiki-platform-wikistream-events-xwiki", + # "project": "openengsb-framework", + # }, + # ... + # ] + + # Download the main text index file. + logger.info("Downloading text index file..") + text_file = NamedTemporaryFile(delete=False) + assert self.INDEX_URL is not None + response = requests.get(self.INDEX_URL, stream=True) + + for chunk in response.iter_content(chunk_size=1024): + text_file.write(chunk) + text_file.close() + logger.debug(f"File is {text_file.name}") + # Prepare regex's to parse index exports. + re_val = re.compile(r"^\s{4}value ([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)$") + re_time = re.compile( + r"^\s{4}value ([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + + r"\|([^|]+)\|([^|]+)$" + ) + re_src = re.compile(r".*src.*") + + # Read the index text export and get URLs and SCMs. + out_pom: Dict = defaultdict(dict) + out_src: Dict = defaultdict(dict) + with open(text_file.name, mode="rt") as file_txt: + line = file_txt.readline() + url_src = None + while line != "": + m_val = re_val.match(line) + if m_val is not None: + (gid, aid, version, classifier, ext) = m_val.group(1, 2, 3, 4, 5) + ext = ext.strip() + path = "/".join(gid.split(".")) + if classifier == "NA" and ext == "pom": + url_pom = urljoin( + self.BASE_URL, + path + + "/" + + aid + + "/" + + version + + "/" + + aid + + "-" + + version + + "." + + ext, + ) + out_pom[url_pom] = aid + if (classifier == "sources" or re_src.match(classifier)) and ( + ext == "zip" or ext == "jar" + ): + url_src = urljoin( + self.BASE_URL, + path + + "/" + + aid + + "/" + + version + + "/" + + aid + + "-" + + version + + "-" + + classifier + + "." + + ext, + ) + out_src[url_src]["g"] = gid + out_src[url_src]["a"] = aid + out_src[url_src]["v"] = version + else: + m_time = re_time.match(line) + if m_time is not None and url_src is not None: + time = m_time.group(2) + out_src[url_src]["t"] = int(time) + url_src = None + line = file_txt.readline() + + # Clean up the download afterwards (may be huge). + remove(text_file.name) + + logger.info(f"Found {len(out_pom)} poms and {len(out_src)} src items.") + + # Yield all src archives found. + for src in out_src.keys(): + logger.debug(f"* Yielding jar {src}.") + yield { + "type": "jar", + "url": src, + "time": out_src[src]["t"], + "gid": out_src[src]["g"], + "aid": out_src[src]["a"], + "version": out_src[src]["v"], + } + + # Now fetch pom files and scan them for scm info. + + logger.info("Fetching poms..") + out_pom_src = {} + for pom in out_pom.keys(): + text = self.page_request(pom, {}) + try: + project = xmltodict.parse(text.content.decode()) + if "scm" in project["project"]: + if "connection" in project["project"]["scm"]: + scm = project["project"]["scm"]["connection"] + gid = project["project"]["groupId"] + aid = project["project"]["artifactId"] + out_pom_src[scm] = f"{gid}.{aid}" + else: + logger.debug(f"No scm.connection in pom {pom}") + else: + logger.debug(f"No scm in pom {pom}") + except xmltodict.expat.ExpatError as error: + logger.info(f"Could not parse POM {pom} XML: {error}. Next.") + + # Yield all src archives found. + for src in out_pom_src.keys(): + logger.debug(f"* Yielding scm {src}.") + yield { + "type": "scm", + "url": src, + "project": out_pom_src[src], + } + + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins. + + """ + assert self.lister_obj.id is not None + + if page["type"] == "scm": + # If origin is a scm url: detect scm type and yield. + # Note that the official format is: + # scm:git:git://github.com/openengsb/openengsb-framework.git + # but many, many projects directly put the repo url, so we have to + # detect the content to match it properly. + re_scm = re.compile(r"^scm:([^:]+):(.*)$") + m_scm = re_scm.match(page["url"]) + if m_scm is not None: + scm_type = m_scm.group(1) + scm_url = m_scm.group(2) + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=scm_url, # or page["url"], + visit_type=scm_type, + ) + yield origin + else: + re_scm = re.compile(r".*\.git$") + m_scm = re_scm.match(page["url"]) + if m_scm is not None: + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=page["url"], visit_type="git", + ) + yield origin + else: + # Origin is a source archive: + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=page["url"], + visit_type=page["type"], + # last_update=parse_packaged_date(package_info), + extra_loader_arguments={ + "artifacts": [ + { + "time": page["time"], + "gid": page["gid"], + "aid": page["aid"], + "version": page["version"], + } + ] + }, + ) + yield origin diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import MavenLister + + +@shared_task(name=__name__ + ".FullMavenLister") +def list_maven_full(**lister_args) -> Dict[str, int]: + """Full update of a Maven repository instance""" + lister = MavenLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py new file mode 100644 diff --git a/swh/lister/maven/tests/data/https_indexes/export.fld b/swh/lister/maven/tests/data/https_indexes/export.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/https_indexes/export.fld @@ -0,0 +1,113 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1626111735737 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1626111735764 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1626111784883 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1626111784915 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 14 + name DESCRIPTOR + type string + value NexusIndex + field 15 + name IDXINFO + type string + value 1.0|index +doc 5 + field 16 + name allGroups + type string + value allGroups + field 17 + name allGroupsList + type string + value al.aldi +doc 6 + field 18 + name rootGroups + type string + value rootGroups + field 19 + name rootGroupsList + type string + value al +END +checksum 00000000003321211082 diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git:git://github.com/aldialimucaj/sprova4j.git + scm:git:git://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.1 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.5 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.14.1 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_lister.py @@ -0,0 +1,152 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from typing import List + +import pytest +import requests + +from swh.lister.maven.lister import MavenLister +from swh.scheduler.model import ListedOrigin + +MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url +INDEX_URL = "https://indexes/export.fld" # index directory url + +URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" +URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" + +LIST_SRC = ( + MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", + MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", +) + +LIST_SRC_DATA = ( + { + "type": "jar", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.0/sprova4j-0.1.0-sources.jar", + "time": 1626109619335, + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.0", + }, + { + "type": "jar", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.1/sprova4j-0.1.1-sources.jar", + "time": 1626111425534, + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.1", + }, +) + +LIST_GIT = ( + "git://github.com/aldialimucaj/sprova4j.git", + "https://github.com/aldialimucaj/sprova4j.git", +) + + +@pytest.fixture +def maven_index(datadir) -> str: + text = Path(datadir, "https_indexes", "export.fld").read_text() + return text + + +@pytest.fixture +def maven_pom_1(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() + return text + + +@pytest.fixture +def maven_pom_2(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() + return text + + +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs. + + Does not test last_update.""" + + sorted_lister_urls = list(sorted(lister_urls)) + sorted_scheduler_origins = list(sorted(scheduler_origins, key=lambda x: x.url)) + + assert len(sorted_lister_urls) == len(sorted_scheduler_origins) + + for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): + assert l_url == s_origin.url + + +def test_maven_full_listing( + swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2, +): + """Covers full listing of multiple pages, checking page results and listed + origins, statelessness.""" + + lister = MavenLister( + scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, + ) + + index_text = maven_index + p1_text = maven_pom_1 + p2_text = maven_pom_2 + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=p1_text) + requests_mock.get(URL_POM_2, text=p2_text) + + # end test setup + + stats = lister.run() + # start test checks + assert stats.pages == 4 + assert stats.origins == 4 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + check_listed_origins(LIST_GIT + LIST_SRC, scheduler_origins) + for s in scheduler_origins: + if s.visit_type == "jar": + for list in LIST_SRC_DATA: + if list.get("url") == s.url: + assert ( + list.get("time") + == s.extra_loader_arguments["artifacts"][0]["time"] + ) + assert ( + list.get("gid") + == s.extra_loader_arguments["artifacts"][0]["gid"] + ) + assert ( + list.get("aid") + == s.extra_loader_arguments["artifacts"][0]["aid"] + ) + assert ( + list.get("version") + == s.extra_loader_arguments["artifacts"][0]["version"] + ) + break + else: + raise AssertionError + assert lister.get_state_from_scheduler() is None + + +@pytest.mark.parametrize("http_code", [400, 500, 502]) +def test_maven_list_http_error( + swh_scheduler, requests_mock, mocker, maven_index, http_code +): + """Test handling of some HTTP errors commonly encountered""" + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 2 diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_tasks.py @@ -0,0 +1,33 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.maven.tasks.MavenLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld" + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.maven.tasks.FullMavenLister", kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -18,6 +18,10 @@ "tuleap": {"url": "https://tuleap.net",}, "gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",}, "opam": {"url": "https://opam.ocaml.org", "instance": "opam"}, + "maven": { + "url": "https://repo1.maven.org/maven2/", + "index_url": "https://indexes/export.fld", + }, }