diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -36,3 +36,7 @@
[mypy-urllib3.util.*]
ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
+
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
beautifulsoup4
launchpadlib
tenacity
+xmltodict
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
lister.pypi=swh.lister.pypi:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
+ lister.maven=swh.lister.maven:register
""",
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import MavenLister
+
+ return {
+ "lister": MavenLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/lister.py
@@ -0,0 +1,250 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+import logging
+from os import remove
+import re
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
+from typing import Any, Dict, Iterator, Optional
+from urllib import request
+from urllib.parse import urljoin
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+from urllib3.util import parse_url
+import xmltodict
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+RepoPage = Dict[str, Any]
+
+
+class MavenLister(StatelessLister[RepoPage]):
+ """List origins from a Maven repository.
+
+ Maven Central provides artifacts for Java builds.
+ It includes POM files and source archives, which we download to get
+ the source code of artifacts and links to their scm repository.
+
+ This lister yields origins of types: git/svn/hg or whatever the Artifacts
+ use as repository type, plus maven types for the maven loader (tgz, jar)."""
+
+ LISTER_NAME = "maven"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ url: str,
+ index_url: str = None,
+ instance: Optional[str] = None,
+ credentials: CredentialsType = None,
+ ):
+ self.BASE_URL = url
+ self.INDEX_URL = index_url
+
+ if instance is None:
+ instance = parse_url(url).host
+
+ super().__init__(
+ scheduler=scheduler, credentials=credentials, url=url, instance=instance,
+ )
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT,}
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[RepoPage]:
+ """
+
+ Example of returned RepoPage's:
+ [
+ {
+ "type": "jar",
+ "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
+ "project": "xwiki-platform-wikistream-events-xwiki",
+ "version": "5.4.2"
+ },
+ {
+ "type": "scm",
+ "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
+ "project": "openengsb-framework",
+ },
+ ...
+ ]
+ """
+
+ # Download the main text index file.
+ logger.info("Downloading text index file..")
+ text_file = NamedTemporaryFile(delete=False)
+ assert self.INDEX_URL is not None
+ with request.urlopen(self.INDEX_URL) as fsrc:
+ copyfileobj(fsrc, text_file)
+ text_file.close()
+ logger.debug(f"File is {text_file.name}")
+ # Prepare regex's to parse index exports.
+ r_val = re.compile(r"^\s{4}value ([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)$")
+ r_src = re.compile(r".*src.*")
+
+ # Read the index text export and get URLs and SCMs.
+ out_pom: Dict = defaultdict(dict)
+ out_src: Dict = defaultdict(dict)
+ with open(text_file.name, mode="rt") as f:
+ line = f.readline()
+ while line != "":
+ m = r_val.match(line)
+ if m is not None:
+ (g, a, v, c, e) = m.group(1, 2, 3, 4, 5)
+ e = e.strip()
+ path = "/".join(g.split("."))
+ if c == "NA" and e == "pom":
+ url = urljoin(
+ self.BASE_URL,
+ path + "/" + a + "/" + v + "/" + a + "-" + v + "." + e,
+ )
+ out_pom[url] = a
+ if (c == "sources" or r_src.match(c)) and (
+ e == "zip" or e == "jar" or e == "tar.gz" or e == "tar.bz2"
+ ):
+ url = urljoin(
+ self.BASE_URL,
+ path
+ + "/"
+ + a
+ + "/"
+ + v
+ + "/"
+ + a
+ + "-"
+ + v
+ + "-"
+ + c
+ + "."
+ + e,
+ )
+ out_src[url]["a"] = a
+ out_src[url]["v"] = v
+ line = f.readline()
+
+ # Clean up the download afterwards (may be huge).
+ remove(text_file.name)
+
+ logger.info(f"Found {len(out_pom)} poms and {len(out_src)} src items.")
+
+ # Yield all src archives found.
+ for s in out_src.keys():
+ logger.debug(f"* Yielding jar {s}.")
+ yield {
+ "type": "jar",
+ "url": s,
+ "project": out_src[s]["a"],
+ "version": out_src[s]["v"],
+ }
+
+ # Now fetch pom files and scan them for scm info.
+
+ logger.info("Fetching poms..")
+ out_pom_src = {}
+ for pom in out_pom.keys():
+ text = self.page_request(pom, {})
+ try:
+ project = xmltodict.parse(text.content.decode())
+ if "scm" in project["project"]:
+ if "connection" in project["project"]["scm"]:
+ scm = project["project"]["scm"]["connection"]
+ gid = project["project"]["groupId"]
+ aid = project["project"]["artifactId"]
+ out_pom_src[scm] = f"{gid}.{aid}"
+ else:
+ logger.debug(f"No scm.connection in pom {pom}")
+ else:
+ logger.debug(f"No scm in pom {pom}")
+ except xmltodict.expat.ExpatError as error:
+ logger.info(f"Could not parse POM {pom} XML: {error}. Next.")
+
+ # Yield all src archives found.
+ for s in out_pom_src.keys():
+ logger.debug(f"* Yielding scm {s}.")
+ yield {
+ "type": "scm",
+ "url": s,
+ "project": out_pom_src[s],
+ }
+
+ def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+ """Convert a page of Maven repositories into a list of ListedOrigins.
+
+ """
+ assert self.lister_obj.id is not None
+
+ if page["type"] == "scm":
+ # If origin is a scm url: detect scm type and yield.
+ # Note that the official format is:
+ # scm:git:git://github.com/openengsb/openengsb-framework.git
+ # but many, many projects directly put the repo url, so we have to
+ # detect the content to match it properly.
+ scm_re = re.compile(r"^scm:([^:]+):(.*)$")
+ m = scm_re.match(page["url"])
+ if m is not None:
+ scm_type = m.group(1)
+ scm_url = m.group(2)
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=scm_url, # or page["url"],
+ visit_type=scm_type,
+ # last_update=parse_packaged_date(package_info),
+ )
+ yield origin
+ else:
+ scm_re = re.compile(r".*\.git$")
+ m = scm_re.match(page["url"])
+ if m is not None:
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=page["url"],
+ visit_type="git",
+ # last_update=parse_packaged_date(package_info),
+ )
+ yield origin
+ else:
+ # Origin is a source archive:
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=page["url"],
+ visit_type=page["type"],
+ # last_update=parse_packaged_date(package_info),
+ extra_loader_arguments={
+ "artifacts": [
+ {"project": page["project"], "version": page["version"]}
+ ]
+ },
+ )
+ yield origin
diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tasks.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Dict
+
+from celery import shared_task
+
+from .lister import MavenLister
+
+
+@shared_task(name=__name__ + ".FullMavenLister")
+def list_maven_full(**lister_args) -> Dict[str, int]:
+ """Full update of a Maven repository instance"""
+ lister = MavenLister.from_configfile(**lister_args)
+ return lister.run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping() -> str:
+ return "OK"
diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld
new file mode 100755
--- /dev/null
+++ b/swh/lister/maven/tests/data/http_indexes/export.fld
@@ -0,0 +1,113 @@
+doc 0
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.0|sources|jar
+ field 1
+ name m
+ type string
+ value 1626111735737
+ field 2
+ name i
+ type string
+ value jar|1626109619335|14316|2|2|0|jar
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 1
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.0|NA|pom
+ field 1
+ name m
+ type string
+ value 1626111735764
+ field 2
+ name i
+ type string
+ value jar|1626109636636|-1|1|0|0|pom
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 2
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.1|sources|jar
+ field 1
+ name m
+ type string
+ value 1626111784883
+ field 2
+ name i
+ type string
+ value jar|1626111425534|14510|2|2|0|jar
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 3
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.1|NA|pom
+ field 1
+ name m
+ type string
+ value 1626111784915
+ field 2
+ name i
+ type string
+ value jar|1626111437014|-1|1|0|0|pom
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 4
+ field 14
+ name DESCRIPTOR
+ type string
+ value NexusIndex
+ field 15
+ name IDXINFO
+ type string
+ value 1.0|index
+doc 5
+ field 16
+ name allGroups
+ type string
+ value allGroups
+ field 17
+ name allGroupsList
+ type string
+ value al.aldi
+doc 6
+ field 18
+ name rootGroups
+ type string
+ value rootGroups
+ field 19
+ name rootGroupsList
+ type string
+ value al
+END
+checksum 00000000003321211082
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
@@ -0,0 +1,86 @@
+
+
+ 4.0.0
+ al.aldi
+ sprova4j
+ 0.1.0
+ sprova4j
+ Java client for Sprova Test Management
+ https://github.com/aldialimucaj/sprova4j
+ 2018
+
+
+ The Apache Software License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+
+
+
+
+ aldi
+ Aldi Alimucaj
+ aldi.alimucaj@gmail.com
+
+
+
+ scm:git:git://github.com/aldialimucaj/sprova4j.git
+ scm:git:git://github.com/aldialimucaj/sprova4j.git
+ https://github.com/aldialimucaj/sprova4j
+
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.3
+ runtime
+
+
+ com.google.code.gson
+ gson
+ 2.8.3
+ runtime
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.10.0
+ runtime
+
+
+ com.squareup.okio
+ okio
+ 1.0.0
+ runtime
+
+
+ org.glassfish
+ javax.json
+ 1.1.2
+ runtime
+
+
+ javax.json
+ javax.json-api
+ 1.1.2
+ runtime
+
+
+ javax.validation
+ validation-api
+ 2.0.1.Final
+ runtime
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+ com.squareup.okhttp3
+ mockwebserver
+ 3.10.0
+ test
+
+
+
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
@@ -0,0 +1,86 @@
+
+
+ 4.0.0
+ al.aldi
+ sprova4j
+ 0.1.1
+ sprova4j
+ Java client for Sprova Test Management
+ https://github.com/aldialimucaj/sprova4j
+ 2018
+
+
+ The Apache Software License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+
+
+
+
+ aldi
+ Aldi Alimucaj
+ aldi.alimucaj@gmail.com
+
+
+
+ https://github.com/aldialimucaj/sprova4j.git
+ https://github.com/aldialimucaj/sprova4j.git
+ https://github.com/aldialimucaj/sprova4j
+
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.3
+ runtime
+
+
+ com.google.code.gson
+ gson
+ 2.8.5
+ runtime
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.10.0
+ runtime
+
+
+ com.squareup.okio
+ okio
+ 1.14.1
+ runtime
+
+
+ org.glassfish
+ javax.json
+ 1.1.2
+ runtime
+
+
+ javax.json
+ javax.json-api
+ 1.1.2
+ runtime
+
+
+ javax.validation
+ validation-api
+ 2.0.1.Final
+ runtime
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+ com.squareup.okhttp3
+ mockwebserver
+ 3.10.0
+ test
+
+
+
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/test_lister.py
@@ -0,0 +1,113 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+from typing import List
+
+import pytest
+import requests
+
+from swh.lister.maven.lister import MavenLister
+from swh.scheduler.model import ListedOrigin
+
+MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url
+INDEX_URL = "http://indexes/export.fld" # index directory url
+
+URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
+URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
+
+LIST_SRC = (
+ MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar",
+ MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar",
+)
+
+LIST_GIT = (
+ "git://github.com/aldialimucaj/sprova4j.git",
+ "https://github.com/aldialimucaj/sprova4j.git",
+)
+
+
+@pytest.fixture
+def maven_index(datadir) -> str:
+ text = Path(datadir, "http_indexes", "export.fld").read_text()
+ return text
+
+
+@pytest.fixture
+def maven_pom_1(datadir) -> str:
+ text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
+ return text
+
+
+@pytest.fixture
+def maven_pom_2(datadir) -> str:
+ text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
+ return text
+
+
+def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
+ """Asserts that the two collections have the same origin URLs.
+
+ Does not test last_update."""
+
+ sorted_lister_urls = list(sorted(lister_urls))
+ sorted_scheduler_origins = list(sorted(scheduler_origins, key=lambda x: x.url))
+
+ assert len(sorted_lister_urls) == len(sorted_scheduler_origins)
+
+ for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins):
+ assert l_url == s_origin.url
+
+
+def test_maven_full_listing(
+ swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2,
+):
+ """Covers full listing of multiple pages, rate-limit, page size (required for test),
+ checking page results and listed origins, statelessness."""
+
+ lister = MavenLister(
+ scheduler=swh_scheduler,
+ url=MVN_URL,
+ instance="maven.org",
+ index_url="http://indexes/export.fld",
+ )
+
+ index_text = maven_index
+ p1_text = maven_pom_1
+ p2_text = maven_pom_2
+ print(f"MOCK {INDEX_URL} with {index_text}")
+ requests_mock.get(INDEX_URL, text=index_text)
+ requests_mock.get(URL_POM_1, text=p1_text)
+ requests_mock.get(URL_POM_2, text=p2_text)
+
+ # end test setup
+
+ stats = lister.run()
+ # start test checks
+ assert stats.pages == 4
+ assert stats.origins == 4
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ check_listed_origins(LIST_GIT + LIST_SRC, scheduler_origins)
+
+ assert lister.get_state_from_scheduler() is None
+
+
+@pytest.mark.parametrize("http_code", [400, 500, 502])
+def test_maven_list_http_error(swh_scheduler, requests_mock, http_code):
+ """Test handling of some HTTP errors commonly encountered"""
+
+ lister = MavenLister(
+ scheduler=swh_scheduler, url=MVN_URL, index_url="http://indexes/export.fld"
+ )
+
+ requests_mock.get(INDEX_URL, text=maven_index)
+ requests_mock.get(URL_POM_1, status_code=http_code)
+
+ with pytest.raises(requests.HTTPError):
+ lister.run()
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == 2
diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/test_tasks.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ lister = mocker.patch("swh.lister.maven.tasks.MavenLister")
+ lister.from_configfile.return_value = lister
+ lister.run.return_value = ListerStats(pages=10, origins=500)
+
+ kwargs = dict(
+ url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld"
+ )
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.maven.tasks.FullMavenLister", kwargs=kwargs,
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.from_configfile.assert_called_once_with(**kwargs)
+ lister.run.assert_called_once_with()