Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342066
D6133.id22590.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
28 KB
Subscribers
None
D6133.id22590.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -36,3 +36,7 @@
[mypy-urllib3.util.*]
ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
+
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
beautifulsoup4
launchpadlib
tenacity
+xmltodict
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
lister.pypi=swh.lister.pypi:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
+ lister.maven=swh.lister.maven:register
""",
classifiers=[
"Programming Language :: Python :: 3",
diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import MavenLister
+
+ return {
+ "lister": MavenLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/lister.py
@@ -0,0 +1,291 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from collections import defaultdict
+import logging
+from os import remove
+import re
+from tempfile import NamedTemporaryFile
+from typing import Any, Dict, Iterator, Optional
+from urllib.parse import urljoin
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+from urllib3.util import parse_url
+import xmltodict
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+RepoPage = Dict[str, Any]
+
+
+class MavenLister(StatelessLister[RepoPage]):
+ """List origins from a Maven repository.
+
+ Maven Central provides artifacts for Java builds.
+ It includes POM files and source archives, which we download to get
+ the source code of artifacts and links to their scm repository.
+
+ This lister yields origins of types: git/svn/hg or whatever the Artifacts
+ use as repository type, plus maven types for the maven loader (tgz, jar)."""
+
+ LISTER_NAME = "maven"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ url: str,
+ index_url: str = None,
+ instance: Optional[str] = None,
+ credentials: CredentialsType = None,
+ ):
+ """Lister class for Maven repositories.
+
+ Args:
+ url: main URL of the Maven repository, i.e. url of the base index
+ used to fetch maven artifacts. For Maven central use
+ https://repo1.maven.org/maven2/
+ index_url: the URL to download the exported text indexes from.
+ Would typically be a local host running the export docker image.
+ instance: Name of maven instance. Defaults to url's network location
+ if unset.
+
+ """
+ self.BASE_URL = url
+ self.INDEX_URL = index_url
+
+ if instance is None:
+ instance = parse_url(url).host
+
+ super().__init__(
+ scheduler=scheduler, credentials=credentials, url=url, instance=instance,
+ )
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT,}
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[RepoPage]:
+ """ Retrieve and parse exported maven indexes to
+ identify all pom files and src archives.
+
+ """
+
+ # Example of returned RepoPage's:
+ # [
+ # {
+ # "type": "jar",
+ # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
+ # "time": 1626109619335,
+ # "gid": "org.xwiki.platform",
+ # "aid": "xwiki-platform-wikistream-events-xwiki",
+ # "version": "5.4.2"
+ # },
+ # {
+ # "type": "scm",
+ # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
+ # "project": "openengsb-framework",
+ # },
+ # ...
+ # ]
+
+ # Download the main text index file.
+ logger.info("Downloading text index file..")
+ text_file = NamedTemporaryFile(delete=False)
+ assert self.INDEX_URL is not None
+ response = requests.get(self.INDEX_URL, stream=True)
+
+ for chunk in response.iter_content(chunk_size=1024):
+ text_file.write(chunk)
+ text_file.close()
+ logger.debug(f"File is {text_file.name}")
+ # Prepare regex's to parse index exports.
+ re_val = re.compile(r"^\s{4}value ([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)$")
+ re_time = re.compile(
+ r"^\s{4}value ([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
+ + r"\|([^|]+)\|([^|]+)$"
+ )
+ re_src = re.compile(r".*src.*")
+
+ # Read the index text export and get URLs and SCMs.
+ out_pom: Dict = defaultdict(dict)
+ out_src: Dict = defaultdict(dict)
+ with open(text_file.name, mode="rt") as file_txt:
+ line = file_txt.readline()
+ url_src = None
+ while line != "":
+ m_val = re_val.match(line)
+ if m_val is not None:
+ (gid, aid, version, classifier, ext) = m_val.group(1, 2, 3, 4, 5)
+ ext = ext.strip()
+ path = "/".join(gid.split("."))
+ if classifier == "NA" and ext == "pom":
+ url_pom = urljoin(
+ self.BASE_URL,
+ path
+ + "/"
+ + aid
+ + "/"
+ + version
+ + "/"
+ + aid
+ + "-"
+ + version
+ + "."
+ + ext,
+ )
+ out_pom[url_pom] = aid
+ if (classifier == "sources" or re_src.match(classifier)) and (
+ ext == "zip" or ext == "jar"
+ ):
+ url_src = urljoin(
+ self.BASE_URL,
+ path
+ + "/"
+ + aid
+ + "/"
+ + version
+ + "/"
+ + aid
+ + "-"
+ + version
+ + "-"
+ + classifier
+ + "."
+ + ext,
+ )
+ out_src[url_src]["g"] = gid
+ out_src[url_src]["a"] = aid
+ out_src[url_src]["v"] = version
+ else:
+ m_time = re_time.match(line)
+ if m_time is not None and url_src is not None:
+ time = m_time.group(2)
+ out_src[url_src]["t"] = int(time)
+ url_src = None
+ line = file_txt.readline()
+
+ # Clean up the download afterwards (may be huge).
+ remove(text_file.name)
+
+ logger.info(f"Found {len(out_pom)} poms and {len(out_src)} src items.")
+
+ # Yield all src archives found.
+ for src in out_src.keys():
+ logger.debug(f"* Yielding jar {src}.")
+ yield {
+ "type": "jar",
+ "url": src,
+ "time": out_src[src]["t"],
+ "gid": out_src[src]["g"],
+ "aid": out_src[src]["a"],
+ "version": out_src[src]["v"],
+ }
+
+ # Now fetch pom files and scan them for scm info.
+
+ logger.info("Fetching poms..")
+ out_pom_src = {}
+ for pom in out_pom.keys():
+ text = self.page_request(pom, {})
+ try:
+ project = xmltodict.parse(text.content.decode())
+ if "scm" in project["project"]:
+ if "connection" in project["project"]["scm"]:
+ scm = project["project"]["scm"]["connection"]
+ gid = project["project"]["groupId"]
+ aid = project["project"]["artifactId"]
+ out_pom_src[scm] = f"{gid}.{aid}"
+ else:
+ logger.debug(f"No scm.connection in pom {pom}")
+ else:
+ logger.debug(f"No scm in pom {pom}")
+ except xmltodict.expat.ExpatError as error:
+ logger.info(f"Could not parse POM {pom} XML: {error}. Next.")
+
+ # Yield all src archives found.
+ for src in out_pom_src.keys():
+ logger.debug(f"* Yielding scm {src}.")
+ yield {
+ "type": "scm",
+ "url": src,
+ "project": out_pom_src[src],
+ }
+
+ def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+ """Convert a page of Maven repositories into a list of ListedOrigins.
+
+ """
+ assert self.lister_obj.id is not None
+ print(f"DEBUG {page['time']}")
+ if page["type"] == "scm":
+ # If origin is a scm url: detect scm type and yield.
+ # Note that the official format is:
+ # scm:git:git://github.com/openengsb/openengsb-framework.git
+ # but many, many projects directly put the repo url, so we have to
+ # detect the content to match it properly.
+ re_scm = re.compile(r"^scm:([^:]+):(.*)$")
+ m_scm = re_scm.match(page["url"])
+ if m_scm is not None:
+ scm_type = m_scm.group(1)
+ scm_url = m_scm.group(2)
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=scm_url, # or page["url"],
+ visit_type=scm_type,
+ )
+ yield origin
+ else:
+ re_scm = re.compile(r".*\.git$")
+ m_scm = re_scm.match(page["url"])
+ if m_scm is not None:
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id, url=page["url"], visit_type="git",
+ )
+ yield origin
+ else:
+ # Origin is a source archive:
+ origin = ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=page["url"],
+ visit_type=page["type"],
+ # last_update=parse_packaged_date(package_info),
+ extra_loader_arguments={
+ "artifacts": [
+ {
+ "time": page["time"],
+ "gid": page["gid"],
+ "aid": page["aid"],
+ "version": page["version"],
+ }
+ ]
+ },
+ )
+ yield origin
diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tasks.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Dict
+
+from celery import shared_task
+
+from .lister import MavenLister
+
+
+@shared_task(name=__name__ + ".FullMavenLister")
+def list_maven_full(**lister_args) -> Dict[str, int]:
+ """Full update of a Maven repository instance"""
+ lister = MavenLister.from_configfile(**lister_args)
+ return lister.run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping() -> str:
+ return "OK"
diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/maven/tests/data/https_indexes/export.fld b/swh/lister/maven/tests/data/https_indexes/export.fld
new file mode 100755
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_indexes/export.fld
@@ -0,0 +1,113 @@
+doc 0
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.0|sources|jar
+ field 1
+ name m
+ type string
+ value 1626111735737
+ field 2
+ name i
+ type string
+ value jar|1626109619335|14316|2|2|0|jar
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 1
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.0|NA|pom
+ field 1
+ name m
+ type string
+ value 1626111735764
+ field 2
+ name i
+ type string
+ value jar|1626109636636|-1|1|0|0|pom
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 2
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.1|sources|jar
+ field 1
+ name m
+ type string
+ value 1626111784883
+ field 2
+ name i
+ type string
+ value jar|1626111425534|14510|2|2|0|jar
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 3
+ field 0
+ name u
+ type string
+ value al.aldi|sprova4j|0.1.1|NA|pom
+ field 1
+ name m
+ type string
+ value 1626111784915
+ field 2
+ name i
+ type string
+ value jar|1626111437014|-1|1|0|0|pom
+ field 10
+ name n
+ type string
+ value sprova4j
+ field 11
+ name d
+ type string
+ value Java client for Sprova Test Management
+doc 4
+ field 14
+ name DESCRIPTOR
+ type string
+ value NexusIndex
+ field 15
+ name IDXINFO
+ type string
+ value 1.0|index
+doc 5
+ field 16
+ name allGroups
+ type string
+ value allGroups
+ field 17
+ name allGroupsList
+ type string
+ value al.aldi
+doc 6
+ field 18
+ name rootGroups
+ type string
+ value rootGroups
+ field 19
+ name rootGroupsList
+ type string
+ value al
+END
+checksum 00000000003321211082
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>al.aldi</groupId>
+ <artifactId>sprova4j</artifactId>
+ <version>0.1.0</version>
+ <name>sprova4j</name>
+ <description>Java client for Sprova Test Management</description>
+ <url>https://github.com/aldialimucaj/sprova4j</url>
+ <inceptionYear>2018</inceptionYear>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+ <developers>
+ <developer>
+ <id>aldi</id>
+ <name>Aldi Alimucaj</name>
+ <email>aldi.alimucaj@gmail.com</email>
+ </developer>
+ </developers>
+ <scm>
+ <connection>scm:git:git://github.com/aldialimucaj/sprova4j.git</connection>
+ <developerConnection>scm:git:git://github.com/aldialimucaj/sprova4j.git</developerConnection>
+ <url>https://github.com/aldialimucaj/sprova4j</url>
+ </scm>
+ <dependencies>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>1.2.3</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>2.8.3</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okhttp3</groupId>
+ <artifactId>okhttp</artifactId>
+ <version>3.10.0</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okio</groupId>
+ <artifactId>okio</artifactId>
+ <version>1.0.0</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.glassfish</groupId>
+ <artifactId>javax.json</artifactId>
+ <version>1.1.2</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.json</groupId>
+ <artifactId>javax.json-api</artifactId>
+ <version>1.1.2</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.validation</groupId>
+ <artifactId>validation-api</artifactId>
+ <version>2.0.1.Final</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.12</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okhttp3</groupId>
+ <artifactId>mockwebserver</artifactId>
+ <version>3.10.0</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>al.aldi</groupId>
+ <artifactId>sprova4j</artifactId>
+ <version>0.1.1</version>
+ <name>sprova4j</name>
+ <description>Java client for Sprova Test Management</description>
+ <url>https://github.com/aldialimucaj/sprova4j</url>
+ <inceptionYear>2018</inceptionYear>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+ <developers>
+ <developer>
+ <id>aldi</id>
+ <name>Aldi Alimucaj</name>
+ <email>aldi.alimucaj@gmail.com</email>
+ </developer>
+ </developers>
+ <scm>
+ <connection>https://github.com/aldialimucaj/sprova4j.git</connection>
+ <developerConnection>https://github.com/aldialimucaj/sprova4j.git</developerConnection>
+ <url>https://github.com/aldialimucaj/sprova4j</url>
+ </scm>
+ <dependencies>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>1.2.3</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>2.8.5</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okhttp3</groupId>
+ <artifactId>okhttp</artifactId>
+ <version>3.10.0</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okio</groupId>
+ <artifactId>okio</artifactId>
+ <version>1.14.1</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.glassfish</groupId>
+ <artifactId>javax.json</artifactId>
+ <version>1.1.2</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.json</groupId>
+ <artifactId>javax.json-api</artifactId>
+ <version>1.1.2</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>javax.validation</groupId>
+ <artifactId>validation-api</artifactId>
+ <version>2.0.1.Final</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.12</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.squareup.okhttp3</groupId>
+ <artifactId>mockwebserver</artifactId>
+ <version>3.10.0</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/test_lister.py
@@ -0,0 +1,152 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+from typing import List
+
+import pytest
+import requests
+
+from swh.lister.maven.lister import MavenLister
+from swh.scheduler.model import ListedOrigin
+
+MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url
+INDEX_URL = "https://indexes/export.fld" # index directory url
+
+URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
+URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
+
+LIST_SRC = (
+ MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar",
+ MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar",
+)
+
+LIST_SRC_DATA = (
+ {
+ "type": "jar",
+ "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+ + "/0.1.0/sprova4j-0.1.0-sources.jar",
+ "time": 1626109619335,
+ "gid": "al.aldi",
+ "aid": "sprova4j",
+ "version": "0.1.0",
+ },
+ {
+ "type": "jar",
+ "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+ + "/0.1.1/sprova4j-0.1.1-sources.jar",
+ "time": 1626111425534,
+ "gid": "al.aldi",
+ "aid": "sprova4j",
+ "version": "0.1.1",
+ },
+)
+
+LIST_GIT = (
+ "git://github.com/aldialimucaj/sprova4j.git",
+ "https://github.com/aldialimucaj/sprova4j.git",
+)
+
+
+@pytest.fixture
+def maven_index(datadir) -> str:
+ text = Path(datadir, "https_indexes", "export.fld").read_text()
+ return text
+
+
+@pytest.fixture
+def maven_pom_1(datadir) -> str:
+ text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
+ return text
+
+
+@pytest.fixture
+def maven_pom_2(datadir) -> str:
+ text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
+ return text
+
+
+def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
+ """Asserts that the two collections have the same origin URLs.
+
+ Does not test last_update."""
+
+ sorted_lister_urls = list(sorted(lister_urls))
+ sorted_scheduler_origins = list(sorted(scheduler_origins, key=lambda x: x.url))
+
+ assert len(sorted_lister_urls) == len(sorted_scheduler_origins)
+
+ for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins):
+ assert l_url == s_origin.url
+
+
+def test_maven_full_listing(
+ swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2,
+):
+ """Covers full listing of multiple pages, checking page results and listed
+ origins, statelessness."""
+
+ lister = MavenLister(
+ scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL,
+ )
+
+ index_text = maven_index
+ p1_text = maven_pom_1
+ p2_text = maven_pom_2
+ requests_mock.get(INDEX_URL, text=index_text)
+ requests_mock.get(URL_POM_1, text=p1_text)
+ requests_mock.get(URL_POM_2, text=p2_text)
+
+ # end test setup
+
+ stats = lister.run()
+ # start test checks
+ assert stats.pages == 4
+ assert stats.origins == 4
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ check_listed_origins(LIST_GIT + LIST_SRC, scheduler_origins)
+ for s in scheduler_origins:
+ if s.visit_type == "jar":
+ for list in LIST_SRC_DATA:
+ if list.get("url") == s.url:
+ assert (
+ list.get("time")
+ == s.extra_loader_arguments["artifacts"][0]["time"]
+ )
+ assert (
+ list.get("gid")
+ == s.extra_loader_arguments["artifacts"][0]["gid"]
+ )
+ assert (
+ list.get("aid")
+ == s.extra_loader_arguments["artifacts"][0]["aid"]
+ )
+ assert (
+ list.get("version")
+ == s.extra_loader_arguments["artifacts"][0]["version"]
+ )
+ break
+ else:
+ raise AssertionError
+ assert lister.get_state_from_scheduler() is None
+
+
+@pytest.mark.parametrize("http_code", [400, 500, 502])
+def test_maven_list_http_error(
+ swh_scheduler, requests_mock, mocker, maven_index, http_code
+):
+ """Test handling of some HTTP errors commonly encountered"""
+
+ lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
+
+ requests_mock.get(INDEX_URL, text=maven_index)
+ requests_mock.get(URL_POM_1, status_code=http_code)
+
+ with pytest.raises(requests.HTTPError):
+ lister.run()
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == 2
diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven/tests/test_tasks.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ lister = mocker.patch("swh.lister.maven.tasks.MavenLister")
+ lister.from_configfile.return_value = lister
+ lister.run.return_value = ListerStats(pages=10, origins=500)
+
+ kwargs = dict(
+ url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld"
+ )
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.maven.tasks.FullMavenLister", kwargs=kwargs,
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.from_configfile.assert_called_once_with(**kwargs)
+ lister.run.assert_called_once_with()
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -18,6 +18,10 @@
"tuleap": {"url": "https://tuleap.net",},
"gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",},
"opam": {"url": "https://opam.ocaml.org", "instance": "opam"},
+ "maven": {
+ "url": "https://repo1.maven.org/maven2/",
+ "index_url": "https://indexes/export.fld",
+ },
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 12:28 PM (2 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225125
Attached To
D6133: maven-lister: initialise lister.
Event Timeline
Log In to Comment