diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` +- `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` @@ -36,7 +37,7 @@ ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) +`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -36,3 +36,7 @@ [mypy-urllib3.util.*] ignore_missing_imports = True + +[mypy-xmltodict.*] +ignore_missing_imports = True + diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ beautifulsoup4 launchpadlib tenacity +xmltodict diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register + lister.maven=swh.lister.maven:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/maven/README.md b/swh/lister/maven/README.md new file mode 100644 --- /dev/null +++ b/swh/lister/maven/README.md @@ -0,0 +1,227 @@ + +## The Maven lister + +This readme describes the design decisions made during development. + +More information can be found on the Software Heritage forge at +[https://forge.softwareheritage.org/T1724](https://forge.softwareheritage.org/T1724) and +on the diff of the lister at +[https://forge.softwareheritage.org/D6395](https://forge.softwareheritage.org/D6395). + +## Execution sequence (TL;DR) + +The complete sequence of actions to list the source artifacts and scm urls is as +follows: + +On the `index_exporter` server (asynchronously): + +* Check the list of remote indexes, and compare it to the list of local index files. +* Retrieve the missing Maven Indexer indexes from the remote repository. + Example of index from Maven Central: + [https://repo1.maven.org/maven2/.index/](https://repo1.maven.org/maven2/.index/) +* Start execution of the Docker container: + * If the `indexes` directory doesn't exist, unpack the Lucene indexes from the Maven + Indexer indexes using `indexer-cli`. + This generates a set of binary files as shown below: + + ``` + boris@castalia:maven$ ls -lh /media/home2/work/indexes/ + total 5,2G + -rw-r--r-- 1 root root 500M juil. 7 22:06 _4m.fdt + -rw-r--r-- 1 root root 339K juil. 7 22:06 _4m.fdx + -rw-r--r-- 1 root root 2,2K juil. 7 22:07 _4m.fnm + -rw-r--r-- 1 root root 166M juil. 7 22:07 _4m_Lucene50_0.doc + -rw-r--r-- 1 root root 147M juil. 7 22:07 _4m_Lucene50_0.pos + -rw-r--r-- 1 root root 290M juil. 7 22:07 _4m_Lucene50_0.time + -rw-r--r-- 1 root root 3,1M juil. 7 22:07 _4m_Lucene50_0.tip + [SNIP] + -rw-r--r-- 1 root root 363 juil. 7 22:06 _e0.si + -rw-r--r-- 1 root root 1,7K juil. 7 22:07 segments_2 + -rw-r--r-- 1 root root 8 juil. 7 21:54 timestamp + -rw-r--r-- 1 root root 0 juil. 7 21:54 write.lock + ``` + * If the `export` directory doesn't exist, export the Lucene documents from the + Lucene indexes using `clue`. + This generates a set of text files as shown below: + + ``` + boris@castalia:~$ ls -lh /work/export/ + total 49G + -rw-r--r-- 1 root root 13G juil. 7 22:12 _p.fld + -rw-r--r-- 1 root root 7,0K juil. 7 22:21 _p.inf + -rw-r--r-- 1 root root 2,9G juil. 7 22:21 _p.len + -rw-r--r-- 1 root root 33G juil. 7 22:20 _p.pst + -rw-r--r-- 1 root root 799 juil. 7 22:21 _p.si + -rw-r--r-- 1 root root 138 juil. 7 22:21 segments_1 + -rw-r--r-- 1 root root 0 juil. 7 22:07 write.lock + ``` +* On the host, copy export files to `/var/www/html/` to make them available on the +network. + +On the lister side: + +* Get the exports from the above local index server. +* Extract the list of all pom and source artefacts from the Lucene export. +* Yield the list of source artefacts to the Maven Loader as they are found. +* Download all poms from the above list. +* Parse all poms to extract the scm attribute, and yield the list of scm urls towards + the classic loaders (git, svn, hg..). + +The process has been optimised as much as it could be, scaling down from 140 GB on disk +/ 60 GB RAM / 90 mn exec time to 60 GB on disk / 2 GB (excl. docker) / 32 mn exec time. + +For the long read about why we came to here, please continue to the "About Maven +ecosystem" section. + + +## Incremental listing + +The lister is stateful, and stores 2 identifiers: `last_seen_doc` and `last_seen_pom`. +The latter was required to prevent the loss of `scm` entries in poms, as they are +computed after the full list of jar's (and thus *after* the last doc id is encountered) +is yielded. + +This enables us to manage two situation types: +* If the lister is interrupted during its execution: the second run simply picks up + where the last doc_id or pom_id was set. +* If the remote server has an updated index: doc_id's are always incremented, and the + lister will pick up where the last doc_id was set and start retrieve the updated + content. + +## About the Maven ecosystem + +Maven repositories are a loose, decentralised network of HTTP servers with a +well-defined hosted structure. They are used according to the Maven dependency +resolver[i](#sdendnote1sym), an inheritance-based mechanism used to identify and +locate artefacts required in Maven builds. + +There is no uniform, standardised way to list the contents of maven repositories, since +consumers are supposed to know what artefacts they need. Instead, Maven repository +owners usually setup a Maven Indexer[ii](#sdendnote2sym) to enablesource code +identification and listing in IDEs – for this reason, source jars usually don't have +build files and information, only providing pure sources. + +Maven Indexer is not a mandatory part of the maven repository stack, but it is the +*de facto* standard for maven repositories indexing and querying. All major Maven +repositories we have seen so far use it. Most artefacts are located in the main central +repository: Maven Central[iii](#sdendnote3sym), hosted and run by +Sonatype[iv](#sdendnote4sym). Other well-known repositories are listed on MVN +Repository[v](#sdendnote5sym). + +Maven repositories are mainly used for binary content (e.g. class jars), but the +following sources of information are relevant to our goal in the maven repositories and +ecosystem: + +* SCM attributes in pom XML files contain the **scm URL** of the associated source code. + They can be fed to standard Git/SVN/others loaders. +* **Source artefacts** contain pure source code (i.e. no build files) associated to the + artefact. There are two main naming conventions for them, although not always +enforced: + * ${artifactId}-${version}-source-release.zip + * ${artifactId}-${version}-src.zip + + They come in various archiving formats (jar, zip, tar.bz2, tar.gz) and require a +specific loader to attach the artefact metadata. + +[i](#sdendnote1anc)Maven dependency resolver: +[https://maven.apache.org/resolver/index.html](https://maven.apache.org/resolver/index.h +tml) + +[ii](#sdendnote2anc)Maven Indexer: +[https://maven.apache.org/maven-indexer/](https://maven.apache.org/maven-indexer/) + +[iii](#sdendnote3anc)Maven Central: +[https://search.maven.org/](https://search.maven.org/) + +[iv](#sdendnote4anc)Sonatype Company: +[https://www.sonatype.com/](https://www.sonatype.com/) + +[v](#sdendnote5anc)MVN Repository: +[https://mvnrepository.com/repos](https://mvnrepository.com/repos) + +## Preliminary research + +Listing the full content of a Maven repository is very unusual, and the whole system +has not been built for this purpose. Instead, tools and build systems can easily fetch +individual artefacts according to their Maven coordinates (groupId, artifactId, +version, classifier, extension). Usual listing means (e.g. scraping) are highly +discouraged and will trigger bannishment easily. There is no common API defined either. + +Once we have the artifactId/group we can easily get the list of versions (e.g. for +updates) by reading the [maven-metadata.xml file at the package +level](https://repo1.maven.org/maven2/ant/ant/maven-metadata.xml), although this is not +always reliable. The various options that were investigated to get the interesting +artefacts are: + +* **Scrapping** could work but is explicitly forbidden[i](#sdendnote1sym). Pages could +easily be parsed through, and it would allow to identify \*all\* artifacts. +* Using **Maven indexes** is the "official" way to retrieve information from a maven +repository and most repositories provide this feature. It would also enable a smart +incremental listing. The Maven Indexer data format however is not well documented. +It relies under the hood on an old version (Lucene54) of a lucene +indexes, and the only libraries that can access it are written in java. This implies a +dedicated Docker container with a jvm and some specific tools (maven indexer and clue +for the lucene index), and thus would bring some complexity to the docker & prod setups. +* A third path could be to **parse all the pom.xml's** that we find and follow all +artifactId's recursively, building a graph of dependencies and parent poms. This is +more of a non-complete heuristic, and we would miss leaf nodes (i.e. artifacts that are +not used by others), but it could help setup a basic list. +* It should be noted also that there are two main implementations of maven +repositories: Nexus and Artifactory. By being more specific we could use the respective +APIs of these products to get information. But getting the full list of artefacts is +still not straightforward, and we'd lose any generic treatment doing so. + +The best option in our opinion is to go with the Maven Indexer, for it is the most +complete listing available (notably for the biggest repository by far: maven central). + +[i](#sdendnote1anc)Maven repository’s Terms of Service: +[https://repo1.maven.org/terms.html](https://repo1.maven.org/terms.html) + +## Maven indexes conversion + +[Maven-Indexer](https://maven.apache.org/maven-indexer/) is a (thick) wrapper around +lucene. It parses the repository and stores documents, fields and terms in an index. +One can extract the lucene index from a maven index using the command: `java -jar +indexer-cli-5.1.1.jar --unpack nexus-maven-repository-index.gz --destination test +--type full`. Note however that 5.1.1 is an old version of maven indexer; newer +versions of the maven indexer won't work on the central indexes. + +[Clue](https://maven.apache.org/maven-indexer/) is a CLI tool to read lucene indexes, +and version 6.2.0 works with our maven indexes. One can use the following command to +export the index to text: `java -jar clue-6.2.0-1.0.0.jar maven/central-lucene-index/ +export central_export text`. + +The exported text file looks like this: + +``` +doc 0 + field 0 + name u + type string + value com.redhat.rhevm.api|rhevm-api-powershell-jaxrs|1.0-rc1.16|javadoc|jar + field 1 + name m + type string + value 1321264789727 + field 2 + name i + type string + value jar|1320743675000|768291|2|2|1|jar + field 10 + name n + type string + value RHEV-M API Powershell Wrapper Implementation JAX-RS + field 13 + name 1 + type string + value 454eb6762e5bb14a75a21ae611ce2048dd548550 +``` + +The execution of these two jars requires a Java virtual machine -- java execution in +python is not possible without a JVM. Docker is a good way to run both tools and +generate the exports independently, rather than add a JVM to the existing production +environment. + +We decided (2021-08-25) to install and execute a docker container on a separate server +so the lister would simply have to fetch it on the network and parse it (the latter +part in pure python). diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import MavenLister + + return { + "lister": MavenLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/lister.py @@ -0,0 +1,426 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict, dataclass +from datetime import datetime +import logging +import re +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import urljoin + +import iso8601 +import requests +from tenacity.before_sleep import before_sleep_log +import xmltodict + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + + +@dataclass +class RepoPage: + """Result from a query to a gitlab project api page.""" + + origin_type: str + """The type of origin: maven or scm. + """ + + url: str + """The URL to access the origin. + """ + + doc: int + """The Doc ID in the Lucene index. + """ + + project: Optional[str] = None + """A short string representation of the project. + Used only for type 'scm' """ + + time: Optional[str] = None + """The time of publication of the artefact, as an iso8601 date str. + Used only for type 'maven' """ + + gid: Optional[str] = None + """The Maven group ID coordinate. + Used only for type 'maven' """ + + aid: Optional[str] = None + """The Maven artefact ID coordinate. + Used only for type 'maven' """ + + version: Optional[str] = None + """The Maven version coordinate. + Used only for type 'maven' """ + + +@dataclass +class MavenListerState: + """State of the MavenLister""" + + last_seen_doc: int = -1 + """Last doc ID ingested during an incremental pass + + """ + + +class MavenLister(Lister[MavenListerState, RepoPage]): + """List origins from a Maven repository. + + Maven Central provides artifacts for Java builds. + It includes POM files and source archives, which we download to get + the source code of artifacts and links to their scm repository. + + This lister yields origins of types: git/svn/hg or whatever the Artifacts + use as repository type, plus maven types for the maven loader (tgz, jar).""" + + LISTER_NAME = "maven" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + index_url: str = None, + instance: Optional[str] = None, + credentials: CredentialsType = None, + incremental: bool = True, + ): + """Lister class for Maven repositories. + + Args: + url: main URL of the Maven repository, i.e. url of the base index + used to fetch maven artifacts. For Maven central use + https://repo1.maven.org/maven2/ + index_url: the URL to download the exported text indexes from. + Would typically be a local host running the export docker image. + See README.md in this directory for more information. + instance: Name of maven instance. Defaults to url's network location + if unset. + incremental: bool, defaults to True. Defines if incremental listing + is activated or not. + + """ + self.base_url = url + self.index_url = index_url + self.incremental = incremental + + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT,} + ) + + def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: + return MavenListerState(**d) + + def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: + return asdict(state) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[RepoPage]: + """ Retrieve and parse exported maven indexes to identify all pom files and src + archives. See the README.md file in this directory and the example of returned + RepoPage below. + + Example of returned RepoPage's:: + + [ + { + "origin_type": "maven", + "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", + "time": "2021-07-12 19:37:05.534000+00:00", + "gid": "org.xwiki.platform", + "aid": "xwiki-platform-wikistream-events-xwiki", + "version": "5.4.2" + }, + { + "origin_type": "scm", + "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", + "project": "openengsb-framework", + }, + ... + ] + + """ + + # Download the main text index file. + logger.info(f"Downloading text index from {self.index_url}.") + assert self.index_url is not None + # This returns a (possibly huge) text index as described in the lister README. + response = requests.get(self.index_url, stream=True) + response.raise_for_status() + + # Prepare regexes to parse index exports. + + # Parse doc id. + # Example line: "doc 13" + re_doc = re.compile(r"^doc (?P\d+)$") + + # Parse gid, aid, version, classifier, extension. + # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" + re_val = re.compile( + r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + + r"(?P[^|]+)\|(?P[^|]+)$" + ) + + # Parse last modification time. + # Example line: " value jar|1626109619335|14316|2|2|0|jar" + re_time = re.compile( + r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + + r"\|([^|]+)\|([^|]+)$" + ) + + # Read file line by line and process it + content: Dict = {} + jar_src: Dict = {} + doc_id: int = 0 + url_src = None + + logger.info("Parsing maven index.") + deleted_items = 0 + iterator = response.iter_lines(chunk_size=1024) + for line_bytes in iterator: + # Read the index text export and get URLs and SCMs. + + # FIXME: maybe we should raise a decode error here, see + # https://forge.softwareheritage.org/D6395#anchor-inline-47955 + line = line_bytes.decode() + + m_doc = re_doc.match(line) + if m_doc is not None: + doc_id = int(m_doc.group("doc")) + else: + m_val = re_val.match(line) + if m_val is not None: + (gid, aid, version, classifier, ext) = m_val.groups() + ext = ext.strip() + path = "/".join(gid.split(".")) + if classifier == "NA" and ext.lower() == "pom": + url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" + url_pom = urljoin(self.base_url, url_path,) + content[doc_id] = {"type": "scm", "url": url_pom} + logger.debug("- Storing scm %s %s.", doc_id, url_pom) + elif ( + classifier.lower() == "sources" or ("src" in classifier) + ) and ext.lower() in ("zip", "jar"): + url_path = ( + f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" + ) + url_src = urljoin(self.base_url, url_path) + jar_src["gid"] = gid + jar_src["aid"] = aid + jar_src["version"] = version + else: + m_time = re_time.match(line) + if m_time is not None and url_src is not None: + time = m_time.group("mtime") + jar_src["time"] = str(datetime.fromtimestamp(int(time) / 1e3)) + content[doc_id] = {} + content[doc_id]["type"] = "maven" + content[doc_id]["url"] = url_src + content[doc_id]["time"] = jar_src["time"] + content[doc_id]["gid"] = jar_src["gid"] + content[doc_id]["aid"] = jar_src["aid"] + content[doc_id]["version"] = jar_src["version"] + logger.debug("- Storing maven %s %s.", doc_id, url_src) + url_src = None + else: + # If we meet name del's we need to decrement the doc counter + # (in case number of artefacts added < number of deleted) + if line == " name del": + deleted_items += 1 + + # Data structure example for content: + # { + # 0: { + # 'type': 'maven', + # 'url': 'https://repo1.maven.org/.../0.1.0/sprova4j-0.1.0-sources.jar', + # 'time': '2021-07-12 19:37:05.534000', + # 'gid': 'al.aldi', + # 'aid': 'sprova4j', + # 'version': '0.1.0' + # }, + # 1: { + # 'type': 'scm', + # 'url': 'https://repo1.maven.org/.../0.1.0/sprova4j-0.1.0.pom' + # }, + # 2: { + # 'type': 'maven', + # 'url': 'https://repo1.maven.org/.../0.1.1/sprova4j-0.1.1-sources.jar', + # 'time': '2021-07-12 19:37:05.534000', + # 'gid': 'al.aldi', + # 'aid': 'sprova4j', + # 'version': '0.1.1' + # }, + # 3: { + # 'type': 'scm', + # 'url': 'https://repo1.maven.org/.../0.1.1/sprova4j-0.1.1.pom' + # } + # } + + logger.info(f"Found a grand total of {len(content)} artefacts.") + + # Now go through the content Dict, starting from the last registered doc, and: + # - for jars: yield them. + # - for scms: fetch pom files and scan them for scm info. + # If the scm has already been registered during this run: skip, + # else: yield it and remember we yielded it in visited_scms. + visited_scms: List = [] + logger.info("Start processing entries, yield jars, fetch poms and yield scms..") + for doc_id in sorted(content): + if ( + self.incremental + and self.state + and self.state.last_seen_doc + and self.state.last_seen_doc >= doc_id # See [*] + ): + # [*] When artefacts are deleted, they are removed from the full list + # and added at the end of the listing as name=del entries. We need to + # decrement the state counter to actually analyse new documents + # occupying deleted documents ids. + continue + artefact = content[doc_id] + if artefact["type"] == "scm": + text = self.page_request(artefact["url"], {}) + try: + project = xmltodict.parse(text.content.decode()) + if "scm" in project["project"]: + if "connection" in project["project"]["scm"]: + scm = project["project"]["scm"]["connection"] + if scm not in visited_scms: + if "groupId" in project["project"]: + gid = project["project"]["groupId"] + elif ( + "parent" in project["project"] + and "groupId" in project["project"]["parent"] + ): + gid = project["project"]["parent"]["groupId"] + else: + continue + aid = project["project"]["artifactId"] + page = RepoPage(origin_type="scm", url=scm, doc=doc_id) + page.project = f"{gid}.{aid}" + logger.debug( + "- Yielding scm %s %s.", doc_id, artefact["url"] + ) + yield page + visited_scms.append(scm) + else: + logger.debug(f"No scm.connection in pom {artefact['url']}") + else: + logger.debug(f"No scm in pom {artefact['url']}") + except xmltodict.expat.ExpatError as error: + logger.info( + f"Could not parse POM {artefact['url']} XML: {error}. Next." + ) + else: + page = RepoPage(origin_type="maven", url=artefact["url"], doc=doc_id) + page.time = artefact["time"] + page.gid = artefact["gid"] + page.aid = artefact["aid"] + page.version = artefact["version"] + logger.debug(f"- Yielding maven {doc_id} {artefact['url']}.") + yield page + + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins. + + """ + assert self.lister_obj.id is not None + last_update: Optional[datetime] + + if page.origin_type == "maven": + # Origin is considered a maven source archive: + try: + assert page.time is not None + last_update = iso8601.parse_date(page.time) + except iso8601.ParseError: + last_update = None + origin = ListedOrigin( + lister_id=self.lister_obj.id, + url=page.url, + visit_type=page.origin_type, + last_update=last_update, + extra_loader_arguments={ + "artifacts": [ + { + "time": str(last_update), + "gid": page.gid, + "aid": page.aid, + "version": page.version, + } + ] + }, + ) + yield origin + else: + # If origin is a scm url: detect scm type and yield. + # Note that the official format is: + # scm:git:git://github.com/openengsb/openengsb-framework.git + # but many, many projects directly put the repo url, so we have to + # detect the content to match it properly. + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page.url) + if m_scm is not None: + scm_type = m_scm.group("type") + scm_url = m_scm.group("url") + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, + ) + yield origin + else: + if page.url.endswith(".git"): + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=page.url, visit_type="git", + ) + yield origin + + def commit_page(self, page: RepoPage) -> None: + """Update currently stored state using the latest listed doc. + + Note: this is a noop for full listing mode + + """ + if self.incremental and self.state: + # We need to differentiate the two state counters according + # to the type of origin. + if page.doc > self.state.last_seen_doc: + self.state.last_seen_doc = page.doc + + def finalize(self) -> None: + """Finalize the lister state, set update if any progress has been made. + + Note: this is a noop for full listing mode + + """ + if self.incremental and self.state: + last_seen_doc = self.state.last_seen_doc + scheduler_state = self.get_state_from_scheduler() + if last_seen_doc and (scheduler_state.last_seen_doc < last_seen_doc): + self.updated = True diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import MavenLister + + +@shared_task(name=__name__ + ".FullMavenLister") +def list_maven_full(**lister_args) -> Dict[str, int]: + """Full update of a Maven repository instance""" + lister = MavenLister.from_configfile(incremental=False, **lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".IncrementalMavenLister") +def list_maven_incremental(**lister_args) -> Dict[str, int]: + """Incremental update of a Maven repository instance""" + lister = MavenLister.from_configfile(incremental=True, **lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py new file mode 100644 diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export.fld @@ -0,0 +1,113 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1626111735737 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1626111735764 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1626111784883 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1626111784915 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 14 + name DESCRIPTOR + type string + value NexusIndex + field 15 + name IDXINFO + type string + value 1.0|index +doc 5 + field 16 + name allGroups + type string + value allGroups + field 17 + name allGroupsList + type string + value al.aldi +doc 6 + field 18 + name rootGroups + type string + value rootGroups + field 19 + name rootGroupsList + type string + value al +END +checksum 00000000003321211082 diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.1.fld b/swh/lister/maven/tests/data/http_indexes/export_incr.1.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export_incr.1.fld @@ -0,0 +1,134 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1633786348254 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1633786348271 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1633786370818 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1633786370857 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 0 + name u + type string + value com.arangodb|arangodb-graphql|1.2|NA|pom + field 1 + name m + type string + value 1634498235946 + field 2 + name i + type string + value jar|1624265143830|-1|0|0|0|pom + field 10 + name n + type string + value arangodb-graphql + field 11 + name d + type string + value ArangoDB Graphql +doc 5 + field 14 + name DESCRIPTOR + type string + value NexusIndex + field 15 + name IDXINFO + type string + value 1.0|index_1 +doc 6 + field 16 + name allGroups + type string + value allGroups + field 17 + name allGroupsList + type string + value com.arangodb|al.aldi +doc 7 + field 18 + name rootGroups + type string + value rootGroups + field 19 + name rootGroupsList + type string + value com|al +END +checksum 00000000004102281591 diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.2.fld b/swh/lister/maven/tests/data/http_indexes/export_incr.2.fld new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/http_indexes/export_incr.2.fld @@ -0,0 +1,173 @@ +doc 0 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|sources|jar + field 1 + name m + type string + value 1635889236213 + field 2 + name i + type string + value jar|1626111425534|14510|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 1 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.1|NA|pom + field 1 + name m + type string + value 1635889236233 + field 2 + name i + type string + value jar|1626111437014|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 2 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|sources|jar + field 1 + name m + type string + value 1635889236235 + field 2 + name i + type string + value jar|1626109619335|14316|2|2|0|jar + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 3 + field 0 + name u + type string + value al.aldi|sprova4j|0.1.0|NA|pom + field 1 + name m + type string + value 1635889236239 + field 2 + name i + type string + value jar|1626109636636|-1|1|0|0|pom + field 10 + name n + type string + value sprova4j + field 11 + name d + type string + value Java client for Sprova Test Management +doc 4 + field 0 + name u + type string + value com.arangodb|arangodb-graphql|1.2|NA|pom + field 1 + name m + type string + value 1635889248456 + field 2 + name i + type string + value jar|1624265143830|-1|0|0|0|pom + field 10 + name n + type string + value arangodb-graphql + field 11 + name d + type string + value ArangoDB Graphql +doc 5 + field 0 + name u + type string + value com.jolira|wicket-guicier-parent|2.0.12|NA|pom + field 1 + name m + type string + value 1635889260544 + field 2 + name i + type string + value pom|1320803683000|2940|0|0|0|pom + field 10 + name n + type string + value Wicket Guicier Parent + field 11 + name d + type string + value A resplacement for wicket-guice that uses constructor injection as an alternative to the excessive use of PageParameters. +doc 6 + field 14 + name del + type string + value al.aldi|sprova4j|0.1.0|NA|null + field 1 + name m + type string + value 1635889260677 +doc 7 + field 14 + name del + type string + value al.aldi|sprova4j|0.1.0|sources|null + field 1 + name m + type string + value 1635889260679 +doc 8 + field 15 + name DESCRIPTOR + type string + value NexusIndex + field 16 + name IDXINFO + type string + value 1.0|index_x +doc 9 + field 17 + name allGroups + type string + value allGroups + field 18 + name allGroupsList + type string + value com.arangodb|com.jolira|al.aldi +doc 10 + field 19 + name rootGroups + type string + value rootGroups + field 20 + name rootGroupsList + type string + value com|al +END +checksum 00000000002844832287 diff --git a/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom new file mode 100755 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom @@ -0,0 +1,208 @@ + + + + + 4.0.0 + + com.arangodb + arangodb-graphql + 1.2 + + arangodb-graphql + ArangoDB Graphql + https://github.com/ArangoDB-Community/arangodb-graphql-java + + + + Apache License 2.0 + http://www.apache.org/licenses/LICENSE-2.0 + repo + + + + + + Colin Findlay + + + Michele Rastelli + https://github.com/rashtao + + + + + UTF-8 + 1.8 + 1.8 + 1.8 + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.8 + true + + ossrh + https://oss.sonatype.org/ + 84aff6e87e214c + false + + + + org.apache.maven.plugins + maven-resources-plugin + 3.1.0 + + UTF-8 + + + + org.apache.maven.plugins + maven-source-plugin + 3.1.0 + + + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.1.1 + + + attach-javadocs + + jar + + + + + + maven-deploy-plugin + 2.8.2 + + false + 10 + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + assembly + package + + single + + + + + + ${project.artifactId}-${project.version}-standalone + + false + false + + jar-with-dependencies + + + + + + + + + com.graphql-java + graphql-java + 11.0 + + + com.arangodb + arangodb-java-driver + 6.5.0 + + + junit + junit + 4.12 + test + + + org.mockito + mockito-core + 2.15.0 + test + + + org.hamcrest + hamcrest-library + 1.3 + test + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + https://github.com/ArangoDB-Community/arangodb-graphql-java + scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git + scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git + + + + ArangoDB GmbH + https://www.arangodb.com + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git:git://github.com/aldialimucaj/sprova4j.git + scm:git:git://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.1 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.5 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.14.1 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/data/https_maven.org/wicket-guicier-parent-2.0.12.pom b/swh/lister/maven/tests/data/https_maven.org/wicket-guicier-parent-2.0.12.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/wicket-guicier-parent-2.0.12.pom @@ -0,0 +1,90 @@ + + + + superpom + com.jolira + 1.1.4 + + 4.0.0 + wicket-guicier-parent + pom + 2.0.12 + Wicket Guicier Parent + A resplacement for wicket-guice that uses constructor injection as an alternative to the excessive use of PageParameters. + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + 1.4.17 + 3.0 + 7.3.1.v20110307 + 4.8.1 + 1.6.1 + + + guicier + demo + + + + + false + src/main/resources + + + false + src/main/java + + ** + + + **/*.java + + + + + + false + src/test/java + + ** + + + **/*.java + + + + + + true + org.apache.maven.plugins + maven-compiler-plugin + 2.3 + + 1.6 + 1.6 + true + true + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.8 + + true + + + + + + scm:git:git@github.com:jolira/wicket-guicier.git + scm:git:git@github.com:jolira/wicket-guicier.git + https://github.com/jolira/wicket-guicier + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_lister.py @@ -0,0 +1,406 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +import pytest +import requests + +from swh.lister.maven.lister import MavenLister + +MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url +INDEX_URL = "http://indexes/export.fld" # index directory url + +URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" +URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" +URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" +URL_POM_4 = ( + MVN_URL + "com/jolira/wicket-guicier-parent/2.0.12/wicket-guicier-parent-2.0.12.pom" +) + +LIST_GIT = ( + "git://github.com/aldialimucaj/sprova4j.git", + "https://github.com/aldialimucaj/sprova4j.git", +) + +LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) + +LIST_GIT_INCR_3 = ("git@github.com:jolira/wicket-guicier.git",) + +LIST_SRC = ( + MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", + MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", +) + +LIST_SRC_DATA = ( + { + "type": "maven", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.0/sprova4j-0.1.0-sources.jar", + "time": "2021-07-12 19:06:59.335000+00:00", + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.0", + }, + { + "type": "maven", + "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + + "/0.1.1/sprova4j-0.1.1-sources.jar", + "time": "2021-07-12 19:37:05.534000+00:00", + "gid": "al.aldi", + "aid": "sprova4j", + "version": "0.1.1", + }, +) + + +@pytest.fixture +def maven_index(datadir) -> str: + text = Path(datadir, "http_indexes", "export.fld").read_text() + return text + + +@pytest.fixture +def maven_index_incr(datadir) -> str: + text = Path(datadir, "http_indexes", "export_incr.1.fld").read_text() + return text + + +@pytest.fixture +def maven_index_incr_3(datadir) -> str: + text = Path(datadir, "http_indexes", "export_incr.2.fld").read_text() + return text + + +@pytest.fixture +def maven_pom_1(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() + return text + + +@pytest.fixture +def maven_pom_2(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() + return text + + +@pytest.fixture +def maven_pom_3(datadir) -> str: + text = Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() + return text + + +@pytest.fixture +def maven_pom_4(datadir) -> str: + text = Path( + datadir, "https_maven.org", "wicket-guicier-parent-2.0.12.pom" + ).read_text() + return text + + +def test_maven_full_listing( + swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2, +): + """Covers full listing of multiple pages, checking page results and listed + origins, statelessness.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 4 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + break + else: + raise AssertionError + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + + +def test_maven_incremental_listing( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_index_incr, + maven_pom_1, + maven_pom_2, + maven_pom_3, +): + """Covers full listing of multiple pages, checking page results and listed + origins, with a second updated run for statefulness.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 4 + assert stats.origins == 4 + + # Second execution of the lister, incremental mode + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == 3 + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index_incr) + requests_mock.get(URL_POM_3, text=maven_pom_3) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + break + else: + raise AssertionError + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state + assert scheduler_state.last_seen_doc == 4 + + +def test_maven_incremental_listing_3( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_index_incr, + maven_index_incr_3, + maven_pom_1, + maven_pom_2, + maven_pom_3, + maven_pom_4, +): + """Covers full listing of multiple pages, checking page results and listed + origins, with a second updated run for statefulness, and a third updated run + with deleted artefacts and a parent pom (i.e. without groupId).""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, text=maven_pom_1) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 4 + assert stats.origins == 4 + + # Second execution of the lister, incremental mode + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == 3 + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index_incr) + requests_mock.get(URL_POM_3, text=maven_pom_3) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + break + else: + raise AssertionError + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state + assert scheduler_state.last_seen_doc == 4 + + # Third execution of the lister, incremental mode + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=True, + ) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == 4 + + # Set up test. + requests_mock.get(INDEX_URL, text=maven_index_incr_3) + requests_mock.get(URL_POM_4, text=maven_pom_4) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + assert sorted(origin_urls) == sorted( + LIST_SRC + LIST_GIT + LIST_GIT_INCR + LIST_GIT_INCR_3 + ) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + break + else: + raise AssertionError + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state + assert scheduler_state.last_seen_doc == 5 + + +@pytest.mark.parametrize("http_code", [400, 404, 500, 502]) +def test_maven_list_http_error( + swh_scheduler, requests_mock, mocker, maven_index, http_code +): + """Test handling of some common HTTP errors: + - 400: Bad request. + - 404: Resource no found. + - 500: Internal server error. + - 502: Bad gateway ou proxy Error. + """ + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + # Test failure of index retrieval. + + requests_mock.get(INDEX_URL, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + # Test failure of artefacts retrieval. + + requests_mock.get(INDEX_URL, text=maven_index) + requests_mock.get(URL_POM_1, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + # If the maven_index step succeeded but not the get_pom step, + # we get only the first maven-jar entry and then we fail when trying to fetch the + # second entry (and first pom). + # src origins). + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 1 diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/test_tasks.py @@ -0,0 +1,45 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@pytest.mark.parametrize( + "task_name,incremental", + [("IncrementalMavenLister", True), ("FullMavenLister", False)], +) +def test_task_lister_maven( + task_name, + incremental, + swh_scheduler_celery_app, + swh_scheduler_celery_worker, + mocker, +): + lister = mocker.patch("swh.lister.maven.tasks.MavenLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld" + ) + res = swh_scheduler_celery_app.send_task( + f"swh.lister.maven.tasks.{task_name}", kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(incremental=incremental, **kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -18,6 +18,10 @@ "tuleap": {"url": "https://tuleap.net",}, "gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",}, "opam": {"url": "https://opam.ocaml.org", "instance": "opam"}, + "maven": { + "url": "https://repo1.maven.org/maven2/", + "index_url": "http://indexes/export.fld", + }, }