diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 6461da2..c4c4b76 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,347 +1,349 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin import requests from tenacity.before_sleep import before_sleep_log from urllib3.util import parse_url import xmltodict from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental if instance is None: instance = parse_url(url).host super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT,} ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """ Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading text index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) if ( self.incremental and self.state and self.state.last_seen_doc and self.state.last_seen_doc >= doc_id ): # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = None else: jar_src["doc"] = doc_id else: # If incremental mode, we don't record any line that is # before our last recorded doc id. if self.incremental and jar_src["doc"] is None: continue m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin(self.BASE_URL, url_path,) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: text = self.page_request(pom, {}) try: project = xmltodict.parse(text.content.decode()) if "scm" in project["project"]: if "connection" in project["project"]["scm"]: scm = project["project"]["scm"]["connection"] gid = project["project"]["groupId"] aid = project["project"]["artifactId"] artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], "url": scm, "project": f"{gid}.{aid}", } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Maven repositories into a list of ListedOrigins. """ assert self.lister_obj.id is not None + scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": # If origin is a scm url: detect scm type and yield. # Note that the official format is: # scm:git:git://github.com/openengsb/openengsb-framework.git # but many, many projects directly put the repo url, so we have to # detect the content to match it properly. m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is not None: scm_type = m_scm.group("type") - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, - ) - yield origin + if scm_type in scm_types_ok: + scm_url = m_scm.group("url") + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, + ) + yield origin else: if page["url"].endswith(".git"): origin = ListedOrigin( lister_id=self.lister_obj.id, url=page["url"], visit_type="git", ) yield origin else: # Origin is a source archive: origin = ListedOrigin( lister_id=self.lister_obj.id, url=page["url"], visit_type=page["type"], extra_loader_arguments={ "artifacts": [ { "time": page["time"], "gid": page["gid"], "aid": page["aid"], "version": page["version"], "base_url": self.BASE_URL, } ] }, ) yield origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom new file mode 100644 index 0000000..8234786 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:https://github.com/aldialimucaj/sprova4j.git + scm:ghttps://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 7ed5cdb..7c4b635 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,253 +1,316 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" LIST_GIT = ( "git://github.com/aldialimucaj/sprova4j.git", "https://github.com/aldialimucaj/sprova4j.git", ) LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) LIST_SRC = ( MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar", MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar", ) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": 1626109619335, "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": 1626111425534, "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", }, ) @pytest.fixture def maven_index(datadir) -> str: text = Path(datadir, "http_indexes", "export.fld").read_text() return text @pytest.fixture def maven_index_incr(datadir) -> str: text = Path(datadir, "http_indexes", "export_incr.fld").read_text() return text @pytest.fixture def maven_pom_1(datadir) -> str: text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() return text +@pytest.fixture +def maven_pom_1_malformed(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text() + return text + + @pytest.fixture def maven_pom_2(datadir) -> str: text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() return text @pytest.fixture def maven_pom_3(datadir) -> str: text = Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() return text def test_maven_full_listing( swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2, ): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. index_text = maven_index requests_mock.get(INDEX_URL, text=index_text) requests_mock.get(URL_POM_1, text=maven_pom_1) requests_mock.get(URL_POM_2, text=maven_pom_2) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 4 assert stats.origins == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: if src.get("url") == origin.url: artifact = origin.extra_loader_arguments["artifacts"][0] assert src.get("time") == artifact["time"] assert src.get("gid") == artifact["gid"] assert src.get("aid") == artifact["aid"] assert src.get("version") == artifact["version"] assert MVN_URL == artifact["base_url"] break else: raise AssertionError scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 +def test_maven_full_listing_malformed( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_pom_1_malformed, + maven_pom_2, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) + assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + assert MVN_URL == artifact["base_url"] + break + else: + raise AssertionError + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + def test_maven_incremental_listing( swh_scheduler, requests_mock, mocker, maven_index, maven_index_incr, maven_pom_1, maven_pom_2, maven_pom_3, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, text=maven_index) requests_mock.get(URL_POM_1, text=maven_pom_1) requests_mock.get(URL_POM_2, text=maven_pom_2) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 assert stats.origins == 4 # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 3 assert scheduler_state.last_seen_pom == 3 # Set up test. requests_mock.get(INDEX_URL, text=maven_index_incr) requests_mock.get(URL_POM_3, text=maven_pom_3) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 1 assert stats.origins == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: if src.get("url") == origin.url: artifact = origin.extra_loader_arguments["artifacts"][0] assert src.get("time") == artifact["time"] assert src.get("gid") == artifact["gid"] assert src.get("aid") == artifact["aid"] assert src.get("version") == artifact["version"] break else: raise AssertionError scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error( swh_scheduler, requests_mock, mocker, maven_index, http_code ): """Test handling of some common HTTP errors: - 400: Bad request. - 404: Resource no found. - 500: Internal server error. - 502: Bad gateway ou proxy Error. """ lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # Test failure of index retrieval. requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): lister.run() # Test failure of artefacts retrieval. requests_mock.get(INDEX_URL, text=maven_index) requests_mock.get(URL_POM_1, status_code=http_code) with pytest.raises(requests.HTTPError): lister.run() # If the maven_index step succeeded but not the get_pom step, # then we get only the 2 maven-jar origins (and not the 2 additional # src origins). scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2