diff --git a/mypy.ini b/mypy.ini index 51c1c65..42c58d8 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,47 +1,44 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True [mypy-lxml.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True -[mypy-xmltodict.*] -ignore_missing_imports = True - [mypy-dulwich.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 5021815..a909c6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,9 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity >= 6.2 -xmltodict lxml dulwich diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 2dc6cc5..2560feb 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,425 +1,428 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin +from bs4 import BeautifulSoup +import lxml import requests from tenacity.before_sleep import before_sleep_log -import xmltodict from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( credentials=self.credentials, user_agent=USER_AGENT ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) if response.status_code != 200: logger.error("Index %s not found, stopping", self.INDEX_URL) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: try: response = self.page_request(pom, {}) - project = xmltodict.parse(response.content) - project_d = project.get("project", {}) - scm_d = project_d.get("scm") - if scm_d is not None: - connection = scm_d.get("connection") + parsed_pom = BeautifulSoup(response.content, "xml") + project = parsed_pom.find("project") + if project is None: + continue + scm = project.find("scm") + if scm is not None: + connection = scm.find("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], - "url": connection, + "url": connection.text, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom, ) - except xmltodict.expat.ExpatError as error: - logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + except lxml.etree.Error as error: + logger.info("Could not parse POM %s XML: %s.", pom, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the page is scm. Try and detect an scm/vcs repository. Note that official format is in the form: scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put the repo url (without the "scm:type"), so we have to check against the content to extract the type and url properly. Raises AssertionError when the type of the page is not 'scm' Returns ListedOrigin with proper canonical scm url (for github) if any is found, None otherwise. """ assert page["type"] == "scm" visit_type: Optional[str] = None url: Optional[str] = None m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is None: return None scm_type = m_scm.group("type") if scm_type and scm_type in SUPPORTED_SCM_TYPES: url = m_scm.group("url") visit_type = scm_type elif page["url"].endswith(".git"): url = page["url"].lstrip("scm:") visit_type = "git" else: return None if url and visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) if not url: return None assert visit_type is not None assert self.lister_obj.id is not None return ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=visit_type, ) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Maven repositories into a list of ListedOrigins.""" if page["type"] == "scm": listed_origin = self.get_scm(page) if listed_origin: yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom new file mode 100644 index 0000000..fa9648b --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom @@ -0,0 +1,769 @@ + + + + org.sonatype.oss + oss-parent + 7 + + 4.0.0 + com.alibaba.citrus + citrus-parent + pom + Citrus Parent Project + 3.0.7 + Another Java-based WEB Framework + http://www.openwebx.org/ + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:https://github.com/webx/citrus + scm:git:git@github.com:webx/citrus.git + http://github.com/webx/citrus + + + + baobao + Michael Zhou + yizhi@taobao.com + + architect + developer + + + http://i54.tinypic.com/2jewmjr.jpg + + -6 + + + + 1.5 + GBK + 3.0.7 + 2.5.6.SEC03 + 1.0 + 1.0 + 6.1.22 + ${java.encoding} + + + + spring3 + + 3.0.6.RELEASE + + + + + dist/webx + dist/test + common/expr + common/logconfig + common/util + common/springext + common/generictype + common/asm + common/hessian + common/codegen + test/util + test/webx + service/base + service/dataresolver + service/form + service/resource + service/upload + service/requestcontext + service/pipeline + service/configuration + service/mappingrule + service/moduleloader + service/pull + service/template + service/jsp + service/velocity + service/freemarker + service/uribroker + service/mail + webx/framework + webx/turbine + webx/dev + + + + org.slf4j + slf4j-api + + + org.slf4j + jcl-over-slf4j + + + + + + + + + ${project.groupId} + citrus-webx-all + ${webx-version} + + + ${project.groupId} + citrus-test-all + ${webx-version} + test + + + ${project.groupId} + citrus-common-logconfig + ${webx-version} + + + ${project.groupId} + citrus-common-expr + ${webx-version} + + + ${project.groupId} + citrus-common-util + ${webx-version} + + + ${project.groupId} + citrus-common-springext + ${webx-version} + + + ${project.groupId} + citrus-common-generictype + ${webx-version} + + + ${project.groupId} + citrus-common-asm + ${webx-version} + + + ${project.groupId} + citrus-common-hessian + ${webx-version} + + + ${project.groupId} + citrus-common-codegen + ${webx-version} + + + ${project.groupId} + citrus-test-util + ${webx-version} + test + + + ${project.groupId} + citrus-test-webx + ${webx-version} + test + + + ${project.groupId} + citrus-service-base + ${webx-version} + + + ${project.groupId} + citrus-service-dataresolver + ${webx-version} + + + ${project.groupId} + citrus-service-form + ${webx-version} + + + ${project.groupId} + citrus-service-resource + ${webx-version} + + + ${project.groupId} + citrus-service-upload + ${webx-version} + + + ${project.groupId} + citrus-service-requestcontext + ${webx-version} + + + ${project.groupId} + citrus-service-pipeline + ${webx-version} + + + ${project.groupId} + citrus-service-configuration + ${webx-version} + + + ${project.groupId} + citrus-service-mappingrule + ${webx-version} + + + ${project.groupId} + citrus-service-moduleloader + ${webx-version} + + + ${project.groupId} + citrus-service-pull + ${webx-version} + + + ${project.groupId} + citrus-service-template + ${webx-version} + + + ${project.groupId} + citrus-service-jsp + ${webx-version} + + + ${project.groupId} + citrus-service-velocity + ${webx-version} + + + ${project.groupId} + citrus-service-freemarker + ${webx-version} + + + ${project.groupId} + citrus-service-uribroker + ${webx-version} + + + ${project.groupId} + citrus-service-mail + ${webx-version} + + + ${project.groupId} + citrus-webx-framework + ${webx-version} + + + ${project.groupId} + citrus-webx-turbine + ${webx-version} + + + ${project.groupId} + citrus-webx-dev + ${webx-version} + + + + + + org.slf4j + slf4j-api + 1.6.1 + + + + org.slf4j + jcl-over-slf4j + 1.6.1 + + + + commons-logging + commons-logging + 1.1.1 + provided + + + + ch.qos.logback + logback-classic + 0.9.24 + runtime + + + + org.slf4j + slf4j-log4j12 + 1.6.1 + runtime + + + log4j + log4j + 1.2.16 + runtime + + + + + + junit + junit + 4.8.2 + test + + + org.hamcrest + hamcrest-library + 1.1 + test + + + httpunit + httpunit + 1.7 + test + + + jtidy + jtidy + + + + + rhino + js + 1.7R1 + test + + + nekohtml + nekohtml + 1.9.6 + test + + + xerces + xercesImpl + 2.9.1 + test + + + xml-apis + xml-apis + + + + + xalan + xalan + 2.7.1 + test + + + xml-apis + xml-apis + + + + + org.easymock + easymockclassextension + 3.0 + test + + + org.apache.tomcat + jasper + 6.0.33 + test + + + org.jvnet.mock-javamail + mock-javamail + 1.7 + test + + + oro + oro + 2.0.8 + + + + + + ecs + ecs + 1.4.2 + + + org.apache.commons + commons-jexl + 2.0.1 + + + org.apache.velocity + velocity + 1.6.4 + + + org.freemarker + freemarker + 2.3.16 + + + commons-fileupload + commons-fileupload + 1.2.1 + + + commons-io + commons-io + 1.4 + + + commons-codec + commons-codec + 1.3 + + + org.codehaus.groovy + groovy-all + 1.6.3 + runtime + + + org.apache.ant + ant + + + org.apache.ant + ant-launcher + + + jline + jline + + + + + dom4j + dom4j + 1.6.1 + + + xml-apis + xml-apis + + + + + cglib + cglib-nodep + 2.2 + + + javax.servlet + servlet-api + 2.5 + provided + + + javax.mail + mail + 1.4.1 + provided + + + javax.activation + activation + 1.1 + provided + + + janino + janino + 2.5.10 + test + + + xml-apis + xml-apis + 1.3.04 + + + + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-beans + ${spring-version} + + + org.springframework + spring-aop + ${spring-version} + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + org.springframework + spring-tx + ${spring-version} + + + org.springframework + spring-jdbc + ${spring-version} + + + org.springframework + spring-orm + ${spring-version} + + + org.springframework + spring-web + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + org.springframework + spring-test + ${spring-version} + test + + + + + + + maven-deploy-plugin + false + + false + + + + maven-compiler-plugin + + ${java.version} + ${java.version} + + + + maven-jar-plugin + + + + true + true + + + + + + maven-antrun-plugin + + + compile + + + + + + + + + + + run + + + + + + maven-surefire-plugin + + + **/*Tests.java + + -Xmx256m + + + + maven-eclipse-plugin + + true + + org.eclipse.jdt.launching.JRE_CONTAINER + + + + + maven-source-plugin + + + attach-sources + + jar-no-fork + + + + + + org.mortbay.jetty + maven-jetty-plugin + + citrus + 9999 + + + productionMode + false + + + + + + com.alibaba.citrus.tool + maven-springext-plugin + + + maven-gpg-plugin + + + sign-artifacts + verify + + sign + + + + + + + + + maven-antrun-plugin + 1.6 + + + maven-compiler-plugin + + 2.3.2 + + + maven-jar-plugin + 2.3.2 + + + maven-deploy-plugin + 2.7 + + true + + + + maven-eclipse-plugin + 2.8 + + + maven-shade-plugin + 1.4 + + + maven-source-plugin + 2.1.2 + + + maven-javadoc-plugin + 2.8 + + + maven-surefire-plugin + 2.10 + + + org.mortbay.jetty + maven-jetty-plugin + ${jetty-version} + + + com.alibaba.citrus.tool + maven-springext-plugin + ${springext-plugin-version} + + + maven-gpg-plugin + 1.4 + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-antrun-plugin + [1.0,) + + run + + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + [1.0,) + + enforce + + + + + + + + + + + + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index b2a88f9..6a75a99 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,353 +1,379 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" USER_REPO0 = "aldialimucaj/sprova4j" GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" LIST_GIT = (GIT_REPO_URL0_HTTPS,) USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) +USER_REPO2 = "webx/citrus" +GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" +GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" + LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_pom_1(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture def maven_pom_1_malformed(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() @pytest.fixture def maven_pom_2(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @pytest.fixture def maven_pom_3(datadir) -> bytes: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def maven_pom_multi_byte_encoding(datadir) -> bytes: + return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes() + + @pytest.fixture def requests_mock(requests_mock): """If github api calls for the configured scm repository, returns its canonical url.""" for url_api, url_html in [ (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + (GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS), ]: requests_mock.get( url_api, json={"html_url": url_html}, ) yield requests_mock @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): requests_mock.get(INDEX_URL, content=maven_index_full) requests_mock.get(URL_POM_1, content=maven_pom_1) requests_mock.get(URL_POM_2, content=maven_pom_2) requests_mock.get(URL_POM_3, content=maven_pom_3) def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, maven_pom_1_malformed, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 + + +def test_maven_list_pom_multi_byte_encoding( + swh_scheduler, requests_mock, maven_pom_multi_byte_encoding +): + """should parse POM file with multi-byte encoding.""" + + # replace pom file with a multi-byte encoding one + requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding) + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 3