diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from dataclasses import asdict, dataclass +from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional @@ -274,6 +275,7 @@ """ assert self.lister_obj.id is not None + scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": # If origin is a scm url: detect scm type and yield. # Note that the official format is: @@ -283,11 +285,12 @@ m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is not None: scm_type = m_scm.group("type") - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, - ) - yield origin + if scm_type in scm_types_ok: + scm_url = m_scm.group("url") + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, + ) + yield origin else: if page["url"].endswith(".git"): origin = ListedOrigin( @@ -296,14 +299,25 @@ yield origin else: # Origin is a source archive: + last_update_dt = None + last_update_iso = "" + last_update_seconds = str(page["time"])[:-3] + try: + last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) + last_update_dt_tz = last_update_dt.astimezone(timezone.utc) + except OverflowError: + logger.warning("- Failed to convert datetime %s.", last_update_seconds) + if last_update_dt: + last_update_iso = last_update_dt_tz.isoformat() origin = ListedOrigin( lister_id=self.lister_obj.id, url=page["url"], visit_type=page["type"], + last_update=last_update_dt, extra_loader_arguments={ "artifacts": [ { - "time": page["time"], + "time": last_update_iso, "gid": page["gid"], "aid": page["aid"], "version": page["version"], diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:https://github.com/aldialimucaj/sprova4j.git + scm:ghttps://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -3,8 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import timezone from pathlib import Path +import iso8601 import pytest import requests @@ -34,7 +36,7 @@ "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", - "time": 1626109619335, + "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", @@ -43,7 +45,7 @@ "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", - "time": 1626111425534, + "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", @@ -53,32 +55,32 @@ @pytest.fixture def maven_index(datadir) -> str: - text = Path(datadir, "http_indexes", "export.fld").read_text() - return text + return Path(datadir, "http_indexes", "export.fld").read_text() @pytest.fixture def maven_index_incr(datadir) -> str: - text = Path(datadir, "http_indexes", "export_incr.fld").read_text() - return text + return Path(datadir, "http_indexes", "export_incr.fld").read_text() @pytest.fixture def maven_pom_1(datadir) -> str: - text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() - return text + return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text() + + +@pytest.fixture +def maven_pom_1_malformed(datadir) -> str: + return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text() @pytest.fixture def maven_pom_2(datadir) -> str: - text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() - return text + return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() @pytest.fixture def maven_pom_3(datadir) -> str: - text = Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() - return text + return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text() def test_maven_full_listing( @@ -117,6 +119,10 @@ if origin.visit_type == "maven": for src in LIST_SRC_DATA: if src.get("url") == origin.url: + last_update_src = iso8601.parse_date(src.get("time")).astimezone( + tz=timezone.utc + ) + assert last_update_src == origin.last_update artifact = origin.extra_loader_arguments["artifacts"][0] assert src.get("time") == artifact["time"] assert src.get("gid") == artifact["gid"] @@ -125,7 +131,68 @@ assert MVN_URL == artifact["base_url"] break else: - raise AssertionError + raise AssertionError( + "Could not find scheduler origin in referenced origins." + ) + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + +def test_maven_full_listing_malformed( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_pom_1_malformed, + maven_pom_2, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) + assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + assert MVN_URL == artifact["base_url"] + break + else: + raise AssertionError( + "Could not find scheduler origin in referenced origins." + ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1