diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -274,6 +274,7 @@ """ assert self.lister_obj.id is not None + scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": # If origin is a scm url: detect scm type and yield. # Note that the official format is: @@ -283,11 +284,12 @@ m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is not None: scm_type = m_scm.group("type") - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, - ) - yield origin + if scm_type in scm_types_ok: + scm_url = m_scm.group("url") + origin = ListedOrigin( + lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type, + ) + yield origin else: if page["url"].endswith(".git"): origin = ListedOrigin( diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom @@ -0,0 +1,86 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:https://github.com/aldialimucaj/sprova4j.git + scm:ghttps://github.com/aldialimucaj/sprova4j.git + https://github.com/aldialimucaj/sprova4j + + + + ch.qos.logback + logback-classic + 1.2.3 + runtime + + + com.google.code.gson + gson + 2.8.3 + runtime + + + com.squareup.okhttp3 + okhttp + 3.10.0 + runtime + + + com.squareup.okio + okio + 1.0.0 + runtime + + + org.glassfish + javax.json + 1.1.2 + runtime + + + javax.json + javax.json-api + 1.1.2 + runtime + + + javax.validation + validation-api + 2.0.1.Final + runtime + + + junit + junit + 4.12 + test + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -69,6 +69,12 @@ return text +@pytest.fixture +def maven_pom_1_malformed(datadir) -> str: + text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text() + return text + + @pytest.fixture def maven_pom_2(datadir) -> str: text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text() @@ -132,6 +138,63 @@ assert scheduler_state.last_seen_pom == -1 +def test_maven_full_listing_malformed( + swh_scheduler, + requests_mock, + mocker, + maven_index, + maven_pom_1_malformed, + maven_pom_2, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + index_text = maven_index + requests_mock.get(INDEX_URL, text=index_text) + requests_mock.get(URL_POM_1, text=maven_pom_1_malformed) + requests_mock.get(URL_POM_2, text=maven_pom_2) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 4 + assert stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + origin_urls = [origin.url for origin in scheduler_origins] + LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",) + assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + if src.get("url") == origin.url: + artifact = origin.extra_loader_arguments["artifacts"][0] + assert src.get("time") == artifact["time"] + assert src.get("gid") == artifact["gid"] + assert src.get("aid") == artifact["aid"] + assert src.get("version") == artifact["version"] + assert MVN_URL == artifact["base_url"] + break + else: + raise AssertionError + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + def test_maven_incremental_listing( swh_scheduler, requests_mock,