Page MenuHomeSoftware Heritage

test_lister.py
No OneTemporary

test_lister.py

# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
import iso8601
import pytest
import requests
from swh.lister.maven.lister import MavenLister
MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url
INDEX_URL = "http://indexes/export.fld" # index directory url
URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
USER_REPO0 = "aldialimucaj/sprova4j"
GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}"
GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}"
ORIGIN_GIT = GIT_REPO_URL0_HTTPS
USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java"
GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}"
GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git"
GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}"
ORIGIN_GIT_INCR = GIT_REPO_URL1_HTTPS
USER_REPO2 = "webx/citrus"
GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}"
GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}"
ORIGIN_SRC = MVN_URL + "al/aldi/sprova4j"
LIST_SRC_DATA = (
{
"type": "maven",
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+ "/0.1.0/sprova4j-0.1.0-sources.jar",
"time": "2021-07-12T17:06:59+00:00",
"gid": "al.aldi",
"aid": "sprova4j",
"version": "0.1.0",
"base_url": MVN_URL,
},
{
"type": "maven",
"url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+ "/0.1.1/sprova4j-0.1.1-sources.jar",
"time": "2021-07-12T17:37:05+00:00",
"gid": "al.aldi",
"aid": "sprova4j",
"version": "0.1.1",
"base_url": MVN_URL,
},
)
@pytest.fixture
def maven_index_full(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_full.fld").read_bytes()
@pytest.fixture
def maven_index_incr_first(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes()
@pytest.fixture
def maven_pom_1(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes()
@pytest.fixture
def maven_index_null_mtime(datadir) -> bytes:
return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes()
@pytest.fixture
def maven_pom_1_malformed(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes()
@pytest.fixture
def maven_pom_2(datadir) -> bytes:
return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes()
@pytest.fixture
def maven_pom_3(datadir) -> bytes:
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
@pytest.fixture
def maven_pom_multi_byte_encoding(datadir) -> bytes:
return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes()
@pytest.fixture
def requests_mock(requests_mock):
"""If github api calls for the configured scm repository, returns its canonical url."""
for url_api, url_html in [
(GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS),
(GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS),
(GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS),
]:
requests_mock.get(
url_api,
json={"html_url": url_html},
)
yield requests_mock
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
):
requests_mock.get(INDEX_URL, content=maven_index_full)
requests_mock.get(URL_POM_1, content=maven_pom_1)
requests_mock.get(URL_POM_2, content=maven_pom_2)
requests_mock.get(URL_POM_3, content=maven_pom_3)
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(MavenLister.http_request.retry, "sleep")
def test_maven_full_listing(swh_scheduler):
"""Covers full listing of multiple pages, checking page results and listed
origins, statelessness."""
# Run the lister.
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=False,
)
stats = lister.run()
# Start test checks.
assert stats.pages == 5
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
# 3 git origins + 1 maven origin with 2 releases (one per jar)
assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC}
assert len(set(origin_urls)) == len(origin_urls)
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == -1
assert scheduler_state.last_seen_pom == -1
def test_maven_full_listing_malformed(
swh_scheduler,
requests_mock,
maven_pom_1_malformed,
):
"""Covers full listing of multiple pages, checking page results with a malformed
scm entry in pom."""
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=False,
)
# Set up test.
requests_mock.get(URL_POM_1, content=maven_pom_1_malformed)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert stats.pages == 5
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
# 2 git origins + 1 maven origin with 2 releases (one per jar)
assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC}
assert len(origin_urls) == len(set(origin_urls))
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == -1
assert scheduler_state.last_seen_pom == -1
def test_maven_incremental_listing(
swh_scheduler,
requests_mock,
maven_index_full,
maven_index_incr_first,
):
"""Covers full listing of multiple pages, checking page results and listed
origins, with a second updated run for statefulness."""
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=True,
)
# Set up test.
requests_mock.get(INDEX_URL, content=maven_index_incr_first)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert lister.incremental
assert lister.updated
assert stats.pages == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
# 1 git origins + 1 maven origin with 1 release (one per jar)
assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_SRC}
assert len(origin_urls) == len(set(origin_urls))
for origin in scheduler_origins:
if origin.visit_type == "maven":
last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"])
assert last_update_src == origin.last_update
assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]]
# Second execution of the lister, incremental mode
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=True,
)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == 1
assert scheduler_state.last_seen_pom == 1
# Set up test.
requests_mock.get(INDEX_URL, content=maven_index_full)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert lister.incremental
assert lister.updated
assert stats.pages == 4
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT, ORIGIN_GIT_INCR}
assert len(origin_urls) == len(set(origin_urls))
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == 4
assert scheduler_state.last_seen_pom == 4
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code):
"""should stop listing if the lister fails to retrieve the main index url."""
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
requests_mock.get(INDEX_URL, status_code=http_code)
with pytest.raises(requests.HTTPError): # listing cannot continues so stop
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 0
@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
def test_maven_list_http_error_artifacts(
swh_scheduler,
requests_mock,
http_code,
):
"""should continue listing when failing to retrieve artifacts."""
# Test failure of artefacts retrieval.
requests_mock.get(URL_POM_1, status_code=http_code)
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
# on artifacts though, that raises but continue listing
lister.run()
# If the maven_index_full step succeeded but not the get_pom step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT_INCR}
assert len(origin_urls) == len(set(origin_urls))
def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime):
requests_mock.get(INDEX_URL, content=maven_index_null_mtime)
# Run the lister.
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=False,
)
stats = lister.run()
# Start test checks.
assert stats.pages == 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 1
assert scheduler_origins[0].last_update is None
def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1):
"""should continue listing when failing to decode pom file."""
# Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one
requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32"))
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
lister.run()
# If the maven_index_full step succeeded but not the pom parsing step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 2
def test_maven_list_pom_multi_byte_encoding(
swh_scheduler, requests_mock, maven_pom_multi_byte_encoding
):
"""should parse POM file with multi-byte encoding."""
# replace pom file with a multi-byte encoding one
requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding)
lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 3

File Metadata

Mime Type
text/x-python
Expires
Fri, Jul 4, 1:21 PM (6 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3356608

Event Timeline