diff --git a/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom new file mode 100644 index 0000000..28284e6 --- /dev/null +++ b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom @@ -0,0 +1,30 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git@github.com/aldialimucaj/sprova4j.git + git@github.com/aldialimucaj/sprova4j + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 9bacd4e..18cde65 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,348 +1,395 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" USER_REPO0 = "aldialimucaj/sprova4j" GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" ORIGIN_GIT = GIT_REPO_URL0_HTTPS USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" ORIGIN_GIT_INCR = GIT_REPO_URL1_HTTPS USER_REPO2 = "webx/citrus" GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" ORIGIN_SRC = MVN_URL + "al/aldi/sprova4j" LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture(autouse=True) def network_requests_mock(requests_mock, requests_mock_datadir, maven_index_full): requests_mock.get(INDEX_URL, content=maven_index_full) @pytest.fixture(autouse=True) def retry_sleep_mock(mocker): mocker.patch.object(MavenLister.http_request.retry, "sleep") def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC} assert len(set(origin_urls)) == len(origin_urls) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, datadir, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get( URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.malformed.pom").read_bytes() ) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_GIT_INCR, ORIGIN_SRC} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 +def test_maven_ignore_invalid_url( + swh_scheduler, + requests_mock, + datadir, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + requests_mock.get( + URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.invalidurl.pom").read_bytes() + ) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 5 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + origin_urls = [origin.url for origin in scheduler_origins] + + # 1 git origins (the other ignored) + 1 maven origin with 2 releases (one per jar) + assert set(origin_urls) == {ORIGIN_GIT_INCR, ORIGIN_SRC} + assert len(origin_urls) == len(set(origin_urls)) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert set(origin_urls) == {ORIGIN_GIT, ORIGIN_SRC} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT, ORIGIN_GIT_INCR} assert len(origin_urls) == len(set(origin_urls)) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert set(origin_urls) == {ORIGIN_SRC, ORIGIN_GIT_INCR} assert len(origin_urls) == len(set(origin_urls)) def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get( URL_POM_1, content=requests.get(URL_POM_1).content.decode("utf-8").encode("utf-32"), ) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 def test_maven_list_pom_multi_byte_encoding(swh_scheduler, requests_mock, datadir): """should parse POM file with multi-byte encoding.""" # replace pom file with a multi-byte encoding one requests_mock.get( URL_POM_1, content=Path(datadir, "citrus-parent-3.0.7.pom").read_bytes() ) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 3 diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py index 26dc753..26526ba 100644 --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -1,170 +1,188 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import os from tempfile import mkdtemp from unittest.mock import MagicMock import pytest from swh.lister.opam.lister import OpamLister, opam_init module_name = "swh.lister.opam.lister" @pytest.fixture def mock_opam(mocker): """Fixture to bypass the actual opam calls within the test context.""" # inhibits the real `subprocess.call` which prepares the required internal opam # state mock_init = mocker.patch(f"{module_name}.call", return_value=None) # replaces the real Popen with a fake one (list origins command) mocked_popen = MagicMock() mocked_popen.stdout = io.BytesIO(b"bar\nbaz\nfoo\n") mock_open = mocker.patch(f"{module_name}.Popen", return_value=mocked_popen) return mock_init, mock_open def test_mock_init_repository_init(mock_opam, tmp_path, datadir): """Initializing opam root directory with an instance should be ok""" mock_init, mock_popen = mock_opam instance = "fake" instance_url = f"file://{datadir}/{instance}" opam_root = str(tmp_path / "test-opam") assert not os.path.exists(opam_root) # This will initialize an opam directory with the instance opam_init(opam_root, instance, instance_url, {}) assert mock_init.called def test_mock_init_repository_update(mock_opam, tmp_path, datadir): """Updating opam root directory with another instance should be ok""" mock_init, mock_popen = mock_opam instance = "fake_opam_repo" - instance_url = f"file://{datadir}/{instance}" + instance_url = f"http://example.org/{instance}" opam_root = str(tmp_path / "test-opam") os.makedirs(opam_root, exist_ok=True) with open(os.path.join(opam_root, "opam"), "w") as f: f.write("one file to avoid empty folder") assert os.path.exists(opam_root) assert os.listdir(opam_root) == ["opam"] # not empty # This will update the repository opam with another instance opam_init(opam_root, instance, instance_url, {}) assert mock_init.called def test_lister_opam_optional_instance(swh_scheduler): """Instance name should be optional and default to be built out of the netloc.""" netloc = "opam.ocaml.org" instance_url = f"https://{netloc}" lister = OpamLister( swh_scheduler, url=instance_url, ) assert lister.instance == netloc assert lister.opam_root == "/tmp/opam/" def test_urls(swh_scheduler, mock_opam, tmp_path): mock_init, mock_popen = mock_opam instance_url = "https://opam.ocaml.org" tmp_folder = mkdtemp(dir=tmp_path, prefix="swh_opam_lister") lister = OpamLister( swh_scheduler, url=instance_url, instance="opam", opam_root=tmp_folder, ) assert lister.instance == "opam" assert lister.opam_root == tmp_folder # call the lister and get all listed origins urls stats = lister.run() assert mock_init.called assert mock_popen.called assert stats.pages == 3 assert stats.origins == 3 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/bar/", f"opam+{instance_url}/packages/baz/", f"opam+{instance_url}/packages/foo/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls -def test_opam_binary(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_binary(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, url=instance_url, instance="fake", opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"), ) stats = lister.run() assert stats.pages == 4 assert stats.origins == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/agrid/", f"opam+{instance_url}/packages/calculon/", f"opam+{instance_url}/packages/directories/", f"opam+{instance_url}/packages/ocb/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls -def test_opam_multi_instance(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_multi_instance(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, url=instance_url, instance="fake", opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"), ) stats = lister.run() assert stats.pages == 4 assert stats.origins == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/agrid/", f"opam+{instance_url}/packages/calculon/", f"opam+{instance_url}/packages/directories/", f"opam+{instance_url}/packages/ocb/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls diff --git a/swh/lister/packagist/tests/data/payrix_payrix-php.json b/swh/lister/packagist/tests/data/payrix_payrix-php.json new file mode 100644 index 0000000..43a6c77 --- /dev/null +++ b/swh/lister/packagist/tests/data/payrix_payrix-php.json @@ -0,0 +1,151 @@ +{ + "packages": { + "payrix/payrix-php": { + "dev-master": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "git@gitlab.com:payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T14:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "default-branch": true, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416889 + }, + "v2.0.0": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.0", + "version_normalized": "2.0.0.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68", + "type": "zip", + "shasum": "", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "type": "library", + "time": "2020-09-03T11:26:52+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416947 + }, + "v2.0.1": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.1", + "version_normalized": "2.0.1.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=9693f2dff0a589e16c88a9bf838069ab89166103", + "type": "zip", + "shasum": "", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "type": "library", + "time": "2021-05-10T02:32:57+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5183918 + }, + "v2.0.2": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.2", + "version_normalized": "2.0.2.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T10:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5232658 + } + } + } +} diff --git a/swh/lister/packagist/tests/data/with_invalid_url.json b/swh/lister/packagist/tests/data/with_invalid_url.json new file mode 100644 index 0000000..4b281ea --- /dev/null +++ b/swh/lister/packagist/tests/data/with_invalid_url.json @@ -0,0 +1,24 @@ +{ + "packages": { + "ycms/module-main": { + "dev-master": { + "name": "with/invalid_url", + "description": "", + "keywords": [], + "homepage": "", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [], + "authors": [], + "source": { + "type": "git", + "url": "git@example.org/invalid/url.git", + "reference": "0000000000000000000000000000000000000000" + }, + "time": "2015-08-23T04:42:33+00:00", + "default-branch": true, + "uid": 4064797 + } + } + } +} diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py index e2782ee..4f512a2 100644 --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -1,199 +1,201 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json from pathlib import Path from swh.lister.packagist.lister import PackagistLister _packages_list = { "packageNames": [ "ljjackson/linnworks", "lky/wx_article", "spryker-eco/computop-api", - "idevlab/essential", + "idevlab/essential", # Git SSH URL + "payrix/payrix-php", + "with/invalid_url", # invalid URL ] } def _package_metadata(datadir, package_name): return json.loads( Path(datadir, f"{package_name.replace('/', '_')}.json").read_text() ) def _request_without_if_modified_since(request): return request.headers.get("If-Modified-Since") is None def _request_with_if_modified_since(request): return request.headers.get("If-Modified-Since") is not None def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_datadir): # first listing, should return one origin per package lister = PackagistLister(scheduler=swh_scheduler) requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) packages_metadata = {} for package_name in _packages_list["packageNames"]: metadata = _package_metadata(datadir, package_name) packages_metadata[package_name] = metadata requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", json=metadata, additional_matcher=_request_without_if_modified_since, ) stats = lister.run() assert stats.pages == 1 - assert stats.origins == len(_packages_list["packageNames"]) + assert stats.origins == len(_packages_list["packageNames"]) - 2 assert lister.updated expected_origins = { ( "https://github.com/gitlky/wx_article", # standard case "git", datetime.datetime.fromisoformat("2018-08-30T07:37:09+00:00"), ), ( "https://github.com/ljjackson/linnworks.git", # API goes 404 "git", datetime.datetime.fromisoformat("2018-11-01T21:45:50+00:00"), ), ( "https://github.com/spryker-eco/computop-api", # SSH URL in manifest "git", datetime.datetime.fromisoformat("2020-06-22T15:50:29+00:00"), ), ( - "git@gitlab.com:idevlab/Essential.git", # not GitHub + "https://gitlab.com/payrix/public/payrix-php.git", # not GitHub "git", - datetime.datetime.fromisoformat("2022-10-12T10:34:29+00:00"), + datetime.datetime.fromisoformat("2021-05-25T14:12:28+00:00"), ), } assert expected_origins == { (o.url, o.visit_type, o.last_update) for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results } # second listing, should return 0 origins as no package metadata # has been updated since first listing lister = PackagistLister(scheduler=swh_scheduler) for package_name in _packages_list["packageNames"]: requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", additional_matcher=_request_with_if_modified_since, status_code=304, ) assert lister.get_state_from_scheduler().last_listing_date is not None stats = lister.run() assert stats.pages == 1 assert stats.origins == 0 assert lister.updated assert expected_origins == { (o.url, o.visit_type, o.last_update) for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results } def test_packagist_lister_missing_metadata(swh_scheduler, requests_mock, datadir): lister = PackagistLister(scheduler=swh_scheduler) requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) for package_name in _packages_list["packageNames"]: requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", additional_matcher=_request_without_if_modified_since, status_code=404, ) stats = lister.run() assert stats.pages == 1 assert stats.origins == 0 def test_packagist_lister_empty_metadata(swh_scheduler, requests_mock, datadir): lister = PackagistLister(scheduler=swh_scheduler) requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) for package_name in _packages_list["packageNames"]: requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", additional_matcher=_request_without_if_modified_since, json={"packages": {}}, ) stats = lister.run() assert stats.pages == 1 assert stats.origins == 0 def test_packagist_lister_package_with_bitbucket_hg_origin( swh_scheduler, requests_mock, datadir ): package_name = "den1n/contextmenu" lister = PackagistLister(scheduler=swh_scheduler) requests_mock.get( lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]} ) requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", additional_matcher=_request_without_if_modified_since, json=_package_metadata(datadir, package_name), ) stats = lister.run() assert stats.pages == 1 assert stats.origins == 0 def test_packagist_lister_package_normalize_github_origin( swh_scheduler, requests_mock, datadir, requests_mock_datadir ): package_name = "ycms/module-main" lister = PackagistLister(scheduler=swh_scheduler) requests_mock.get( lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]} ) requests_mock.get( f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", additional_matcher=_request_without_if_modified_since, json=_package_metadata(datadir, package_name), ) stats = lister.run() assert stats.pages == 1 assert stats.origins == 1 expected_origins = { ( "https://github.com/GameCHN/module-main", "git", datetime.datetime.fromisoformat("2015-08-23T04:42:33+00:00"), ), } assert expected_origins == { (o.url, o.visit_type, o.last_update) for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results } def test_lister_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = PackagistLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 5b3a33d..8a1b497 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,332 +1,339 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations from dataclasses import dataclass import logging from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Set, TypeVar from urllib.parse import urlparse import requests from tenacity.before_sleep import before_sleep_log from swh.core.config import load_from_envvar from swh.core.github.utils import GitHubSession from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface from . import USER_AGENT_TEMPLATE -from .utils import http_retry +from .utils import http_retry, is_valid_origin_url logger = logging.getLogger(__name__) @dataclass class ListerStats: pages: int = 0 origins: int = 0 def __add__(self, other: ListerStats) -> ListerStats: return self.__class__(self.pages + other.pages, self.origins + other.origins) def __iadd__(self, other: ListerStats): self.pages += other.pages self.origins += other.origins def dict(self) -> Dict[str, int]: return {"pages": self.pages, "origins": self.origins} StateType = TypeVar("StateType") PageType = TypeVar("PageType") BackendStateType = Dict[str, Any] CredentialsType = Optional[Dict[str, Dict[str, List[Dict[str, str]]]]] class Lister(Generic[StateType, PageType]): """The base class for a Software Heritage lister. A lister scrapes a page by page list of origins from an upstream (a forge, the API of a package manager, ...), and massages the results of that scrape into a list of origins that are recorded by the scheduler backend. The main loop of the lister, :meth:`run`, basically revolves around the :meth:`get_pages` iterator, which sets up the lister state, then yields the scrape results page by page. The :meth:`get_origins_from_page` method converts the pages into a list of :class:`model.ListedOrigin`, sent to the scheduler at every page. The :meth:`commit_page` method can be used to update the lister state after a page of origins has been recorded in the scheduler backend. The :func:`finalize` method is called at lister teardown (whether the run has been successful or not) to update the local :attr:`state` object before it's sent to the database. This method must set the :attr:`updated` attribute if an updated state needs to be sent to the scheduler backend. This method can call :func:`get_state_from_scheduler` to refresh and merge the lister state from the scheduler before it's finalized (and potentially minimize the risk of race conditions between concurrent runs of the lister). The state of the lister is serialized and deserialized from the dict stored in the scheduler backend, using the :meth:`state_from_dict` and :meth:`state_to_dict` methods. Args: scheduler: the instance of the Scheduler being used to register the origins listed by this lister url: a URL representing this lister, e.g. the API's base URL instance: the instance name, to uniquely identify this lister instance, if not provided the URL network location will be used credentials: dictionary of credentials for all listers. The first level identifies the :attr:`LISTER_NAME`, the second level the lister :attr:`instance`. The final level is a list of dicts containing the expected credentials for the given instance of that lister. Generic types: - *StateType*: concrete lister type; should usually be a :class:`dataclass` for stricter typing - *PageType*: type of scrape results; can usually be a :class:`requests.Response`, or a :class:`dict` """ LISTER_NAME: str = "" github_session: Optional[GitHubSession] = None def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: CredentialsType = None, with_github_session: bool = False, ): if not self.LISTER_NAME: raise ValueError("Must set the LISTER_NAME attribute on Lister classes") self.url = url if instance is not None: self.instance = instance else: self.instance = urlparse(url).netloc self.scheduler = scheduler if not credentials: credentials = {} self.credentials = list( credentials.get(self.LISTER_NAME, {}).get(self.instance, []) ) # store the initial state of the lister self.state = self.get_state_from_scheduler() self.updated = False self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} ) self.github_session: Optional[GitHubSession] = ( GitHubSession( credentials=credentials.get("github", {}).get("github", []), user_agent=str(self.session.headers["User-Agent"]), ) if with_github_session else None ) self.recorded_origins: Set[str] = set() @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, kwargs.get("params")) response = self.session.request(method, url, **kwargs) if response.status_code not in (200, 304): logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def run(self) -> ListerStats: """Run the lister. Returns: A counter with the number of pages and origins seen for this run of the lister. """ full_stats = ListerStats() self.recorded_origins = set() try: for page in self.get_pages(): full_stats.pages += 1 origins = self.get_origins_from_page(page) sent_origins = self.send_origins(origins) self.recorded_origins.update(sent_origins) full_stats.origins = len(self.recorded_origins) self.commit_page(page) finally: self.finalize() if self.updated: self.set_state_in_scheduler() return full_stats def get_state_from_scheduler(self) -> StateType: """Update the state in the current instance from the state in the scheduler backend. This updates :attr:`lister_obj`, and returns its (deserialized) current state, to allow for comparison with the local state. Returns: the state retrieved from the scheduler backend """ self.lister_obj = self.scheduler.get_or_create_lister( name=self.LISTER_NAME, instance_name=self.instance ) return self.state_from_dict(self.lister_obj.current_state) def set_state_in_scheduler(self) -> None: """Update the state in the scheduler backend from the state of the current instance. Raises: swh.scheduler.exc.StaleData: in case of a race condition between concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`). """ self.lister_obj.current_state = self.state_to_dict(self.state) self.lister_obj = self.scheduler.update_lister(self.lister_obj) # State management to/from the scheduler def state_from_dict(self, d: BackendStateType) -> StateType: """Convert the state stored in the scheduler backend (as a dict), to the concrete StateType for this lister.""" raise NotImplementedError def state_to_dict(self, state: StateType) -> BackendStateType: """Convert the StateType for this lister to its serialization as dict for storage in the scheduler. Values must be JSON-compatible as that's what the backend database expects. """ raise NotImplementedError def finalize(self) -> None: """Custom hook to finalize the lister state before returning from the main loop. This method must set :attr:`updated` if the lister has done some work. If relevant, this method can use :meth`get_state_from_scheduler` to merge the current lister state with the one from the scheduler backend, reducing the risk of race conditions if we're running concurrent listings. This method is called in a `finally` block, which means it will also run when the lister fails. """ pass # Actual listing logic def get_pages(self) -> Iterator[PageType]: """Retrieve a list of pages of listed results. This is the main loop of the lister. Returns: an iterator of raw pages fetched from the platform currently being listed. """ raise NotImplementedError def get_origins_from_page(self, page: PageType) -> Iterator[model.ListedOrigin]: """Extract a list of :class:`model.ListedOrigin` from a raw page of results. Args: page: a single page of results Returns: an iterator for the origins present on the given page of results """ raise NotImplementedError def commit_page(self, page: PageType) -> None: """Custom hook called after the current page has been committed in the scheduler backend. This method can be used to update the state after a page of origins has been successfully recorded in the scheduler backend. If the new state should be recorded at the point the lister completes, the :attr:`updated` attribute must be set. """ pass def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]: """Record a list of :class:`model.ListedOrigin` in the scheduler. Returns: the list of origin URLs recorded in scheduler database """ + valid_origins = [] + for origin in origins: + if is_valid_origin_url(origin.url): + valid_origins.append(origin) + else: + logger.warning("Skipping invalid origin: %s", origin.url) + recorded_origins = [] - for batch_origins in grouper(origins, n=1000): + for batch_origins in grouper(valid_origins, n=1000): ret = self.scheduler.record_listed_origins(batch_origins) recorded_origins += [origin.url for origin in ret] return recorded_origins @classmethod def from_config(cls, scheduler: Dict[str, Any], **config: Any): """Instantiate a lister from a configuration dict. This is basically a backwards-compatibility shim for the CLI. Args: scheduler: instantiation config for the scheduler config: the configuration dict for the lister, with the following keys: - credentials (optional): credentials list for the scheduler - any other kwargs passed to the lister. Returns: the instantiated lister """ # Drop the legacy config keys which aren't used for this generation of listers. for legacy_key in ("storage", "lister", "celery"): config.pop(legacy_key, None) # Instantiate the scheduler scheduler_instance = get_scheduler(**scheduler) return cls(scheduler=scheduler_instance, **config) @classmethod def from_configfile(cls, **kwargs: Any): """Instantiate a lister from the configuration loaded from the SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their value is not None. Args: kwargs: kwargs passed to the lister instantiation """ config = dict(load_from_envvar()) config.update({k: v for k, v in kwargs.items() if v is not None}) return cls.from_config(**config) class StatelessLister(Lister[None, PageType], Generic[PageType]): def state_from_dict(self, d: BackendStateType) -> None: """Always return empty state""" return None def state_to_dict(self, state: None) -> BackendStateType: """Always set empty state""" return {} diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 125b31b..3220d4d 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,113 +1,161 @@ # Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Callable, Iterator, Tuple +from typing import Callable, Iterator, Optional, Tuple +import urllib.parse from requests.exceptions import ConnectionError, HTTPError from requests.status_codes import codes from tenacity import retry as tenacity_retry from tenacity.stop import stop_after_attempt from tenacity.wait import wait_exponential def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can have one more element. >>> list(split_range(19, 10)) [(0, 9), (10, 19)] >>> list(split_range(20, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)] >>> list(split_range(21, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21)] """ prev_index = None for index in range(0, total_pages, nb_pages): if index is not None and prev_index is not None: yield prev_index, index - 1 prev_index = index if index != total_pages: yield index, total_pages def is_throttling_exception(e: Exception) -> bool: """ Checks if an exception is a requests.exception.HTTPError for a response with status code 429 (too many requests). """ return ( isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests ) def is_retryable_exception(e: Exception) -> bool: """ Checks if an exception is worth retrying (connection, throttling or a server error). """ is_connection_error = isinstance(e, ConnectionError) is_500_error = isinstance(e, HTTPError) and e.response.status_code >= 500 return is_connection_error or is_throttling_exception(e) or is_500_error def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> bool: """ Custom tenacity retry predicate for handling exceptions with the given predicate. """ attempt = retry_state.outcome if attempt.failed: exception = attempt.exception() return predicate(exception) return False def retry_policy_generic(retry_state) -> bool: """ Custom tenacity retry predicate for handling failed requests: - ConnectionError - Server errors (status >= 500) - Throttling errors (status == 429) This does not handle 404, 403 or other status codes. """ return retry_if_exception(retry_state, is_retryable_exception) WAIT_EXP_BASE = 10 MAX_NUMBER_ATTEMPTS = 5 def http_retry( retry=retry_policy_generic, wait=wait_exponential(exp_base=WAIT_EXP_BASE), stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), **retry_args, ): """ Decorator based on `tenacity` for retrying a function possibly raising requests.exception.HTTPError for status code 429 (too many requests). It provides a default configuration that should work properly in most cases but all `tenacity.retry` parameters can also be overridden in client code. When the mmaximum of attempts is reached, the HTTPError exception will then be reraised. Args: retry: function defining request retry condition (default to 429 status code) https://tenacity.readthedocs.io/en/latest/#whether-to-retry wait: function defining wait strategy before retrying (default to exponential backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying stop: function defining when to stop retrying (default after 5 attempts) https://tenacity.readthedocs.io/en/latest/#stopping """ return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args) + + +def is_valid_origin_url(url: Optional[str]) -> bool: + """Returns whether the given string is a valid origin URL. + This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo`` + and ``git@example.org:foo``), as they are not supported by the Git loader + and usually require authentication. + + All HTTP URLs are allowed: + + >>> is_valid_origin_url("http://example.org/repo.git") + True + >>> is_valid_origin_url("http://example.org/repo") + True + >>> is_valid_origin_url("https://example.org/repo") + True + >>> is_valid_origin_url("https://foo:bar@example.org/repo") + True + + Scheme-less URLs are rejected; + + >>> is_valid_origin_url("example.org/repo") + False + >>> is_valid_origin_url("example.org:repo") + False + + Git SSH URLs and pseudo-URLs are rejected: + + >>> is_valid_origin_url("git@example.org:repo") + False + >>> is_valid_origin_url("ssh://git@example.org:repo") + False + """ + if not url: + # Empty or None + return False + + parsed = urllib.parse.urlparse(url) + if not parsed.netloc: + # Is parsed as a relative URL + return False + + if parsed.scheme == "ssh": + # Git SSH URL + return False + + return True