diff --git a/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom new file mode 100644 --- /dev/null +++ b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom @@ -0,0 +1,30 @@ + + + 4.0.0 + al.aldi + sprova4j + 0.1.0 + sprova4j + Java client for Sprova Test Management + https://github.com/aldialimucaj/sprova4j + 2018 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + aldi + Aldi Alimucaj + aldi.alimucaj@gmail.com + + + + scm:git@github.com/aldialimucaj/sprova4j.git + git@github.com/aldialimucaj/sprova4j + + + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -170,6 +170,53 @@ assert scheduler_state.last_seen_pom == -1 +def test_maven_ignore_invalid_url( + swh_scheduler, + requests_mock, + datadir, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + requests_mock.get( + URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.invalidurl.pom").read_bytes() + ) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 5 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + origin_urls = [origin.url for origin in scheduler_origins] + + # 1 git origins (the other ignored) + 1 maven origin with 2 releases (one per jar) + assert set(origin_urls) == {ORIGIN_GIT_INCR, ORIGIN_SRC} + assert len(origin_urls) == len(set(origin_urls)) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + def test_maven_incremental_listing( swh_scheduler, requests_mock, diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -48,7 +48,7 @@ mock_init, mock_popen = mock_opam instance = "fake_opam_repo" - instance_url = f"file://{datadir}/{instance}" + instance_url = f"http://example.org/{instance}" opam_root = str(tmp_path / "test-opam") os.makedirs(opam_root, exist_ok=True) @@ -112,8 +112,17 @@ assert expected_urls == result_urls -def test_opam_binary(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_binary(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, @@ -141,8 +150,17 @@ assert expected_urls == result_urls -def test_opam_multi_instance(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_multi_instance(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, diff --git a/swh/lister/packagist/tests/data/payrix_payrix-php.json b/swh/lister/packagist/tests/data/payrix_payrix-php.json new file mode 100644 --- /dev/null +++ b/swh/lister/packagist/tests/data/payrix_payrix-php.json @@ -0,0 +1,151 @@ +{ + "packages": { + "payrix/payrix-php": { + "dev-master": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "git@gitlab.com:payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T14:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "default-branch": true, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416889 + }, + "v2.0.0": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.0", + "version_normalized": "2.0.0.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68", + "type": "zip", + "shasum": "", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "type": "library", + "time": "2020-09-03T11:26:52+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416947 + }, + "v2.0.1": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.1", + "version_normalized": "2.0.1.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=9693f2dff0a589e16c88a9bf838069ab89166103", + "type": "zip", + "shasum": "", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "type": "library", + "time": "2021-05-10T02:32:57+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5183918 + }, + "v2.0.2": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.2", + "version_normalized": "2.0.2.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T10:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5232658 + } + } + } +} diff --git a/swh/lister/packagist/tests/data/with_invalid_url.json b/swh/lister/packagist/tests/data/with_invalid_url.json new file mode 100644 --- /dev/null +++ b/swh/lister/packagist/tests/data/with_invalid_url.json @@ -0,0 +1,24 @@ +{ + "packages": { + "ycms/module-main": { + "dev-master": { + "name": "with/invalid_url", + "description": "", + "keywords": [], + "homepage": "", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [], + "authors": [], + "source": { + "type": "git", + "url": "git@example.org/invalid/url.git", + "reference": "0000000000000000000000000000000000000000" + }, + "time": "2015-08-23T04:42:33+00:00", + "default-branch": true, + "uid": 4064797 + } + } + } +} diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -14,7 +14,9 @@ "ljjackson/linnworks", "lky/wx_article", "spryker-eco/computop-api", - "idevlab/essential", + "idevlab/essential", # Git SSH URL + "payrix/payrix-php", + "with/invalid_url", # invalid URL ] } @@ -49,7 +51,7 @@ stats = lister.run() assert stats.pages == 1 - assert stats.origins == len(_packages_list["packageNames"]) + assert stats.origins == len(_packages_list["packageNames"]) - 2 assert lister.updated expected_origins = { @@ -69,9 +71,9 @@ datetime.datetime.fromisoformat("2020-06-22T15:50:29+00:00"), ), ( - "git@gitlab.com:idevlab/Essential.git", # not GitHub + "https://gitlab.com/payrix/public/payrix-php.git", # not GitHub "git", - datetime.datetime.fromisoformat("2022-10-12T10:34:29+00:00"), + datetime.datetime.fromisoformat("2021-05-25T14:12:28+00:00"), ), } diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -20,7 +20,7 @@ from swh.scheduler.interface import SchedulerInterface from . import USER_AGENT_TEMPLATE -from .utils import http_retry +from .utils import http_retry, is_valid_origin_url logger = logging.getLogger(__name__) @@ -277,8 +277,15 @@ Returns: the list of origin URLs recorded in scheduler database """ + valid_origins = [] + for origin in origins: + if is_valid_origin_url(origin.url): + valid_origins.append(origin) + else: + logger.warning("Skipping invalid origin: %s", origin.url) + recorded_origins = [] - for batch_origins in grouper(origins, n=1000): + for batch_origins in grouper(valid_origins, n=1000): ret = self.scheduler.record_listed_origins(batch_origins) recorded_origins += [origin.url for origin in ret] diff --git a/swh/lister/utils.py b/swh/lister/utils.py --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -2,7 +2,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Callable, Iterator, Tuple +from typing import Callable, Iterator, Optional, Tuple +import urllib.parse from requests.exceptions import ConnectionError, HTTPError from requests.status_codes import codes @@ -111,3 +112,50 @@ """ return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args) + + +def is_valid_origin_url(url: Optional[str]) -> bool: + """Returns whether the given string is a valid origin URL. + This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo`` + and ``git@example.org:foo``), as they are not supported by the Git loader + and usually require authentication. + + All HTTP URLs are allowed: + + >>> is_valid_origin_url("http://example.org/repo.git") + True + >>> is_valid_origin_url("http://example.org/repo") + True + >>> is_valid_origin_url("https://example.org/repo") + True + >>> is_valid_origin_url("https://foo:bar@example.org/repo") + True + + Scheme-less URLs are rejected; + + >>> is_valid_origin_url("example.org/repo") + False + >>> is_valid_origin_url("example.org:repo") + False + + Git SSH URLs and pseudo-URLs are rejected: + + >>> is_valid_origin_url("git@example.org:repo") + False + >>> is_valid_origin_url("ssh://git@example.org:repo") + False + """ + if not url: + # Empty or None + return False + + parsed = urllib.parse.urlparse(url) + if not parsed.netloc: + # Is parsed as a relative URL + return False + + if parsed.scheme == "ssh": + # Git SSH URL + return False + + return True