diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -32,6 +32,7 @@ from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT from swh.lister import TARBALL_EXTENSIONS from swh.lister.pattern import CredentialsType, StatelessLister +from swh.lister.utils import is_valid_origin_url from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @@ -348,7 +349,7 @@ if self.github_session else artifact_url ) - if not origin: + if not origin or not is_valid_origin_url(origin): return None return ArtifactType.VCS, VCS( origin=origin, type=artifact_type, ref=artifact_ref @@ -402,7 +403,7 @@ urls = [] for url in origin_urls: urlparsed = urlparse(url) - if urlparsed.scheme == "": + if urlparsed.scheme == "" and not re.match(r"^\w+@[^/]+:", url): logger.warning("Missing scheme for <%s>: fallback to http", url) fixed_url = f"http://{url}" else: diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json --- a/swh/lister/nixguix/tests/data/sources-failure.json +++ b/swh/lister/nixguix/tests/data/sources-failure.json @@ -53,6 +53,16 @@ "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ], "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" }, + { + "type": "url", + "urls": [ "ssh://git@example.org:wrong-scheme-so-skipped.txt" ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, + { + "type": "url", + "urls": [ "git@example.org:git-pseudourl/so-skipped" ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" + }, { "type": "url", "urls": [ "https://code.9front.org/hg/plan9front" ], diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -353,13 +353,20 @@ ) listed_result = lister.run() - # only the origin upstream is listed, every other entries are unsupported or incomplete - assert listed_result == ListerStats(pages=1, origins=1) + expected_origins = ["https://github.com/NixOS/nixpkgs"] scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results - assert len(scheduler_origins) == 1 + scheduler_origin_urls = [orig.url for orig in scheduler_origins] + + assert scheduler_origin_urls == expected_origins + + # only the origin upstream is listed, every other entries are unsupported or incomplete + assert listed_result == ListerStats(pages=1, origins=1), ( + f"Expected origins: {' '.join(expected_origins)}, got: " + f"{' '.join(scheduler_origin_urls)}" + ) assert scheduler_origins[0].visit_type == "git"