diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -293,6 +293,21 @@ else None ) + def build_artifact( + self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None + ) -> Optional[Tuple[ArtifactType, VCS]]: + """Build a canonicalized vcs artifact when possible.""" + origin = ( + self.github_session.get_canonical_url(artifact_url) + if self.github_session + else artifact_url + ) + if not origin: + return None + return ArtifactType.VCS, VCS( + origin=origin, type=artifact_type, ref=artifact_ref + ) + def get_pages(self) -> Iterator[PageResult]: """Yield one page per "typed" origin referenced in manifest.""" # fetch and parse the manifest... @@ -321,16 +336,12 @@ if artifact_type in VCS_SUPPORTED: plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]] plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]] - artifact_url = ( - self.github_session.get_canonical_url(plain_url) - if self.github_session - else plain_url + built_artifact = self.build_artifact( + plain_url, artifact_type, plain_ref ) - if not artifact_url: + if not built_artifact: continue - yield ArtifactType.VCS, VCS( - origin=artifact_url, type=artifact_type, ref=plain_ref - ) + yield built_artifact elif artifact_type == "url": # It's either a tarball or a file origin_urls = artifact.get("urls") @@ -354,6 +365,13 @@ origin, *fallback_urls = urls + if origin.endswith(".git"): + built_artifact = self.build_artifact(origin, "git") + if not built_artifact: + continue + yield built_artifact + continue + integrity = artifact.get("integrity") if integrity is None: logger.warning("Skipping url <%s>: missing integrity field", origin) @@ -367,17 +385,12 @@ ) urlparsed = urlparse(origin) artifact_type = urlparsed.scheme + if artifact_type in VCS_SUPPORTED: - artifact_url = ( - self.github_session.get_canonical_url(origin) - if self.github_session - else origin - ) - if not artifact_url: + built_artifact = self.build_artifact(origin, artifact_type) + if not built_artifact: continue - yield ArtifactType.VCS, VCS( - origin=artifact_url, type=artifact_type - ) + yield built_artifact else: logger.warning( "Skipping url <%s>: undetected remote artifact type", origin diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json --- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -57,6 +57,13 @@ "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=", "inferredFetcher": "fetchzip" }, + { + "type": "url", + "urls": [ + "https://github.com/trie/trie.git" + ], + "integrity": "sha256-j7xp1svMeYIm+WScVe/B7w0jNjMtvkp9a1hLLLlO92g=" + }, { "type": "git", "git_url": "https://example.org/pali/0xffff", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -169,6 +169,10 @@ url, [{"json": response}], ) + requests_mock.get( + "https://api.github.com/repos/trie/trie", + [{"json": {"html_url": "https://github.com/trie/trie.git"}}], + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin @@ -186,7 +190,9 @@ expected_visit_types[artifact_type] += 1 elif artifact_type == "url": url = artifact["urls"][0] - if url.endswith(".c") or url.endswith(".txt"): + if url.endswith(".git"): + expected_visit_types["git"] += 1 + elif url.endswith(".c") or url.endswith(".txt"): expected_visit_types["content"] += 1 elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless expected_visit_types["svn"] += 1