diff --git a/swh/loader/exception.py b/swh/loader/exception.py new file mode 100644 --- /dev/null +++ b/swh/loader/exception.py @@ -0,0 +1,12 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class NotFound(ValueError): + """An exception raised when some origins or artifacts is not found + + """ + + pass diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -29,6 +29,7 @@ from swh.core.config import load_from_envvar from swh.core.tarball import uncompress +from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.collections import ImmutableDict @@ -413,7 +414,18 @@ load_exceptions: List[Exception] = [] - for version in self.get_versions(): # for each + try: + versions = self.get_versions() + except NotFound: + status_visit = "not_found" + status_load = "failed" + return finalize_visit() + except Exception: + status_visit = "failed" + status_load = "failed" + return finalize_visit() + + for version in versions: logger.debug("version: %s", version) tmp_revisions[version] = [] # `p_` stands for `package_` diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,12 +11,13 @@ import attr +from swh.loader.exception import NotFound from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info +from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method from swh.model import hashutil from swh.model.collections import ImmutableDict from swh.model.model import ( @@ -60,29 +61,31 @@ def __init__(self, url): super().__init__(url=url) - unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) - self.raw_sources = retrieve_sources(url) - clean = clean_sources( - parse_sources(self.raw_sources), unsupported_file_extensions - ) - self.sources = clean["sources"] self.provider_url = url - self._integrityByUrl = {s["urls"][0]: s["integrity"] for s in self.sources} - - # The revision used to create the sources.json file. For Nix, - # this revision belongs to the github.com/nixos/nixpkgs - # repository - self.revision = clean["revision"] - # Note: this could be renamed get_artifacts in the PackageLoader # base class. - def get_versions(self): + @cached_method + def raw_sources(self): + return retrieve_sources(self.url) + + @cached_method + def supported_sources(self): + raw_sources = self.raw_sources() + unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) + return clean_sources(parse_sources(raw_sources), unsupported_file_extensions) + + @cached_method + def integrity_by_url(self) -> Dict[str, Any]: + sources = self.supported_sources() + return {s["urls"][0]: s["integrity"] for s in sources["sources"]} + + def get_versions(self) -> List[str]: """The first mirror of the mirror list is used as branch name in the snapshot. """ - return self._integrityByUrl.keys() + return list(self.integrity_by_url().keys()) def get_metadata_authority(self): return MetadataAuthority( @@ -92,7 +95,7 @@ def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( - format="nixguix-sources-json", metadata=self.raw_sources, + format="nixguix-sources-json", metadata=self.raw_sources(), ), ] @@ -103,7 +106,7 @@ # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. - integrity = self._integrityByUrl[url] + integrity = self.integrity_by_url()[url] p_info = NixGuixPackageInfo.from_metadata({"url": url, "integrity": integrity}) yield url, p_info @@ -178,10 +181,14 @@ a Nix/Guix evaluation. """ + # The revision used to create the sources.json file. For Nix, + # this revision belongs to the github.com/nixos/nixpkgs + # repository + revision = self.supported_sources()["revision"] return { b"evaluation": { "target_type": "revision", - "target": hashutil.hash_to_bytes(self.revision), + "target": hashutil.hash_to_bytes(revision), } } @@ -209,7 +216,11 @@ def retrieve_sources(url: str) -> bytes: - return api_info(url, allow_redirects=True) + """Retrieve sources. Potentially raise NotFound error.""" + try: + return api_info(url, allow_redirects=True) + except Exception as e: + raise NotFound(e) def parse_sources(raw_sources: bytes) -> Dict[str, Any]: diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,10 +1,9 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -from json.decoder import JSONDecodeError import logging import os from typing import Dict, Optional, Tuple @@ -107,14 +106,28 @@ assert len(j["sources"]) == 2 -def test_retrieve_non_existing(swh_config, requests_mock_datadir): - with pytest.raises(ValueError): - NixGuixLoader("https://non-existing-url") +def test_nixguix_url_not_found(swh_config, requests_mock_datadir): + sources_url = "https://non-existing-url" + loader = NixGuixLoader(sources_url) + load_status = loader.load() + + assert load_status == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, sources_url, status="not_found", type="nixguix", snapshot=None + ) -def test_retrieve_non_json(swh_config, requests_mock_datadir): - with pytest.raises(JSONDecodeError): - NixGuixLoader("https://example.com/file.txt") +def test_nixguix_url_with_decoding_error(swh_config, requests_mock_datadir): + sources_url = "https://example.com/file.txt" + loader = NixGuixLoader(sources_url) + load_status = loader.load() + + assert load_status == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, sources_url, status="failed", type="nixguix", snapshot=None + ) def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): @@ -304,7 +317,8 @@ loader = NixGuixLoader(sources_url) loader_status = loader.load() - urls = [s["urls"][0] for s in loader.sources] + sources = loader.supported_sources()["sources"] + urls = [s["urls"][0] for s in sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,6 +13,7 @@ import attr import chardet +from swh.loader.exception import NotFound from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, @@ -101,7 +102,10 @@ @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + try: + return api_info(self.provider_url) + except ValueError as e: + raise NotFound(e) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -701,3 +701,14 @@ } assert_last_visit_matches(loader.storage, url, status="failed", type="npm") + + +def test_npm_origin_not_found(swh_config, requests_mock_datadir): + url = package_url("non-existent-url") + loader = NpmLoader(url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, url, status="not_found", type="npm", snapshot=None + ) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -12,6 +12,7 @@ import attr from pkginfo import UnpackedSDist +from swh.loader.exception import NotFound from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, @@ -69,7 +70,10 @@ @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + try: + return api_info(self.provider_url) + except ValueError as e: + raise NotFound(e) @cached_method def info(self) -> Dict: diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -904,3 +904,14 @@ assert_last_visit_matches( loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) + + +def test_pypi_origin_not_found(swh_config, requests_mock_datadir): + url = "https://pypi.org/project/unknown" + loader = PyPILoader(url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, url, status="not_found", type="pypi", snapshot=None + )