diff --git a/swh/loader/exception.py b/swh/loader/exception.py new file mode 100644 --- /dev/null +++ b/swh/loader/exception.py @@ -0,0 +1,13 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class NotFound(ValueError): + """An exception raised when some information to retrieve is not found (e.g origin, + artifact, ...) + + """ + + pass diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -29,6 +29,7 @@ from swh.core.config import load_from_envvar from swh.core.tarball import uncompress +from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.collections import ImmutableDict @@ -157,6 +158,10 @@ def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. + Raises: + `class:swh.loader.exception.NotFound` error when failing to read the + published package versions. + Returns: Sequence of published versions @@ -413,7 +418,18 @@ load_exceptions: List[Exception] = [] - for version in self.get_versions(): # for each + try: + versions = self.get_versions() + except NotFound: + status_visit = "not_found" + status_load = "failed" + return finalize_visit() + except Exception: + status_visit = "failed" + status_load = "failed" + return finalize_visit() + + for version in versions: logger.debug("version: %s", version) tmp_revisions[version] = [] # `p_` stands for `package_` diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,7 +16,7 @@ PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info +from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method from swh.model import hashutil from swh.model.collections import ImmutableDict from swh.model.model import ( @@ -60,29 +60,31 @@ def __init__(self, url): super().__init__(url=url) - unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) - self.raw_sources = retrieve_sources(url) - clean = clean_sources( - parse_sources(self.raw_sources), unsupported_file_extensions - ) - self.sources = clean["sources"] self.provider_url = url - self._integrityByUrl = {s["urls"][0]: s["integrity"] for s in self.sources} - - # The revision used to create the sources.json file. For Nix, - # this revision belongs to the github.com/nixos/nixpkgs - # repository - self.revision = clean["revision"] - # Note: this could be renamed get_artifacts in the PackageLoader # base class. - def get_versions(self): + @cached_method + def raw_sources(self): + return retrieve_sources(self.url) + + @cached_method + def supported_sources(self): + raw_sources = self.raw_sources() + unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) + return clean_sources(parse_sources(raw_sources), unsupported_file_extensions) + + @cached_method + def integrity_by_url(self) -> Dict[str, Any]: + sources = self.supported_sources() + return {s["urls"][0]: s["integrity"] for s in sources["sources"]} + + def get_versions(self) -> List[str]: """The first mirror of the mirror list is used as branch name in the snapshot. """ - return self._integrityByUrl.keys() + return list(self.integrity_by_url().keys()) def get_metadata_authority(self): return MetadataAuthority( @@ -92,7 +94,7 @@ def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( - format="nixguix-sources-json", metadata=self.raw_sources, + format="nixguix-sources-json", metadata=self.raw_sources(), ), ] @@ -103,7 +105,7 @@ # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. - integrity = self._integrityByUrl[url] + integrity = self.integrity_by_url()[url] p_info = NixGuixPackageInfo.from_metadata({"url": url, "integrity": integrity}) yield url, p_info @@ -178,10 +180,14 @@ a Nix/Guix evaluation. """ + # The revision used to create the sources.json file. For Nix, + # this revision belongs to the github.com/nixos/nixpkgs + # repository + revision = self.supported_sources()["revision"] return { b"evaluation": { "target_type": "revision", - "target": hashutil.hash_to_bytes(self.revision), + "target": hashutil.hash_to_bytes(revision), } } @@ -209,6 +215,7 @@ def retrieve_sources(url: str) -> bytes: + """Retrieve sources. Potentially raise NotFound error.""" return api_info(url, allow_redirects=True) diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,10 +1,9 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -from json.decoder import JSONDecodeError import logging import os from typing import Dict, Optional, Tuple @@ -107,14 +106,33 @@ assert len(j["sources"]) == 2 -def test_retrieve_non_existing(swh_config, requests_mock_datadir): - with pytest.raises(ValueError): - NixGuixLoader("https://non-existing-url") +def test_nixguix_url_not_found(swh_config, requests_mock_datadir): + """When failing to read from the url, the visit is marked as not_found. + + """ + unknown_url = "https://non-existing-url" + loader = NixGuixLoader(unknown_url) + # during the retrieval step + load_status = loader.load() + + assert load_status == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, unknown_url, status="not_found", type="nixguix", snapshot=None + ) -def test_retrieve_non_json(swh_config, requests_mock_datadir): - with pytest.raises(JSONDecodeError): - NixGuixLoader("https://example.com/file.txt") +def test_nixguix_url_with_decoding_error(swh_config, requests_mock_datadir): + """Other errors during communication with the url, the visit is marked as failed""" + sources_url = "https://example.com/file.txt" + loader = NixGuixLoader(sources_url) + load_status = loader.load() + + assert load_status == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, sources_url, status="failed", type="nixguix", snapshot=None + ) def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): @@ -304,7 +322,8 @@ loader = NixGuixLoader(sources_url) loader_status = loader.load() - urls = [s["urls"][0] for s in loader.sources] + sources = loader.supported_sources()["sources"] + urls = [s["urls"][0] for s in sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -701,3 +701,14 @@ } assert_last_visit_matches(loader.storage, url, status="failed", type="npm") + + +def test_npm_origin_not_found(swh_config, requests_mock_datadir): + url = package_url("non-existent-url") + loader = NpmLoader(url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, url, status="not_found", type="npm", snapshot=None + ) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -904,3 +904,14 @@ assert_last_visit_matches( loader.storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) + + +def test_pypi_origin_not_found(swh_config, requests_mock_datadir): + url = "https://pypi.org/project/unknown" + loader = PyPILoader(url) + + assert loader.load() == {"status": "failed"} + + assert_last_visit_matches( + loader.storage, url, status="not_found", type="pypi", snapshot=None + ) diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,6 +9,7 @@ import pytest +from swh.loader.exception import NotFound import swh.loader.package from swh.loader.package.utils import api_info, download, release_name @@ -135,7 +136,7 @@ status_code = 400 requests_mock.get(url, status_code=status_code) - with pytest.raises(ValueError) as e0: + with pytest.raises(NotFound) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ import requests +from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person @@ -32,7 +33,7 @@ url (str): The api url (e.g PyPI, npm, etc...) Raises: - ValueError in case of query failures (for some reasons: 404, ...) + NotFound in case of query failures (for some reasons: 404, ...) Returns: The associated response's information @@ -40,7 +41,7 @@ """ response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) if response.status_code != 200: - raise ValueError("Fail to query '%s'. Reason: %s" % (url, response.status_code)) + raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") return response.content