diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -196,7 +196,7 @@ # received datetime without timezone: 2001-06-08 00:00:00 dt = dt.replace(tzinfo=timezone.utc) except Exception as e: - logger.warning("Fail to parse date %s. Reason: %s", (date, e)) + logger.warning("Fail to parse date %s. Reason: %s", date, e) if dt: return TimestampWithTimezone.from_datetime(dt) else: diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -28,7 +28,7 @@ PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import download +from swh.loader.package.utils import cached_method, download logger = logging.getLogger(__name__) @@ -86,7 +86,6 @@ RawExtrinsicMetadataCore( format="sword-v2-atom-codemeta-v2-in-json", metadata=raw_metadata_from_origin, - discovery_date=None, ), ], ) @@ -112,7 +111,6 @@ config_deposit = self.config["deposit"] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) - self.metadata: Dict[str, Any] = {} def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot @@ -120,7 +118,7 @@ return ["HEAD"] def get_metadata_authority(self) -> MetadataAuthority: - provider = self.metadata["origin_metadata"]["provider"] + provider = self.metadata()["origin_metadata"]["provider"] assert provider["provider_type"] == "deposit_client" return MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, @@ -132,7 +130,7 @@ ) def get_metadata_fetcher(self) -> MetadataFetcher: - tool = self.metadata["origin_metadata"]["tool"] + tool = self.metadata()["origin_metadata"]["tool"] return MetadataFetcher( name=tool["name"], version=tool["version"], metadata=tool["configuration"], ) @@ -141,7 +139,7 @@ self, version: str ) -> Iterator[Tuple[str, DepositPackageInfo]]: p_info = DepositPackageInfo.from_metadata( - self.metadata, url=self.url, filename="archive.zip", + self.metadata(), url=self.url, filename="archive.zip", ) yield "HEAD", p_info @@ -180,19 +178,23 @@ ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: - origin_metadata = self.metadata["origin_metadata"] + origin_metadata = self.metadata()["origin_metadata"] return [ RawExtrinsicMetadataCore( format="sword-v2-atom-codemeta-v2-in-json", metadata=json.dumps(origin_metadata["metadata"]).encode(), - discovery_date=None, ) ] + @cached_method + def metadata(self): + """Returns metadata from the deposit server""" + return self.client.metadata_get(self.deposit_id) + def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: - self.metadata = self.client.metadata_get(self.deposit_id) + self.metadata() except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -64,7 +64,7 @@ format = attr.ib(type=str) metadata = attr.ib(type=bytes) - discovery_date = attr.ib(type=Optional[datetime.datetime]) + discovery_date = attr.ib(type=Optional[datetime.datetime], default=None) """Defaults to the visit date.""" diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -89,9 +89,7 @@ def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( - format="nixguix-sources-json", - metadata=self.raw_sources, - discovery_date=None, + format="nixguix-sources-json", metadata=self.raw_sources, ), ] diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -15,6 +15,8 @@ from urllib.parse import quote from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, Person, RevisionType, Revision, @@ -22,8 +24,12 @@ Sha1Git, ) -from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import api_info, release_name +from swh.loader.package.loader import ( + BasePackageInfo, + PackageLoader, + RawExtrinsicMetadataCore, +) +from swh.loader.package.utils import api_info, cached_method, release_name logger = logging.getLogger(__name__) @@ -90,24 +96,38 @@ self._info: Dict[str, Any] = {} self._versions = None - @property - def info(self) -> Dict[str, Any]: + @cached_method + def _raw_info(self) -> bytes: + return api_info(self.provider_url) + + @cached_method + def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ - if not self._info: - self._info = json.loads(api_info(self.provider_url)) - return self._info + return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: - return sorted(list(self.info["versions"].keys())) + return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: - return self.info["dist-tags"].get("latest", "") + return self.info()["dist-tags"].get("latest", "") + + def get_metadata_authority(self): + return MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, + ) + + def get_extrinsic_snapshot_metadata(self): + return [ + RawExtrinsicMetadataCore( + format="replicate-npm-package-json", metadata=self._raw_info(), + ), + ] def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( - project_metadata=self.info, version=version + project_metadata=self.info(), version=version ) yield release_name(version), p_info diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -7,9 +7,23 @@ import os import pytest -from swh.model.hashutil import hash_to_bytes -from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.identifiers import SWHID +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Person, + RawExtrinsicMetadata, + Snapshot, + SnapshotBranch, + TargetType, +) + +from swh.storage.interface import PagedResult +from swh.loader.package import __version__ from swh.loader.package.npm.loader import ( _author_str, NpmLoader, @@ -24,6 +38,12 @@ ) +@pytest.fixture +def org_api_info(datadir) -> bytes: + with open(os.path.join(datadir, "https_replicate.npmjs.com", "org"), "rb",) as f: + return f.read() + + def test_npm_author_str(): for author, expected_author in [ ("author", "author"), @@ -322,7 +342,7 @@ ) -def test_npm_loader_first_visit(swh_config, requests_mock_datadir): +def test_npm_loader_first_visit(swh_config, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(url) @@ -387,6 +407,32 @@ ) check_snapshot(expected_snapshot, loader.storage) + snapshot_swhid = SWHID( + object_type="snapshot", object_id=hash_to_hex(expected_snapshot_id) + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", + ) + expected_metadata = [ + RawExtrinsicMetadata( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.npm.loader.NpmLoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="replicate-npm-package-json", + metadata=org_api_info, + origin="https://www.npmjs.com/package/org", + ) + ] + assert loader.storage.raw_extrinsic_metadata_get( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + ) == PagedResult(next_page_token=None, results=expected_metadata,) + def test_npm_loader_incremental_visit(swh_config, requests_mock_datadir_visits): package = "org" @@ -417,7 +463,10 @@ "snapshot": 1, } == stats - loader._info = None # reset loader internal state + # reset loader internal state + del loader._cached_info + del loader._cached__raw_info + actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snap_id2 = actual_load_status2["snapshot_id"] diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -13,6 +13,8 @@ from pkginfo import UnpackedSDist from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, Person, Sha1Git, TimestampWithTimezone, @@ -20,8 +22,17 @@ RevisionType, ) -from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import api_info, release_name, EMPTY_AUTHOR +from swh.loader.package.loader import ( + BasePackageInfo, + PackageLoader, + RawExtrinsicMetadataCore, +) +from swh.loader.package.utils import ( + api_info, + cached_method, + release_name, + EMPTY_AUTHOR, +) logger = logging.getLogger(__name__) @@ -55,27 +66,43 @@ def __init__(self, url): super().__init__(url=url) - self._info = None self.provider_url = pypi_api_url(self.url) - @property + @cached_method + def _raw_info(self) -> bytes: + return api_info(self.provider_url) + + @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ - if not self._info: - self._info = json.loads(api_info(self.provider_url)) - return self._info + return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: - return self.info["releases"].keys() + return self.info()["releases"].keys() def get_default_version(self) -> str: - return self.info["info"]["version"] + return self.info()["info"]["version"] + + def get_metadata_authority(self): + p_url = urlparse(self.url) + return MetadataAuthority( + type=MetadataAuthorityType.FORGE, + url=f"{p_url.scheme}://{p_url.netloc}/", + metadata={}, + ) + + def get_extrinsic_snapshot_metadata(self): + return [ + RawExtrinsicMetadataCore( + format="pypi-project-json", metadata=self._raw_info(), + ), + ] def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] - for meta in self.info["releases"][version]: + for meta in self.info()["releases"][version]: if meta["packagetype"] != "sdist": continue p_info = PyPIPackageInfo.from_metadata(meta) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -13,9 +13,22 @@ from swh.core.tarball import uncompress from swh.core.pytest_plugin import requests_mock_datadir_factory -from swh.model.hashutil import hash_to_bytes -from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.identifiers import SWHID +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Person, + RawExtrinsicMetadata, + Snapshot, + SnapshotBranch, + TargetType, +) +from swh.storage.interface import PagedResult +from swh.loader.package import __version__ from swh.loader.package.pypi.loader import ( PyPILoader, pypi_api_url, @@ -31,6 +44,14 @@ ) +@pytest.fixture +def _0805nexter_api_info(datadir) -> bytes: + with open( + os.path.join(datadir, "https_pypi.org", "pypi_0805nexter_json"), "rb", + ) as f: + return f.read() + + def test_author_basic(): data = { "author": "i-am-groot", @@ -315,6 +336,41 @@ ) +def test_snapshot_metadata(swh_config, requests_mock_datadir, _0805nexter_api_info): + url = "https://pypi.org/project/0805nexter" + loader = PyPILoader(url) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + snapshot_swhid = SWHID( + object_type="snapshot", object_id=hash_to_hex(actual_load_status["snapshot_id"]) + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", + ) + expected_metadata = [ + RawExtrinsicMetadata( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.pypi.loader.PyPILoader", version=__version__, + ), + discovery_date=loader.visit_date, + format="pypi-project-json", + metadata=_0805nexter_api_info, + origin=url, + ) + ] + assert loader.storage.raw_extrinsic_metadata_get( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + ) == PagedResult(next_page_token=None, results=expected_metadata,) + + def test_visit_with_missing_artifact(swh_config, requests_mock_datadir_missing_one): """Load a pypi project with some missing artifacts ends up with 1 snapshot @@ -582,12 +638,13 @@ } == visit1_stats # Reset internal state - loader._info = None + del loader._cached__raw_info + del loader._cached_info visit2_actual_load_status = loader.load() visit2_stats = get_stats(loader.storage) - assert visit2_actual_load_status["status"] == "eventful" + assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("2e5149a7b0725d18231a37b342e9b7c4e121f283") assert visit2_actual_load_status == { "status": "eventful", diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -4,11 +4,12 @@ # See top-level LICENSE file for more information import copy +import functools import logging import os import requests -from typing import Dict, Optional, Tuple +from typing import Callable, Dict, Optional, Tuple, TypeVar from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.model.model import Person @@ -121,3 +122,23 @@ if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version + + +TReturn = TypeVar("TReturn") +TSelf = TypeVar("TSelf") + +_UNDEFINED = object() + + +def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: + cache_name = f"_cached_{f.__name__}" + + @functools.wraps(f) + def newf(self): + value = getattr(self, cache_name, _UNDEFINED) + if value is _UNDEFINED: + value = f(self) + setattr(self, cache_name, value) + return value + + return newf