diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index a860cdf..2cea265 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,345 +1,351 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Person, Revision, RevisionType, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EMPTY_PERSON = Person(fullname=b"", name=None, email=None) @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" version = attr.ib(type=str) @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. """ visit_type = "npm" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, p_info: NpmPackageInfo ) -> Optional[bytes]: return artifact_to_revision_id(known_artifacts, p_info) def build_revision( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) message = i_metadata["version"].encode("ascii") if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Revision( type=RevisionType.TAR, message=message, author=author, date=date, committer=author, committer_date=date, parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "package.json", "raw": i_metadata,}, "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), "raw": p_info.raw_info, }, }, ) return r def artifact_to_revision_id( known_artifacts: Dict, p_info: NpmPackageInfo ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. The following code allows to deal with 2 metadata formats: - old format sample:: { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } - new format sample:: { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, }], ... } """ shasum = p_info.shasum for rev_id, known_artifact in known_artifacts.items(): - known_original_artifact = known_artifact.get("original_artifact") - if not known_original_artifact: - # previous loader-npm version kept original artifact elsewhere - known_original_artifact = known_artifact.get("package_source") - if not known_original_artifact: - continue - original_hash = known_original_artifact["sha1"] - else: - assert isinstance(known_original_artifact, list) - original_hash = known_original_artifact[0]["checksums"]["sha1"] + original_hash = _artifact_to_sha1(known_artifact) if shasum == original_hash: return rev_id + return None +def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]: + """Returns the sha1 from an NPM 'original_artifact' dict""" + known_original_artifact = known_artifact.get("original_artifact") + if not known_original_artifact: + # previous loader-npm version kept original artifact elsewhere + known_original_artifact = known_artifact.get("package_source") + if not known_original_artifact: + return None + return known_original_artifact["sha1"] + else: + assert isinstance(known_original_artifact, list) + return known_original_artifact[0]["checksums"]["sha1"] + + def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields """ if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py index 942547f..ddede87 100644 --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,290 +1,301 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from typing import Any, Dict, Iterator, Optional, Sequence, Tuple from urllib.parse import urlparse import attr from pkginfo import UnpackedSDist from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, Person, Revision, RevisionType, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @attr.s class PyPIPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) comment_text = attr.ib(type=Optional[str]) sha256 = attr.ib(type=str) upload_time = attr.ib(type=str) @classmethod def from_metadata(cls, metadata: Dict[str, Any]) -> "PyPIPackageInfo": return cls( url=metadata["url"], filename=metadata["filename"], raw_info=metadata, comment_text=metadata.get("comment_text"), sha256=metadata["digests"]["sha256"], upload_time=metadata["upload_time"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="pypi-project-json", metadata=json.dumps(metadata).encode(), ) ], ) class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ visit_type = "pypi" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.provider_url = pypi_api_url(self.url) @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return self.info()["releases"].keys() def get_default_version(self) -> str: return self.info()["info"]["version"] def get_metadata_authority(self): p_url = urlparse(self.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] for meta in self.info()["releases"][version]: if meta["packagetype"] != "sdist": continue p_info = PyPIPackageInfo.from_metadata(meta) res.append((version, p_info)) if len(res) == 1: version, p_info = res[0] yield release_name(version), p_info else: for version, p_info in res: yield release_name(version, p_info.filename), p_info def resolve_revision_from( self, known_artifacts: Dict, p_info: PyPIPackageInfo ) -> Optional[bytes]: return artifact_to_revision_id(known_artifacts, p_info) def build_revision( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata name = i_metadata["version"] _author = author(i_metadata) # from extrinsic metadata message = p_info.comment_text or "" message = "%s: %s" % (name, message) if message else name date = TimestampWithTimezone.from_iso8601(p_info.upload_time) return Revision( type=RevisionType.TAR, message=message.encode("utf-8"), author=_author, date=date, committer=_author, committer_date=date, parents=(), directory=directory, synthetic=True, metadata={ "intrinsic": {"tool": "PKG-INFO", "raw": i_metadata,}, "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), "raw": p_info.raw_info, }, }, ) def artifact_to_revision_id( known_artifacts: Dict, p_info: PyPIPackageInfo ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. The following code allows to deal with 2 metadata formats (column metadata in 'revision') - old format sample:: { 'original_artifact': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, ... } - new format sample:: { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa ... }, }], ... } """ sha256 = p_info.sha256 for rev_id, known_artifact in known_artifacts.items(): - original_artifact = known_artifact["original_artifact"] - if isinstance(original_artifact, dict): - # previous loader-pypi version stored metadata as dict - original_sha256 = original_artifact["sha256"] - if sha256 == original_sha256: - return rev_id - continue - # new pypi loader actually store metadata dict differently... - assert isinstance(original_artifact, list) - # current loader-pypi stores metadata as list of dict - for original_artifact in known_artifact["original_artifact"]: - if sha256 == original_artifact["checksums"]["sha256"]: - return rev_id + original_sha256 = _artifact_to_sha256(known_artifact) + if sha256 == original_sha256: + return rev_id + return None +def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]: + """Returns the sha256 from a PyPI 'original_artifact' dict""" + original_artifact = known_artifact["original_artifact"] + if isinstance(original_artifact, dict): + # previous loader-pypi version stored metadata as dict + return original_artifact["sha256"] + # new pypi loader actually store metadata dict differently... + assert isinstance(original_artifact, list) + # current loader-pypi stores metadata as list of dict + if len(known_artifact["original_artifact"]) == 0: + return None + elif len(known_artifact["original_artifact"]) == 1: + return original_artifact[0]["checksums"]["sha256"] + else: + raise ValueError( + f"Expected exactly one PyPI original_artifact, got " + f"{len(known_artifact['original_artifact'])}" + ) + + def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.rstrip("/").split("/")[-1] url = "%s://%s/pypi/%s/json" % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, "PKG-INFO") if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop("filename") # this gets added with the ondisk location return raw def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py index 5415f78..ed478e8 100644 --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -1,902 +1,916 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from os import path from unittest.mock import patch import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.core.tarball import uncompress from swh.loader.package import __version__ from swh.loader.package.pypi.loader import ( PyPILoader, artifact_to_revision_id, author, extract_intrinsic_metadata, pypi_api_url, ) from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType, ) from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Person, RawExtrinsicMetadata, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.interface import PagedResult @pytest.fixture def _0805nexter_api_info(datadir) -> bytes: with open( os.path.join(datadir, "https_pypi.org", "pypi_0805nexter_json"), "rb", ) as f: return f.read() def test_pypi_author_basic(): data = { "author": "i-am-groot", "author_email": "iam@groot.org", } actual_author = author(data) expected_author = Person( fullname=b"i-am-groot ", name=b"i-am-groot", email=b"iam@groot.org", ) assert actual_author == expected_author def test_pypi_author_empty_email(): data = { "author": "i-am-groot", "author_email": "", } actual_author = author(data) expected_author = Person(fullname=b"i-am-groot", name=b"i-am-groot", email=b"",) assert actual_author == expected_author def test_pypi_author_empty_name(): data = { "author": "", "author_email": "iam@groot.org", } actual_author = author(data) expected_author = Person( fullname=b" ", name=b"", email=b"iam@groot.org", ) assert actual_author == expected_author def test_pypi_author_malformed(): data = { "author": "['pierre', 'paul', 'jacques']", "author_email": None, } actual_author = author(data) expected_author = Person( fullname=b"['pierre', 'paul', 'jacques']", name=b"['pierre', 'paul', 'jacques']", email=None, ) assert actual_author == expected_author def test_pypi_author_malformed_2(): data = { "author": "[marie, jeanne]", "author_email": "[marie@some, jeanne@thing]", } actual_author = author(data) expected_author = Person( fullname=b"[marie, jeanne] <[marie@some, jeanne@thing]>", name=b"[marie, jeanne]", email=b"[marie@some, jeanne@thing]", ) assert actual_author == expected_author def test_pypi_author_malformed_3(): data = { "author": "[marie, jeanne, pierre]", "author_email": "[marie@somewhere.org, jeanne@somewhere.org]", } actual_author = author(data) expected_author = Person( fullname=( b"[marie, jeanne, pierre] " b"<[marie@somewhere.org, jeanne@somewhere.org]>" ), name=b"[marie, jeanne, pierre]", email=b"[marie@somewhere.org, jeanne@somewhere.org]", ) actual_author == expected_author # configuration error # def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url("https://pypi.org/project/requests") assert url == "https://pypi.org/pypi/requests/json" def test_pypi_api_url_with_slash(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url("https://pypi.org/project/requests/") assert url == "https://pypi.org/pypi/requests/json" @pytest.mark.fs def test_pypi_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( datadir, "https_files.pythonhosted.org", "0805nexter-1.1.0.zip" ) uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { "metadata_version": "1.0", "name": "0805nexter", "version": "1.1.0", "summary": "a simple printer of nested lest", "home_page": "http://www.hp.com", "author": "hgtkpython", "author_email": "2868989685@qq.com", "platforms": ["UNKNOWN"], } assert actual_metadata == expected_metadata @pytest.mark.fs def test_pypi_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path assert extract_intrinsic_metadata("/something-inexistent") == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = path.join(tmp_path, "something") os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} requests_mock_datadir_missing_all = requests_mock_datadir_factory( ignore_urls=[ "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip", # noqa "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip", # noqa ] ) def test_pypi_no_release_artifact(swh_storage, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches(swh_storage, url, status="partial", type="pypi") def test_pypi_fail__load_snapshot(swh_storage, requests_mock_datadir): """problem during loading: {visit: failed, status: failed, no snapshot} """ url = "https://pypi.org/project/0805nexter" with patch( "swh.loader.package.pypi.loader.PyPILoader._load_snapshot", side_effect=ValueError("Fake problem to fail visit"), ): loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 0, } == stats assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") # problem during loading: # {visit: partial, status: uneventful, no snapshot} def test_pypi_release_with_traceback(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/0805nexter" with patch( "swh.loader.package.pypi.loader.PyPILoader.last_snapshot", side_effect=ValueError("Fake problem to fail the visit"), ): loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[ "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip", # noqa ] ) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_pypi_revision_metadata_structure( swh_storage, requests_mock_datadir, _0805nexter_api_info ): url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_revision_id = hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21") revision = swh_storage.revision_get([expected_revision_id])[0] assert revision is not None check_metadata_paths( revision.metadata, paths=[ ("intrinsic.tool", str), ("intrinsic.raw", dict), ("extrinsic.provider", str), ("extrinsic.when", str), ("extrinsic.raw", dict), ("original_artifact", list), ], ) for original_artifact in revision.metadata["original_artifact"]: check_metadata_paths( original_artifact, paths=[("filename", str), ("length", int), ("checksums", dict),], ) revision_swhid = CoreSWHID( object_type=ObjectType.REVISION, object_id=expected_revision_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=revision.directory ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.pypi.loader.PyPILoader", version=__version__, ), discovery_date=loader.visit_date, format="pypi-project-json", metadata=json.dumps( json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0] ).encode(), origin=url, revision=revision_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) def test_pypi_visit_with_missing_artifact( swh_storage, requests_mock_datadir_missing_one ): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("dd0e4201a232b1c104433741dbf45895b8ac9355") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } stats = get_stats(swh_storage) assert { "content": 3, "directory": 2, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats expected_contents = map( hash_to_bytes, [ "405859113963cb7a797642b45f171d6360425d16", "e5686aa568fdb1d19d7f1329267082fe40482d31", "83ecf6ec1114fd260ca7a833a2d165e71258c338", ], ) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, [ "b178b66bd22383d5f16f4f5c923d39ca798861b4", "c3a58f8b57433a4b56caaa5033ae2e0931405338", ], ) assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"): hash_to_bytes( "b178b66bd22383d5f16f4f5c923d39ca798861b4" ), # noqa } assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) assert_last_visit_matches( swh_storage, url, status="partial", type="pypi", snapshot=expected_snapshot_id, ) def test_pypi_visit_with_1_release_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 1, } == stats expected_contents = map( hash_to_bytes, [ "a61e24cdfdab3bb7817f6be85d37a3e666b34566", "938c33483285fd8ad57f15497f538320df82aeb8", "a27576d60e08c94a05006d2e6d540c0fdb5f38c8", "405859113963cb7a797642b45f171d6360425d16", "e5686aa568fdb1d19d7f1329267082fe40482d31", "83ecf6ec1114fd260ca7a833a2d165e71258c338", ], ) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, [ "05219ba38bc542d4345d5638af1ed56c7d43ca7d", "cf019eb456cf6f78d8c4674596f1c9a97ece8f44", "b178b66bd22383d5f16f4f5c923d39ca798861b4", "c3a58f8b57433a4b56caaa5033ae2e0931405338", ], ) assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"): hash_to_bytes( "05219ba38bc542d4345d5638af1ed56c7d43ca7d" ), # noqa hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"): hash_to_bytes( "b178b66bd22383d5f16f4f5c923d39ca798861b4" ), # noqa } assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"), target_type=TargetType.REVISION, ), b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert actual_load_status == { "status": "eventful", "snapshot_id": snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=snapshot_id ) stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 1, } == stats expected_snapshot = Snapshot( id=snapshot_id, branches={ b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"), target_type=TargetType.REVISION, ), b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) actual_load_status2 = loader.load() assert actual_load_status2 == { "status": "uneventful", "snapshot_id": actual_load_status2["snapshot_id"], } visit_status2 = assert_last_visit_matches( swh_storage, url, status="full", type="pypi" ) stats2 = get_stats(swh_storage) expected_stats2 = stats.copy() expected_stats2["origin_visit"] = 1 + 1 assert expected_stats2 == stats2 # same snapshot assert visit_status2.snapshot == snapshot_id def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(swh_storage) expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert visit1_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "revision": 2, "skipped_content": 0, "snapshot": 1, } == visit1_stats # Reset internal state del loader._cached__raw_info del loader._cached_info visit2_actual_load_status = loader.load() visit2_stats = get_stats(swh_storage) assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("2e5149a7b0725d18231a37b342e9b7c4e121f283") assert visit2_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id2.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) assert { "content": 6 + 1, # 1 more content "directory": 4 + 2, # 2 more directories "origin": 1, "origin_visit": 1 + 1, "release": 0, "revision": 2 + 1, # 1 more revision "skipped_content": 0, "snapshot": 1 + 1, # 1 more snapshot } == visit2_stats expected_contents = map( hash_to_bytes, [ "a61e24cdfdab3bb7817f6be85d37a3e666b34566", "938c33483285fd8ad57f15497f538320df82aeb8", "a27576d60e08c94a05006d2e6d540c0fdb5f38c8", "405859113963cb7a797642b45f171d6360425d16", "e5686aa568fdb1d19d7f1329267082fe40482d31", "83ecf6ec1114fd260ca7a833a2d165e71258c338", "92689fa2b7fb4d4fc6fb195bf73a50c87c030639", ], ) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map( hash_to_bytes, [ "05219ba38bc542d4345d5638af1ed56c7d43ca7d", "cf019eb456cf6f78d8c4674596f1c9a97ece8f44", "b178b66bd22383d5f16f4f5c923d39ca798861b4", "c3a58f8b57433a4b56caaa5033ae2e0931405338", "e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a", "52604d46843b898f5a43208045d09fcf8731631b", ], ) assert list(swh_storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"): hash_to_bytes( "05219ba38bc542d4345d5638af1ed56c7d43ca7d" ), # noqa hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"): hash_to_bytes( "b178b66bd22383d5f16f4f5c923d39ca798861b4" ), # noqa hash_to_bytes("51247143b01445c9348afa9edfae31bf7c5d86b1"): hash_to_bytes( "e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a" ), # noqa } assert list(swh_storage.revision_missing(expected_revs)) == [] expected_snapshot = Snapshot( id=expected_snapshot_id2, branches={ b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"), target_type=TargetType.REVISION, ), b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("e445da4da22b31bfebb6ffc4383dbf839a074d21"), target_type=TargetType.REVISION, ), b"releases/1.3.0": SnapshotBranch( target=hash_to_bytes("51247143b01445c9348afa9edfae31bf7c5d86b1"), target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"releases/1.3.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://files.pythonhosted.org") ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls)) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} # release with multiple sdist artifacts per pypi "version" # snapshot branch output is different def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch( target=hash_to_bytes("4c99891f93b81450385777235a37b5e966dd1571"), target_type=TargetType.REVISION, ), b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch( target=hash_to_bytes("0bf88f5760cca7665d0af4d6575d9301134fe11a"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) def test_pypi_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ class artifact_metadata: sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" assert artifact_to_revision_id({}, artifact_metadata) is None known_artifacts = { "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": { "original_artifact": {"sha256": "something-irrelevant",}, }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_pypi_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ class artifact_metadata: sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { "original_artifact": {"sha256": "something-wrong",}, }, hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { "original_artifact": { "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa }, }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( "845673bfe8cbd31b1eaf757745a964137e6f9116" ) def test_pypi_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ class artifact_metadata: sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { "original_artifact": [ { "checksums": { "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa }, } ], }, hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { "original_artifact": [{"checksums": {"sha256": "something-wrong"},}], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92" ) + # there should not be more than one artifact + with pytest.raises(ValueError): + artifact_to_revision_id( + { + hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { + "original_artifact": [ + {"checksums": {"sha256": artifact_metadata.sha256,},}, + {"checksums": {"sha256": artifact_metadata.sha256,},}, + ], + }, + }, + artifact_metadata, + ) + def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ url = "https://pypi.org/project/upymenu" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot(id=expected_snapshot_id, branches={}) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) def test_pypi_origin_not_found(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/unknown" loader = PyPILoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="pypi", snapshot=None )