diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index 85d30ea..2fabd6a 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,245 +1,245 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import requests import types from typing import ( Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union ) from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import ( Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git, ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download logger = logging.getLogger(__name__) class DepositLoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'deposit' def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) config_deposit = self.config['deposit'] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit['url'], auth=config_deposit['auth']) - self._metadata = None - - @property - def metadata(self): - if self._metadata is None: - self._metadata = self.client.metadata_get(self.deposit_id) - return self._metadata + self.metadata: Dict[str, Any] = {} def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ['HEAD'] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: p_info = { 'filename': 'archive.zip', 'raw': self.metadata, } yield 'HEAD', p_info def download_package(self, p_info: Mapping[str, Any], tmpdir: str) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get( self.deposit_id, tmpdir, p_info['filename'])] def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git) -> Optional[Revision]: revision_data = a_metadata.pop('revision') # FIXME: the deposit no longer needs to build the revision date = TimestampWithTimezone.from_dict(revision_data['date']) metadata = revision_data['metadata'] metadata.update({ 'extrinsic': { 'provider': self.client.metadata_url(self.deposit_id), 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }) return Revision( type=RevisionType.TAR, message=revision_data['message'].encode('utf-8'), author=parse_author(revision_data['author']), date=date, committer=parse_author(revision_data['committer']), committer_date=date, parents=[hash_to_bytes(p) for p in revision_data.get('parents', [])], directory=directory, synthetic=True, metadata=metadata, ) def load(self) -> Dict: - # Usual loading + # First making sure the deposit is known prior to trigger a loading + try: + self.metadata = self.client.metadata_get(self.deposit_id) + except ValueError: + logger.error(f'Unknown deposit {self.deposit_id}, ignoring') + return {'status': 'failed'} + # Then usual loading r = super().load() success = r['status'] != 'failed' if success: # Update archive with metadata information origin_metadata = self.metadata['origin_metadata'] logger.debug('origin_metadata: %s', origin_metadata) tools = self.storage.tool_add([origin_metadata['tool']]) logger.debug('tools: %s', tools) tool_id = tools[0]['id'] provider = origin_metadata['provider'] # FIXME: Shall we delete this info? provider_id = self.storage.metadata_provider_add( provider['provider_name'], provider['provider_type'], provider['provider_url'], metadata=None) metadata = origin_metadata['metadata'] self.storage.origin_metadata_add( self.url, self.visit_date, provider_id, tool_id, metadata) # Update deposit status try: if not success: self.client.status_update(self.deposit_id, status='failed') return r snapshot_id = hash_to_bytes(r['snapshot_id']) branches = self.storage.snapshot_get(snapshot_id)['branches'] logger.debug('branches: %s', branches) if not branches: return r rev_id = branches[b'HEAD']['target'] revisions = self.storage.revision_get([rev_id]) # FIXME: inconsistency between tests and production code if isinstance(revisions, types.GeneratorType): revisions = list(revisions) revision = revisions[0] # Retrieve the revision identifier dir_id = revision['directory'] # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_id, status='done', revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), origin_url=self.url) except Exception: logger.exception( 'Problem when trying to update the deposit\'s status') return {'status': 'failed'} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author['fullname'].encode('utf-8'), name=author['name'].encode('utf-8'), email=author['email'].encode('utf-8'), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip('/') self.auth = None if not auth else (auth['username'], auth['password']) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs['auth'] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f'{self.base_url}/{deposit_id}/raw/' return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f'{self.base_url}/{deposit_id}/meta/' def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do('get', url) if r.ok: return r.json() msg = f'Problem when retrieving deposit metadata at {url}' logger.error(msg) raise ValueError(msg) def status_update(self, deposit_id: Union[int, str], status: str, revision_id: Optional[str] = None, directory_id: Optional[str] = None, origin_url: Optional[str] = None): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f'{self.base_url}/{deposit_id}/update/' payload = {'status': status} if revision_id: payload['revision_id'] = revision_id if directory_id: payload['directory_id'] = directory_id if origin_url: payload['origin_url'] = origin_url self.do('put', url, json=payload) diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index 0e6b32e..0ec8922 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,207 +1,205 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from swh.model.hashutil import hash_to_bytes from swh.loader.package.deposit.loader import DepositLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) from swh.core.pytest_plugin import requests_mock_datadir_factory def test_deposit_init_ok(swh_config, swh_loader_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config['deposit']['url'] -def test_deposit_loading_failure_to_fetch_metadata(swh_config): - """Error during fetching artifact ends us with failed/partial visit +def test_deposit_loading_unknown_deposit( + swh_config, requests_mock_datadir): + """Loading an unknown deposit should fail + no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url' - unknown_deposit_id = 666 + unknown_deposit_id = 667 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, - 'origin': 1, - 'origin_visit': 1, + 'origin': 0, + 'origin_visit': 0, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats - origin_visit = next(loader.storage.origin_visit_get(url)) - assert origin_visit['status'] == 'partial' - assert origin_visit['type'] == 'deposit' - requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://deposit.softwareheritage.org/1/private/666/raw/', ]) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url-2' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'deposit' def test_revision_metadata_structure(swh_config, requests_mock_datadir): # do not care for deposit update query requests_mock_datadir.put(re.compile('https')) url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '637318680351f5d78856d13264faebbd91efe9bb') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_deposit_loading_ok(swh_config, requests_mock_datadir): requests_mock_datadir.put(re.compile('https')) # do not care for put url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = 'b2b327b33dc85818bd23c3ccda8b7e675a66ecbd' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 303, 'directory': 12, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deposit' expected_branches = { 'HEAD': { 'target': '637318680351f5d78856d13264faebbd91efe9bb', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) # check metadata tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2", } } tool = loader.storage.tool_get(tool) assert tool is not None assert tool['id'] is not None provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } provider = loader.storage.metadata_provider_get_by(provider) assert provider is not None assert provider['id'] is not None metadata = list(loader.storage.origin_metadata_get_by( url, provider_type='deposit_client')) assert metadata is not None assert isinstance(metadata, list) assert len(metadata) == 1 metadata0 = metadata[0] assert metadata0['provider_id'] == provider['id'] assert metadata0['provider_type'] == 'deposit_client' assert metadata0['tool_id'] == tool['id'] diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index fee6f11..174df8d 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,448 +1,452 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os from typing import ( Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple ) import attr from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( BaseModel, Sha1Git, Content, SkippedContent, Directory, Revision, TargetType, Snapshot, Origin ) from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.package.utils import download logger = logging.getLogger(__name__) class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.max_content_size = self.config['max_content_size'] def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git) -> Optional[Revision]: """Build the revision from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: a_metadata: Artifact metadata uncompressed_path: Artifact uncompressed path on disk Returns: SWH data dict """ raise NotImplementedError('build_revision') def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return '' def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot """ snapshot = None visit = self.storage.origin_visit_get_latest( self.url, require_snapshot=True) if visit and visit.get('snapshot'): snapshot = Snapshot.from_dict(snapshot_get_all_branches( self.storage, visit['snapshot'])) return snapshot def known_artifacts( self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: """Retrieve the known releases/artifact for the origin. Args snapshot: snapshot for the visit Returns: Dict of keys revision id (bytes), values a metadata Dict. """ if not snapshot: return {} # retrieve only revisions (e.g the alias we do not want here) revs = [rev.target for rev in snapshot.branches.values() if rev and rev.target_type == TargetType.REVISION] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: # revision_get can return None continue ret[revision['id']] = revision['metadata'] return ret def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: """Resolve the revision from a snapshot and an artifact metadata dict. If the artifact has already been downloaded, this will return the existing revision targeting that uncompressed artifact directory. Otherwise, this returns None. Args: snapshot: Snapshot artifact_metadata: Information dict Returns: None or revision identifier """ return None def download_package(self, p_info: Mapping[str, Any], tmpdir: str) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ a_uri = p_info['url'] filename = p_info.get('filename') return [download(a_uri, dest=tmpdir, filename=filename)] def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, 'src') for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'full' # either: partial, full tmp_revisions = {} # type: Dict[str, List] snapshot = None # Prepare origin and origin_visit origin = Origin(url=self.url) try: self.storage.origin_add_one(origin) visit = self.storage.origin_visit_add( self.url, date=self.visit_date, type=self.visit_type) except Exception: logger.exception('Failed to create origin/origin_visit:') return {'status': 'failed'} try: last_snapshot = self.last_snapshot() logger.debug('last snapshot: %s', last_snapshot) known_artifacts = self.known_artifacts(last_snapshot) logger.debug('known artifacts: %s', known_artifacts) - # Retrieve the default release version (the "latest" one) - default_version = self.get_default_version() - logger.debug('default version: %s', default_version) - for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `p_` stands for `package_` for branch_name, p_info in self.get_package_info(version): logger.debug('package_info: %s', p_info) revision_id = self.resolve_revision_from( known_artifacts, p_info['raw']) if revision_id is None: (revision_id, loaded) = \ self._load_revision(p_info, origin) if loaded: status_load = 'eventful' else: status_visit = 'partial' if revision_id is None: continue tmp_revisions[version].append((branch_name, revision_id)) - snapshot = self._load_snapshot(default_version, tmp_revisions) - if hasattr(self.storage, 'flush'): - self.storage.flush() except Exception: logger.exception('Fail to load %s' % self.url) status_visit = 'partial' status_load = 'failed' finally: + # Retrieve the default release version (the "latest" one) + default_version = self.get_default_version() + logger.debug('default version: %s', default_version) + extra_branches = self.extra_branches() + logger.debug('extra branches: %s', extra_branches) + snapshot = self._load_snapshot( + default_version, tmp_revisions, extra_branches) + if hasattr(self.storage, 'flush'): + self.storage.flush() self.storage.origin_visit_update( origin=self.url, visit_id=visit.visit, status=status_visit, snapshot=snapshot and snapshot.id) - result = { + result: Dict[str, Any] = { 'status': status_load, - } # type: Dict[str, Any] + } if snapshot: result['snapshot_id'] = hash_to_hex(snapshot.id) return result def _load_revision(self, p_info, origin) -> Tuple[Optional[Sha1Git], bool]: """Does all the loading of a revision itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and revision to self.storage * returns (revision_id, loaded) """ with tempfile.TemporaryDirectory() as tmpdir: try: dl_artifacts = self.download_package(p_info, tmpdir) except Exception: logger.exception('Unable to retrieve %s', p_info) return (None, False) try: uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug('uncompressed_path: %s', uncompressed_path) except ValueError: logger.exception('Fail to uncompress %s', p_info['url']) return (None, False) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode('utf-8'), max_content_length=self.max_content_size) contents: List[Content] = [] skipped_contents: List[SkippedContent] = [] directories: List[Directory] = [] for obj in directory.iter_tree(): obj = obj.to_model() if isinstance(obj, Content): # FIXME: read the data from disk later (when the # storage buffer is flushed). obj = obj.with_data() contents.append(obj) elif isinstance(obj, SkippedContent): skipped_contents.append(obj) elif isinstance(obj, Directory): directories.append(obj) else: raise TypeError( f'Unexpected content type from disk: {obj}') logger.debug('Number of skipped contents: %s', len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug('Number of contents: %s', len(contents)) self.storage.content_add(contents) logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 revision = self.build_revision( p_info['raw'], uncompressed_path, directory=directory.hash) if not revision: # Some artifacts are missing intrinsic metadata # skipping those return (None, True) metadata = revision.metadata or {} metadata.update({ 'original_artifact': [ hashes for _, hashes in dl_artifacts ], }) revision = attr.evolve(revision, metadata=metadata) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) return (revision.id, True) def _load_snapshot( self, default_version: str, - revisions: Dict[str, List[Tuple[str, bytes]]]) -> Snapshot: - """Build snapshot out of the current revisions stored. Then load it in - the storage. + revisions: Dict[str, List[Tuple[str, bytes]]], + extra_branches: Dict[bytes, Mapping[str, Any]] + ) -> Optional[Snapshot]: + """Build snapshot out of the current revisions stored and extra branches. + Then load it in the storage. """ logger.debug('revisions: %s', revisions) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_revisions in revisions.items(): if version == default_version and \ len(branch_name_revisions) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_revisions[0] # except for some corner case (deposit) if branch_name != 'HEAD': branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name.encode('utf-8'), } for branch_name, target in branch_name_revisions: branches[branch_name.encode('utf-8')] = { 'target_type': 'revision', 'target': target, } # Deal with extra-branches - for name, branch_target in self.extra_branches().items(): + for name, branch_target in extra_branches.items(): if name in branches: logger.error("Extra branch '%s' has been ignored", name) else: branches[name] = branch_target snapshot_data = { 'branches': branches } logger.debug('snapshot: %s', snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) return snapshot diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py index 116eede..7f68b1d 100644 --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -1,834 +1,836 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from os import path import pytest from unittest.mock import patch from swh.core.tarball import uncompress from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.model.hashutil import hash_to_bytes from swh.model.model import Person from swh.loader.package.pypi.loader import ( PyPILoader, pypi_api_url, author, extract_intrinsic_metadata, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) def test_author_basic(): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = Person( fullname=b'i-am-groot ', name=b'i-am-groot', email=b'iam@groot.org', ) assert actual_author == expected_author def test_author_empty_email(): data = { 'author': 'i-am-groot', 'author_email': '', } actual_author = author(data) expected_author = Person( fullname=b'i-am-groot', name=b'i-am-groot', email=b'', ) assert actual_author == expected_author def test_author_empty_name(): data = { 'author': "", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = Person( fullname=b' ', name=b'', email=b'iam@groot.org', ) assert actual_author == expected_author def test_author_malformed(): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = Person( fullname=b"['pierre', 'paul', 'jacques']", name=b"['pierre', 'paul', 'jacques']", email=None, ) assert actual_author == expected_author def test_author_malformed_2(): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = Person( fullname=b'[marie, jeanne] <[marie@some, jeanne@thing]>', name=b'[marie, jeanne]', email=b'[marie@some, jeanne@thing]', ) assert actual_author == expected_author def test_author_malformed_3(): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = Person( fullname=( b'[marie, jeanne, pierre] ' b'<[marie@somewhere.org, jeanne@somewhere.org]>' ), name=b'[marie, jeanne, pierre]', email=b'[marie@somewhere.org, jeanne@somewhere.org]', ) actual_author == expected_author # configuration error # def test_badly_configured_loader_raise(monkeypatch): """Badly configured loader should raise""" monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False) with pytest.raises(ValueError) as e: PyPILoader(url='some-url') assert 'Misconfiguration' in e.value.args[0] def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests') assert url == 'https://pypi.org/pypi/requests/json' def test_pypi_api_url_with_slash(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url('https://pypi.org/project/requests/') assert url == 'https://pypi.org/pypi/requests/json' @pytest.mark.fs def test_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip') uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { 'metadata_version': '1.0', 'name': '0805nexter', 'version': '1.1.0', 'summary': 'a simple printer of nested lest', 'home_page': 'http://www.hp.com', 'author': 'hgtkpython', 'author_email': '2868989685@qq.com', 'platforms': ['UNKNOWN'], } assert actual_metadata == expected_metadata @pytest.mark.fs def test_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path assert extract_intrinsic_metadata('/something-inexistent') == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = path.join(tmp_path, 'something') os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa ]) def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: # {visit: partial, status: uneventful, no snapshot} -def test_release_with_traceback(swh_config): +def test_release_with_traceback(swh_config, requests_mock_datadir): url = 'https://pypi.org/project/0805nexter' - with patch('swh.loader.package.pypi.loader.PyPILoader.get_default_version', - side_effect=ValueError('Problem')): + with patch('swh.loader.package.pypi.loader.PyPILoader.last_snapshot', + side_effect=ValueError('Fake problem to fail the visit')): loader = PyPILoader(url) actual_load_status = loader.load() - assert actual_load_status == {'status': 'failed'} + assert actual_load_status['status'] == 'failed' + assert actual_load_status[ + 'snapshot_id'] == '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e' stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, - 'snapshot': 0, + 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa ]) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_revision_metadata_structure(swh_config, requests_mock_datadir): url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'e445da4da22b31bfebb6ffc4383dbf839a074d21') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_missing_artifact( swh_config, requests_mock_datadir_missing_one): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'dd0e4201a232b1c104433741dbf45895b8ac9355' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 3, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'pypi' def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) actual_load_status = loader.load() snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == stats expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.2.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' actual_load_status2 = loader.load() assert actual_load_status2 == { 'status': 'uneventful', 'snapshot_id': actual_load_status2['snapshot_id'] } stats2 = get_stats(loader.storage) expected_stats2 = stats.copy() expected_stats2['origin_visit'] = 1 + 1 assert expected_stats2 == stats2 # same snapshot actual_snapshot_id = origin_visit['snapshot'] assert actual_snapshot_id == hash_to_bytes(snapshot_id) def test_incremental_visit(swh_config, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = 'https://pypi.org/project/0805nexter' loader = PyPILoader(url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(loader.storage) expected_snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a' assert visit1_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } origin_visit1 = next(loader.storage.origin_visit_get(url)) assert origin_visit1['status'] == 'full' assert origin_visit1['type'] == 'pypi' assert { 'content': 6, 'directory': 4, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1 } == visit1_stats # Reset internal state loader._info = None visit2_actual_load_status = loader.load() visit2_stats = get_stats(loader.storage) assert visit2_actual_load_status['status'] == 'eventful' expected_snapshot_id2 = '2e5149a7b0725d18231a37b342e9b7c4e121f283' assert visit2_actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id2 } visits = list(loader.storage.origin_visit_get(url)) assert len(visits) == 2 assert visits[1]['status'] == 'full' assert visits[1]['type'] == 'pypi' assert { 'content': 6 + 1, # 1 more content 'directory': 4 + 2, # 2 more directories 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': 2 + 1, # 1 more revision 'skipped_content': 0, 'snapshot': 1 + 1, # 1 more snapshot } == visit2_stats expected_contents = map(hash_to_bytes, [ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', '938c33483285fd8ad57f15497f538320df82aeb8', 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', '405859113963cb7a797642b45f171d6360425d16', 'e5686aa568fdb1d19d7f1329267082fe40482d31', '83ecf6ec1114fd260ca7a833a2d165e71258c338', '92689fa2b7fb4d4fc6fb195bf73a50c87c030639' ]) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, [ '05219ba38bc542d4345d5638af1ed56c7d43ca7d', 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', 'b178b66bd22383d5f16f4f5c923d39ca798861b4', 'c3a58f8b57433a4b56caaa5033ae2e0931405338', 'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a', '52604d46843b898f5a43208045d09fcf8731631b', ]) assert list(loader.storage.directory_missing(expected_dirs)) == [] # {revision hash: directory hash} expected_revs = { hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa } assert list(loader.storage.revision_missing(expected_revs)) == [] expected_branches = { 'releases/1.1.0': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.2.0': { 'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21', 'target_type': 'revision', }, 'releases/1.3.0': { 'target': '51247143b01445c9348afa9edfae31bf7c5d86b1', 'target_type': 'revision', }, 'HEAD': { 'target': 'releases/1.3.0', 'target_type': 'alias', }, } expected_snapshot = { 'id': expected_snapshot_id2, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://files.pythonhosted.org') ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls)) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} # release with multiple sdist artifacts per pypi "version" # snapshot branch output is different def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = 'https://pypi.org/project/nexter' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } expected_branches = { 'releases/1.1.0/nexter-1.1.0.zip': { 'target': '4c99891f93b81450385777235a37b5e966dd1571', 'target_type': 'revision', }, 'releases/1.1.0/nexter-1.1.0.tar.gz': { 'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' def test_pypi_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } assert artifact_to_revision_id({}, artifact_metadata) is None known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': { 'original_artifact': { 'sha256': 'something-irrelevant', }, }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_pypi_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': { 'sha256': "something-wrong", }, }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_pypi_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'digests': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha256': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') def test_pypi_artifact_to_revision_id_failures(): with pytest.raises(KeyError, match='sha256'): artifact_metadata = { 'digests': {}, } assert artifact_to_revision_id({}, artifact_metadata) with pytest.raises(KeyError, match='digests'): artifact_metadata = { 'something': 'wrong', } assert artifact_to_revision_id({}, artifact_metadata) def test_pypi_artifact_with_no_intrinsic_metadata( swh_config, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ url = 'https://pypi.org/project/upymenu' loader = PyPILoader(url) actual_load_status = loader.load() expected_snapshot_id = '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': expected_snapshot_id, 'branches': {} } check_snapshot(expected_snapshot, loader.storage) origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi'