diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py index a7863cd..ed8c97c 100644 --- a/swh/loader/package/deposit.py +++ b/swh/loader/package/deposit.py @@ -1,79 +1,121 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + from typing import Generator, Dict, Tuple, Sequence from swh.loader.package.loader import PackageLoader from swh.deposit.client import PrivateApiDepositClient as ApiClient +logger = logging.getLogger(__name__) + + class DepositLoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = 'deposit' def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) # For now build back existing api urls # archive_url: Private api url to retrieve archive artifact self.archive_url = '/%s/raw/' % deposit_id # metadata_url: Private api url to retrieve the deposit metadata self.metadata_url = '/%s/meta/' % deposit_id # deposit_update_url: Private api to push pids and status update on the # deposit id self.deposit_update_url = '/%s/update/' % deposit_id self.client = ApiClient() + self._metadata = None + + @property + def metadata(self): + if self._metadata is None: + self._metadata = self.client.metadata_get(self.metadata_url) + return self._metadata def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ['HEAD'] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: - meta = self.client.metadata_get(self.metadata_url) filename = 'archive.zip' # do not care about it here url = self.client.base_url + self.archive_url - yield filename, url, meta + yield filename, url, self.metadata def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: revision = a_metadata.pop('revision') metadata = { 'extrinsic': { 'provider': '%s/%s' % ( self.client.base_url, self.metadata_url), 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, } # FIXME: the deposit no longer needs to build the revision revision['metadata'].update(metadata) revision['author'] = parse_author(revision['author']) revision['committer'] = parse_author(revision['committer']) revision['message'] = revision['message'].encode('utf-8') return revision + def load(self) -> Dict: + # Usual loading + r = super().load() + + if r['status'] != 'failed': # when loading is ok + + # Update archive with metadata information + origin_metadata = self.metadata['origin_metadata'] + + logger.debug('origin_metadata: %s', origin_metadata) + tools = self.storage.tool_add([origin_metadata['tool']]) + logger.debug('tools: %s', tools) + tool_id = tools[0]['id'] + + provider = origin_metadata['provider'] + # FIXME: Shall we delete this info? + provider_id = self.storage.metadata_provider_add( + provider['provider_name'], + provider['provider_type'], + provider['provider_url'], + metadata=None) + + metadata = origin_metadata['metadata'] + self.storage.origin_metadata_add( + self.url, self.visit_date, provider_id, tool_id, metadata) + + # 3. push update + # self.update_deposit() + + return r + def parse_author(author): """See prior fixme """ return { 'fullname': author['fullname'].encode('utf-8'), 'name': author['name'].encode('utf-8'), 'email': author['email'].encode('utf-8'), } diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py index 13099d6..aea4171 100644 --- a/swh/loader/package/tests/test_deposit.py +++ b/swh/loader/package/tests/test_deposit.py @@ -1,159 +1,195 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package.deposit import DepositLoader from swh.model.hashutil import hash_to_bytes from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths ) from swh.loader.package.tests.conftest import local_get_factory def test_deposit_init_ok(swh_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.archive_url == '/%s/raw/' % deposit_id assert loader.metadata_url == '/%s/meta/' % deposit_id assert loader.deposit_update_url == '/%s/update/' % deposit_id assert loader.client is not None def test_deposit_loading_failure_to_fetch_metadata(swh_config): """Error during fetching artifact ends us with failed/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url' unknown_deposit_id = 666 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = loader.storage.stat_counters() assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' local_get_missing_one = local_get_factory(ignore_urls=[ 'https://deposit.softwareheritage.org/1/private/666/raw/', ]) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, local_get_missing_one): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url-2' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() assert actual_load_status == {'status': 'uneventful'} stats = loader.storage.stat_counters() assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' def test_revision_metadata_structure(swh_config, local_get): url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} expected_revision_id = hash_to_bytes( '9471c606239bccb1f269564c9ea114e1eeab9eb4') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact.filename', str), ('original_artifact.length', int), ('original_artifact.checksums', dict), ]) def test_deposit_loading_ok(swh_config, local_get): url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) assert loader.archive_url actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': 303, 'directory': 12, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' expected_branches = { 'HEAD': { 'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4', 'target_type': 'revision', }, } expected_snapshot = { 'id': '453f455d0efb69586143cd6b6e5897f9906b53a7', 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) + + # check metadata + + tool = { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2", + } + } + + tool = loader.storage.tool_get(tool) + assert tool is not None + assert tool['id'] is not None + + provider = { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": None, + } + + provider = loader.storage.metadata_provider_get_by(provider) + assert provider is not None + assert provider['id'] is not None + + metadata = loader.storage.origin_metadata_get_by( + url, provider_type='deposit_client') + assert metadata is not None + assert isinstance(metadata, list) + assert len(metadata) == 1 + metadata0 = metadata[0] + + assert metadata0['provider_id'] == provider['id'] + assert metadata0['provider_type'] == 'deposit_client' + assert metadata0['tool_id'] == tool['id']