Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/deposit.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
import requests | |||||
from typing import Any, Dict, Generator, Mapping, Sequence, Tuple | from typing import ( | ||||
Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union | |||||
) | |||||
from swh.model.hashutil import hash_to_hex, hash_to_bytes | from swh.model.hashutil import hash_to_hex, hash_to_bytes | ||||
from swh.loader.package.loader import PackageLoader | from swh.loader.package.loader import PackageLoader | ||||
from swh.deposit.client import PrivateApiDepositClient as ApiClient | from swh.loader.package.utils import download | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class DepositLoader(PackageLoader): | class DepositLoader(PackageLoader): | ||||
"""Load pypi origin's artifact releases into swh archive. | """Load pypi origin's artifact releases into swh archive. | ||||
""" | """ | ||||
visit_type = 'deposit' | visit_type = 'deposit' | ||||
def __init__(self, url: str, deposit_id: str): | def __init__(self, url: str, deposit_id: str): | ||||
"""Constructor | """Constructor | ||||
Args: | Args: | ||||
url: Origin url to associate the artifacts/metadata to | url: Origin url to associate the artifacts/metadata to | ||||
deposit_id: Deposit identity | deposit_id: Deposit identity | ||||
""" | """ | ||||
super().__init__(url=url) | super().__init__(url=url) | ||||
# For now build back existing api urls | config_deposit = self.config['deposit'] | ||||
# archive_url: Private api url to retrieve archive artifact | self.deposit_id = deposit_id | ||||
self.archive_url = '/%s/raw/' % deposit_id | self.client = ApiClient(url=config_deposit['url'], | ||||
# metadata_url: Private api url to retrieve the deposit metadata | auth=config_deposit['auth']) | ||||
self.metadata_url = '/%s/meta/' % deposit_id | |||||
# deposit_update_url: Private api to push pids and status update on the | |||||
# deposit id | |||||
self.deposit_update_url = '/%s/update/' % deposit_id | |||||
self.client = ApiClient() | |||||
self._metadata = None | self._metadata = None | ||||
@property | @property | ||||
def metadata(self): | def metadata(self): | ||||
if self._metadata is None: | if self._metadata is None: | ||||
self._metadata = self.client.metadata_get(self.metadata_url) | self._metadata = self.client.metadata_get(self.deposit_id) | ||||
return self._metadata | return self._metadata | ||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot | # only 1 branch 'HEAD' with no alias since we only have 1 snapshot | ||||
# branch | # branch | ||||
return ['HEAD'] | return ['HEAD'] | ||||
def get_package_info(self, version: str) -> Generator[ | def get_package_info(self, version: str) -> Generator[ | ||||
Tuple[str, Mapping[str, Any]], None, None]: | Tuple[str, Mapping[str, Any]], None, None]: | ||||
p_info = { | p_info = { | ||||
'url': self.client.base_url + self.archive_url, | |||||
'filename': 'archive.zip', | 'filename': 'archive.zip', | ||||
'raw': self.metadata, | 'raw': self.metadata, | ||||
} | } | ||||
yield 'HEAD', p_info | yield 'HEAD', p_info | ||||
def download_package(self, p_info: Mapping[str, Any], | |||||
anlambert: Based on what @zack told us during its talk on mypy, concrete types should be used for… | |||||
Done Inline ActionsRight, thx for reminding me! ardumont: Right, thx for reminding me! | |||||
tmpdir: str) -> List[Tuple[str, Mapping]]: | |||||
"""Override to allow use of the dedicated deposit client | |||||
""" | |||||
return [self.client.archive_get( | |||||
self.deposit_id, tmpdir, p_info['filename'])] | |||||
def build_revision( | def build_revision( | ||||
self, a_metadata: Dict, uncompressed_path: str) -> Dict: | self, a_metadata: Dict, uncompressed_path: str) -> Dict: | ||||
revision = a_metadata.pop('revision') | revision = a_metadata.pop('revision') | ||||
metadata = { | metadata = { | ||||
'extrinsic': { | 'extrinsic': { | ||||
'provider': '%s/%s' % ( | 'provider': self.client.metadata_url(self.deposit_id), | ||||
self.client.base_url, self.metadata_url), | |||||
'when': self.visit_date.isoformat(), | 'when': self.visit_date.isoformat(), | ||||
'raw': a_metadata, | 'raw': a_metadata, | ||||
}, | }, | ||||
} | } | ||||
# FIXME: the deposit no longer needs to build the revision | # FIXME: the deposit no longer needs to build the revision | ||||
revision['metadata'].update(metadata) | revision['metadata'].update(metadata) | ||||
revision['author'] = parse_author(revision['author']) | revision['author'] = parse_author(revision['author']) | ||||
Show All 27 Lines | def load(self) -> Dict: | ||||
metadata = origin_metadata['metadata'] | metadata = origin_metadata['metadata'] | ||||
self.storage.origin_metadata_add( | self.storage.origin_metadata_add( | ||||
self.url, self.visit_date, provider_id, tool_id, metadata) | self.url, self.visit_date, provider_id, tool_id, metadata) | ||||
# Update deposit status | # Update deposit status | ||||
try: | try: | ||||
if not success: | if not success: | ||||
self.client.status_update( | self.client.status_update(self.deposit_id, status='failed') | ||||
self.deposit_update_url, status='failed') | |||||
return r | return r | ||||
snapshot_id = hash_to_bytes(r['snapshot_id']) | snapshot_id = hash_to_bytes(r['snapshot_id']) | ||||
branches = self.storage.snapshot_get(snapshot_id)['branches'] | branches = self.storage.snapshot_get(snapshot_id)['branches'] | ||||
logger.debug('branches: %s', branches) | logger.debug('branches: %s', branches) | ||||
if not branches: | if not branches: | ||||
return r | return r | ||||
rev_id = branches[b'HEAD']['target'] | rev_id = branches[b'HEAD']['target'] | ||||
revision = next(self.storage.revision_get([rev_id])) | revision = next(self.storage.revision_get([rev_id])) | ||||
# Retrieve the revision identifier | # Retrieve the revision identifier | ||||
dir_id = revision['directory'] | dir_id = revision['directory'] | ||||
# update the deposit's status to success with its | # update the deposit's status to success with its | ||||
# revision-id and directory-id | # revision-id and directory-id | ||||
self.client.status_update( | self.client.status_update( | ||||
self.deposit_update_url, | self.deposit_id, | ||||
status='done', | status='done', | ||||
revision_id=hash_to_hex(rev_id), | revision_id=hash_to_hex(rev_id), | ||||
directory_id=hash_to_hex(dir_id), | directory_id=hash_to_hex(dir_id), | ||||
origin_url=self.url) | origin_url=self.url) | ||||
except Exception: | except Exception: | ||||
logger.exception( | logger.exception( | ||||
'Problem when trying to update the deposit\'s status') | 'Problem when trying to update the deposit\'s status') | ||||
return {'status': 'failed'} | return {'status': 'failed'} | ||||
return r | return r | ||||
def parse_author(author): | def parse_author(author): | ||||
"""See prior fixme | """See prior fixme | ||||
""" | """ | ||||
return { | return { | ||||
'fullname': author['fullname'].encode('utf-8'), | 'fullname': author['fullname'].encode('utf-8'), | ||||
'name': author['name'].encode('utf-8'), | 'name': author['name'].encode('utf-8'), | ||||
'email': author['email'].encode('utf-8'), | 'email': author['email'].encode('utf-8'), | ||||
} | } | ||||
class ApiClient: | |||||
"""Private Deposit Api client | |||||
""" | |||||
def __init__(self, url, auth: Optional[Mapping[str, str]]): | |||||
self.base_url = url.rstrip('/') | |||||
self.auth = None if not auth else (auth['username'], auth['password']) | |||||
Not Done Inline Actionsmissing type annotations here anlambert: missing type annotations here | |||||
def do(self, method: str, url: str, *args, **kwargs): | |||||
"""Internal method to deal with requests, possibly with basic http | |||||
authentication. | |||||
Args: | |||||
method (str): supported http methods as in get/post/put | |||||
Returns: | |||||
The request's execution output | |||||
""" | |||||
method_fn = getattr(requests, method) | |||||
if self.auth: | |||||
kwargs['auth'] = self.auth | |||||
return method_fn(url, *args, **kwargs) | |||||
def archive_get( | |||||
self, deposit_id: Union[int, str], tmpdir: str, | |||||
filename: str) -> Tuple[str, Dict]: | |||||
"""Retrieve deposit's archive artifact locally | |||||
""" | |||||
url = f'{self.base_url}/{deposit_id}/raw/' | |||||
return download(url, dest=tmpdir, filename=filename, auth=self.auth) | |||||
def metadata_url(self, deposit_id: Union[int, str]) -> str: | |||||
return f'{self.base_url}/{deposit_id}/meta/' | |||||
Not Done Inline Actionss/Mapping/Dict/ anlambert: s/Mapping/Dict/ | |||||
def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: | |||||
"""Retrieve deposit's metadata artifact as json | |||||
""" | |||||
url = self.metadata_url(deposit_id) | |||||
r = self.do('get', url) | |||||
if r.ok: | |||||
return r.json() | |||||
msg = f'Problem when retrieving deposit metadata at {url}' | |||||
logger.error(msg) | |||||
raise ValueError(msg) | |||||
Not Done Inline Actionsmissing type annotations anlambert: missing type annotations | |||||
def status_update(self, deposit_id: Union[int, str], status: str, | |||||
revision_id: Optional[str] = None, | |||||
directory_id: Optional[str] = None, | |||||
origin_url: Optional[str] = None): | |||||
"""Update deposit's information including status, and persistent | |||||
identifiers result of the loading. | |||||
""" | |||||
url = f'/{self.base_url}/{deposit_id}/update/' | |||||
payload = {'status': status} | |||||
if revision_id: | |||||
payload['revision_id'] = revision_id | |||||
if directory_id: | |||||
payload['directory_id'] = directory_id | |||||
if origin_url: | |||||
payload['origin_url'] = origin_url | |||||
self.do('put', url, json=payload) |
Based on what @zack told us during its talk on mypy, concrete types should be used for annotated returned value and abstract ones for parameters. So List[Tuple[str, Dict]] should be used here.