diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,3 @@ swh.core >= 0.0.75 swh.model >= 0.0.18 swh.storage >= 0.0.153 -swh.deposit diff --git a/swh/loader/package/__init__.py b/swh/loader/package/__init__.py --- a/swh/loader/package/__init__.py +++ b/swh/loader/package/__init__.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information -from typing import Any, Mapping +from typing import Any, Dict, Mapping try: from swh.loader.core._version import __version__ # type: ignore @@ -12,7 +12,7 @@ __version__ = 'devel' -DEFAULT_PARAMS = { +DEFAULT_PARAMS: Dict[str, Any] = { 'headers': { 'User-Agent': 'Software Heritage Loader (%s)' % ( __version__ diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py --- a/swh/loader/package/deposit.py +++ b/swh/loader/package/deposit.py @@ -4,12 +4,15 @@ # See top-level LICENSE file for more information import logging +import requests -from typing import Any, Dict, Generator, Mapping, Sequence, Tuple +from typing import ( + Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union +) from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.loader.package.loader import PackageLoader -from swh.deposit.client import PrivateApiDepositClient as ApiClient +from swh.loader.package.utils import download logger = logging.getLogger(__name__) @@ -31,21 +34,16 @@ """ super().__init__(url=url) - # For now build back existing api urls - # archive_url: Private api url to retrieve archive artifact - self.archive_url = '/%s/raw/' % deposit_id - # metadata_url: Private api url to retrieve the deposit metadata - self.metadata_url = '/%s/meta/' % deposit_id - # deposit_update_url: Private api to push pids and status update on the - # deposit id - self.deposit_update_url = '/%s/update/' % deposit_id - self.client = ApiClient() + config_deposit = self.config['deposit'] + self.deposit_id = deposit_id + self.client = ApiClient(url=config_deposit['url'], + auth=config_deposit['auth']) self._metadata = None @property def metadata(self): if self._metadata is None: - self._metadata = self.client.metadata_get(self.metadata_url) + self._metadata = self.client.metadata_get(self.deposit_id) return self._metadata def get_versions(self) -> Sequence[str]: @@ -56,19 +54,25 @@ def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: p_info = { - 'url': self.client.base_url + self.archive_url, 'filename': 'archive.zip', 'raw': self.metadata, } yield 'HEAD', p_info + def download_package(self, p_info: Mapping[str, Any], + tmpdir: str) -> List[Tuple[str, Mapping]]: + """Override to allow use of the dedicated deposit client + + """ + return [self.client.archive_get( + self.deposit_id, tmpdir, p_info['filename'])] + def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: revision = a_metadata.pop('revision') metadata = { 'extrinsic': { - 'provider': '%s/%s' % ( - self.client.base_url, self.metadata_url), + 'provider': self.client.metadata_url(self.deposit_id), 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, @@ -112,8 +116,7 @@ # Update deposit status try: if not success: - self.client.status_update( - self.deposit_update_url, status='failed') + self.client.status_update(self.deposit_id, status='failed') return r snapshot_id = hash_to_bytes(r['snapshot_id']) @@ -131,7 +134,7 @@ # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( - self.deposit_update_url, + self.deposit_id, status='done', revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), @@ -152,3 +155,72 @@ 'name': author['name'].encode('utf-8'), 'email': author['email'].encode('utf-8'), } + + +class ApiClient: + """Private Deposit Api client + + """ + def __init__(self, url, auth: Optional[Mapping[str, str]]): + self.base_url = url.rstrip('/') + self.auth = None if not auth else (auth['username'], auth['password']) + + def do(self, method: str, url: str, *args, **kwargs): + """Internal method to deal with requests, possibly with basic http + authentication. + + Args: + method (str): supported http methods as in get/post/put + + Returns: + The request's execution output + + """ + method_fn = getattr(requests, method) + if self.auth: + kwargs['auth'] = self.auth + return method_fn(url, *args, **kwargs) + + def archive_get( + self, deposit_id: Union[int, str], tmpdir: str, + filename: str) -> Tuple[str, Dict]: + """Retrieve deposit's archive artifact locally + + """ + url = f'{self.base_url}/{deposit_id}/raw/' + return download(url, dest=tmpdir, filename=filename, auth=self.auth) + + def metadata_url(self, deposit_id: Union[int, str]) -> str: + return f'{self.base_url}/{deposit_id}/meta/' + + def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: + """Retrieve deposit's metadata artifact as json + + """ + url = self.metadata_url(deposit_id) + r = self.do('get', url) + if r.ok: + return r.json() + + msg = f'Problem when retrieving deposit metadata at {url}' + logger.error(msg) + raise ValueError(msg) + + def status_update(self, deposit_id: Union[int, str], status: str, + revision_id: Optional[str] = None, + directory_id: Optional[str] = None, + origin_url: Optional[str] = None): + """Update deposit's information including status, and persistent + identifiers result of the loading. + + """ + url = f'/{self.base_url}/{deposit_id}/update/' + payload = {'status': status} + if revision_id: + payload['revision_id'] = revision_id + if directory_id: + payload['directory_id'] = directory_id + if origin_url: + payload['origin_url'] = origin_url + + self.do('put', url, json=payload) diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py --- a/swh/loader/package/tests/conftest.py +++ b/swh/loader/package/tests/conftest.py @@ -7,13 +7,15 @@ import pytest import yaml +from typing import Any, Dict + from swh.storage.tests.conftest import * # noqa from swh.scheduler.tests.conftest import * # noqa @pytest.fixture -def swh_config(monkeypatch, swh_storage_postgresql, tmp_path): - storage_config = { +def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: + return { 'storage': { 'cls': 'local', 'args': { @@ -24,12 +26,21 @@ }, }, }, - 'url': 'https://deposit.softwareheritage.org/1/private', + 'deposit': { + 'url': 'https://deposit.softwareheritage.org/1/private', + 'auth': { + 'username': 'user', + 'password': 'pass', + } + }, } + +@pytest.fixture +def swh_config(swh_loader_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), 'loader.yml') with open(conffile, 'w') as f: - f.write(yaml.dump(storage_config)) + f.write(yaml.dump(swh_loader_config)) monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py --- a/swh/loader/package/tests/test_deposit.py +++ b/swh/loader/package/tests/test_deposit.py @@ -16,16 +16,14 @@ from swh.core.pytest_plugin import requests_mock_datadir_factory -def test_deposit_init_ok(swh_config): +def test_deposit_init_ok(swh_config, swh_loader_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url - assert loader.archive_url == '/%s/raw/' % deposit_id - assert loader.metadata_url == '/%s/meta/' % deposit_id - assert loader.deposit_update_url == '/%s/update/' % deposit_id assert loader.client is not None + assert loader.client.base_url == swh_loader_config['deposit']['url'] def test_deposit_loading_failure_to_fetch_metadata(swh_config): @@ -74,7 +72,6 @@ deposit_id = 666 loader = DepositLoader(url, deposit_id) - assert loader.archive_url actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None @@ -105,7 +102,6 @@ deposit_id = 666 loader = DepositLoader(url, deposit_id) - assert loader.archive_url actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None @@ -138,7 +134,6 @@ deposit_id = 666 loader = DepositLoader(url, deposit_id) - assert loader.archive_url actual_load_status = loader.load() expected_snapshot_id = '453f455d0efb69586143cd6b6e5897f9906b53a7' assert actual_load_status == { diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import logging import os import requests @@ -38,16 +39,18 @@ def download(url: str, dest: str, hashes: Dict = {}, - filename: Optional[str] = None) -> Tuple[str, Dict]: + filename: Optional[str] = None, + auth: Optional[Tuple[str, str]] = None) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to - hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) + auth: Optional tuple of login/password (for http authentication + service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, @@ -57,7 +60,10 @@ Tuple of local (filepath, hashes of filepath) """ - response = requests.get(url, **DEFAULT_PARAMS, stream=True) + params = copy.deepcopy(DEFAULT_PARAMS) + if auth is not None: + params['auth'] = auth + response = requests.get(url, **params, stream=True) logger.debug('headers: %s', response.headers) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % (