diff --git a/conftest.py b/conftest.py index e5217e3..516e510 100644 --- a/conftest.py +++ b/conftest.py @@ -1,64 +1,65 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import yaml from typing import Any, Dict from swh.storage.tests.conftest import * # noqa from swh.scheduler.tests.conftest import * # noqa @pytest.fixture def swh_loader_config(swh_storage_postgresql) -> Dict[str, Any]: return { 'storage': { 'cls': 'local', 'args': { 'db': swh_storage_postgresql.dsn, 'objstorage': { 'cls': 'memory', 'args': {} }, }, }, 'deposit': { 'url': 'https://deposit.softwareheritage.org/1/private', 'auth': { 'username': 'user', 'password': 'pass', } }, } @pytest.fixture def swh_config(swh_loader_config, monkeypatch, tmp_path): conffile = os.path.join(str(tmp_path), 'loader.yml') with open(conffile, 'w') as f: f.write(yaml.dump(swh_loader_config)) monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile @pytest.fixture(autouse=True, scope='session') def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ['http_proxy'] = 'http://localhost:999' os.environ['https_proxy'] = 'http://localhost:999' @pytest.fixture(scope='session') # type: ignore # expected redefinition def celery_includes(): return [ 'swh.loader.package.tasks', 'swh.loader.package.archive.tasks', 'swh.loader.package.debian.tasks', + 'swh.loader.package.deposit.tasks', ] diff --git a/setup.py b/setup.py index 115a66d..1bf53d5 100755 --- a/setup.py +++ b/setup.py @@ -1,70 +1,71 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = 'requirements-%s.txt' % name else: reqf = 'requirements.txt' requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.core', description='Software Heritage Base Loader', long_description=long_description, long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDBASE', packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, entry_points=''' [swh.workers] loader.archive=swh.loader.package.archive:register loader.debian=swh.loader.package.debian:register + loader.deposit=swh.loader.package.deposit:register ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', 'Funding': 'https://www.softwareheritage.org/donate', 'Source': 'https://forge.softwareheritage.org/source/swh-loader-core', }, ) diff --git a/swh/loader/package/deposit/__init__.py b/swh/loader/package/deposit/__init__.py new file mode 100644 index 0000000..d39d5f5 --- /dev/null +++ b/swh/loader/package/deposit/__init__.py @@ -0,0 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + return { + 'task_modules': [f'{__name__}.tasks'], + } diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit/loader.py similarity index 100% rename from swh/loader/package/deposit.py rename to swh/loader/package/deposit/loader.py diff --git a/swh/loader/package/deposit/tasks.py b/swh/loader/package/deposit/tasks.py new file mode 100644 index 0000000..08dc376 --- /dev/null +++ b/swh/loader/package/deposit/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.deposit.loader import DepositLoader + + +@shared_task(name=__name__ + '.LoadDeposit') +def load_deposit(*, url, deposit_id): + """Load Deposit artifacts""" + return DepositLoader(url, deposit_id).load() diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_meta b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta similarity index 100% rename from swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_meta rename to swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_raw b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw similarity index 100% rename from swh/loader/package/tests/data/https_deposit.softwareheritage.org/1_private_666_raw rename to swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json similarity index 100% rename from swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.json rename to swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json diff --git a/swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz similarity index 100% rename from swh/loader/package/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz rename to swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.orig.tar.gz diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py similarity index 99% rename from swh/loader/package/tests/test_deposit.py rename to swh/loader/package/deposit/tests/test_deposit.py index 9f00c41..e8e3fbf 100644 --- a/swh/loader/package/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,208 +1,208 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re from swh.model.hashutil import hash_to_bytes -from swh.loader.package.deposit import DepositLoader +from swh.loader.package.deposit.loader import DepositLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) from swh.core.pytest_plugin import requests_mock_datadir_factory def test_deposit_init_ok(swh_config, swh_loader_config): url = 'some-url' deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config['deposit']['url'] def test_deposit_loading_failure_to_fetch_metadata(swh_config): """Error during fetching artifact ends us with failed/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url' unknown_deposit_id = 666 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'deposit' requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[ 'https://deposit.softwareheritage.org/1/private/666/raw/', ]) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = 'some-url-2' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'deposit' def test_revision_metadata_structure(swh_config, requests_mock_datadir): # do not care for deposit update query requests_mock_datadir.put(re.compile('https')) url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '9471c606239bccb1f269564c9ea114e1eeab9eb4') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_deposit_loading_ok(swh_config, requests_mock_datadir): requests_mock_datadir.put(re.compile('https')) # do not care for put url = 'https://hal-test.archives-ouvertes.fr/some-external-id' deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = '453f455d0efb69586143cd6b6e5897f9906b53a7' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id, } stats = get_stats(loader.storage) assert { 'content': 303, 'directory': 12, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'deposit' expected_branches = { 'HEAD': { 'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4', 'target_type': 'revision', }, } expected_snapshot = { 'id': expected_snapshot_id, 'branches': expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) # check metadata tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": { "sword_version": "2", } } tool = loader.storage.tool_get(tool) assert tool is not None assert tool['id'] is not None provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } provider = loader.storage.metadata_provider_get_by(provider) assert provider is not None assert provider['id'] is not None metadata = list(loader.storage.origin_metadata_get_by( url, provider_type='deposit_client')) assert metadata is not None assert isinstance(metadata, list) assert len(metadata) == 1 metadata0 = metadata[0] assert metadata0['provider_id'] == provider['id'] assert metadata0['provider_type'] == 'deposit_client' assert metadata0['tool_id'] == tool['id'] diff --git a/swh/loader/package/deposit/tests/test_tasks.py b/swh/loader/package/deposit/tests/test_tasks.py new file mode 100644 index 0000000..4c161cb --- /dev/null +++ b/swh/loader/package/deposit/tests/test_tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + + +@patch('swh.loader.package.deposit.loader.DepositLoader.load') +def test_deposit_loader( + mock_loader, swh_app, celery_session_worker, swh_config): + mock_loader.return_value = {'status': 'eventful'} + + res = swh_app.send_task( + 'swh.loader.package.deposit.tasks.LoadDeposit', + (), dict(url='some-url', deposit_id='some-d-id')) + assert res + res.wait() + assert res.successful() + + assert res.result == {'status': 'eventful'} diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py index 79c6b03..520dc73 100644 --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/tasks.py @@ -1,28 +1,21 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.deposit import DepositLoader from swh.loader.package.npm import NpmLoader from swh.loader.package.pypi import PyPILoader -@shared_task(name=__name__ + '.LoadDeposit') -def load_deposit(*, url, deposit_id): - """Load Deposit artifacts""" - return DepositLoader(url, deposit_id).load() - - @shared_task(name=__name__ + '.LoadNpm') def load_npm(*, package_name, package_url, package_metadata_url): """Load Npm package""" return NpmLoader(package_name, package_url, package_metadata_url).load() @shared_task(name=__name__ + '.LoadPyPI') def load_pypi(*, url=None): """Load PyPI package""" return PyPILoader(url).load() diff --git a/swh/loader/package/tests/test_tasks.py b/swh/loader/package/tests/test_tasks.py index ff8edf2..743f866 100644 --- a/swh/loader/package/tests/test_tasks.py +++ b/swh/loader/package/tests/test_tasks.py @@ -1,53 +1,38 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -@patch('swh.loader.package.deposit.DepositLoader.load') -def test_deposit_loader( - mock_loader, swh_app, celery_session_worker, swh_config): - mock_loader.return_value = {'status': 'eventful'} - - res = swh_app.send_task( - 'swh.loader.package.tasks.LoadDeposit', - (), dict(url='some-url', deposit_id='some-d-id')) - assert res - res.wait() - assert res.successful() - - assert res.result == {'status': 'eventful'} - - @patch('swh.loader.package.npm.NpmLoader.load') def test_npm_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( 'swh.loader.package.tasks.LoadNpm', (), dict(package_name='some-package', package_url='some', package_metadata_url='something')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'} @patch('swh.loader.package.pypi.PyPILoader.load') def test_pypi_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( 'swh.loader.package.tasks.LoadPyPI', (), dict(url='some-url')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'}