diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,3 @@ pytest +pytest-mock swh.scheduler[testing] diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -50,6 +50,10 @@ extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, + entry_points=''' + [swh.workers] + loader.git=swh.loader.git:register + ''', classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", diff --git a/swh/loader/git/__init__.py b/swh/loader/git/__init__.py --- a/swh/loader/git/__init__.py +++ b/swh/loader/git/__init__.py @@ -0,0 +1,14 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict + + +def register() -> Dict[str, Any]: + from swh.loader.git.loader import GitLoader + return { + 'task_modules': ['%s.tasks' % __name__], + 'loader': GitLoader, + } diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -17,7 +17,6 @@ def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { - 'type': 'git', 'url': origin_url, } diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py --- a/swh/loader/git/from_disk.py +++ b/swh/loader/git/from_disk.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,19 +25,17 @@ visit_type = 'git' - def __init__(self, config=None): + def __init__(self, url, visit_date=None, directory=None, config=None): super().__init__(logging_class='swh.loader.git.Loader', config=config) - - def _prepare_origin_visit(self, origin_url, visit_date): - self.origin_url = origin_url - self.origin = converters.origin_url_to_origin(self.origin_url) + self.origin_url = url self.visit_date = visit_date + self.directory = directory - def prepare_origin_visit(self, origin_url, directory, visit_date): - self._prepare_origin_visit(origin_url, visit_date) + def prepare_origin_visit(self, *args, **kwargs): + self.origin = converters.origin_url_to_origin(self.origin_url) - def prepare(self, origin_url, directory, visit_date): - self.repo = dulwich.repo.Repo(directory) + def prepare(self, *args, **kwargs): + self.repo = dulwich.repo.Repo(self.directory) def iter_objects(self): object_store = self.repo.object_store @@ -305,9 +303,10 @@ ... """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, archive_path, **kwargs): super().__init__(*args, **kwargs) self.temp_dir = self.repo_path = None + self.archive_path = archive_path def project_name_from_archive(self, archive_path): """Compute the project name from the archive's path. @@ -320,22 +319,21 @@ break return archive_name - def prepare_origin_visit(self, origin_url, archive_path, visit_date): - self._prepare_origin_visit(origin_url, visit_date) - - def prepare(self, origin_url, archive_path, visit_date): + def prepare(self, *args, **kwargs): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoaderFromDisk does 3. Load as GitLoaderFromDisk does """ - project_name = self.project_name_from_archive(archive_path) + project_name = self.project_name_from_archive(self.archive_path) self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( - project_name, archive_path) + project_name, self.archive_path) - self.log.info('Project %s - Uncompressing archive %s at %s' % ( - origin_url, os.path.basename(archive_path), self.repo_path)) - super().prepare(origin_url, self.repo_path, visit_date) + self.log.info('Project %s - Uncompressing archive %s at %s', + self.origin_url, os.path.basename(self.archive_path), + self.repo_path) + self.directory = self.repo_path + super().prepare(*args, **kwargs) def cleanup(self): """Cleanup the temporary location (if it exists). diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2018 The Software Heritage developers +# Copyright (C) 2016-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -194,7 +194,8 @@ visit_type = 'git' - def __init__(self, repo_representation=RepoRepresentation, config=None): + def __init__(self, url, base_url=None, ignore_history=False, + repo_representation=RepoRepresentation, config=None): """Initialize the bulk updater. Args: @@ -205,6 +206,9 @@ """ super().__init__(logging_class='swh.loader.git.BulkLoader', config=config) + self.origin_url = url + self.base_url = base_url + self.ignore_history = ignore_history self.repo_representation = repo_representation def fetch_pack_from_origin(self, origin_url, @@ -273,9 +277,9 @@ return id_to_type, type_to_ids - def prepare_origin_visit(self, origin_url, **kwargs): + def prepare_origin_visit(self, *args, **kwargs): self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - self.origin = converters.origin_url_to_origin(origin_url) + self.origin = converters.origin_url_to_origin(self.origin_url) def get_full_snapshot(self, origin_url): prev_snapshot = self.storage.snapshot_get_latest(origin_url) @@ -284,16 +288,16 @@ return prev_snapshot - def prepare(self, origin_url, base_url=None, ignore_history=False): + def prepare(self, *args, **kwargs): base_origin_url = origin_url = self.origin['url'] prev_snapshot = None - if not ignore_history: + if not self.ignore_history: prev_snapshot = self.get_full_snapshot(origin_url) - if base_url and not prev_snapshot: - base_origin = converters.origin_url_to_origin(base_url) + if self.base_url and not prev_snapshot: + base_origin = converters.origin_url_to_origin(self.base_url) base_origin = self.storage.origin_get(base_origin) if base_origin: base_origin_url = base_origin['url'] @@ -301,7 +305,6 @@ self.base_snapshot = prev_snapshot self.base_origin_url = base_origin_url - self.ignore_history = ignore_history def fetch_data(self): def do_progress(msg): @@ -512,10 +515,11 @@ @click.option('--ignore-history/--no-ignore-history', help='Ignore the repository history', default=False) def main(origin_url, base_url, ignore_history): - return GitLoader().load( + loader = GitLoader( origin_url, base_url=base_url, ignore_history=ignore_history, ) + return loader.load() main() diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,24 +1,24 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser -from celery import current_app as app +from celery import shared_task from swh.loader.git.from_disk import GitLoaderFromDisk, GitLoaderFromArchive from swh.loader.git.loader import GitLoader -@app.task(name=__name__ + '.UpdateGitRepository') -def update_git_repository(repo_url, base_url=None): +@shared_task(name=__name__ + '.UpdateGitRepository') +def load_git(repo_url, base_url=None): """Import a git repository from a remote location""" - loader = GitLoader() - return loader.load(repo_url, base_url=base_url) + loader = GitLoader(repo_url, base_url=base_url) + return loader.load() -@app.task(name=__name__ + '.LoadDiskGitRepository') +@shared_task(name=__name__ + '.LoadDiskGitRepository') def load_git_from_dir(origin_url, directory, date): """Import a git repository from a local repository @@ -26,11 +26,13 @@ `date`. """ - loader = GitLoaderFromDisk() - return loader.load(origin_url, directory, dateutil.parser.parse(date)) + visit_date = dateutil.parser.parse(date) + loader = GitLoaderFromDisk( + origin_url, directory=directory, visit_date=visit_date) + return loader.load() -@app.task(name=__name__ + '.UncompressAndLoadDiskGitRepository') +@shared_task(name=__name__ + '.UncompressAndLoadDiskGitRepository') def load_git_from_zip(origin_url, archive_path, date): """Import a git repository from a zip archive @@ -38,6 +40,7 @@ 2. Load it through the git disk loader 3. Clean up the temporary folder """ - loader = GitLoaderFromArchive() - return loader.load( - origin_url, archive_path, dateutil.parser.parse(date)) + visit_date = dateutil.parser.parse(date) + loader = GitLoaderFromArchive( + origin_url, archive_path=archive_path, visit_date=visit_date) + return loader.load() diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -119,14 +119,15 @@ """ def setUp(self): super().setUp('testrepo.tgz', uncompress_archive=True) - self.loader = GitLoaderFromDiskTest() + self.loader = GitLoaderFromDiskTest( + url=self.repo_url, + visit_date='2016-05-03 15:16:32+00', + directory=self.destination_path + ) self.storage = self.loader.storage def load(self): - return self.loader.load( - origin_url=self.repo_url, - visit_date='2016-05-03 15:16:32+00', - directory=self.destination_path) + return self.loader.load() class BaseGitLoaderFromArchiveTest(BaseGitLoaderFromDiskTest): @@ -138,14 +139,15 @@ """ def setUp(self): super().setUp('testrepo.tgz', uncompress_archive=False) - self.loader = GitLoaderFromArchive() + self.loader = GitLoaderFromArchive( + url=self.repo_url, + visit_date='2016-05-03 15:16:32+00', + archive_path=self.destination_path, + ) self.storage = self.loader.storage def load(self): - return self.loader.load( - origin_url=self.repo_url, - visit_date='2016-05-03 15:16:32+00', - archive_path=self.destination_path) + return self.loader.load() class GitLoaderFromDiskTests: diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,9 +20,8 @@ """Same tests as for the GitLoaderFromDisk, but running on GitLoader.""" def setUp(self): super().setUp() - self.loader = GitLoaderTest() + self.loader = GitLoaderTest(self.repo_url) self.storage = self.loader.storage def load(self): - return self.loader.load( - origin_url=self.repo_url) + return self.loader.load() diff --git a/swh/loader/git/tests/test_tasks.py b/swh/loader/git/tests/test_tasks.py --- a/swh/loader/git/tests/test_tasks.py +++ b/swh/loader/git/tests/test_tasks.py @@ -1,14 +1,11 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime -from unittest.mock import patch - -@patch('swh.loader.git.loader.GitLoader.load') -def test_git_loader(mock_loader, swh_app, celery_session_worker): +def test_git_loader(mocker, swh_app, celery_session_worker): + mock_loader = mocker.patch('swh.loader.git.loader.GitLoader.load') mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( @@ -19,11 +16,12 @@ assert res.successful() assert res.result == {'status': 'eventful'} - mock_loader.assert_called_once_with('origin_url', base_url=None) + mock_loader.assert_called_once_with() -@patch('swh.loader.git.from_disk.GitLoaderFromDisk.load') -def test_git_loader_from_disk(mock_loader, swh_app, celery_session_worker): +def test_git_loader_from_disk(mocker, swh_app, celery_session_worker): + mock_loader = mocker.patch( + 'swh.loader.git.from_disk.GitLoaderFromDisk.load') mock_loader.return_value = {'status': 'uneventful'} res = swh_app.send_task( @@ -34,12 +32,13 @@ assert res.successful() assert res.result == {'status': 'uneventful'} - mock_loader.assert_called_once_with( - 'origin_url2', '/some/repo', datetime.datetime(2018, 12, 10, 0, 0)) + mock_loader.assert_called_once_with() + +def test_git_loader_from_archive(mocker, swh_app, celery_session_worker): + mock_loader = mocker.patch( + 'swh.loader.git.from_disk.GitLoaderFromArchive.load') -@patch('swh.loader.git.from_disk.GitLoaderFromArchive.load') -def test_git_loader_from_archive(mock_loader, swh_app, celery_session_worker): mock_loader.return_value = {'status': 'failed'} res = swh_app.send_task( @@ -50,5 +49,4 @@ assert res.successful() assert res.result == {'status': 'failed'} - mock_loader.assert_called_once_with( - 'origin_url3', '/some/repo', datetime.datetime(2017, 1, 10, 0, 0)) + mock_loader.assert_called_once_with()