diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index 62fc969..9250b68 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,199 +1,206 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import os import shutil from tempfile import mkdtemp from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from .client import PyPiClient from .model import PyPiProject TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' DEBUG_MODE = '** DEBUG MODE **' class PyPiLoader(SWHLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } - def __init__(self): + def __init__(self, client=None): super().__init__(logging_class='swh.loader.pypi.PyPiLoader') self.origin_id = None - - temp_directory = self.config['temp_directory'] - os.makedirs(temp_directory, exist_ok=True) - self.temp_directory = mkdtemp( - suffix='-%s' % os.getpid(), - prefix=TEMPORARY_DIR_PREFIX_PATTERN, - dir=temp_directory) - self.pypi_client = PyPiClient( - temp_directory=self.temp_directory, - cache=self.config['cache'], - cache_dir=self.config['cache_dir']) + if not client: + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + self.pypi_client = PyPiClient( + temp_directory=self.temp_directory, + cache=self.config['cache'], + cache_dir=self.config['cache_dir']) + else: + self.temp_directory = client.temp_directory + self.pypi_client = client self.debug = self.config['debug'] def pre_cleanup(self): """(override) To prevent disk explosion if some other workers exploded in mid-air (OOM killed), we try and clean up dangling files. """ if self.debug: self.log.warn('%s Will not pre-clean up temp dir %s' % ( DEBUG_MODE, self.temp_directory )) return clean_dangling_folders(self.config['temp_directory'], pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """(override) Clean up temporary disk use """ if self.debug: self.log.warn('%s Will not clean up temp dir %s' % ( DEBUG_MODE, self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """(override) Prepare the origin visit information Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None # loader core will populate it def prepare(self, project_name, origin_url, origin_metadata_url=None): """(override) Keep reference to the origin url (project) and the project metadata url Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url self.project = PyPiProject(self.pypi_client, self.project_name, self.origin_metadata_url) def _known_releases(self, _last_snapshot): """Retrieve the known releases/artifact for the origin_id. Returns tuple artifact's filename, artifact's sha256 """ _revs = [rev['target'] for rev in _last_snapshot['branches'].values()] _known_revisions = self.storage.revision_get(_revs) for _rev in _known_revisions: _artifact = _rev['metadata']['original_artifact'] yield _artifact['filename'], _artifact['sha256'] + def _last_snapshot(self): + """Retrieve the last snapshot + + """ + return self.storage.snapshot_get_latest(self.origin_id) + def fetch_data(self): """(override) Fetch and collect swh objects. """ - _last_snapshot = self.storage.snapshot_get_latest(self.origin_id) + _last_snapshot = self._last_snapshot() if _last_snapshot: self._snapshot = _last_snapshot.copy() _known_releases = self._known_releases(self._snapshot) else: self._snapshot = { 'branches': {} } _known_releases = [] self._contents = [] self._directories = [] self._revisions = [] - self._releases = [] for release_info, author, release, dirpath in self.project.releases( _known_releases): dirpath = dirpath.encode('utf-8') directory = Directory.from_disk(path=dirpath, data=True) _objects = directory.collect() self._contents.extend(_objects['content'].values()) self._directories.extend(_objects['directory'].values()) date = normalize_timestamp( int(arrow.get(release['date']).timestamp)) name = release['name'].encode('utf-8') message = release['message'].encode('utf-8') if message: message = b'%s: %s' % (name, message) else: message = name _revision = { 'synthetic': True, 'metadata': { 'original_artifact': release, 'project': release_info, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'message': message, 'directory': directory.hash, 'parents': [], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) self._revisions.append(_revision) branch_name = release['filename'].encode('utf-8') self._snapshot['branches'][branch_name] = { 'target': _revision['id'], 'target_type': 'revision', } self._snapshot['id'] = identifier_to_bytes( snapshot_identifier(self._snapshot)) def store_data(self): """(override) This sends collected objects to storage. """ self.maybe_load_contents(self._contents) self.maybe_load_directories(self._directories) self.maybe_load_revisions(self._revisions) - self.maybe_load_releases(self._releases) self.maybe_load_snapshot(self._snapshot) diff --git a/swh/loader/pypi/tests/__init__.py b/swh/loader/pypi/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/pypi/tests/common.py b/swh/loader/pypi/tests/common.py new file mode 100644 index 0000000..2159d13 --- /dev/null +++ b/swh/loader/pypi/tests/common.py @@ -0,0 +1,116 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.loader.pypi.client import PyPiClient + + +RESOURCES_PATH = './swh/loader/pypi/tests/resources' + + +class PyPiClientWithCache(PyPiClient): + """Force the use of the cache to bypass pypi calls + + """ + def __init__(self, temp_directory, cache_dir): + super().__init__(temp_directory=temp_directory, + cache=True, cache_dir=cache_dir) + + +class LoaderNoStorage: + """Mixin class to inhibit the persistence and keep in memory the data + sent for storage. + + cf. LoaderNoStorage + + """ + CONFIG_BASE_FILENAME = '' # do not provide a real path + ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://nowhere:5002/', # do not provide a real storage + } + }), + + # do not send any data to the storage + 'send_contents': ('bool', False), + 'send_directories': ('bool', False), + 'send_revisions': ('bool', False), + 'send_releases': ('bool', False), + 'send_snapshot': ('bool', False), + 'debug': ('bool', False), + } + + def __init__(self, client=None): + super().__init__(client=client) + self.all_contents = [] + self.all_directories = [] + self.all_revisions = [] + self.all_releases = [] + self.all_snapshots = [] + + # typed data + self.objects = { + 'content': self.all_contents, + 'directory': self.all_directories, + 'revision': self.all_revisions, + 'release': self.all_releases, + 'snapshot': self.all_snapshots + } + + def _add(self, type, l): + """Add without duplicates and keeping the insertion order. + + Args: + type (str): Type of objects concerned by the action + l ([object]): List of 'type' object + + """ + col = self.objects[type] + for o in l: + if o in col: + continue + col.extend([o]) + + def maybe_load_contents(self, all_contents): + self._add('content', all_contents) + + def maybe_load_directories(self, all_directories): + self._add('directory', all_directories) + + def maybe_load_revisions(self, all_revisions): + self._add('revision', all_revisions) + + def maybe_load_releases(self, releases): + raise ValueError('If called, the test must break.') + + def maybe_load_snapshot(self, snapshot): + self.objects['snapshot'].append(snapshot) + + def _store_origin_visit(self): + pass + + def open_fetch_history(self): + pass + + def close_fetch_history_success(self, fetch_history_id): + pass + + def close_fetch_history_failure(self, fetch_history_id): + pass + + def update_origin_visit(self, origin_id, visit, status): + pass + + # Override to do nothing at the end + def close_failure(self): + pass + + def close_success(self): + pass + + def pre_cleanup(self): + pass diff --git a/swh/loader/pypi/tests/test_loader.py b/swh/loader/pypi/tests/test_loader.py new file mode 100644 index 0000000..6c1de34 --- /dev/null +++ b/swh/loader/pypi/tests/test_loader.py @@ -0,0 +1,185 @@ +# Copyright (C) 2016-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import shutil +import tempfile + +from nose.plugins.attrib import attr +from nose.tools import istest +from unittest import TestCase + +from swh.model import hashutil + +from swh.loader.pypi.model import PyPiProject +from swh.loader.pypi.loader import PyPiLoader +from .common import PyPiClientWithCache, RESOURCES_PATH, LoaderNoStorage + + +class TestPyPiLoader(LoaderNoStorage, PyPiLoader): + """Real PyPiLoader for test purposes (storage and pypi interactions + inhibited) + + """ + def __init__(self, project_name, ): + project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project_name) + project_metadata_url = 'https://pypi.org/pypi/%s/json' % project_name + with open(project_metadata_file) as f: + data = json.load(f) + + temp_dir = tempfile.mkdtemp( + dir='/tmp/', prefix='swh.loader.pypi.tests-') + # Will use the pypi with cache + client = PyPiClientWithCache( + temp_directory=temp_dir, cache_dir=RESOURCES_PATH) + super().__init__(client=client) + self.project = PyPiProject( + client=client, + project=project_name, + project_metadata_url=project_metadata_url, + data=data) + + def prepare(self, project_name, origin_url, + origin_metadata_url=None): + self.project_name = project_name + self.origin_url = origin_url + self.origin_metadata_url = origin_metadata_url + self.visit = 1 # first visit + + +class PyPiLoaderNoSnapshot(TestPyPiLoader): + """Same as TestPyPiLoader with no prior snapshot seen + + """ + def _last_snapshot(self): + return None + def _last_snapshot(self): + return None + + +@attr('fs') +class BaseLoaderITest(TestCase): + """Loader Test Mixin to prepare the pypi to 'load' in a test context. + + In this setup, the loader uses the cache to load data so no + network interaction (no storage, no pypi). + + """ + def setUp(self, project_name='0805nexter', + dummy_pypi_instance='https://dummy.org'): + self.tmp_root_path = tempfile.mkdtemp() + self.loader = PyPiLoaderNoSnapshot(project_name=project_name) + self._project = project_name + self._origin_url = '%s/pypi/%s/' % (dummy_pypi_instance, project_name) + self._project_metadata_url = '%s/pypi/%s/json' % ( + dummy_pypi_instance, project_name) + + def tearDown(self): + shutil.rmtree(self.tmp_root_path) + + def assertContentsOk(self, expected_contents): + contents = self.loader.all_contents + self.assertEquals(len(contents), len(expected_contents)) + + for content in contents: + content_id = hashutil.hash_to_hex(content['sha1']) + self.assertIn(content_id, expected_contents) + + def assertDirectoriesOk(self, expected_directories): + directories = self.loader.all_directories + self.assertEquals(len(directories), len(expected_directories)) + + for _dir in directories: + _dir_id = hashutil.hash_to_hex(_dir['id']) + self.assertIn(_dir_id, expected_directories) + + def assertSnapshotOk(self, expected_snapshot, expected_revisions): + snapshots = self.loader.all_snapshots + self.assertEqual(len(snapshots), 1) + + snap = snapshots[0] + snap_id = hashutil.hash_to_hex(snap['id']) + self.assertEqual(snap_id, expected_snapshot) + + branches = snap['branches'] + self.assertEqual(len(expected_revisions), len(branches)) + + for branch, target in branches.items(): + rev_id = hashutil.hash_to_hex(target['target']) + self.assertIn(rev_id, expected_revisions) + self.assertEqual('revision', target['target_type']) + + def assertRevisionsOk(self, expected_revisions): # noqa: N802 + """Check the loader's revisions match the expected revisions. + + Expects self.loader to be instantiated and ready to be + inspected (meaning the loading took place). + + Args: + expected_revisions (dict): Dict with key revision id, + value the targeted directory id. + + """ + # The last revision being the one used later to start back from + for rev in self.loader.all_revisions: + rev_id = hashutil.hash_to_hex(rev['id']) + directory_id = hashutil.hash_to_hex(rev['directory']) + + self.assertEquals(expected_revisions[rev_id], directory_id) + + +# Define loaders with no storage +# They'll just accumulate the data in place +# Only for testing purposes. + + +class LoaderITest(BaseLoaderITest): + @istest + def load(self): + """Load a pypi origin + + """ + # when + self.loader.load( + self._project, self._origin_url, self._project_metadata_url) + + # then + self.assertEquals(len(self.loader.all_contents), 6, + '3 contents per release artifact files (2)') + self.assertEquals(len(self.loader.all_directories), 4) + self.assertEquals(len(self.loader.all_revisions), 2, + '2 releases so 2 revisions should be created') + self.assertEquals(len(self.loader.all_releases), 0, + 'No release is created in the pypi loader') + self.assertEquals(len(self.loader.all_snapshots), 1, + 'Only 1 snapshot targetting all revisions') + + expected_contents = [ + 'a61e24cdfdab3bb7817f6be85d37a3e666b34566', + '938c33483285fd8ad57f15497f538320df82aeb8', + 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8', + '405859113963cb7a797642b45f171d6360425d16', + 'e5686aa568fdb1d19d7f1329267082fe40482d31', + '83ecf6ec1114fd260ca7a833a2d165e71258c338', + ] + + self.assertContentsOk(expected_contents) + + expected_directories = [ + '05219ba38bc542d4345d5638af1ed56c7d43ca7d', + 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44', + 'b178b66bd22383d5f16f4f5c923d39ca798861b4', + 'c3a58f8b57433a4b56caaa5033ae2e0931405338', + ] + self.assertDirectoriesOk(expected_directories) + + # {revision hash: directory hash} + expected_revisions = { + '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa + 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa + } + self.assertRevisionsOk(expected_revisions) + + self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', expected_revisions) diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py index 480b69b..972b74a 100644 --- a/swh/loader/pypi/tests/test_model.py +++ b/swh/loader/pypi/tests/test_model.py @@ -1,217 +1,200 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import json import tempfile import shutil from unittest import TestCase from nose.tools import istest from swh.loader.pypi.model import PyPiProject, author -from swh.loader.pypi.client import PyPiClient, _project_pkginfo +from swh.loader.pypi.client import _project_pkginfo - -class PyPiClientWithCache(PyPiClient): - """Force the use of the cache to bypass pypi calls - - """ - def __init__(self, temp_directory, cache_dir): - super().__init__(temp_directory=temp_directory, - cache=True, cache_dir=cache_dir) - - -RESOURCES_PATH = './swh/loader/pypi/tests/resources' +from .common import PyPiClientWithCache, RESOURCES_PATH class ModelTest(TestCase): def setUp(self): project = '0805nexter' project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project) with open(project_metadata_file) as f: data = json.load(f) - release_files = [] - for f in os.listdir(RESOURCES_PATH): - if f.endswith('.json'): - continue - file = os.path.join(RESOURCES_PATH, f) - release_files.append(file) - temp_dir = tempfile.mkdtemp( dir='/tmp/', prefix='swh.loader.pypi.tests-') project_metadata_url = 'https://pypi.org/pypi/%s/json' % project # Will use the pypi with cache client = PyPiClientWithCache( temp_directory=temp_dir, cache_dir=RESOURCES_PATH) self.project = PyPiProject( client=client, project=project, project_metadata_url=project_metadata_url, data=data) self.data = data self.temp_dir = temp_dir self.project_name = project def tearDown(self): if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) @istest def info(self): actual_info = self.project.info() expected_info = { 'home_page': self.data['info']['home_page'], 'description': self.data['info']['description'], 'summary': self.data['info']['summary'], 'license': self.data['info']['license'], 'package_url': self.data['info']['package_url'], 'project_url': self.data['info']['project_url'], 'upstream': self.data['info']['project_urls']['Homepage'], } self.assertEqual(expected_info, actual_info) @istest def author(self): info = self.data['info'] actual_author = author(info) name = info['author'].encode('utf-8') email = info['author_email'].encode('utf-8') expected_author = { 'fullname': b'%s <%s>' % (name, email), 'name': name, 'email': email, } self.assertEqual(expected_author, actual_author) @istest def releases(self): actual_releases = self.project.releases([]) expected_releases = { '1.1.0': { 'archive_type': 'zip', 'blake2s256': 'df9413bde66e6133b10cadefad6fcf9cbbc369b47831089112c846d79f14985a', # noqa 'date': '2016-01-31T05:28:42', 'filename': '0805nexter-1.1.0.zip', 'message': '', 'name': '1.1.0', 'sha1': '127d8697db916ba1c67084052196a83319a25000', 'sha1_git': '4b8f1350e6d9fa00256e974ae24c09543d85b196', 'sha256': '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035', # noqa 'size': 862, 'url': 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa }, '1.2.0': { 'archive_type': 'zip', 'blake2s256': '67010586b5b9a4aaa3b1c386f9dc8b4c99e6e40f37732a717a5f9b9b1185e588', # noqa 'date': '2016-01-31T05:51:25', 'filename': '0805nexter-1.2.0.zip', 'message': '', 'name': '1.2.0', 'sha1': 'd55238554b94da7c5bf4a349ece0fe3b2b19f79c', 'sha1_git': '8638d33a96cb25d8319af21417f00045ec6ee810', 'sha256': '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709', # noqa 'size': 898, 'url': 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa } } dir_paths = [] for _release_info, _author, _release, _dir_path in actual_releases: version = _release_info['version'] expected_pkginfo = _project_pkginfo(_dir_path) self.assertEquals(_release_info, expected_pkginfo) expected_author = author(expected_pkginfo) self.assertEqual(_author, expected_author) expected_release = expected_releases[version] self.assertEqual(_release, expected_release) self.assertTrue(version in _dir_path) self.assertTrue(self.project_name in _dir_path) # path still exists self.assertTrue(os.path.exists(_dir_path)) dir_paths.append(_dir_path) # Ensure uncompressed paths have been destroyed for _dir_path in dir_paths: # path no longer exists self.assertFalse(os.path.exists(_dir_path)) class ParseAuthorTest(TestCase): @istest def author_basic(self): data = { 'author': "i-am-groot", 'author_email': 'iam@groot.org', } actual_author = author(data) expected_author = { 'fullname': b'i-am-groot ', 'name': b'i-am-groot', 'email': b'iam@groot.org', } self.assertEquals(actual_author, expected_author) @istest def author_malformed(self): data = { 'author': "['pierre', 'paul', 'jacques']", 'author_email': None, } actual_author = author(data) expected_author = { 'fullname': b"['pierre', 'paul', 'jacques']", 'name': b"['pierre', 'paul', 'jacques']", 'email': None, } self.assertEquals(actual_author, expected_author) @istest def author_malformed_2(self): data = { 'author': '[marie, jeanne]', 'author_email': '[marie@some, jeanne@thing]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', 'name': b'[marie, jeanne]', 'email': b'[marie@some, jeanne@thing]', } self.assertEquals(actual_author, expected_author) @istest def author_malformed_3(self): data = { 'author': '[marie, jeanne, pierre]', 'author_email': '[marie@somewhere.org, jeanne@somewhere.org]', } actual_author = author(data) expected_author = { 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa 'name': b'[marie, jeanne, pierre]', 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', } self.assertEquals(actual_author, expected_author)