Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py
index 62fc969..9250b68 100644
--- a/swh/loader/pypi/loader.py
+++ b/swh/loader/pypi/loader.py
@@ -1,199 +1,206 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import arrow
import os
import shutil
from tempfile import mkdtemp
from swh.loader.core.utils import clean_dangling_folders
from swh.loader.core.loader import SWHLoader
from swh.model.from_disk import Directory
from swh.model.identifiers import (
revision_identifier, snapshot_identifier,
identifier_to_bytes, normalize_timestamp
)
from .client import PyPiClient
from .model import PyPiProject
TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.'
DEBUG_MODE = '** DEBUG MODE **'
class PyPiLoader(SWHLoader):
CONFIG_BASE_FILENAME = 'loader/pypi'
ADDITIONAL_CONFIG = {
'temp_directory': ('str', '/tmp/swh.loader.pypi/'),
'cache': ('bool', False),
'cache_dir': ('str', ''),
'debug': ('bool', False), # NOT FOR PRODUCTION
}
- def __init__(self):
+ def __init__(self, client=None):
super().__init__(logging_class='swh.loader.pypi.PyPiLoader')
self.origin_id = None
-
- temp_directory = self.config['temp_directory']
- os.makedirs(temp_directory, exist_ok=True)
- self.temp_directory = mkdtemp(
- suffix='-%s' % os.getpid(),
- prefix=TEMPORARY_DIR_PREFIX_PATTERN,
- dir=temp_directory)
- self.pypi_client = PyPiClient(
- temp_directory=self.temp_directory,
- cache=self.config['cache'],
- cache_dir=self.config['cache_dir'])
+ if not client:
+ temp_directory = self.config['temp_directory']
+ os.makedirs(temp_directory, exist_ok=True)
+ self.temp_directory = mkdtemp(
+ suffix='-%s' % os.getpid(),
+ prefix=TEMPORARY_DIR_PREFIX_PATTERN,
+ dir=temp_directory)
+ self.pypi_client = PyPiClient(
+ temp_directory=self.temp_directory,
+ cache=self.config['cache'],
+ cache_dir=self.config['cache_dir'])
+ else:
+ self.temp_directory = client.temp_directory
+ self.pypi_client = client
self.debug = self.config['debug']
def pre_cleanup(self):
"""(override) To prevent disk explosion if some other workers exploded
in mid-air (OOM killed), we try and clean up dangling files.
"""
if self.debug:
self.log.warn('%s Will not pre-clean up temp dir %s' % (
DEBUG_MODE, self.temp_directory
))
return
clean_dangling_folders(self.config['temp_directory'],
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log)
def cleanup(self):
"""(override) Clean up temporary disk use
"""
if self.debug:
self.log.warn('%s Will not clean up temp dir %s' % (
DEBUG_MODE, self.temp_directory
))
return
if os.path.exists(self.temp_directory):
self.log.debug('Clean up %s' % self.temp_directory)
shutil.rmtree(self.temp_directory)
def prepare_origin_visit(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Prepare the origin visit information
Args:
project_name (str): Project's simple name
origin_url (str): Project's main url
origin_metadata_url (str): Project's metadata url
"""
self.origin = {
'url': origin_url,
'type': 'pypi',
}
self.visit_date = None # loader core will populate it
def prepare(self, project_name, origin_url,
origin_metadata_url=None):
"""(override) Keep reference to the origin url (project) and the
project metadata url
Args:
project_name (str): Project's simple name
origin_url (str): Project's main url
origin_metadata_url (str): Project's metadata url
"""
self.project_name = project_name
self.origin_url = origin_url
self.origin_metadata_url = origin_metadata_url
self.project = PyPiProject(self.pypi_client, self.project_name,
self.origin_metadata_url)
def _known_releases(self, _last_snapshot):
"""Retrieve the known releases/artifact for the origin_id.
Returns
tuple artifact's filename, artifact's sha256
"""
_revs = [rev['target'] for rev in _last_snapshot['branches'].values()]
_known_revisions = self.storage.revision_get(_revs)
for _rev in _known_revisions:
_artifact = _rev['metadata']['original_artifact']
yield _artifact['filename'], _artifact['sha256']
+ def _last_snapshot(self):
+ """Retrieve the last snapshot
+
+ """
+ return self.storage.snapshot_get_latest(self.origin_id)
+
def fetch_data(self):
"""(override) Fetch and collect swh objects.
"""
- _last_snapshot = self.storage.snapshot_get_latest(self.origin_id)
+ _last_snapshot = self._last_snapshot()
if _last_snapshot:
self._snapshot = _last_snapshot.copy()
_known_releases = self._known_releases(self._snapshot)
else:
self._snapshot = {
'branches': {}
}
_known_releases = []
self._contents = []
self._directories = []
self._revisions = []
- self._releases = []
for release_info, author, release, dirpath in self.project.releases(
_known_releases):
dirpath = dirpath.encode('utf-8')
directory = Directory.from_disk(path=dirpath, data=True)
_objects = directory.collect()
self._contents.extend(_objects['content'].values())
self._directories.extend(_objects['directory'].values())
date = normalize_timestamp(
int(arrow.get(release['date']).timestamp))
name = release['name'].encode('utf-8')
message = release['message'].encode('utf-8')
if message:
message = b'%s: %s' % (name, message)
else:
message = name
_revision = {
'synthetic': True,
'metadata': {
'original_artifact': release,
'project': release_info,
},
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'message': message,
'directory': directory.hash,
'parents': [],
'type': 'tar',
}
_revision['id'] = identifier_to_bytes(
revision_identifier(_revision))
self._revisions.append(_revision)
branch_name = release['filename'].encode('utf-8')
self._snapshot['branches'][branch_name] = {
'target': _revision['id'],
'target_type': 'revision',
}
self._snapshot['id'] = identifier_to_bytes(
snapshot_identifier(self._snapshot))
def store_data(self):
"""(override) This sends collected objects to storage.
"""
self.maybe_load_contents(self._contents)
self.maybe_load_directories(self._directories)
self.maybe_load_revisions(self._revisions)
- self.maybe_load_releases(self._releases)
self.maybe_load_snapshot(self._snapshot)
diff --git a/swh/loader/pypi/tests/__init__.py b/swh/loader/pypi/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/swh/loader/pypi/tests/common.py b/swh/loader/pypi/tests/common.py
new file mode 100644
index 0000000..2159d13
--- /dev/null
+++ b/swh/loader/pypi/tests/common.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from swh.loader.pypi.client import PyPiClient
+
+
+RESOURCES_PATH = './swh/loader/pypi/tests/resources'
+
+
+class PyPiClientWithCache(PyPiClient):
+ """Force the use of the cache to bypass pypi calls
+
+ """
+ def __init__(self, temp_directory, cache_dir):
+ super().__init__(temp_directory=temp_directory,
+ cache=True, cache_dir=cache_dir)
+
+
+class LoaderNoStorage:
+ """Mixin class to inhibit the persistence and keep in memory the data
+ sent for storage.
+
+ cf. LoaderNoStorage
+
+ """
+ CONFIG_BASE_FILENAME = '' # do not provide a real path
+ ADDITIONAL_CONFIG = {
+ 'storage': ('dict', {
+ 'cls': 'remote',
+ 'args': {
+ 'url': 'http://nowhere:5002/', # do not provide a real storage
+ }
+ }),
+
+ # do not send any data to the storage
+ 'send_contents': ('bool', False),
+ 'send_directories': ('bool', False),
+ 'send_revisions': ('bool', False),
+ 'send_releases': ('bool', False),
+ 'send_snapshot': ('bool', False),
+ 'debug': ('bool', False),
+ }
+
+ def __init__(self, client=None):
+ super().__init__(client=client)
+ self.all_contents = []
+ self.all_directories = []
+ self.all_revisions = []
+ self.all_releases = []
+ self.all_snapshots = []
+
+ # typed data
+ self.objects = {
+ 'content': self.all_contents,
+ 'directory': self.all_directories,
+ 'revision': self.all_revisions,
+ 'release': self.all_releases,
+ 'snapshot': self.all_snapshots
+ }
+
+ def _add(self, type, l):
+ """Add without duplicates and keeping the insertion order.
+
+ Args:
+ type (str): Type of objects concerned by the action
+ l ([object]): List of 'type' object
+
+ """
+ col = self.objects[type]
+ for o in l:
+ if o in col:
+ continue
+ col.extend([o])
+
+ def maybe_load_contents(self, all_contents):
+ self._add('content', all_contents)
+
+ def maybe_load_directories(self, all_directories):
+ self._add('directory', all_directories)
+
+ def maybe_load_revisions(self, all_revisions):
+ self._add('revision', all_revisions)
+
+ def maybe_load_releases(self, releases):
+ raise ValueError('If called, the test must break.')
+
+ def maybe_load_snapshot(self, snapshot):
+ self.objects['snapshot'].append(snapshot)
+
+ def _store_origin_visit(self):
+ pass
+
+ def open_fetch_history(self):
+ pass
+
+ def close_fetch_history_success(self, fetch_history_id):
+ pass
+
+ def close_fetch_history_failure(self, fetch_history_id):
+ pass
+
+ def update_origin_visit(self, origin_id, visit, status):
+ pass
+
+ # Override to do nothing at the end
+ def close_failure(self):
+ pass
+
+ def close_success(self):
+ pass
+
+ def pre_cleanup(self):
+ pass
diff --git a/swh/loader/pypi/tests/test_loader.py b/swh/loader/pypi/tests/test_loader.py
new file mode 100644
index 0000000..6c1de34
--- /dev/null
+++ b/swh/loader/pypi/tests/test_loader.py
@@ -0,0 +1,185 @@
+# Copyright (C) 2016-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import json
+import shutil
+import tempfile
+
+from nose.plugins.attrib import attr
+from nose.tools import istest
+from unittest import TestCase
+
+from swh.model import hashutil
+
+from swh.loader.pypi.model import PyPiProject
+from swh.loader.pypi.loader import PyPiLoader
+from .common import PyPiClientWithCache, RESOURCES_PATH, LoaderNoStorage
+
+
+class TestPyPiLoader(LoaderNoStorage, PyPiLoader):
+ """Real PyPiLoader for test purposes (storage and pypi interactions
+ inhibited)
+
+ """
+ def __init__(self, project_name, ):
+ project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project_name)
+ project_metadata_url = 'https://pypi.org/pypi/%s/json' % project_name
+ with open(project_metadata_file) as f:
+ data = json.load(f)
+
+ temp_dir = tempfile.mkdtemp(
+ dir='/tmp/', prefix='swh.loader.pypi.tests-')
+ # Will use the pypi with cache
+ client = PyPiClientWithCache(
+ temp_directory=temp_dir, cache_dir=RESOURCES_PATH)
+ super().__init__(client=client)
+ self.project = PyPiProject(
+ client=client,
+ project=project_name,
+ project_metadata_url=project_metadata_url,
+ data=data)
+
+ def prepare(self, project_name, origin_url,
+ origin_metadata_url=None):
+ self.project_name = project_name
+ self.origin_url = origin_url
+ self.origin_metadata_url = origin_metadata_url
+ self.visit = 1 # first visit
+
+
+class PyPiLoaderNoSnapshot(TestPyPiLoader):
+ """Same as TestPyPiLoader with no prior snapshot seen
+
+ """
+ def _last_snapshot(self):
+ return None
+ def _last_snapshot(self):
+ return None
+
+
+@attr('fs')
+class BaseLoaderITest(TestCase):
+ """Loader Test Mixin to prepare the pypi to 'load' in a test context.
+
+ In this setup, the loader uses the cache to load data so no
+ network interaction (no storage, no pypi).
+
+ """
+ def setUp(self, project_name='0805nexter',
+ dummy_pypi_instance='https://dummy.org'):
+ self.tmp_root_path = tempfile.mkdtemp()
+ self.loader = PyPiLoaderNoSnapshot(project_name=project_name)
+ self._project = project_name
+ self._origin_url = '%s/pypi/%s/' % (dummy_pypi_instance, project_name)
+ self._project_metadata_url = '%s/pypi/%s/json' % (
+ dummy_pypi_instance, project_name)
+
+ def tearDown(self):
+ shutil.rmtree(self.tmp_root_path)
+
+ def assertContentsOk(self, expected_contents):
+ contents = self.loader.all_contents
+ self.assertEquals(len(contents), len(expected_contents))
+
+ for content in contents:
+ content_id = hashutil.hash_to_hex(content['sha1'])
+ self.assertIn(content_id, expected_contents)
+
+ def assertDirectoriesOk(self, expected_directories):
+ directories = self.loader.all_directories
+ self.assertEquals(len(directories), len(expected_directories))
+
+ for _dir in directories:
+ _dir_id = hashutil.hash_to_hex(_dir['id'])
+ self.assertIn(_dir_id, expected_directories)
+
+ def assertSnapshotOk(self, expected_snapshot, expected_revisions):
+ snapshots = self.loader.all_snapshots
+ self.assertEqual(len(snapshots), 1)
+
+ snap = snapshots[0]
+ snap_id = hashutil.hash_to_hex(snap['id'])
+ self.assertEqual(snap_id, expected_snapshot)
+
+ branches = snap['branches']
+ self.assertEqual(len(expected_revisions), len(branches))
+
+ for branch, target in branches.items():
+ rev_id = hashutil.hash_to_hex(target['target'])
+ self.assertIn(rev_id, expected_revisions)
+ self.assertEqual('revision', target['target_type'])
+
+ def assertRevisionsOk(self, expected_revisions): # noqa: N802
+ """Check the loader's revisions match the expected revisions.
+
+ Expects self.loader to be instantiated and ready to be
+ inspected (meaning the loading took place).
+
+ Args:
+ expected_revisions (dict): Dict with key revision id,
+ value the targeted directory id.
+
+ """
+ # The last revision being the one used later to start back from
+ for rev in self.loader.all_revisions:
+ rev_id = hashutil.hash_to_hex(rev['id'])
+ directory_id = hashutil.hash_to_hex(rev['directory'])
+
+ self.assertEquals(expected_revisions[rev_id], directory_id)
+
+
+# Define loaders with no storage
+# They'll just accumulate the data in place
+# Only for testing purposes.
+
+
+class LoaderITest(BaseLoaderITest):
+ @istest
+ def load(self):
+ """Load a pypi origin
+
+ """
+ # when
+ self.loader.load(
+ self._project, self._origin_url, self._project_metadata_url)
+
+ # then
+ self.assertEquals(len(self.loader.all_contents), 6,
+ '3 contents per release artifact files (2)')
+ self.assertEquals(len(self.loader.all_directories), 4)
+ self.assertEquals(len(self.loader.all_revisions), 2,
+ '2 releases so 2 revisions should be created')
+ self.assertEquals(len(self.loader.all_releases), 0,
+ 'No release is created in the pypi loader')
+ self.assertEquals(len(self.loader.all_snapshots), 1,
+ 'Only 1 snapshot targetting all revisions')
+
+ expected_contents = [
+ 'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
+ '938c33483285fd8ad57f15497f538320df82aeb8',
+ 'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
+ '405859113963cb7a797642b45f171d6360425d16',
+ 'e5686aa568fdb1d19d7f1329267082fe40482d31',
+ '83ecf6ec1114fd260ca7a833a2d165e71258c338',
+ ]
+
+ self.assertContentsOk(expected_contents)
+
+ expected_directories = [
+ '05219ba38bc542d4345d5638af1ed56c7d43ca7d',
+ 'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
+ 'b178b66bd22383d5f16f4f5c923d39ca798861b4',
+ 'c3a58f8b57433a4b56caaa5033ae2e0931405338',
+ ]
+ self.assertDirectoriesOk(expected_directories)
+
+ # {revision hash: directory hash}
+ expected_revisions = {
+ '4c99891f93b81450385777235a37b5e966dd1571': '05219ba38bc542d4345d5638af1ed56c7d43ca7d', # noqa
+ 'e445da4da22b31bfebb6ffc4383dbf839a074d21': 'b178b66bd22383d5f16f4f5c923d39ca798861b4', # noqa
+ }
+ self.assertRevisionsOk(expected_revisions)
+
+ self.assertSnapshotOk('f456b03e8bf1920d64b00df234b1efedc25b6c93', expected_revisions)
diff --git a/swh/loader/pypi/tests/test_model.py b/swh/loader/pypi/tests/test_model.py
index 480b69b..972b74a 100644
--- a/swh/loader/pypi/tests/test_model.py
+++ b/swh/loader/pypi/tests/test_model.py
@@ -1,217 +1,200 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import json
import tempfile
import shutil
from unittest import TestCase
from nose.tools import istest
from swh.loader.pypi.model import PyPiProject, author
-from swh.loader.pypi.client import PyPiClient, _project_pkginfo
+from swh.loader.pypi.client import _project_pkginfo
-
-class PyPiClientWithCache(PyPiClient):
- """Force the use of the cache to bypass pypi calls
-
- """
- def __init__(self, temp_directory, cache_dir):
- super().__init__(temp_directory=temp_directory,
- cache=True, cache_dir=cache_dir)
-
-
-RESOURCES_PATH = './swh/loader/pypi/tests/resources'
+from .common import PyPiClientWithCache, RESOURCES_PATH
class ModelTest(TestCase):
def setUp(self):
project = '0805nexter'
project_metadata_file = '%s/%s.json' % (RESOURCES_PATH, project)
with open(project_metadata_file) as f:
data = json.load(f)
- release_files = []
- for f in os.listdir(RESOURCES_PATH):
- if f.endswith('.json'):
- continue
- file = os.path.join(RESOURCES_PATH, f)
- release_files.append(file)
-
temp_dir = tempfile.mkdtemp(
dir='/tmp/', prefix='swh.loader.pypi.tests-')
project_metadata_url = 'https://pypi.org/pypi/%s/json' % project
# Will use the pypi with cache
client = PyPiClientWithCache(
temp_directory=temp_dir, cache_dir=RESOURCES_PATH)
self.project = PyPiProject(
client=client,
project=project,
project_metadata_url=project_metadata_url,
data=data)
self.data = data
self.temp_dir = temp_dir
self.project_name = project
def tearDown(self):
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
@istest
def info(self):
actual_info = self.project.info()
expected_info = {
'home_page': self.data['info']['home_page'],
'description': self.data['info']['description'],
'summary': self.data['info']['summary'],
'license': self.data['info']['license'],
'package_url': self.data['info']['package_url'],
'project_url': self.data['info']['project_url'],
'upstream': self.data['info']['project_urls']['Homepage'],
}
self.assertEqual(expected_info, actual_info)
@istest
def author(self):
info = self.data['info']
actual_author = author(info)
name = info['author'].encode('utf-8')
email = info['author_email'].encode('utf-8')
expected_author = {
'fullname': b'%s <%s>' % (name, email),
'name': name,
'email': email,
}
self.assertEqual(expected_author, actual_author)
@istest
def releases(self):
actual_releases = self.project.releases([])
expected_releases = {
'1.1.0': {
'archive_type': 'zip',
'blake2s256': 'df9413bde66e6133b10cadefad6fcf9cbbc369b47831089112c846d79f14985a', # noqa
'date': '2016-01-31T05:28:42',
'filename': '0805nexter-1.1.0.zip',
'message': '',
'name': '1.1.0',
'sha1': '127d8697db916ba1c67084052196a83319a25000',
'sha1_git': '4b8f1350e6d9fa00256e974ae24c09543d85b196',
'sha256': '52cd128ad3afe539478abc7440d4b043384295fbe6b0958a237cb6d926465035', # noqa
'size': 862,
'url': 'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
},
'1.2.0': {
'archive_type': 'zip',
'blake2s256': '67010586b5b9a4aaa3b1c386f9dc8b4c99e6e40f37732a717a5f9b9b1185e588', # noqa
'date': '2016-01-31T05:51:25',
'filename': '0805nexter-1.2.0.zip',
'message': '',
'name': '1.2.0',
'sha1': 'd55238554b94da7c5bf4a349ece0fe3b2b19f79c',
'sha1_git': '8638d33a96cb25d8319af21417f00045ec6ee810',
'sha256': '49785c6ae39ea511b3c253d7621c0b1b6228be2f965aca8a491e6b84126d0709', # noqa
'size': 898,
'url': 'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa
}
}
dir_paths = []
for _release_info, _author, _release, _dir_path in actual_releases:
version = _release_info['version']
expected_pkginfo = _project_pkginfo(_dir_path)
self.assertEquals(_release_info, expected_pkginfo)
expected_author = author(expected_pkginfo)
self.assertEqual(_author, expected_author)
expected_release = expected_releases[version]
self.assertEqual(_release, expected_release)
self.assertTrue(version in _dir_path)
self.assertTrue(self.project_name in _dir_path)
# path still exists
self.assertTrue(os.path.exists(_dir_path))
dir_paths.append(_dir_path)
# Ensure uncompressed paths have been destroyed
for _dir_path in dir_paths:
# path no longer exists
self.assertFalse(os.path.exists(_dir_path))
class ParseAuthorTest(TestCase):
@istest
def author_basic(self):
data = {
'author': "i-am-groot",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot <iam@groot.org>',
'name': b'i-am-groot',
'email': b'iam@groot.org',
}
self.assertEquals(actual_author, expected_author)
@istest
def author_malformed(self):
data = {
'author': "['pierre', 'paul', 'jacques']",
'author_email': None,
}
actual_author = author(data)
expected_author = {
'fullname': b"['pierre', 'paul', 'jacques']",
'name': b"['pierre', 'paul', 'jacques']",
'email': None,
}
self.assertEquals(actual_author, expected_author)
@istest
def author_malformed_2(self):
data = {
'author': '[marie, jeanne]',
'author_email': '[marie@some, jeanne@thing]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
'name': b'[marie, jeanne]',
'email': b'[marie@some, jeanne@thing]',
}
self.assertEquals(actual_author, expected_author)
@istest
def author_malformed_3(self):
data = {
'author': '[marie, jeanne, pierre]',
'author_email': '[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
'name': b'[marie, jeanne, pierre]',
'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
}
self.assertEquals(actual_author, expected_author)

File Metadata

Mime Type
text/x-diff
Expires
Sat, Jun 21, 5:06 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236487

Event Timeline