diff --git a/PKG-INFO b/PKG-INFO index 4d5e49a..79d0691 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.vault -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 3f8b0ec..a388692 100644 --- a/debian/control +++ b/debian/control @@ -1,34 +1,34 @@ Source: swh-vault Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-click, python3-dateutil, python3-dulwich, python3-fastimport, python3-flask, python3-nose, python3-psycopg2, python3-setuptools, python3-swh.core (>= 0.0.28~), python3-swh.model (>= 0.0.18~), python3-swh.objstorage (>= 0.0.17~), python3-swh.scheduler (>= 0.0.11~), - python3-swh.storage (>= 0.0.92~), + python3-swh.storage (>= 0.0.100~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DVAU/ Package: python3-swh.vault Architecture: all Depends: python3-swh.core (>= 0.0.28~), python3-swh.model (>= 0.0.18~), python3-swh.objstorage (>= 0.0.17~), python3-swh.scheduler (>= 0.0.11~), - python3-swh.storage (>= 0.0.92~), + python3-swh.storage (>= 0.0.100~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Vault diff --git a/requirements-swh.txt b/requirements-swh.txt index c0fca85..b5800e5 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ -swh.core >= 0.0.28 +swh.core >= 0.0.40 swh.model >= 0.0.18 swh.objstorage >= 0.0.17 swh.scheduler >= 0.0.11 -swh.storage >= 0.0.92 +swh.storage >= 0.0.100 diff --git a/swh.vault.egg-info/PKG-INFO b/swh.vault.egg-info/PKG-INFO index 4d5e49a..79d0691 100644 --- a/swh.vault.egg-info/PKG-INFO +++ b/swh.vault.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.vault -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.vault.egg-info/requires.txt b/swh.vault.egg-info/requires.txt index b5e5e9f..101a1ee 100644 --- a/swh.vault.egg-info/requires.txt +++ b/swh.vault.egg-info/requires.txt @@ -1,11 +1,11 @@ click fastimport flask psycopg2 python-dateutil -swh.core>=0.0.28 +swh.core>=0.0.40 swh.model>=0.0.18 swh.objstorage>=0.0.17 swh.scheduler>=0.0.11 -swh.storage>=0.0.92 +swh.storage>=0.0.100 vcversioner diff --git a/swh/vault/__init__.py b/swh/vault/__init__.py index e69de29..11c6215 100644 --- a/swh/vault/__init__.py +++ b/swh/vault/__init__.py @@ -0,0 +1,30 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def get_vault(cls, args): + """ + Get a vault object of class `vault_class` with arguments + `vault_args`. + + Args: + vault (dict): dictionary with keys: + - cls (str): vault's class, either 'remote' + - args (dict): dictionary with keys + + Returns: + an instance of swh.storage.Storage (either local or remote) + + Raises: + ValueError if passed an unknown storage class. + + """ + + if cls == 'remote': + from .api.client import RemoteVaultClient as Vault + else: + raise ValueError('Unknown storage class `%s`' % cls) + + return Vault(**args) diff --git a/swh/vault/api/client.py b/swh/vault/api/client.py index 138df1d..43cc0a3 100644 --- a/swh/vault/api/client.py +++ b/swh/vault/api/client.py @@ -1,68 +1,69 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model import hashutil from swh.core.api import SWHRemoteAPI class VaultAPIError(Exception): """Vault API Error""" def __str__(self): return ('An unexpected error occurred in the Vault backend: {}' .format(self.args)) class RemoteVaultClient(SWHRemoteAPI): """Client to the Software Heritage vault cache.""" - def __init__(self, base_url): - super().__init__(api_exception=VaultAPIError, url=base_url) + def __init__(self, base_url, timeout=None): + super().__init__( + api_exception=VaultAPIError, url=base_url, timeout=timeout) # Web API endpoints def fetch(self, obj_type, obj_id): hex_id = hashutil.hash_to_hex(obj_id) return self.get('fetch/{}/{}'.format(obj_type, hex_id)) def cook(self, obj_type, obj_id, email=None): hex_id = hashutil.hash_to_hex(obj_id) return self.post('cook/{}/{}'.format(obj_type, hex_id), data={}, params=({'email': email} if email else None)) def progress(self, obj_type, obj_id): hex_id = hashutil.hash_to_hex(obj_id) return self.get('progress/{}/{}'.format(obj_type, hex_id)) # Cookers endpoints def set_progress(self, obj_type, obj_id, progress): hex_id = hashutil.hash_to_hex(obj_id) return self.post('set_progress/{}/{}'.format(obj_type, hex_id), data=progress) def set_status(self, obj_type, obj_id, status): hex_id = hashutil.hash_to_hex(obj_id) return self.post('set_status/{}/{}' .format(obj_type, hex_id), data=status) # TODO: handle streaming properly def put_bundle(self, obj_type, obj_id, bundle): hex_id = hashutil.hash_to_hex(obj_id) return self.post('put_bundle/{}/{}' .format(obj_type, hex_id), data=bundle) def send_notif(self, obj_type, obj_id): hex_id = hashutil.hash_to_hex(obj_id) return self.post('send_notif/{}/{}' .format(obj_type, hex_id), data=None) # Batch endpoints def batch_cook(self, batch): return self.post('batch_cook', data=batch) def batch_progress(self, batch_id): return self.get('batch_progress/{}'.format(batch_id)) diff --git a/swh/vault/backend.py b/swh/vault/backend.py index 33ec22e..90e3066 100644 --- a/swh/vault/backend.py +++ b/swh/vault/backend.py @@ -1,472 +1,471 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import smtplib import psycopg2 from psycopg2.extras import RealDictCursor from functools import wraps from email.mime.text import MIMEText from swh.model import hashutil from swh.scheduler.backend import SchedulerBackend from swh.scheduler.utils import create_oneshot_task_dict from swh.vault.cache import VaultCache from swh.vault.cookers import get_cooker from swh.vault.cooking_tasks import SWHCookingTask # noqa cooking_task_name = 'swh.vault.cooking_tasks.SWHCookingTask' NOTIF_EMAIL_FROM = ('"Software Heritage Vault" ' '') NOTIF_EMAIL_SUBJECT_SUCCESS = ("Bundle ready: {obj_type} {short_id}") NOTIF_EMAIL_SUBJECT_FAILURE = ("Bundle failed: {obj_type} {short_id}") NOTIF_EMAIL_BODY_SUCCESS = """ You have requested the following bundle from the Software Heritage Vault: Object Type: {obj_type} Object ID: {hex_id} This bundle is now available for download at the following address: {url} Please keep in mind that this link might expire at some point, in which case you will need to request the bundle again. --\x20 The Software Heritage Developers """ NOTIF_EMAIL_BODY_FAILURE = """ You have requested the following bundle from the Software Heritage Vault: Object Type: {obj_type} Object ID: {hex_id} This bundle could not be cooked for the following reason: {progress_msg} We apologize for the inconvenience. --\x20 The Software Heritage Developers """ class NotFoundExc(Exception): """Bundle was not found.""" pass def batch_to_bytes(batch): return [(obj_type, hashutil.hash_to_bytes(obj_id)) for obj_type, obj_id in batch] # TODO: Imported from swh.scheduler.backend. Factorization needed. def autocommit(fn): @wraps(fn) def wrapped(self, *args, **kwargs): autocommit = False # TODO: I don't like using None, it's confusing for the user. how about # a NEW_CURSOR object()? if 'cursor' not in kwargs or not kwargs['cursor']: autocommit = True kwargs['cursor'] = self.cursor() try: ret = fn(self, *args, **kwargs) except BaseException: if autocommit: self.rollback() raise if autocommit: self.commit() return ret return wrapped # TODO: This has to be factorized with other database base classes and helpers # (swh.scheduler.backend.SchedulerBackend, swh.storage.db.BaseDb, ...) # The three first methods are imported from swh.scheduler.backend. class VaultBackend: """ Backend for the Software Heritage vault. """ def __init__(self, config): self.config = config self.cache = VaultCache(self.config['cache']) self.db = None self.reconnect() self.smtp_server = smtplib.SMTP() if self.config['scheduling_db'] is not None: self.scheduler = SchedulerBackend( scheduling_db=self.config['scheduling_db']) def reconnect(self): """Reconnect to the database.""" if not self.db or self.db.closed: self.db = psycopg2.connect( dsn=self.config['db'], cursor_factory=RealDictCursor, ) def close(self): """Close the underlying database connection.""" self.db.close() def cursor(self): """Return a fresh cursor on the database, with auto-reconnection in case of failure""" cur = None # Get a fresh cursor and reconnect at most three times tries = 0 while True: tries += 1 try: cur = self.db.cursor() cur.execute('select 1') break except psycopg2.OperationalError: if tries < 3: self.reconnect() else: raise return cur def commit(self): """Commit a transaction""" self.db.commit() def rollback(self): """Rollback a transaction""" self.db.rollback() @autocommit def task_info(self, obj_type, obj_id, cursor=None): """Fetch information from a bundle""" obj_id = hashutil.hash_to_bytes(obj_id) cursor.execute(''' SELECT id, type, object_id, task_id, task_status, sticky, ts_created, ts_done, ts_last_access, progress_msg FROM vault_bundle WHERE type = %s AND object_id = %s''', (obj_type, obj_id)) res = cursor.fetchone() if res: res['object_id'] = bytes(res['object_id']) return res def _send_task(self, args): """Send a cooking task to the celery scheduler""" task = create_oneshot_task_dict('swh-vault-cooking', *args) added_tasks = self.scheduler.create_tasks([task]) return added_tasks[0]['id'] @autocommit def create_task(self, obj_type, obj_id, sticky=False, cursor=None): """Create and send a cooking task""" obj_id = hashutil.hash_to_bytes(obj_id) hex_id = hashutil.hash_to_hex(obj_id) args = [obj_type, hex_id] backend_storage_config = {'storage': self.config['storage']} cooker_class = get_cooker(obj_type) cooker = cooker_class(*args, override_cfg=backend_storage_config) if not cooker.check_exists(): raise NotFoundExc("Object {} was not found.".format(hex_id)) cursor.execute(''' INSERT INTO vault_bundle (type, object_id, sticky) VALUES (%s, %s, %s)''', (obj_type, obj_id, sticky)) self.commit() task_id = self._send_task(args) cursor.execute(''' UPDATE vault_bundle SET task_id = %s WHERE type = %s AND object_id = %s''', (task_id, obj_type, obj_id)) @autocommit def add_notif_email(self, obj_type, obj_id, email, cursor=None): """Add an e-mail address to notify when a given bundle is ready""" obj_id = hashutil.hash_to_bytes(obj_id) cursor.execute(''' INSERT INTO vault_notif_email (email, bundle_id) VALUES (%s, (SELECT id FROM vault_bundle WHERE type = %s AND object_id = %s))''', (email, obj_type, obj_id)) @autocommit def cook_request(self, obj_type, obj_id, *, sticky=False, email=None, cursor=None): """Main entry point for cooking requests. This starts a cooking task if needed, and add the given e-mail to the notify list""" obj_id = hashutil.hash_to_bytes(obj_id) info = self.task_info(obj_type, obj_id) # If there's a failed bundle entry, delete it first. if info is not None and info['task_status'] == 'failed': cursor.execute('''DELETE FROM vault_bundle WHERE type = %s AND object_id = %s''', (obj_type, obj_id)) self.commit() info = None # If there's no bundle entry, create the task. if info is None: self.create_task(obj_type, obj_id, sticky) if email is not None: # If the task is already done, send the email directly if info is not None and info['task_status'] == 'done': self.send_notification(None, email, obj_type, obj_id, info['task_status']) # Else, add it to the notification queue else: self.add_notif_email(obj_type, obj_id, email) info = self.task_info(obj_type, obj_id) return info @autocommit def batch_cook(self, batch, cursor=None): """Cook a batch of bundles and returns the cooking id.""" # Import execute_values at runtime only, because it requires # psycopg2 >= 2.7 (only available on postgresql servers) from psycopg2.extras import execute_values cursor.execute(''' INSERT INTO vault_batch (id) VALUES (DEFAULT) RETURNING id''') batch_id = cursor.fetchone()['id'] batch = batch_to_bytes(batch) # Delete all failed bundles from the batch cursor.execute(''' DELETE FROM vault_bundle WHERE task_status = 'failed' AND (type, object_id) IN %s''', (tuple(batch),)) # Insert all the bundles, return the new ones execute_values(cursor, ''' INSERT INTO vault_bundle (type, object_id) - VALUES %s ON CONFLICT DO NOTHING''', batch) + VALUES %s ON CONFLICT DO NOTHING + RETURNING type, object_id''', batch) + batch_new = [(bundle['type'], bytes(bundle['object_id'])) + for bundle in cursor.fetchall()] - # Get the bundle ids and task status + # Get the bundle ids cursor.execute(''' - SELECT id, type, object_id, task_id FROM vault_bundle + SELECT id FROM vault_bundle WHERE (type, object_id) IN %s''', (tuple(batch),)) - bundles = cursor.fetchall() + bundle_ids = cursor.fetchall() # Insert the batch-bundle entries - batch_id_bundle_ids = [(batch_id, row['id']) for row in bundles] + batch_id_bundle_ids = [(batch_id, row['id']) for row in bundle_ids] execute_values(cursor, ''' INSERT INTO vault_batch_bundle (batch_id, bundle_id) VALUES %s ON CONFLICT DO NOTHING''', batch_id_bundle_ids) self.commit() - # Get the tasks to fetch - batch_new = [(row['type'], bytes(row['object_id'])) - for row in bundles if row['task_id'] is None] - # Send the tasks args_batch = [(obj_type, hashutil.hash_to_hex(obj_id)) for obj_type, obj_id in batch_new] # TODO: change once the scheduler handles priority tasks tasks = [create_oneshot_task_dict('swh-vault-batch-cooking', *args) for args in args_batch] added_tasks = self.scheduler.create_tasks(tasks) tasks_ids_bundle_ids = zip([task['id'] for task in added_tasks], batch_new) tasks_ids_bundle_ids = [(task_id, obj_type, obj_id) for task_id, (obj_type, obj_id) in tasks_ids_bundle_ids] # Update the task ids execute_values(cursor, ''' UPDATE vault_bundle SET task_id = s_task_id FROM (VALUES %s) AS sub (s_task_id, s_type, s_object_id) WHERE type = s_type::cook_type AND object_id = s_object_id ''', tasks_ids_bundle_ids) return batch_id @autocommit def batch_info(self, batch_id, cursor=None): """Fetch information from a batch of bundles""" cursor.execute(''' SELECT vault_bundle.id as id, type, object_id, task_id, task_status, sticky, ts_created, ts_done, ts_last_access, progress_msg FROM vault_batch_bundle LEFT JOIN vault_bundle ON vault_bundle.id = bundle_id WHERE batch_id = %s''', (batch_id,)) res = cursor.fetchall() if res: for d in res: d['object_id'] = bytes(d['object_id']) return res @autocommit def is_available(self, obj_type, obj_id, cursor=None): """Check whether a bundle is available for retrieval""" info = self.task_info(obj_type, obj_id, cursor=cursor) return (info is not None and info['task_status'] == 'done' and self.cache.is_cached(obj_type, obj_id)) @autocommit def fetch(self, obj_type, obj_id, cursor=None): """Retrieve a bundle from the cache""" if not self.is_available(obj_type, obj_id, cursor=cursor): return None self.update_access_ts(obj_type, obj_id, cursor=cursor) return self.cache.get(obj_type, obj_id) @autocommit def update_access_ts(self, obj_type, obj_id, cursor=None): """Update the last access timestamp of a bundle""" obj_id = hashutil.hash_to_bytes(obj_id) cursor.execute(''' UPDATE vault_bundle SET ts_last_access = NOW() WHERE type = %s AND object_id = %s''', (obj_type, obj_id)) @autocommit def set_status(self, obj_type, obj_id, status, cursor=None): """Set the cooking status of a bundle""" obj_id = hashutil.hash_to_bytes(obj_id) req = (''' UPDATE vault_bundle SET task_status = %s ''' + (''', ts_done = NOW() ''' if status == 'done' else '') + '''WHERE type = %s AND object_id = %s''') cursor.execute(req, (status, obj_type, obj_id)) @autocommit def set_progress(self, obj_type, obj_id, progress, cursor=None): """Set the cooking progress of a bundle""" obj_id = hashutil.hash_to_bytes(obj_id) cursor.execute(''' UPDATE vault_bundle SET progress_msg = %s WHERE type = %s AND object_id = %s''', (progress, obj_type, obj_id)) @autocommit def send_all_notifications(self, obj_type, obj_id, cursor=None): """Send all the e-mails in the notification list of a bundle""" obj_id = hashutil.hash_to_bytes(obj_id) cursor.execute(''' SELECT vault_notif_email.id AS id, email, task_status, progress_msg FROM vault_notif_email INNER JOIN vault_bundle ON bundle_id = vault_bundle.id WHERE vault_bundle.type = %s AND vault_bundle.object_id = %s''', (obj_type, obj_id)) for d in cursor: self.send_notification(d['id'], d['email'], obj_type, obj_id, status=d['task_status'], progress_msg=d['progress_msg']) @autocommit def send_notification(self, n_id, email, obj_type, obj_id, status, progress_msg=None, cursor=None): """Send the notification of a bundle to a specific e-mail""" hex_id = hashutil.hash_to_hex(obj_id) short_id = hex_id[:7] # TODO: instead of hardcoding this, we should probably: # * add a "fetch_url" field in the vault_notif_email table # * generate the url with flask.url_for() on the web-ui side # * send this url as part of the cook request and store it in # the table # * use this url for the notification e-mail url = ('https://archive.softwareheritage.org/api/1/vault/{}/{}/' 'raw'.format(obj_type, hex_id)) if status == 'done': text = NOTIF_EMAIL_BODY_SUCCESS.strip() text = text.format(obj_type=obj_type, hex_id=hex_id, url=url) msg = MIMEText(text) msg['Subject'] = (NOTIF_EMAIL_SUBJECT_SUCCESS .format(obj_type=obj_type, short_id=short_id)) elif status == 'failed': text = NOTIF_EMAIL_BODY_FAILURE.strip() text = text.format(obj_type=obj_type, hex_id=hex_id, progress_msg=progress_msg) msg = MIMEText(text) msg['Subject'] = (NOTIF_EMAIL_SUBJECT_FAILURE .format(obj_type=obj_type, short_id=short_id)) else: raise RuntimeError("send_notification called on a '{}' bundle" .format(status)) msg['From'] = NOTIF_EMAIL_FROM msg['To'] = email self._smtp_send(msg) if n_id is not None: cursor.execute(''' DELETE FROM vault_notif_email WHERE id = %s''', (n_id,)) def _smtp_send(self, msg): # Reconnect if needed try: status = self.smtp_server.noop()[0] except smtplib.SMTPException: status = -1 if status != 250: self.smtp_server.connect('localhost', 25) # Send the message self.smtp_server.send_message(msg) @autocommit def _cache_expire(self, cond, *args, cursor=None): """Low-level expiration method, used by cache_expire_* methods""" # Embedded SELECT query to be able to use ORDER BY and LIMIT cursor.execute(''' DELETE FROM vault_bundle WHERE ctid IN ( SELECT ctid FROM vault_bundle WHERE sticky = false {} ) RETURNING type, object_id '''.format(cond), args) for d in cursor: self.cache.delete(d['type'], bytes(d['object_id'])) @autocommit def cache_expire_oldest(self, n=1, by='last_access', cursor=None): """Expire the `n` oldest bundles""" assert by in ('created', 'done', 'last_access') filter = '''ORDER BY ts_{} LIMIT {}'''.format(by, n) return self._cache_expire(filter) @autocommit def cache_expire_until(self, date, by='last_access', cursor=None): """Expire all the bundles until a certain date""" assert by in ('created', 'done', 'last_access') filter = '''AND ts_{} <= %s'''.format(by) return self._cache_expire(filter, date) diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py index a40333e..bc9f7aa 100644 --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -1,514 +1,518 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import gzip import io import os import pathlib import subprocess import tarfile import tempfile import unittest import unittest.mock import dulwich.fastexport import dulwich.index import dulwich.objects import dulwich.porcelain import dulwich.repo from swh.core.tests.db_testing import DbTestFixture from swh.loader.git.loader import GitLoader from swh.model import hashutil from swh.model.from_disk import Directory from swh.storage.tests.storage_testing import StorageTestFixture from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import VaultTestFixture, hash_content from swh.vault.to_disk import SKIPPED_MESSAGE, HIDDEN_MESSAGE class TestRepo: """A tiny context manager for a test git repository, with some utility functions to perform basic git stuff. """ def __enter__(self): self.tmp_dir = tempfile.TemporaryDirectory(prefix='tmp-vault-repo-') self.repo_dir = self.tmp_dir.__enter__() self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b'Test Author' self.author_email = b'test@softwareheritage.org' self.author = b'%s <%s>' % (self.author_name, self.author_email) self.base_date = 258244200 self.counter = 0 return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): self.tmp_dir.__exit__(exc, value, tb) def checkout(self, rev_sha): rev = self.repo[rev_sha] dulwich.index.build_index_from_tree(self.repo_dir, self.repo.index_path(), self.repo.object_store, rev.tree) def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs): name = self.author_name email = self.author_email date = '%d +0000' % (self.base_date + self.counter) env = { # Set git commit format 'GIT_AUTHOR_NAME': name, 'GIT_AUTHOR_EMAIL': email, 'GIT_AUTHOR_DATE': date, 'GIT_COMMITTER_NAME': name, 'GIT_COMMITTER_EMAIL': email, 'GIT_COMMITTER_DATE': date, # Ignore all the system-wide and user configurations 'GIT_CONFIG_NOSYSTEM': '1', 'HOME': str(self.tmp_dir), 'XDG_CONFIG_HOME': str(self.tmp_dir), } kwargs.setdefault('env', {}).update(env) subprocess.check_call(('git', '-C', self.repo_dir) + cmd, stdout=stdout, **kwargs) def commit(self, message='Commit test\n', ref=b'HEAD'): self.git_shell('add', '.') message = message.encode() + b'\n' ret = self.repo.do_commit( message=message, committer=self.author, commit_timestamp=self.base_date + self.counter, commit_timezone=0, ref=ref) self.counter += 1 return ret def merge(self, parent_sha_list, message='Merge branches.'): self.git_shell('merge', '--allow-unrelated-histories', '-m', message, *[p.decode() for p in parent_sha_list]) self.counter += 1 return self.repo.refs[b'HEAD'] def print_debug_graph(self, reflog=False): args = ['log', '--all', '--graph', '--decorate'] if reflog: args.append('--reflog') self.git_shell(*args, stdout=None) class BaseTestCookers(VaultTestFixture, StorageTestFixture, DbTestFixture): """Base class of cookers unit tests""" def setUp(self): super().setUp() self.loader = GitLoader() self.loader.storage = self.storage + def tearDown(self): + self.loader = None + super().tearDown() + def load(self, repo_path): """Load a repository in the test storage""" self.loader.load('fake_origin', repo_path, datetime.datetime.now()) @contextlib.contextmanager def cook_extract_directory(self, obj_id): """Context manager that cooks a directory and extract it.""" cooker = DirectoryCooker('directory', obj_id) cooker.storage = self.storage cooker.backend = unittest.mock.MagicMock() cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) with tempfile.TemporaryDirectory(prefix='tmp-vault-extract-') as td: with tarfile.open(fileobj=cooker.fileobj, mode='r') as tar: tar.extractall(td) yield pathlib.Path(td) / hashutil.hash_to_hex(obj_id) + cooker.storage = None @contextlib.contextmanager def cook_stream_revision_gitfast(self, obj_id): """Context manager that cooks a revision and stream its fastexport.""" cooker = RevisionGitfastCooker('revision_gitfast', obj_id) cooker.storage = self.storage cooker.backend = unittest.mock.MagicMock() cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj) yield fastexport_stream + cooker.storage = None @contextlib.contextmanager def cook_extract_revision_gitfast(self, obj_id): """Context manager that cooks a revision and extract it.""" test_repo = TestRepo() with self.cook_stream_revision_gitfast(obj_id) as stream, \ test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) yield test_repo, p TEST_CONTENT = (" test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces ") TEST_EXECUTABLE = b'\x42\x40\x00\x00\x05' class TestDirectoryCooker(BaseTestCookers, unittest.TestCase): def test_directory_simple(self): repo = TestRepo() with repo as rp: (rp / 'file').write_text(TEST_CONTENT) (rp / 'executable').write_bytes(TEST_EXECUTABLE) (rp / 'executable').chmod(0o755) (rp / 'link').symlink_to('file') (rp / 'dir1/dir2').mkdir(parents=True) (rp / 'dir1/dir2/file').write_text(TEST_CONTENT) c = repo.commit() self.load(str(rp)) obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with self.cook_extract_directory(obj_id) as p: self.assertEqual((p / 'file').stat().st_mode, 0o100644) self.assertEqual((p / 'file').read_text(), TEST_CONTENT) self.assertEqual((p / 'executable').stat().st_mode, 0o100755) self.assertEqual((p / 'executable').read_bytes(), TEST_EXECUTABLE) self.assertTrue((p / 'link').is_symlink) self.assertEqual(os.readlink(str(p / 'link')), 'file') self.assertEqual((p / 'dir1/dir2/file').stat().st_mode, 0o100644) self.assertEqual((p / 'dir1/dir2/file').read_text(), TEST_CONTENT) directory = Directory.from_disk(path=bytes(p)) self.assertEqual(obj_id_hex, hashutil.hash_to_hex(directory.hash)) def test_directory_filtered_objects(self): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b'test1') file_2, id_2 = hash_content(b'test2') file_3, id_3 = hash_content(b'test3') (rp / 'file').write_bytes(file_1) (rp / 'hidden_file').write_bytes(file_2) (rp / 'absent_file').write_bytes(file_3) c = repo.commit() self.load(str(rp)) obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) # FIXME: storage.content_update() should be changed to allow things # like that - cur = self.storage.db._cursor(None) - cur.execute("""update content set status = 'visible' - where sha1 = %s""", (id_1,)) - cur.execute("""update content set status = 'hidden' - where sha1 = %s""", (id_2,)) - cur.execute("""update content set status = 'absent' - where sha1 = %s""", (id_3,)) - cur.close() + with self.storage.get_db().transaction() as cur: + cur.execute("""update content set status = 'visible' + where sha1 = %s""", (id_1,)) + cur.execute("""update content set status = 'hidden' + where sha1 = %s""", (id_2,)) + cur.execute("""update content set status = 'absent' + where sha1 = %s""", (id_3,)) with self.cook_extract_directory(obj_id) as p: self.assertEqual((p / 'file').read_bytes(), b'test1') self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE) self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE) def test_directory_bogus_perms(self): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. repo = TestRepo() with repo as rp: (rp / 'file').write_text(TEST_CONTENT) (rp / 'file').chmod(0o664) (rp / 'executable').write_bytes(TEST_EXECUTABLE) (rp / 'executable').chmod(0o775) (rp / 'wat').write_text(TEST_CONTENT) (rp / 'wat').chmod(0o604) c = repo.commit() self.load(str(rp)) obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with self.cook_extract_directory(obj_id) as p: self.assertEqual((p / 'file').stat().st_mode, 0o100644) self.assertEqual((p / 'executable').stat().st_mode, 0o100755) self.assertEqual((p / 'wat').stat().st_mode, 0o100644) def test_directory_revision_data(self): target_rev = '0e8a3ad980ec179856012b7eecf4327e99cd44cd' d = hashutil.hash_to_bytes('17a3e48bce37be5226490e750202ad3a9a1a3fe9') dir = { 'id': d, 'entries': [ { 'name': b'submodule', 'type': 'rev', 'target': hashutil.hash_to_bytes(target_rev), 'perms': 0o100644, } ], } self.storage.directory_add([dir]) with self.cook_extract_directory(d) as p: self.assertTrue((p / 'submodule').is_symlink()) self.assertEqual(os.readlink(str(p / 'submodule')), target_rev) class TestRevisionGitfastCooker(BaseTestCookers, unittest.TestCase): def test_revision_simple(self): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / 'file1').write_text(TEST_CONTENT) repo.commit('add file1') (rp / 'file2').write_text(TEST_CONTENT) repo.commit('add file2') (rp / 'dir1/dir2').mkdir(parents=True) (rp / 'dir1/dir2/file').write_text(TEST_CONTENT) repo.commit('add dir1/dir2/file') (rp / 'bin1').write_bytes(TEST_EXECUTABLE) (rp / 'bin1').chmod(0o755) repo.commit('add bin1') (rp / 'link1').symlink_to('file1') repo.commit('link link1 to file1') (rp / 'file2').unlink() repo.commit('remove file2') (rp / 'bin1').rename(rp / 'bin') repo.commit('rename bin1 to bin') self.load(str(rp)) obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): ert.checkout(b'HEAD') self.assertEqual((p / 'file1').stat().st_mode, 0o100644) self.assertEqual((p / 'file1').read_text(), TEST_CONTENT) self.assertTrue((p / 'link1').is_symlink) self.assertEqual(os.readlink(str(p / 'link1')), 'file1') self.assertEqual((p / 'bin').stat().st_mode, 0o100755) self.assertEqual((p / 'bin').read_bytes(), TEST_EXECUTABLE) self.assertEqual((p / 'dir1/dir2/file').read_text(), TEST_CONTENT) self.assertEqual((p / 'dir1/dir2/file').stat().st_mode, 0o100644) self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex) def test_revision_two_roots(self): # # 1----3---4 # / # 2---- # repo = TestRepo() with repo as rp: (rp / 'file1').write_text(TEST_CONTENT) c1 = repo.commit('Add file1') del repo.repo.refs[b'refs/heads/master'] # git update-ref -d HEAD (rp / 'file2').write_text(TEST_CONTENT) repo.commit('Add file2') repo.merge([c1]) (rp / 'file3').write_text(TEST_CONTENT) repo.commit('add file3') obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) self.load(str(rp)) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex) def test_revision_two_double_fork_merge(self): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / 'file1').write_text(TEST_CONTENT) c1 = repo.commit('Add file1') repo.repo.refs[b'refs/heads/c1'] = c1 (rp / 'file2').write_text(TEST_CONTENT) repo.commit('Add file2') (rp / 'file3').write_text(TEST_CONTENT) c3 = repo.commit('Add file3', ref=b'refs/heads/c1') repo.repo.refs[b'refs/heads/c3'] = c3 repo.merge([c3]) (rp / 'file5').write_text(TEST_CONTENT) c5 = repo.commit('Add file3', ref=b'refs/heads/c3') repo.merge([c5]) obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) self.load(str(rp)) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex) def test_revision_triple_merge(self): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / 'file1').write_text(TEST_CONTENT) c1 = repo.commit('Commit 1') repo.repo.refs[b'refs/heads/b1'] = c1 repo.repo.refs[b'refs/heads/b2'] = c1 repo.commit('Commit 2') c3 = repo.commit('Commit 3', ref=b'refs/heads/b1') c4 = repo.commit('Commit 4', ref=b'refs/heads/b2') repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) self.load(str(rp)) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex) def test_revision_filtered_objects(self): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b'test1') file_2, id_2 = hash_content(b'test2') file_3, id_3 = hash_content(b'test3') (rp / 'file').write_bytes(file_1) (rp / 'hidden_file').write_bytes(file_2) (rp / 'absent_file').write_bytes(file_3) repo.commit() obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) self.load(str(rp)) # FIXME: storage.content_update() should be changed to allow things # like that - cur = self.storage.db._cursor(None) - cur.execute("""update content set status = 'visible' - where sha1 = %s""", (id_1,)) - cur.execute("""update content set status = 'hidden' - where sha1 = %s""", (id_2,)) - cur.execute("""update content set status = 'absent' - where sha1 = %s""", (id_3,)) - cur.close() + with self.storage.get_db().transaction() as cur: + cur.execute("""update content set status = 'visible' + where sha1 = %s""", (id_1,)) + cur.execute("""update content set status = 'hidden' + where sha1 = %s""", (id_2,)) + cur.execute("""update content set status = 'absent' + where sha1 = %s""", (id_3,)) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): ert.checkout(b'HEAD') self.assertEqual((p / 'file').read_bytes(), b'test1') self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE) self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE) def test_revision_bogus_perms(self): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the revision # cooker. repo = TestRepo() with repo as rp: (rp / 'file').write_text(TEST_CONTENT) (rp / 'file').chmod(0o664) (rp / 'executable').write_bytes(TEST_EXECUTABLE) (rp / 'executable').chmod(0o775) (rp / 'wat').write_text(TEST_CONTENT) (rp / 'wat').chmod(0o604) repo.commit('initial commit') self.load(str(rp)) obj_id_hex = repo.repo.refs[b'HEAD'].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) with self.cook_extract_revision_gitfast(obj_id) as (ert, p): ert.checkout(b'HEAD') self.assertEqual((p / 'file').stat().st_mode, 0o100644) self.assertEqual((p / 'executable').stat().st_mode, 0o100755) self.assertEqual((p / 'wat').stat().st_mode, 0o100644) def test_revision_null_fields(self): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / 'file').write_text(TEST_CONTENT) c = repo.commit('initial commit') self.load(str(rp)) repo.repo.refs[b'HEAD'].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_id = b'56789012345678901234' test_revision = { 'id': test_id, 'message': None, 'author': {'name': None, 'email': None, 'fullname': ''}, 'date': None, 'committer': {'name': None, 'email': None, 'fullname': ''}, 'committer_date': None, 'parents': [], 'type': 'git', 'directory': dir_id, 'metadata': {}, 'synthetic': True } self.storage.revision_add([test_revision]) with self.cook_extract_revision_gitfast(test_id) as (ert, p): ert.checkout(b'HEAD') self.assertEqual((p / 'file').stat().st_mode, 0o100644) def test_revision_revision_data(self): target_rev = '0e8a3ad980ec179856012b7eecf4327e99cd44cd' d = hashutil.hash_to_bytes('17a3e48bce37be5226490e750202ad3a9a1a3fe9') r = hashutil.hash_to_bytes('1ecc9270c4fc61cfddbc65a774e91ef5c425a6f0') dir = { 'id': d, 'entries': [ { 'name': b'submodule', 'type': 'rev', 'target': hashutil.hash_to_bytes(target_rev), 'perms': 0o100644, } ], } self.storage.directory_add([dir]) rev = { 'id': r, 'message': None, 'author': {'name': None, 'email': None, 'fullname': ''}, 'date': None, 'committer': {'name': None, 'email': None, 'fullname': ''}, 'committer_date': None, 'parents': [], 'type': 'git', 'directory': d, 'metadata': {}, 'synthetic': True } self.storage.revision_add([rev]) with self.cook_stream_revision_gitfast(r) as stream: pattern = 'M 160000 {} submodule'.format(target_rev).encode() self.assertIn(pattern, stream.read()) diff --git a/version.txt b/version.txt index 8bd003c..665c1e7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.19-0-ga492ff4 \ No newline at end of file +v0.0.20-0-g74a4f98 \ No newline at end of file