Page MenuHomeSoftware Heritage

D2084.id7018.diff
No OneTemporary

D2084.id7018.diff

diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -1,6 +1,76 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
from hypothesis import settings
+from typing import Dict
+
# define tests profile. Full documentation is at:
# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
settings.register_profile("fast", max_examples=5, deadline=5000)
settings.register_profile("slow", max_examples=20, deadline=5000)
+
+
+@pytest.fixture
+def sample_data() -> Dict:
+ """Pre-defined sample storage object data to manipulate
+
+ Returns:
+ Dict of data (keys: content, directory, revision, person)
+
+ """
+ sample_content = {
+ 'blake2s256': b'\xbf?\x05\xed\xc1U\xd2\xc5\x168Xm\x93\xde}f(HO@\xd0\xacn\x04\x1e\x9a\xb9\xfa\xbf\xcc\x08\xc7', # noqa
+ 'sha1': b'g\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
+ 'sha1_git': b'\xf2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
+ 'sha256': b"\x87\x022\xedZN\x84\xe8za\xf8'(oA\xc9k\xb1\x80c\x80\xe7J\x06\xea\xd2\xd5\xbeB\x19\xb8\xce", # noqa
+ 'length': 48,
+ 'data': b'temp file for testing content storage conversion',
+ 'status': 'visible',
+ }
+
+ sample_content2 = {
+ 'blake2s256': b'\xbf?\x05\xed\xc1U\xd2\xc5\x168Xm\x93\xde}f(HO@\xd0\xacn\x04\x1e\x9a\xb9\xfa\xbf\xcc\x08\xc7', # noqa
+ 'sha1': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
+ 'sha1_git': b'\xc2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
+ 'sha256': b"\x77\x022\xedZN\x84\xe8za\xf8'(oA\xc9k\xb1\x80c\x80\xe7J\x06\xea\xd2\xd5\xbeB\x19\xb8\xce", # noqa
+ 'length': 50,
+ 'data': b'temp file for testing content storage conversion 2',
+ 'status': 'visible',
+ }
+
+ sample_directory = {
+ 'id': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
+ 'entries': []
+ }
+
+ sample_person = {
+ 'name': b'John Doe',
+ 'email': b'john.doe@institute.org',
+ 'fullname': b'John Doe <john.doe@institute.org>'
+ }
+
+ sample_revision = {
+ 'id': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
+ 'message': b'something',
+ 'author': sample_person,
+ 'committer': sample_person,
+ 'date': 1567591673,
+ 'committer_date': 1567591673,
+ 'type': 'tar',
+ 'directory': b'\xc2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
+ 'synthetic': False,
+ 'metadata': {},
+ 'parents': [],
+ }
+
+ return {
+ 'content': [sample_content, sample_content2],
+ 'person': [sample_person],
+ 'directory': [sample_directory],
+ 'revision': [sample_revision],
+ }
diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py
--- a/swh/storage/__init__.py
+++ b/swh/storage/__init__.py
@@ -13,7 +13,7 @@
pass
-STORAGE_IMPLEMENTATION = {'local', 'remote', 'memory'}
+STORAGE_IMPLEMENTATION = {'local', 'remote', 'memory', 'filter'}
def get_storage(cls, args):
@@ -22,7 +22,7 @@
Args:
storage (dict): dictionary with keys:
- - cls (str): storage's class, either local, remote, memory
+ - cls (str): storage's class, either local, remote, memory, filter
- args (dict): dictionary with keys
Returns:
@@ -42,5 +42,7 @@
from .storage import Storage
elif cls == 'memory':
from .in_memory import Storage
+ elif cls == 'filter':
+ from .filter import FilteringProxyStorage as Storage
return Storage(**args)
diff --git a/swh/storage/filter.py b/swh/storage/filter.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/filter.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from typing import Dict, Generator, Sequence, Set
+
+from swh.storage import get_storage
+
+
+class FilteringProxyStorage:
+ """Filtering Storage implementation. This is in charge of transparently
+ filtering out known objects prior to adding them to storage.
+
+ Sample configuration use case for filtering storage:
+
+ .. code-block: yaml
+
+ storage:
+ cls: filter
+ args:
+ storage:
+ cls: remote
+ args: http://storage.internal.staging.swh.network:5002/
+
+ """
+ def __init__(self, storage):
+ self.storage = get_storage(**storage)
+ self.objects_seen = {
+ 'content': set(), # set of content hashes (sha256) seen
+ 'directory': set(),
+ 'revision': set(),
+ }
+
+ def __getattr__(self, key):
+ return getattr(self.storage, key)
+
+ def content_add(self, content: Sequence[Dict]) -> Dict:
+ contents = list(content)
+ contents_to_add = self._filter_missing_contents(contents)
+ return self.storage.content_add(
+ x for x in contents if x['sha256'] in contents_to_add
+ )
+
+ def directory_add(self, directories: Sequence[Dict]) -> Dict:
+ directories = list(directories)
+ missing_ids = self._filter_missing_ids(
+ 'directory',
+ (d['id'] for d in directories)
+ )
+ return self.storage.directory_add(
+ d for d in directories if d['id'] in missing_ids
+ )
+
+ def revision_add(self, revisions):
+ revisions = list(revisions)
+ missing_ids = self._filter_missing_ids(
+ 'revision',
+ (d['id'] for d in revisions)
+ )
+ return self.storage.revision_add(
+ r for r in revisions if r['id'] in missing_ids
+ )
+
+ def _filter_missing_contents(
+ self, content_hashes: Sequence[Dict]) -> Set[bytes]:
+ """Return only the content keys missing from swh
+
+ Args:
+ content_hashes: List of sha256 to check for existence in swh
+ storage
+
+ """
+ objects_seen = self.objects_seen['content']
+ missing_hashes = []
+ for hashes in content_hashes:
+ if hashes['sha256'] in objects_seen:
+ continue
+ objects_seen.add(hashes['sha256'])
+ missing_hashes.append(hashes)
+
+ return set(self.storage.content_missing(
+ missing_hashes,
+ key_hash='sha256',
+ ))
+
+ def _filter_missing_ids(
+ self,
+ object_type: str,
+ ids: Generator[bytes, None, None]) -> Set[bytes]:
+ """Filter missing ids from the storage for a given object type.
+
+ Args:
+ object_type: object type to use {revision, directory}
+ ids: Sequence of object_type ids
+
+ Returns:
+ Missing ids from the storage for object_type
+
+ """
+ objects_seen = self.objects_seen[object_type]
+ missing_ids = []
+ for id in ids:
+ if id in objects_seen:
+ continue
+ objects_seen.add(id)
+ missing_ids.append(id)
+
+ fn_by_object_type = {
+ 'revision': self.storage.revision_missing,
+ 'directory': self.storage.directory_missing,
+ }
+
+ fn = fn_by_object_type[object_type]
+ return set(fn(missing_ids))
diff --git a/swh/storage/tests/test_filter.py b/swh/storage/tests/test_filter.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_filter.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from swh.storage.filter import FilteringProxyStorage
+
+
+def test_filtering_proxy_storage_content(sample_data):
+ sample_content = sample_data['content'][0]
+ storage = FilteringProxyStorage(storage={'cls': 'memory', 'args': {}})
+
+ content = next(storage.content_get([sample_content['sha1']]))
+ assert not content
+
+ s = storage.content_add([sample_content])
+ assert s == {
+ 'content:add': 1,
+ 'content:add:bytes': 48,
+ 'skipped_content:add': 0
+ }
+
+ content = next(storage.content_get([sample_content['sha1']]))
+ assert content is not None
+
+ s = storage.content_add([sample_content])
+ assert s == {
+ 'content:add': 0,
+ 'content:add:bytes': 0,
+ 'skipped_content:add': 0
+ }
+
+
+def test_filtering_proxy_storage_revision(sample_data):
+ sample_revision = sample_data['revision'][0]
+ storage = FilteringProxyStorage(storage={'cls': 'memory', 'args': {}})
+
+ revision = next(storage.revision_get([sample_revision['id']]))
+ assert not revision
+
+ s = storage.revision_add([sample_revision])
+ assert s == {
+ 'revision:add': 1,
+ }
+
+ revision = next(storage.revision_get([sample_revision['id']]))
+ assert revision is not None
+
+ s = storage.revision_add([sample_revision])
+ assert s == {
+ 'revision:add': 0,
+ }
+
+
+def test_filtering_proxy_storage_directory(sample_data):
+ sample_directory = sample_data['directory'][0]
+ storage = FilteringProxyStorage(storage={'cls': 'memory', 'args': {}})
+
+ directory = next(storage.directory_missing([sample_directory['id']]))
+ assert directory
+
+ s = storage.directory_add([sample_directory])
+ assert s == {
+ 'directory:add': 1,
+ }
+
+ directory = list(storage.directory_missing([sample_directory['id']]))
+ assert not directory
+
+ s = storage.directory_add([sample_directory])
+ assert s == {
+ 'directory:add': 0,
+ }
diff --git a/swh/storage/tests/test_init.py b/swh/storage/tests/test_init.py
--- a/swh/storage/tests/test_init.py
+++ b/swh/storage/tests/test_init.py
@@ -12,6 +12,7 @@
from swh.storage.api.client import RemoteStorage
from swh.storage.storage import Storage as DbStorage
from swh.storage.in_memory import Storage as MemoryStorage
+from swh.storage.filter import FilteringProxyStorage
@patch('swh.storage.storage.psycopg2.pool')
@@ -28,6 +29,9 @@
'cls': 'memory', 'args': {},
},
}),
+ ('filter', FilteringProxyStorage, {'storage': {
+ 'cls': 'memory', 'args': {}}
+ })
]:
actual_storage = get_storage(cls, args=dummy_args)
assert actual_storage is not None

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 3:39 PM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217603

Event Timeline