Page MenuHomeSoftware Heritage

D2662.diff
No OneTemporary

D2662.diff

diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -17,6 +17,7 @@
Revision, Release, Directory, DirectoryEntry, Content, SkippedContent,
OriginVisit, Snapshot, Origin
)
+from swh.model.hashutil import DEFAULT_ALGORITHMS
from swh.storage.objstorage import ObjStorage
from swh.storage.writer import JournalWriter
@@ -257,7 +258,7 @@
def skipped_content_missing(self, contents):
for content in contents:
if not self._cql_runner.skipped_content_get_from_pk(content):
- yield content
+ yield {algo: content[algo] for algo in DEFAULT_ALGORITHMS}
def directory_add(self, directories: Iterable[Directory]) -> Dict:
directories = list(directories)
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -41,15 +41,15 @@
class InMemoryStorage:
def __init__(self, journal_writer=None):
- self._contents = {}
- self._content_indexes = defaultdict(lambda: defaultdict(set))
- self._skipped_contents = {}
- self._skipped_content_indexes = defaultdict(lambda: defaultdict(set))
self.reset()
self.journal_writer = JournalWriter(journal_writer)
def reset(self):
+ self._contents = {}
+ self._content_indexes = defaultdict(lambda: defaultdict(set))
+ self._skipped_contents = {}
+ self._skipped_content_indexes = defaultdict(lambda: defaultdict(set))
self._directories = {}
self._revisions = {}
self._releases = {}
@@ -251,45 +251,49 @@
def content_get_random(self):
return random.choice(list(self._content_indexes['sha1_git']))
- def _skipped_content_add(self, contents: Iterable[SkippedContent]) -> Dict:
+ def _skipped_content_add(self, contents: List[SkippedContent]) -> Dict:
self.journal_writer.skipped_content_add(contents)
summary = {
'skipped_content:add': 0
}
- skipped_content_missing = self.skipped_content_missing(
+ missing_contents = self.skipped_content_missing(
[c.to_dict() for c in contents])
- for content in skipped_content_missing:
- key = self._content_key(content, allow_missing=True)
+ missing = {self._content_key(c) for c in missing_contents}
+ contents = [c for c in contents
+ if self._content_key(c) in missing]
+ for content in contents:
+ key = self._content_key(content)
for algo in DEFAULT_ALGORITHMS:
- if content.get(algo):
+ if content.get_hash(algo):
self._skipped_content_indexes[algo][
- content.get(algo)].add(key)
+ content.get_hash(algo)].add(key)
self._skipped_contents[key] = content
summary['skipped_content:add'] += 1
return summary
- def skipped_content_missing(self, contents):
- for content in contents:
- for (key, algorithm) in self._content_key_algorithm(content):
- if algorithm == 'blake2s256':
- continue
- if key not in self._skipped_content_indexes[algorithm]:
- # index must contain hashes of algos except blake2s256
- # else the content is considered skipped
- yield {algo: content[algo]
- for algo in DEFAULT_ALGORITHMS
- if content[algo] is not None}
- break
-
def skipped_content_add(self, content: Iterable[SkippedContent]) -> Dict:
- content = list(content)
now = datetime.datetime.now(tz=datetime.timezone.utc)
content = [attr.evolve(c, ctime=now) for c in content]
return self._skipped_content_add(content)
+ def skipped_content_missing(self, contents):
+ for content in contents:
+ matches = list(self._skipped_contents.values())
+ for (algorithm, key) in self._content_key(content):
+ if algorithm == 'blake2s256':
+ continue
+ # Filter out skipped contents with the same hash
+ matches = [
+ match for match in matches
+ if match.get_hash(algorithm) == key]
+ # if none of the contents match
+ if not matches:
+ yield {algo: content[algo]
+ for algo in DEFAULT_ALGORITHMS}
+
def directory_add(self, directories: Iterable[Directory]) -> Dict:
directories = [dir_ for dir_ in directories
if dir_.id not in self._directories]
@@ -971,17 +975,11 @@
return person
@staticmethod
- def _content_key(content, allow_missing=False):
- """A stable key for a content"""
- return tuple(getattr(content, key, None)
- for key in sorted(DEFAULT_ALGORITHMS))
-
- @staticmethod
- def _content_key_algorithm(content):
+ def _content_key(content):
""" A stable key and the algorithm for a content"""
if isinstance(content, BaseContent):
content = content.to_dict()
- return tuple((content.get(key), key)
+ return tuple((key, content.get(key))
for key in sorted(DEFAULT_ALGORITHMS))
@staticmethod
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -339,6 +339,37 @@
missing = list(swh_storage.skipped_content_missing([cont, cont2]))
+ assert missing == [
+ {
+ 'sha1': cont['sha1'],
+ 'sha1_git': cont['sha1_git'],
+ 'blake2s256': cont['blake2s256'],
+ 'sha256': cont['sha256']
+ },
+ {
+ 'sha1': cont2['sha1'],
+ 'sha1_git': cont2['sha1_git'],
+ 'blake2s256': cont2['blake2s256'],
+ 'sha256': cont2['sha256']
+ },
+ ]
+
+ actual_result = swh_storage.skipped_content_add([cont, cont, cont2])
+
+ assert 2 <= actual_result.pop('skipped_content:add') <= 3
+ assert actual_result == {}
+
+ missing = list(swh_storage.skipped_content_missing([cont, cont2]))
+
+ assert missing == []
+
+ def test_skipped_content_add_missing_hashes(self, swh_storage):
+ cont = data.skipped_cont
+ cont2 = data.skipped_cont2
+ cont['sha1_git'] = cont2['sha1_git'] = None
+
+ missing = list(swh_storage.skipped_content_missing([cont, cont2]))
+
assert len(missing) == 2
actual_result = swh_storage.skipped_content_add([cont, cont, cont2])
@@ -350,6 +381,29 @@
assert missing == []
+ def test_skipped_content_missing_partial_hash(self, swh_storage):
+ cont = data.skipped_cont
+ cont2 = cont.copy()
+ cont2['sha1_git'] = None
+
+ missing = list(swh_storage.skipped_content_missing([cont, cont2]))
+
+ assert len(missing) == 2
+
+ actual_result = swh_storage.skipped_content_add([cont])
+
+ assert actual_result.pop('skipped_content:add') == 1
+ assert actual_result == {}
+
+ missing = list(swh_storage.skipped_content_missing([cont, cont2]))
+
+ assert missing == [{
+ 'sha1': cont2['sha1'],
+ 'sha1_git': cont2['sha1_git'],
+ 'blake2s256': cont2['blake2s256'],
+ 'sha256': cont2['sha256']
+ }]
+
@pytest.mark.property_based
@settings(deadline=None) # this test is very slow
@given(strategies.sets(

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 9:59 AM (19 h, 42 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221865

Event Timeline