Page MenuHomeSoftware Heritage

D2626.diff
No OneTemporary

D2626.diff

diff --git a/swh/storage/buffer.py b/swh/storage/buffer.py
--- a/swh/storage/buffer.py
+++ b/swh/storage/buffer.py
@@ -28,6 +28,7 @@
min_batch_size:
content: 10000
content_bytes: 100000000
+ skipped_content: 10000
directory: 5000
revision: 1000
release: 10000
@@ -43,16 +44,18 @@
'content': min_batch_size.get('content', 10000),
'content_bytes': min_batch_size.get('content_bytes',
100*1024*1024),
+ 'skipped_content': min_batch_size.get('skipped_content', 10000),
'directory': min_batch_size.get('directory', 25000),
'revision': min_batch_size.get('revision', 100000),
'release': min_batch_size.get('release', 100000),
}
- self.object_types = ['content', 'directory', 'revision', 'release']
+ self.object_types = [
+ 'content', 'skipped_content', 'directory', 'revision', 'release']
self._objects = {k: deque() for k in self.object_types}
def __getattr__(self, key):
if key.endswith('_add'):
- object_type = key.split('_')[0]
+ object_type = key.rsplit('_', 1)[0]
if object_type in self.object_types:
return partial(
self.object_add, object_type=object_type
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -19,6 +19,7 @@
from swh.model.model import (
Sha1Git, TimestampWithTimezone, Timestamp, Person, Content,
+ SkippedContent,
)
from .common import Row, TOKEN_BEGIN, TOKEN_END, hash_url
@@ -167,6 +168,10 @@
'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length',
'ctime', 'status']
+ @_prepared_insert_statement('content', _content_keys)
+ def content_add_one(self, content, *, statement) -> None:
+ self._add_one(statement, 'content', content, self._content_keys)
+
@_prepared_statement('SELECT * FROM content WHERE ' +
' AND '.join(map('%s = ?'.__mod__, HASH_ALGORITHMS)))
def content_get_from_pk(
@@ -203,10 +208,6 @@
self, ids: List[bytes], *, statement) -> List[bytes]:
return self._missing(statement, ids)
- @_prepared_insert_statement('content', _content_keys)
- def content_add_one(self, content, *, statement) -> None:
- self._add_one(statement, 'content', content, self._content_keys)
-
def content_index_add_one(self, main_algo: str, content: Content) -> None:
query = 'INSERT INTO content_by_{algo} ({cols}) VALUES ({values})' \
.format(algo=main_algo, cols=', '.join(self._content_pk),
@@ -221,6 +222,60 @@
algo=algo)
return list(self._execute_with_retries(query, [hash_]))
+ ##########################
+ # 'skipped_content' table
+ ##########################
+
+ _skipped_content_pk = ['sha1', 'sha1_git', 'sha256', 'blake2s256']
+ _skipped_content_keys = [
+ 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length',
+ 'ctime', 'status', 'reason', 'origin']
+ _magic_null_pk = b''
+ """
+ NULLs are not allowed in primary keys; instead use an empty
+ value
+ """
+
+ @_prepared_insert_statement('skipped_content', _skipped_content_keys)
+ def skipped_content_add_one(self, content, *, statement) -> None:
+ content = content.to_dict()
+ for key in self._skipped_content_pk:
+ if content[key] is None:
+ content[key] = self._magic_null_pk
+ content = SkippedContent.from_dict(content)
+ self._add_one(statement, 'skipped_content', content,
+ self._skipped_content_keys)
+
+ @_prepared_statement('SELECT * FROM skipped_content WHERE ' +
+ ' AND '.join(map('%s = ?'.__mod__, HASH_ALGORITHMS)))
+ def skipped_content_get_from_pk(
+ self, content_hashes: Dict[str, bytes], *, statement
+ ) -> Optional[Row]:
+ rows = list(self._execute_with_retries(
+ statement, [content_hashes[algo] or self._magic_null_pk
+ for algo in HASH_ALGORITHMS]))
+ assert len(rows) <= 1
+ if rows:
+ # TODO: convert _magic_null_pk back to None?
+ return rows[0]
+ else:
+ return None
+
+ ##########################
+ # 'skipped_content_by_*' tables
+ ##########################
+
+ def skipped_content_index_add_one(
+ self, main_algo: str, content: Content) -> None:
+ assert content.get_hash(main_algo) is not None
+ query = ('INSERT INTO skipped_content_by_{algo} ({cols}) '
+ 'VALUES ({values})').format(
+ algo=main_algo, cols=', '.join(self._content_pk),
+ values=', '.join('%s' for _ in self._content_pk))
+ self._execute_with_retries(
+ query, [content.get_hash(algo) or self._magic_null_pk
+ for algo in self._content_pk])
+
##########################
# 'revision' table
##########################
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -65,6 +65,20 @@
PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
);
+CREATE TABLE IF NOT EXISTS skipped_content (
+ sha1 blob,
+ sha1_git blob,
+ sha256 blob,
+ blake2s256 blob,
+ length bigint,
+ ctime timestamp,
+ -- creation time, i.e. time of (first) injection into the storage
+ status ascii,
+ reason text,
+ origin text,
+ PRIMARY KEY ((sha1, sha1_git, sha256, blake2s256))
+);
+
CREATE TABLE IF NOT EXISTS revision (
id blob PRIMARY KEY,
date microtimestamp_with_timezone,
@@ -184,19 +198,29 @@
sha256 blob,
blake2s256 blob,
PRIMARY KEY (({main_algo}), {other_algos})
-);'''
+);
+
+CREATE TABLE IF NOT EXISTS skipped_content_by_{main_algo} (
+ sha1 blob,
+ sha1_git blob,
+ sha256 blob,
+ blake2s256 blob,
+ PRIMARY KEY (({main_algo}), {other_algos})
+);
+'''
-TABLES = ('content revision revision_parent release directory '
- 'directory_entry snapshot snapshot_branch origin_visit '
- 'origin tool_by_uuid tool object_count').split()
+TABLES = ('skipped_content content revision revision_parent release '
+ 'directory directory_entry snapshot snapshot_branch '
+ 'origin_visit origin tool_by_uuid tool object_count').split()
HASH_ALGORITHMS = ['sha1', 'sha1_git', 'sha256', 'blake2s256']
for main_algo in HASH_ALGORITHMS:
- CREATE_TABLES_QUERIES.append(CONTENT_INDEX_TEMPLATE.format(
+ CREATE_TABLES_QUERIES.extend(CONTENT_INDEX_TEMPLATE.format(
main_algo=main_algo,
other_algos=', '.join(
[algo for algo in HASH_ALGORITHMS if algo != main_algo])
- ))
+ ).split('\n\n'))
TABLES.append('content_by_%s' % main_algo)
+ TABLES.append('skipped_content_by_%s' % main_algo)
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -14,7 +14,8 @@
import dateutil
from swh.model.model import (
- Revision, Release, Directory, DirectoryEntry, Content, OriginVisit,
+ Revision, Release, Directory, DirectoryEntry, Content, SkippedContent,
+ OriginVisit,
)
from swh.objstorage import get_objstorage
from swh.objstorage.exc import ObjNotFoundError
@@ -116,7 +117,6 @@
summary = {
'content:add': count_content_added,
- 'skipped_content:add': count_contents - count_content_added,
}
if with_data:
@@ -208,10 +208,6 @@
result[content_metadata['sha1']].append(content_metadata)
return result
- def skipped_content_missing(self, contents):
- # TODO
- raise NotImplementedError('not yet supported for Cassandra')
-
def content_find(self, content):
# Find an algorithm that is common to all the requested contents.
# It will be used to do an initial filtering efficiently.
@@ -263,6 +259,46 @@
def content_get_random(self):
return self._cql_runner.content_get_random().sha1_git
+ def _skipped_content_add(self, contents):
+ contents = [SkippedContent.from_dict(c) for c in contents]
+
+ # Filter-out content already in the database.
+ contents = [
+ c for c in contents
+ if not self._cql_runner.skipped_content_get_from_pk(c.to_dict())]
+
+ if self.journal_writer:
+ for content in contents:
+ content = content.to_dict()
+ if 'data' in content:
+ del content['data']
+ self.journal_writer.write_addition('content', content)
+
+ for content in contents:
+ # Add to index tables
+ for algo in HASH_ALGORITHMS:
+ if content.get_hash(algo) is not None:
+ self._cql_runner.skipped_content_index_add_one(
+ algo, content)
+
+ # Then to the main table
+ self._cql_runner.skipped_content_add_one(content)
+
+ return {
+ 'skipped_content:add': len(contents)
+ }
+
+ def skipped_content_add(self, content):
+ content = [c.copy() for c in content] # semi-shallow copy
+ for item in content:
+ item['ctime'] = now()
+ return self._skipped_content_add(content)
+
+ def skipped_content_missing(self, contents):
+ for content in contents:
+ if not self._cql_runner.skipped_content_get_from_pk(content):
+ yield content
+
def directory_add(self, directories):
directories = list(directories)
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -143,9 +143,9 @@
query = """SELECT * FROM (VALUES %s) AS t (%s)
WHERE not exists
(SELECT 1 FROM skipped_content s WHERE
- s.sha1 is not distinct from t.sha1 and
- s.sha1_git is not distinct from t.sha1_git and
- s.sha256 is not distinct from t.sha256);""" % \
+ s.sha1 is not distinct from t.sha1::sha1 and
+ s.sha1_git is not distinct from t.sha1_git::sha1 and
+ s.sha256 is not distinct from t.sha256::bytea);""" % \
((', '.join('%s' for _ in contents)),
', '.join(self.content_hash_keys))
cur.execute(query,
diff --git a/swh/storage/filter.py b/swh/storage/filter.py
--- a/swh/storage/filter.py
+++ b/swh/storage/filter.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
-from typing import Dict, Generator, Sequence, Set
+from typing import Dict, Generator, Iterable, Set
from swh.storage import get_storage
@@ -27,22 +27,30 @@
def __init__(self, storage):
self.storage = get_storage(**storage)
self.objects_seen = {
- 'content': set(), # set of content hashes (sha256) seen
- 'directory': set(),
- 'revision': set(),
+ 'content': set(), # sha256
+ 'skipped_content': set(), # sha1_git
+ 'directory': set(), # sha1_git
+ 'revision': set(), # sha1_git
}
def __getattr__(self, key):
return getattr(self.storage, key)
- def content_add(self, content: Sequence[Dict]) -> Dict:
+ def content_add(self, content: Iterable[Dict]) -> Dict:
contents = list(content)
contents_to_add = self._filter_missing_contents(contents)
return self.storage.content_add(
x for x in contents if x['sha256'] in contents_to_add
)
- def directory_add(self, directories: Sequence[Dict]) -> Dict:
+ def skipped_content_add(self, content: Iterable[Dict]) -> Dict:
+ contents = list(content)
+ contents_to_add = self._filter_missing_skipped_contents(contents)
+ return self.storage.skipped_content_add(
+ x for x in contents if x['sha1_git'] in contents_to_add
+ )
+
+ def directory_add(self, directories: Iterable[Dict]) -> Dict:
directories = list(directories)
missing_ids = self._filter_missing_ids(
'directory',
@@ -63,7 +71,7 @@
)
def _filter_missing_contents(
- self, content_hashes: Sequence[Dict]) -> Set[bytes]:
+ self, content_hashes: Iterable[Dict]) -> Set[bytes]:
"""Return only the content keys missing from swh
Args:
@@ -84,6 +92,26 @@
key_hash='sha256',
))
+ def _filter_missing_skipped_contents(
+ self, content_hashes: Iterable[Dict]) -> Set[bytes]:
+ """Return only the content keys missing from swh
+
+ Args:
+ content_hashes: List of sha1_git to check for existence in swh
+ storage
+
+ """
+ objects_seen = self.objects_seen['skipped_content']
+ missing_hashes = []
+ for hashes in content_hashes:
+ if hashes['sha1_git'] in objects_seen:
+ continue
+ objects_seen.add(hashes['sha1_git'])
+ missing_hashes.append(hashes)
+
+ return {c['sha1_git']
+ for c in self.storage.skipped_content_missing(missing_hashes)}
+
def _filter_missing_ids(
self,
object_type: str,
@@ -92,7 +120,7 @@
Args:
object_type: object type to use {revision, directory}
- ids: Sequence of object_type ids
+ ids: Iterable of object_type ids
Returns:
Missing ids from the storage for object_type
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -19,8 +19,8 @@
import attr
from swh.model.model import (
- Content, Directory, Revision, Release, Snapshot, OriginVisit, Origin,
- SHA1_SIZE)
+ BaseContent, Content, SkippedContent, Directory, Revision, Release,
+ Snapshot, OriginVisit, Origin, SHA1_SIZE)
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
from swh.objstorage import get_objstorage
from swh.objstorage.exc import ObjNotFoundError
@@ -75,47 +75,26 @@
return True
def _content_add(self, contents, with_data):
- content_with_data = []
- content_without_data = []
for content in contents:
if content.status is None:
content.status = 'visible'
+ if content.status == 'absent':
+ raise ValueError('content with status=absent')
if content.length is None:
- content.length = -1
- if content.status != 'absent':
- if self._content_key(content) not in self._contents:
- content_with_data.append(content)
- else:
- if self._content_key(content) not in self._skipped_contents:
- content_without_data.append(content)
+ raise ValueError('content with length=None')
if self.journal_writer:
- for content in content_with_data:
+ for content in contents:
content = attr.evolve(content, data=None)
self.journal_writer.write_addition('content', content)
- for content in content_without_data:
- self.journal_writer.write_addition('content', content)
-
- count_content_added, count_content_bytes_added = \
- self._content_add_present(content_with_data, with_data)
-
- count_skipped_content_added = self._content_add_absent(
- content_without_data
- )
summary = {
- 'content:add': count_content_added,
- 'skipped_content:add': count_skipped_content_added,
+ 'content:add': 0,
}
if with_data:
- summary['content:add:bytes'] = count_content_bytes_added
+ summary['content:add:bytes'] = 0
- return summary
-
- def _content_add_present(self, contents, with_data):
- count_content_added = 0
- count_content_bytes_added = 0
for content in contents:
key = self._content_key(content)
if key in self._contents:
@@ -133,40 +112,21 @@
('content', content.sha1))
self._contents[key] = content
bisect.insort(self._sorted_sha1s, content.sha1)
- count_content_added += 1
+ summary['content:add'] += 1
if with_data:
content_data = self._contents[key].data
self._contents[key] = attr.evolve(
self._contents[key],
data=None)
- count_content_bytes_added += len(content_data)
+ summary['content:add:bytes'] += len(content_data)
self.objstorage.add(content_data, content.sha1)
- return (count_content_added, count_content_bytes_added)
-
- def _content_add_absent(self, contents):
- count = 0
- skipped_content_missing = self.skipped_content_missing(contents)
- for content in skipped_content_missing:
- key = self._content_key(content)
- for algo in DEFAULT_ALGORITHMS:
- self._skipped_content_indexes[algo][content.get_hash(algo)] \
- .add(key)
- self._skipped_contents[key] = content
- count += 1
-
- return count
-
- def _content_to_model(self, contents):
- for content in contents:
- content = content.copy()
- content.pop('origin', None)
- yield Content.from_dict(content)
+ return summary
def content_add(self, content):
now = datetime.datetime.now(tz=datetime.timezone.utc)
- content = [attr.evolve(c, ctime=now)
- for c in self._content_to_model(content)]
+ content = [attr.evolve(Content.from_dict(c), ctime=now)
+ for c in content]
return self._content_add(content, with_data=True)
def content_update(self, content, keys=[]):
@@ -194,7 +154,7 @@
self._content_indexes[algorithm][hash_].add(new_key)
def content_add_metadata(self, content):
- content = list(self._content_to_model(content))
+ content = [Content.from_dict(c) for c in content]
return self._content_add(content, with_data=False)
def content_get(self, content):
@@ -308,6 +268,39 @@
if content not in self._content_indexes['sha1_git']:
yield content
+ def content_get_random(self):
+ return random.choice(list(self._content_indexes['sha1_git']))
+
+ def _skipped_content_add(self, contents):
+ for content in contents:
+ if content.status is None:
+ content = attr.evolve(content, status='absent')
+ if content.length is None:
+ content = attr.evolve(content, length=-1)
+ if content.status != 'absent':
+ raise ValueError(f'Content with status={content.status}')
+
+ if self.journal_writer:
+ for content in contents:
+ self.journal_writer.write_addition('content', content)
+
+ summary = {
+ 'skipped_content:add': 0
+ }
+
+ skipped_content_missing = self.skipped_content_missing(
+ [c.to_dict() for c in contents])
+ for content in skipped_content_missing:
+ key = self._content_key(content, allow_missing=True)
+ for algo in DEFAULT_ALGORITHMS:
+ if algo in content:
+ self._skipped_content_indexes[algo][content[algo]] \
+ .add(key)
+ self._skipped_contents[key] = content
+ summary['skipped_content:add'] += 1
+
+ return summary
+
def skipped_content_missing(self, contents):
for content in contents:
for (key, algorithm) in self._content_key_algorithm(content):
@@ -316,11 +309,17 @@
if key not in self._skipped_content_indexes[algorithm]:
# index must contain hashes of algos except blake2s256
# else the content is considered skipped
- yield content
+ yield {algo: content[algo]
+ for algo in DEFAULT_ALGORITHMS
+ if content[algo] is not None}
break
- def content_get_random(self):
- return random.choice(list(self._content_indexes['sha1_git']))
+ def skipped_content_add(self, content):
+ content = list(content)
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
+ content = [attr.evolve(SkippedContent.from_dict(c), ctime=now)
+ for c in content]
+ return self._skipped_content_add(content)
def directory_add(self, directories):
directories = list(directories)
@@ -1021,15 +1020,15 @@
return person
@staticmethod
- def _content_key(content):
+ def _content_key(content, allow_missing=False):
"""A stable key for a content"""
- return tuple(getattr(content, key)
+ return tuple(getattr(content, key, None)
for key in sorted(DEFAULT_ALGORITHMS))
@staticmethod
def _content_key_algorithm(content):
""" A stable key and the algorithm for a content"""
- if isinstance(content, Content):
+ if isinstance(content, BaseContent):
content = content.to_dict()
return tuple((content.get(key), key)
for key in sorted(DEFAULT_ALGORITHMS))
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -29,14 +29,11 @@
following keys:
- data (bytes): the actual content
- - length (int): content length (default: -1)
+ - length (int): content length
- one key for each checksum algorithm in
:data:`swh.model.hashutil.ALGORITHMS`, mapped to the
corresponding checksum
- - status (str): one of visible, hidden, absent
- - reason (str): if status = absent, the reason why
- - origin (int): if status = absent, the origin we saw the
- content in
+ - status (str): one of visible, hidden
Raises:
@@ -50,11 +47,10 @@
Since additions to both idempotent, that should not be a problem.
Returns:
- Summary dict with the following key and associated values:
+ Summary dict with the following keys and associated values:
content:add: New contents added
content:add:bytes: Sum of the contents' length data
- skipped_content:add: New skipped contents (no data) added
"""
...
@@ -253,20 +249,6 @@
"""
...
- @remote_api_endpoint('content/skipped/missing')
- def skipped_content_missing(self, contents):
- """List skipped_content missing from storage
-
- Args:
- content: iterable of dictionaries containing the data for each
- checksum algorithm.
-
- Returns:
- iterable: missing signatures
-
- """
- ...
-
@remote_api_endpoint('content/present')
def content_find(self, content):
"""Find a content hash in db.
@@ -296,6 +278,57 @@
"""
...
+ @remote_api_endpoint('content/skipped/add')
+ def skipped_content_add(self, content):
+ """Add contents to the skipped_content list, which contains
+ (partial) information about content missing from the archive.
+
+ Args:
+ contents (iterable): iterable of dictionaries representing
+ individual pieces of content to add. Each dictionary has the
+ following keys:
+
+ - length (Optional[int]): content length (default: -1)
+ - one key for each checksum algorithm in
+ :data:`swh.model.hashutil.ALGORITHMS`, mapped to the
+ corresponding checksum; each is optional
+ - status (str): must be "absent"
+ - reason (str): the reason why the content is absent
+ - origin (int): if status = absent, the origin we saw the
+ content in
+
+ Raises:
+
+ The following exceptions can occur:
+
+ - HashCollision in case of collision
+ - Any other exceptions raise by the backend
+
+ In case of errors, some content may have been stored in
+ the DB and in the objstorage.
+ Since additions to both idempotent, that should not be a problem.
+
+ Returns:
+ Summary dict with the following key and associated values:
+
+ skipped_content:add: New skipped contents (no data) added
+ """
+ ...
+
+ @remote_api_endpoint('content/skipped/missing')
+ def skipped_content_missing(self, contents):
+ """List skipped_content missing from storage
+
+ Args:
+ content: iterable of dictionaries containing the data for each
+ checksum algorithm.
+
+ Returns:
+ iterable: missing signatures
+
+ """
+ ...
+
@remote_api_endpoint('directory/add')
def directory_add(self, directories):
"""Add directories to the storage
diff --git a/swh/storage/retry.py b/swh/storage/retry.py
--- a/swh/storage/retry.py
+++ b/swh/storage/retry.py
@@ -63,15 +63,20 @@
return getattr(self.storage, key)
@swh_retry
- def content_add(self, content: List[Dict]) -> Dict:
+ def content_add(self, content: Iterable[Dict]) -> Dict:
contents = list(content)
return self.storage.content_add(contents)
@swh_retry
- def content_add_metadata(self, content: List[Dict]) -> Dict:
+ def content_add_metadata(self, content: Iterable[Dict]) -> Dict:
contents = list(content)
return self.storage.content_add_metadata(contents)
+ @swh_retry
+ def skipped_content_add(self, content: Iterable[Dict]) -> Dict:
+ contents = list(content)
+ return self.storage.skipped_content_add(contents)
+
@swh_retry
def origin_add_one(self, origin: Dict) -> str:
return self.storage.origin_add_one(origin)
@@ -91,7 +96,7 @@
metadata=metadata, snapshot=snapshot)
@swh_retry
- def tool_add(self, tools: List[Dict]) -> List[Dict]:
+ def tool_add(self, tools: Iterable[Dict]) -> List[Dict]:
tools = list(tools)
return self.storage.tool_add(tools)
@@ -110,22 +115,22 @@
origin_url, ts, provider_id, tool_id, metadata)
@swh_retry
- def directory_add(self, directories: List[Dict]) -> Dict:
+ def directory_add(self, directories: Iterable[Dict]) -> Dict:
directories = list(directories)
return self.storage.directory_add(directories)
@swh_retry
- def revision_add(self, revisions: List[Dict]) -> Dict:
+ def revision_add(self, revisions: Iterable[Dict]) -> Dict:
revisions = list(revisions)
return self.storage.revision_add(revisions)
@swh_retry
- def release_add(self, releases: List[Dict]) -> Dict:
+ def release_add(self, releases: Iterable[Dict]) -> Dict:
releases = list(releases)
return self.storage.release_add(releases)
@swh_retry
- def snapshot_add(self, snapshot: List[Dict]) -> Dict:
+ def snapshot_add(self, snapshot: Iterable[Dict]) -> Dict:
snapshots = list(snapshot)
return self.storage.snapshot_add(snapshots)
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -128,106 +128,54 @@
return tuple([hash[k] for k in keys])
@staticmethod
- def _normalize_content(d):
+ def _content_normalize(d):
d = d.copy()
if 'status' not in d:
d['status'] = 'visible'
- if 'length' not in d:
- d['length'] = -1
-
return d
@staticmethod
- def _validate_content(d):
+ def _content_validate(d):
"""Sanity checks on status / reason / length, that postgresql
doesn't enforce."""
- if d['status'] not in ('visible', 'absent', 'hidden'):
+ if d['status'] not in ('visible', 'hidden'):
raise ValueError('Invalid content status: {}'.format(d['status']))
- if d['status'] != 'absent' and d.get('reason') is not None:
+ if d.get('reason') is not None:
raise ValueError(
- 'Must not provide a reason if content is not absent.')
+ 'Must not provide a reason if content is present.')
- if d['length'] < -1:
- raise ValueError('Content length must be positive or -1.')
+ if d['length'] is None or d['length'] < 0:
+ raise ValueError('Content length must be positive.')
- def _filter_new_content(self, content, db=None, cur=None):
- """Sort contents into buckets 'with data' and 'without data',
- and filter out those already in the database."""
- content_by_status = defaultdict(list)
- for d in content:
- content_by_status[d['status']].append(d)
-
- content_with_data = content_by_status['visible'] \
- + content_by_status['hidden']
- content_without_data = content_by_status['absent']
-
- missing_content = set(self.content_missing(content_with_data,
- db=db, cur=cur))
- missing_skipped = set(self._content_unique_key(hashes, db)
- for hashes in self.skipped_content_missing(
- content_without_data, db=db, cur=cur))
-
- content_with_data = [
- cont for cont in content_with_data
- if cont['sha1'] in missing_content]
- content_without_data = [
- cont for cont in content_without_data
- if self._content_unique_key(cont, db) in missing_skipped]
-
- summary = {
- 'content:add': len(missing_content),
- 'skipped_content:add': len(missing_skipped),
- }
-
- return (content_with_data, content_without_data, summary)
-
- def _content_add_metadata(self, db, cur,
- content_with_data, content_without_data):
+ def _content_add_metadata(self, db, cur, content):
"""Add content to the postgresql database but not the object storage.
"""
- if content_with_data:
- # create temporary table for metadata injection
- db.mktemp('content', cur)
+ # create temporary table for metadata injection
+ db.mktemp('content', cur)
- db.copy_to(content_with_data, 'tmp_content',
- db.content_add_keys, cur)
+ db.copy_to(content, 'tmp_content',
+ db.content_add_keys, cur)
- # move metadata in place
- try:
- db.content_add_from_temp(cur)
- except psycopg2.IntegrityError as e:
- from . import HashCollision
- if e.diag.sqlstate == '23505' and \
- e.diag.table_name == 'content':
- constraint_to_hash_name = {
- 'content_pkey': 'sha1',
- 'content_sha1_git_idx': 'sha1_git',
- 'content_sha256_idx': 'sha256',
- }
- colliding_hash_name = constraint_to_hash_name \
- .get(e.diag.constraint_name)
- raise HashCollision(colliding_hash_name) from None
- else:
- raise
-
- if content_without_data:
- content_without_data = \
- [cont.copy() for cont in content_without_data]
- origin_ids = db.origin_id_get_by_url(
- [cont.get('origin') for cont in content_without_data],
- cur=cur)
- for (cont, origin_id) in zip(content_without_data, origin_ids):
- if 'origin' in cont:
- cont['origin'] = origin_id
- db.mktemp('skipped_content', cur)
- db.copy_to(content_without_data, 'tmp_skipped_content',
- db.skipped_content_keys, cur)
-
- # move metadata in place
- db.skipped_content_add_from_temp(cur)
+ # move metadata in place
+ try:
+ db.content_add_from_temp(cur)
+ except psycopg2.IntegrityError as e:
+ from . import HashCollision
+ if e.diag.sqlstate == '23505' and \
+ e.diag.table_name == 'content':
+ constraint_to_hash_name = {
+ 'content_pkey': 'sha1',
+ 'content_sha1_git_idx': 'sha1_git',
+ 'content_sha256_idx': 'sha256',
+ }
+ colliding_hash_name = constraint_to_hash_name \
+ .get(e.diag.constraint_name)
+ raise HashCollision(colliding_hash_name) from None
+ else:
+ raise
@timed
@process_metrics
@@ -238,21 +186,19 @@
for item in content:
item['ctime'] = now
- content = [self._normalize_content(c) for c in content]
+ content = [self._content_normalize(c) for c in content]
for c in content:
- self._validate_content(c)
+ self._content_validate(c)
- (content_with_data, content_without_data, summary) = \
- self._filter_new_content(content, db, cur)
+ missing = list(self.content_missing(content, key_hash='sha1_git'))
+ content = [c for c in content if c['sha1_git'] in missing]
if self.journal_writer:
- for item in content_with_data:
+ for item in content:
if 'data' in item:
item = item.copy()
del item['data']
self.journal_writer.write_addition('content', item)
- for item in content_without_data:
- self.journal_writer.write_addition('content', item)
def add_to_objstorage():
"""Add to objstorage the new missing_content
@@ -264,7 +210,7 @@
"""
content_bytes_added = 0
data = {}
- for cont in content_with_data:
+ for cont in content:
if cont['sha1'] not in data:
data[cont['sha1']] = cont['data']
content_bytes_added += max(0, cont['length'])
@@ -278,15 +224,16 @@
with ThreadPoolExecutor(max_workers=1) as executor:
added_to_objstorage = executor.submit(add_to_objstorage)
- self._content_add_metadata(
- db, cur, content_with_data, content_without_data)
+ self._content_add_metadata(db, cur, content)
# Wait for objstorage addition before returning from the
# transaction, bubbling up any exception
content_bytes_added = added_to_objstorage.result()
- summary['content:add:bytes'] = content_bytes_added
- return summary
+ return {
+ 'content:add': len(content),
+ 'content:add:bytes': content_bytes_added,
+ }
@timed
@db_transaction()
@@ -308,23 +255,23 @@
@process_metrics
@db_transaction()
def content_add_metadata(self, content, db=None, cur=None):
- content = [self._normalize_content(c) for c in content]
+ content = [self._content_normalize(c) for c in content]
for c in content:
- self._validate_content(c)
+ self._content_validate(c)
- (content_with_data, content_without_data, summary) = \
- self._filter_new_content(content, db, cur)
+ missing = self.content_missing(content, key_hash='sha1_git')
+ content = [c for c in content if c['sha1_git'] in missing]
if self.journal_writer:
- for item in itertools.chain(content_with_data,
- content_without_data):
+ for item in itertools.chain(content):
assert 'data' not in content
self.journal_writer.write_addition('content', item)
- self._content_add_metadata(
- db, cur, content_with_data, content_without_data)
+ self._content_add_metadata(db, cur, content)
- return summary
+ return {
+ 'content:add': len(content),
+ }
@timed
def content_get(self, content):
@@ -423,12 +370,6 @@
for obj in db.content_missing_per_sha1_git(contents, cur):
yield obj[0]
- @timed
- @db_transaction_generator()
- def skipped_content_missing(self, contents, db=None, cur=None):
- for content in db.skipped_content_missing(contents, cur):
- yield dict(zip(db.content_hash_keys, content))
-
@timed
@db_transaction()
def content_find(self, content, db=None, cur=None):
@@ -449,6 +390,83 @@
def content_get_random(self, db=None, cur=None):
return db.content_get_random(cur)
+ @staticmethod
+ def _skipped_content_normalize(d):
+ d = d.copy()
+
+ if d.get('status') is None:
+ d['status'] = 'absent'
+
+ if d.get('length') is None:
+ d['length'] = -1
+
+ return d
+
+ @staticmethod
+ def _skipped_content_validate(d):
+ """Sanity checks on status / reason / length, that postgresql
+ doesn't enforce."""
+ if d['status'] != 'absent':
+ raise ValueError('Invalid content status: {}'.format(d['status']))
+
+ if d.get('reason') is None:
+ raise ValueError(
+ 'Must provide a reason if content is absent.')
+
+ if d['length'] < -1:
+ raise ValueError('Content length must be positive or -1.')
+
+ def _skipped_content_add_metadata(self, db, cur, content):
+ content = \
+ [cont.copy() for cont in content]
+ origin_ids = db.origin_id_get_by_url(
+ [cont.get('origin') for cont in content],
+ cur=cur)
+ for (cont, origin_id) in zip(content, origin_ids):
+ if 'origin' in cont:
+ cont['origin'] = origin_id
+ db.mktemp('skipped_content', cur)
+ db.copy_to(content, 'tmp_skipped_content',
+ db.skipped_content_keys, cur)
+
+ # move metadata in place
+ db.skipped_content_add_from_temp(cur)
+
+ @timed
+ @process_metrics
+ @db_transaction()
+ def skipped_content_add(self, content, db=None, cur=None):
+ content = [dict(c.items()) for c in content] # semi-shallow copy
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
+ for item in content:
+ item['ctime'] = now
+
+ content = [self._skipped_content_normalize(c) for c in content]
+ for c in content:
+ self._skipped_content_validate(c)
+
+ missing_contents = self.skipped_content_missing(content)
+ content = [c for c in content
+ if any(all(c.get(algo) == missing_content.get(algo)
+ for algo in ALGORITHMS)
+ for missing_content in missing_contents)]
+
+ if self.journal_writer:
+ for item in content:
+ self.journal_writer.write_addition('content', item)
+
+ self._skipped_content_add_metadata(db, cur, content)
+
+ return {
+ 'skipped_content:add': len(content),
+ }
+
+ @timed
+ @db_transaction_generator()
+ def skipped_content_missing(self, contents, db=None, cur=None):
+ for content in db.skipped_content_missing(contents, cur):
+ yield dict(zip(db.content_hash_keys, content))
+
@timed
@process_metrics
@db_transaction()
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -58,7 +58,10 @@
@pytest.fixture
def swh_contents(swh_storage):
contents = gen_contents(n=20)
- swh_storage.content_add(contents)
+ swh_storage.content_add(
+ [c for c in contents if c['status'] != 'absent'])
+ swh_storage.skipped_content_add(
+ [c for c in contents if c['status'] == 'absent'])
return contents
@@ -218,6 +221,7 @@
return {
'content': [data.cont, data.cont2],
'content_metadata': [data.cont3],
+ 'skipped_content': [data.skipped_cont, data.skipped_cont2],
'person': [data.person],
'directory': [data.dir2, data.dir],
'revision': [data.revision],
diff --git a/swh/storage/tests/test_buffer.py b/swh/storage/tests/test_buffer.py
--- a/swh/storage/tests/test_buffer.py
+++ b/swh/storage/tests/test_buffer.py
@@ -27,7 +27,6 @@
assert s == {
'content:add': 1 + 1,
'content:add:bytes': contents[0]['length'] + contents[1]['length'],
- 'skipped_content:add': 0
}
missing_contents = storage.content_missing(
@@ -48,7 +47,6 @@
assert s == {
'content:add': 1,
'content:add:bytes': contents[0]['length'],
- 'skipped_content:add': 0
}
missing_contents = storage.content_missing([contents[0]])
@@ -75,7 +73,6 @@
assert s == {
'content:add': 1,
'content:add:bytes': contents[0]['length'],
- 'skipped_content:add': 0
}
missing_contents = storage.content_missing([contents[0]])
@@ -85,6 +82,55 @@
assert s == {}
+def test_buffering_proxy_storage_skipped_content_threshold_not_hit(
+ sample_data):
+ contents = sample_data['skipped_content']
+ storage = BufferingProxyStorage(
+ storage={'cls': 'memory'},
+ min_batch_size={
+ 'skipped_content': 10,
+ }
+ )
+ s = storage.skipped_content_add([contents[0], contents[1]])
+ assert s == {}
+
+ # contents have not been written to storage
+ missing_contents = storage.skipped_content_missing(
+ [contents[0], contents[1]])
+ assert {c['sha1'] for c in missing_contents} \
+ == {c['sha1'] for c in contents}
+
+ s = storage.flush()
+ assert s == {
+ 'skipped_content:add': 1 + 1
+ }
+
+ missing_contents = storage.skipped_content_missing(
+ [contents[0], contents[1]])
+ assert list(missing_contents) == []
+
+
+def test_buffering_proxy_storage_skipped_content_threshold_nb_hit(sample_data):
+ contents = sample_data['skipped_content']
+ storage = BufferingProxyStorage(
+ storage={'cls': 'memory'},
+ min_batch_size={
+ 'skipped_content': 1,
+ }
+ )
+
+ s = storage.skipped_content_add([contents[0]])
+ assert s == {
+ 'skipped_content:add': 1
+ }
+
+ missing_contents = storage.skipped_content_missing([contents[0]])
+ assert list(missing_contents) == []
+
+ s = storage.flush()
+ assert s == {}
+
+
def test_buffering_proxy_storage_directory_threshold_not_hit(sample_data):
directories = sample_data['directory']
storage = BufferingProxyStorage(
diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py
--- a/swh/storage/tests/test_cassandra.py
+++ b/swh/storage/tests/test_cassandra.py
@@ -177,11 +177,6 @@
def test_content_update(self):
pass
- @pytest.mark.skip(
- 'not implemented, see https://forge.softwareheritage.org/T1633')
- def test_skipped_content_add(self):
- pass
-
@pytest.mark.skip(
'The "person" table of the pgsql is a legacy thing, and not '
'supported by the cassandra backend.')
diff --git a/swh/storage/tests/test_filter.py b/swh/storage/tests/test_filter.py
--- a/swh/storage/tests/test_filter.py
+++ b/swh/storage/tests/test_filter.py
@@ -18,7 +18,6 @@
assert s == {
'content:add': 1,
'content:add:bytes': sample_content['length'],
- 'skipped_content:add': 0
}
content = next(storage.content_get([sample_content['sha1']]))
@@ -28,7 +27,27 @@
assert s == {
'content:add': 0,
'content:add:bytes': 0,
- 'skipped_content:add': 0
+ }
+
+
+def test_filtering_proxy_storage_skipped_content(sample_data):
+ sample_content = sample_data['skipped_content'][0]
+ storage = FilteringProxyStorage(storage={'cls': 'memory'})
+
+ content = next(storage.skipped_content_missing([sample_content]))
+ assert content['sha1'] == sample_content['sha1']
+
+ s = storage.skipped_content_add([sample_content])
+ assert s == {
+ 'skipped_content:add': 1,
+ }
+
+ content = list(storage.skipped_content_missing([sample_content]))
+ assert content == []
+
+ s = storage.skipped_content_add([sample_content])
+ assert s == {
+ 'skipped_content:add': 0,
}
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -32,7 +32,6 @@
assert s == {
'content:add': 1,
'content:add:bytes': sample_content['length'],
- 'skipped_content:add': 0
}
content = next(swh_storage.content_get([sample_content['sha1']]))
@@ -103,7 +102,6 @@
s = swh_storage.content_add_metadata([sample_content])
assert s == {
'content:add': 1,
- 'skipped_content:add': 0
}
content_metadata = swh_storage.content_get_metadata([pk])
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -141,7 +141,6 @@
assert actual_result == {
'content:add': 1,
'content:add:bytes': cont['length'],
- 'skipped_content:add': 0
}
assert list(swh_storage.content_get([cont['sha1']])) == \
@@ -168,7 +167,6 @@
assert actual_result == {
'content:add': 1,
'content:add:bytes': data.cont['length'],
- 'skipped_content:add': 0
}
swh_storage.refresh_stat_counters()
@@ -177,25 +175,35 @@
def test_content_add_validation(self, swh_storage):
cont = data.cont
+ with pytest.raises(ValueError, match='status'):
+ swh_storage.content_add([{**cont, 'status': 'absent'}])
+
with pytest.raises(ValueError, match='status'):
swh_storage.content_add([{**cont, 'status': 'foobar'}])
with pytest.raises(ValueError, match="(?i)length"):
swh_storage.content_add([{**cont, 'length': -2}])
+ with pytest.raises(
+ (ValueError, TypeError),
+ match="reason"):
+ swh_storage.content_add([{**cont, 'reason': 'foobar'}])
+
+ def test_skipped_content_add_validation(self, swh_storage):
+ cont = data.cont.copy()
+ del cont['data']
+
+ with pytest.raises(ValueError, match='status'):
+ swh_storage.skipped_content_add([{**cont, 'status': 'visible'}])
+
with pytest.raises((ValueError, psycopg2.IntegrityError),
match='reason') as cm:
- swh_storage.content_add([{**cont, 'status': 'absent'}])
+ swh_storage.skipped_content_add([{**cont, 'status': 'absent'}])
if type(cm.value) == psycopg2.IntegrityError:
assert cm.exception.pgcode == \
psycopg2.errorcodes.NOT_NULL_VIOLATION
- with pytest.raises(
- ValueError,
- match="^Must not provide a reason if content is not absent.$"):
- swh_storage.content_add([{**cont, 'reason': 'foobar'}])
-
def test_content_get_missing(self, swh_storage):
cont = data.cont
@@ -225,7 +233,6 @@
assert actual_result == {
'content:add': 2,
'content:add:bytes': cont['length'] + cont2['length'],
- 'skipped_content:add': 0
}
def test_content_add_twice(self, swh_storage):
@@ -233,7 +240,6 @@
assert actual_result == {
'content:add': 1,
'content:add:bytes': data.cont['length'],
- 'skipped_content:add': 0
}
assert len(swh_storage.journal_writer.objects) == 1
@@ -241,9 +247,8 @@
assert actual_result == {
'content:add': 1,
'content:add:bytes': data.cont2['length'],
- 'skipped_content:add': 0
}
- assert len(swh_storage.journal_writer.objects) == 2
+ assert 2 <= len(swh_storage.journal_writer.objects) <= 3
assert len(swh_storage.content_find(data.cont)) == 1
assert len(swh_storage.content_find(data.cont2)) == 1
@@ -286,7 +291,6 @@
actual_result = swh_storage.content_add_metadata([cont])
assert actual_result == {
'content:add': 1,
- 'skipped_content:add': 0
}
expected_cont = cont.copy()
@@ -308,7 +312,6 @@
actual_result = swh_storage.content_add_metadata([cont, cont2])
assert actual_result == {
'content:add': 2,
- 'skipped_content:add': 0
}
def test_content_add_metadata_collision(self, swh_storage):
@@ -336,13 +339,10 @@
assert len(missing) == 2
- actual_result = swh_storage.content_add([cont, cont, cont2])
+ actual_result = swh_storage.skipped_content_add([cont, cont, cont2])
- assert actual_result == {
- 'content:add': 0,
- 'content:add:bytes': 0,
- 'skipped_content:add': 2,
- }
+ assert 2 <= actual_result.pop('skipped_content:add') <= 3
+ assert actual_result == {}
missing = list(swh_storage.skipped_content_missing([cont, cont2]))
@@ -3618,6 +3618,8 @@
swh_storage.origin_visit_add(
origin, obj['date'], obj['type'])
else:
+ if obj_type == 'content' and obj['status'] == 'absent':
+ obj_type = 'skipped_content'
method = getattr(swh_storage, obj_type + '_add')
try:
method([obj])
@@ -3727,7 +3729,6 @@
assert actual_result == {
'content:add': 1,
'content:add:bytes': cont['length'],
- 'skipped_content:add': 0
}
if hasattr(swh_storage, 'objstorage'):
@@ -3758,7 +3759,6 @@
assert actual_result == {
'content:add': 1,
- 'skipped_content:add': 0
}
if hasattr(swh_storage, 'objstorage'):
@@ -3778,13 +3778,10 @@
cont2 = data.skipped_cont2
cont2['blake2s256'] = None
- actual_result = swh_storage.content_add([cont, cont, cont2])
+ actual_result = swh_storage.skipped_content_add([cont, cont, cont2])
- assert actual_result == {
- 'content:add': 0,
- 'content:add:bytes': 0,
- 'skipped_content:add': 2,
- }
+ assert 2 <= actual_result.pop('skipped_content:add') <= 3
+ assert actual_result == {}
with db_transaction(swh_storage) as (_, cur):
cur.execute('SELECT sha1, sha1_git, sha256, blake2s256, '

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 3:07 AM (2 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219023

Event Timeline