Changeset View
Standalone View
swh/journal/replay.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
import logging | import logging | ||||
from time import time | from time import time | ||||
from typing import Callable, Dict, List, Optional | from typing import Callable, Dict, List, Optional | ||||
▲ Show 20 Lines • Show All 217 Lines • ▼ Show 20 Lines | if object_type == 'revision': | ||||
objects = _fix_revisions(objects) | objects = _fix_revisions(objects) | ||||
elif object_type == 'origin_visit': | elif object_type == 'origin_visit': | ||||
objects = _fix_origin_visits(objects) | objects = _fix_origin_visits(objects) | ||||
return objects | return objects | ||||
def _insert_objects(object_type, objects, storage): | def _insert_objects(object_type, objects, storage): | ||||
objects = fix_objects(object_type, objects) | objects = fix_objects(object_type, objects) | ||||
if object_type == 'content': | if object_type == 'content': | ||||
# TODO: insert 'content' in batches | |||||
for object_ in objects: | |||||
try: | try: | ||||
douardda: are these vars necessary?
Why not simply use (with proper exception handling stuff):
```… | |||||
Done Inline ActionsDepends if we want to act upon my question below or not i guess ;) ardumont: Depends if we want to act upon my question below or not i guess ;)
| |||||
Not Done Inline ActionsWell one thing at a time. Let's deal with this as is, then discuss how to improve cases of ingestion error... douardda: Well one thing at a time. Let's deal with this as is, then discuss how to improve cases of… | |||||
if object_.get('status') == 'absent': | storage.skipped_content_add( | ||||
storage.skipped_content_add([object_]) | (obj for obj in objects if obj.get('status') == 'absent')) | ||||
else: | |||||
storage.content_add_metadata([object_]) | |||||
except HashCollision as e: | except HashCollision as e: | ||||
logger.error('Hash collision: %s', e.args) | logger.error('(SkippedContent) Hash collision: %s', e.args) | ||||
try: | |||||
storage.content_add_metadata( | |||||
(obj for obj in objects if obj.get('status') != 'absent')) | |||||
except HashCollision as e: | |||||
logger.error('(Content) Hash collision: %s', e.args) | |||||
elif object_type in ('directory', 'revision', 'release', | elif object_type in ('directory', 'revision', 'release', | ||||
'snapshot', 'origin'): | 'snapshot', 'origin'): | ||||
Done Inline ActionsDo we add a fallback behavior to ensure other objects from the failing transaction are written nonetheless? Sounds sensible to do so (in the mirror context). Maybe we should evolve the hash collision exception to refeerence the content in error. ardumont: Do we add a fallback behavior to ensure other objects from the failing transaction are written… | |||||
Done Inline Actions
Right now we have HashCollision 'type' of hash collision as message (e.g. sha1)... [1] [1] https://sentry.softwareheritage.org/share/issue/5a6f8c09fea1468997bb88ca3e19fc2d/ ardumont: > Maybe we should evolve the hash collision exception to refeerence the content in error. | |||||
# TODO: split batches that are too large for the storage | # TODO: split batches that are too large for the storage | ||||
# to handle? | # to handle? | ||||
method = getattr(storage, object_type + '_add') | method = getattr(storage, object_type + '_add') | ||||
method(objects) | method(objects) | ||||
elif object_type == 'origin_visit': | elif object_type == 'origin_visit': | ||||
for visit in objects: | for visit in objects: | ||||
storage.origin_add_one({'url': visit['origin']}) | storage.origin_add_one({'url': visit['origin']}) | ||||
if 'metadata' not in visit: | if 'metadata' not in visit: | ||||
▲ Show 20 Lines • Show All 181 Lines • Show Last 20 Lines |
are these vars necessary?
Why not simply use (with proper exception handling stuff):