diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.75 -swh.model >= 0.0.18 +swh.model >= 0.0.54 swh.scheduler swh.storage >= 0.0.163 diff --git a/swh/loader/core/converters.py b/swh/loader/core/converters.py --- a/swh/loader/core/converters.py +++ b/swh/loader/core/converters.py @@ -1,15 +1,46 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert objects to dictionaries suitable for swh.storage""" +import logging + +from typing import Dict, Iterable, List, Optional, Tuple + from swh.model.hashutil import hash_to_hex +from swh.model.model import BaseContent, Content, SkippedContent + + +logger = logging.getLogger(__name__) + + +def prepare_contents( + contents: Iterable[Dict], max_content_size: Optional[int] = None, + origin_url: Optional[str] = None) -> Tuple[ + List[Dict], List[Dict]]: + """Prepare contents for storage from a list of contents + Returns + tuple of content iterable, skipped content iterable -def content_for_storage(content, log=None, max_content_size=None, - origin_url=None): + """ + present_contents: List[Dict] = [] + skipped_contents: List[Dict] = [] + for _content in contents: + content = content_for_storage( + _content, max_content_size=max_content_size, origin_url=origin_url) + if isinstance(content, SkippedContent): + skipped_contents.append(content.to_dict()) + else: + present_contents.append(content.to_dict()) + return present_contents, skipped_contents + + +def content_for_storage( + content: Dict, max_content_size: Optional[int] = None, + origin_url: Optional[str] = None) -> BaseContent: """Prepare content to be ready for storage Note: @@ -20,27 +51,26 @@ """ ret = content.copy() + ret.pop('perms', None) if max_content_size and ret['length'] > max_content_size: - if log: - log.info('Skipping content %s, too large (%s > %s)' % - (hash_to_hex(content['sha1_git']), - ret['length'], - max_content_size)) + logger.info('Skipping content %s, too large (%s > %s)' % + (hash_to_hex(content['sha1_git']), + ret['length'], + max_content_size)) ret.pop('data', None) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_url}) - return ret + return SkippedContent.from_dict(ret) if 'data' not in ret: ret['data'] = open(ret['path'], 'rb').read() # Extra keys added by swh.model.from_disk, that are not accepted # by swh-storage - ret.pop('perms', None) ret.pop('path', None) ret['status'] = 'visible' - return ret + return Content.from_dict(ret) diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -13,7 +13,7 @@ from swh.core import config from swh.storage import get_storage -from swh.loader.core.converters import content_for_storage +from swh.loader.core.converters import prepare_contents class BaseLoader(config.SWHConfig, metaclass=ABCMeta): @@ -379,12 +379,11 @@ self.save_data() if self.has_contents(): - self.storage.content_add([ - content_for_storage( - c, max_content_size=self.max_content_size, - origin_url=self.origin['url']) - for c in self.get_contents() - ]) + contents, skipped_contents = prepare_contents( + self.get_contents(), max_content_size=self.max_content_size, + origin_url=self.origin['url']) + self.storage.skipped_content_add(skipped_contents) + self.storage.content_add(contents) if self.has_directories(): self.storage.directory_add(self.get_directories()) if self.has_revisions(): diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py --- a/swh/loader/core/tests/test_converters.py +++ b/swh/loader/core/tests/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,10 +6,10 @@ import os import tempfile import unittest -from unittest.mock import Mock from swh.loader.core import converters -from swh.model.from_disk import Content +from swh.model import from_disk +from swh.model.model import Content, SkippedContent def tmpfile_with_content(fromdir, contentfile): @@ -44,14 +44,15 @@ data = b'temp file for testing content storage conversion' tmpfile = tmpfile_with_content(self.tmpdir.name, data) - obj = Content.from_file(path=os.fsdecode(tmpfile), - save_path=True).get_data() + obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile), + save_path=True).get_data() expected_content = obj.copy() expected_content['data'] = data expected_content['status'] = 'visible' del expected_content['path'] del expected_content['perms'] + expected_content = Content.from_dict(expected_content) # when content = converters.content_for_storage(obj) @@ -63,11 +64,12 @@ # given data = b'temp file for testing content storage conversion' - obj = Content.from_bytes(data=data, mode=0o100644).get_data() + obj = from_disk.Content.from_bytes(data=data, mode=0o100644).get_data() expected_content = obj.copy() expected_content['status'] = 'visible' del expected_content['perms'] + expected_content = Content.from_dict(expected_content) # when content = converters.content_for_storage(obj) @@ -79,24 +81,21 @@ # given data = b'temp file for testing content storage conversion' - obj = Content.from_bytes(data=data, mode=0o100644).get_data() - - log = Mock() + obj = from_disk.Content.from_bytes(data=data, mode=0o100644).get_data() + del obj['perms'] expected_content = obj.copy() expected_content.pop('data') expected_content['status'] = 'absent' expected_content['origin'] = 'http://example.org/' expected_content['reason'] = 'Content too large' + expected_content = SkippedContent.from_dict(expected_content) # when content = converters.content_for_storage( - obj, log, max_content_size=len(data) - 1, - origin_url=expected_content['origin'], + obj, max_content_size=len(data) - 1, + origin_url=expected_content.origin, ) # then self.assertEqual(content, expected_content) - self.assertTrue(log.info.called) - self.assertIn('Skipping content', log.info.call_args[0][0]) - self.assertIn('too large', log.info.call_args[0][0]) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,7 +21,7 @@ ) from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches -from swh.loader.core.converters import content_for_storage +from swh.loader.core.converters import prepare_contents from swh.loader.package.utils import download @@ -57,6 +57,7 @@ self.storage = get_storage(**self.config['storage']) self.url = url self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) + self.max_content_size = self.config['max_content_size'] def _check_configuration(self): """Checks the minimal configuration required is set for the loader. @@ -313,12 +314,16 @@ # memory objects = directory.collect() - contents = list( - objects.get('content', {}).values()) + contents, skipped_contents = prepare_contents( + objects.get('content', {}).values(), + max_content_size=self.max_content_size, + origin_url=origin['url']) + self.storage.skipped_content_add(skipped_contents) + logger.debug('Number of skipped contents: %s', + len(skipped_contents)) + self.storage.content_add(contents) logger.debug('Number of contents: %s', len(contents)) - self.storage.content_add( - [content_for_storage(x) for x in contents]) status_load = 'eventful'