diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -210,11 +210,12 @@ if not hasattr(self, '__save_data_path'): year = str(self.visit_date.year) - origin_url_hash = hashlib.sha1(self.origin['url']).hexdigest() + url = self.origin['url'].encode('utf-8') + origin_url_hash = hashlib.sha1(url).hexdigest() - path = os.path.join( + path = '%s/sha1:%s/%s/%s' % ( self.config['save_data_path'], - 'sha1:' + origin_url_hash[0:2], + origin_url_hash[0:2], origin_url_hash, year, ) diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -1,10 +1,12 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime +import hashlib import logging +import pytest from swh.model.hashutil import hash_to_bytes @@ -63,7 +65,9 @@ class DummyBufferedLoader(DummyLoader, BufferedLoader): - pass + def __init__(self, *args, save_data_path=None, **kwargs): + super().__init__(*args, **kwargs) + self.__save_data_path = save_data_path class DummyBaseLoaderTest(BaseLoaderTest): @@ -344,3 +348,24 @@ assert isinstance(loader.log, logging.Logger) assert loader.log.name == \ 'some.logger.name' + + +@pytest.mark.fs +def test_loader_save_data_path(tmp_path): + loader = DummyBufferedLoader('some.logger.name.1') + url = 'http://bitbucket.org/something' + loader.origin = { + 'url': url, + } + loader.visit_date = datetime.datetime(year=2019, month=10, day=1) + loader.config = { + 'save_data_path': tmp_path, + } + + hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest() + expected_save_path = '%s/sha1:%s/%s/2019' % ( + str(tmp_path), hash_url[0:2], hash_url + ) + + save_path = loader.get_save_data_path() + assert save_path == expected_save_path