diff --git a/.gitignore b/.gitignore index 8f530ea..22f237f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ *.pyc *.sw? *~ .coverage +.eggs/ __pycache__ dist swh.core.egg-info version.txt diff --git a/PKG-INFO b/PKG-INFO index f9437f1..7b0f805 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 88ba390..77ee085 100644 --- a/debian/control +++ b/debian/control @@ -1,19 +1,20 @@ Source: swh-core Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-celery, python3-dateutil, + python3-msgpack, python3-nose, python3-setuptools, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DCORE/ Package: python3-swh.core Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage core utilities diff --git a/requirements.txt b/requirements.txt index dbaf71c..ae46c28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ celery +msgpack-python python-dateutil vcversioner diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index f9437f1..7b0f805 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.4 +Version: 0.0.5 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 60d2fe1..12e5195 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,27 +1,27 @@ .gitignore MANIFEST.in Makefile requirements.txt setup.py version.txt bin/swh-hashdir bin/swh-hashfile debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/config.py swh/core/hashutil.py -swh/core/json.py swh/core/scheduling.py +swh/core/serializers.py swh/core/tests/test_config.py swh/core/tests/test_hashutil.py -swh/core/tests/test_json.py -swh/core/tests/test_scheduling.py \ No newline at end of file +swh/core/tests/test_scheduling.py +swh/core/tests/test_serializers.py \ No newline at end of file diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index dbaf71c..ae46c28 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,3 +1,4 @@ celery +msgpack-python python-dateutil vcversioner diff --git a/swh/core/config.py b/swh/core/config.py index 82ca282..7634184 100644 --- a/swh/core/config.py +++ b/swh/core/config.py @@ -1,162 +1,184 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import configparser import os SWH_CONFIG_DIRECTORIES = [ - '~/.config/softwareheritage', + '~/.config/swh', '~/.swh', '/etc/softwareheritage', ] +SWH_GLOBAL_CONFIG = 'global.ini' + +SWH_DEFAULT_GLOBAL_CONFIG = { + 'content_size_limit': ('int', 100 * 1024 * 1024), +} # conversion per type _map_convert_fn = { 'int': int, 'bool': lambda x: x.lower() == 'true', 'list[str]': lambda x: [value.strip() for value in x.split(',')], 'list[int]': lambda x: [int(value.strip()) for value in x.split(',')], } def read(conf_file=None, default_conf=None): """Read the user's configuration file. Fill in the gap using `default_conf`. `default_conf` is similar to this: DEFAULT_CONF = { 'a': ('string', '/tmp/swh-loader-git/log'), 'b': ('string', 'dbname=swhloadergit') 'c': ('bool', true) 'e': ('bool', None) 'd': ('int', 10) } If conf_file is None, return the default config. """ conf = {} if conf_file: config_path = os.path.expanduser(conf_file) if os.path.exists(config_path): config = configparser.ConfigParser(defaults=default_conf) config.read(os.path.expanduser(conf_file)) if 'main' in config._sections: conf = config._sections['main'] if not default_conf: default_conf = {} # remaining missing default configuration key are set # also type conversion is enforced for underneath layer for key in default_conf: nature_type, default_value = default_conf[key] val = conf.get(key, None) if not val: # fallback to default value conf[key] = default_value else: # value present but in string format, force type conversion conf[key] = _map_convert_fn.get(nature_type, lambda x: x)(val) return conf def priority_read(conf_filenames, default_conf=None): """Try reading the configuration files from conf_filenames, in order, and return the configuration from the first one that exists. default_conf has the same specification as it does in read. """ # Try all the files in order for filename in conf_filenames: full_filename = os.path.expanduser(filename) if os.path.exists(full_filename): return read(full_filename, default_conf) # Else, return the default configuration return read(None, default_conf) def merge_default_configs(base_config, *other_configs): """Merge several default config dictionaries, from left to right""" full_config = base_config.copy() for config in other_configs: full_config.update(config) return full_config def swh_config_paths(base_filename): """Return the Software Heritage specific configuration paths for the given filename.""" return [os.path.join(dirname, base_filename) for dirname in SWH_CONFIG_DIRECTORIES] def prepare_folders(conf, *keys): """Prepare the folder mentioned in config under keys. """ def makedir(folder): if not os.path.exists(folder): os.makedirs(folder) for key in keys: makedir(conf[key]) +def load_global_config(): + """Load the global Software Heritage config""" + + return priority_read( + swh_config_paths(SWH_GLOBAL_CONFIG), + SWH_DEFAULT_GLOBAL_CONFIG, + ) + + class SWHConfig: """Mixin to add configuration parsing abilities to classes The class should override the class attributes: - DEFAULT_CONFIG (default configuration to be parsed) - CONFIG_FILENAME (the filename of the configuration to be used) This class defines one classmethod, parse_config_file, which parses a configuration file using the default config as set in the class attribute. """ DEFAULT_CONFIG = {} CONFIG_BASE_FILENAME = '' @classmethod def parse_config_file(cls, base_filename=None, config_filename=None, - additional_configs=None): + additional_configs=None, global_config=True): """Parse the configuration file associated to the current class. By default, parse_config_file will load the configuration cls.CONFIG_BASE_FILENAME from one of the Software Heritage configuration directories, in order, unless it is overridden by base_filename or config_filename (which shortcuts the file lookup completely). Args: - base_filename (str) overrides the default cls.CONFIG_BASE_FILENAME - config_filename (str) sets the file to parse instead of the defaults set from cls.CONFIG_BASE_FILENAME - additional_configs (list of default configuration dicts) allows to override or extend the configuration set in cls.DEFAULT_CONFIG. + - global_config (bool): Load the global configuration (default: + True) """ if config_filename: config_filenames = [config_filename] else: if not base_filename: base_filename = cls.CONFIG_BASE_FILENAME config_filenames = swh_config_paths(base_filename) if not additional_configs: additional_configs = [] full_default_config = merge_default_configs(cls.DEFAULT_CONFIG, *additional_configs) - return priority_read(config_filenames, full_default_config) + config = {} + if global_config: + config = load_global_config() + + config.update(priority_read(config_filenames, full_default_config)) + + return config diff --git a/swh/core/json.py b/swh/core/serializers.py similarity index 81% rename from swh/core/json.py rename to swh/core/serializers.py index 140656a..b84b3eb 100644 --- a/swh/core/json.py +++ b/swh/core/serializers.py @@ -1,90 +1,114 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import datetime from json import JSONDecoder, JSONEncoder +import types import dateutil.parser +import msgpack class SWHJSONEncoder(JSONEncoder): """JSON encoder for data structures generated by Software Heritage. This JSON encoder extends the default Python JSON encoder and adds awareness for the following specific types: - bytes (get encoded as a Base85 string); - datetime.datetime (get encoded as an ISO8601 string). Non-standard types get encoded as a a dictionary with two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. SWHJSONEncoder also encodes arbitrary iterables as a list (allowing serialization of generators). Caveats: Limitations in the JSONEncoder extension mechanism prevent us from "escaping" dictionaries that only contain the swhtype and d keys, and therefore arbitrary data structures can't be round-tripped through SWHJSONEncoder and SWHJSONDecoder. """ def default(self, o): if isinstance(o, bytes): return { 'swhtype': 'bytes', 'd': base64.b85encode(o).decode('ascii'), } elif isinstance(o, datetime.datetime): return { 'swhtype': 'datetime', 'd': o.isoformat(), } try: return super().default(o) except TypeError as e: try: iterable = iter(o) except TypeError: raise e from None else: return list(iterable) class SWHJSONDecoder(JSONDecoder): """JSON decoder for data structures encoded with SWHJSONEncoder. This JSON decoder extends the default Python JSON decoder, allowing the decoding of: - bytes (encoded as a Base85 string); - datetime.datetime (encoded as an ISO8601 string). Non-standard types must be encoded as a a dictionary with exactly two keys: - swhtype with value 'bytes' or 'datetime'; - d containing the encoded value. To limit the impact our encoding, if the swhtype key doesn't contain a known value, the dictionary is decoded as-is. """ def decode_data(self, o): if isinstance(o, dict): if set(o.keys()) == {'d', 'swhtype'}: datatype = o['swhtype'] if datatype == 'bytes': return base64.b85decode(o['d']) elif datatype == 'datetime': return dateutil.parser.parse(o['d']) return {key: self.decode_data(value) for key, value in o.items()} if isinstance(o, list): return [self.decode_data(value) for value in o] else: return o def raw_decode(self, s, idx=0): data, index = super().raw_decode(s, idx) return self.decode_data(data), index + + +def msgpack_dumps(data): + """Write data as a msgpack stream""" + def encode_types(obj): + if isinstance(obj, datetime.datetime): + return {b'__datetime__': True, b's': obj.isoformat()} + if isinstance(obj, types.GeneratorType): + return list(obj) + return obj + + return msgpack.packb(data, use_bin_type=True, default=encode_types) + + +def msgpack_loads(data): + """Read data as a msgpack stream""" + def decode_types(obj): + if b'__datetime__' in obj and obj[b'__datetime__']: + return dateutil.parser.parse(obj[b's']) + return obj + + return msgpack.unpackb(data, encoding='utf-8', object_hook=decode_types) diff --git a/swh/core/tests/test_config.py b/swh/core/tests/test_config.py index a4f661f..1a98d70 100644 --- a/swh/core/tests/test_config.py +++ b/swh/core/tests/test_config.py @@ -1,186 +1,186 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import tempfile import unittest import os import shutil from nose.tools import istest from swh.core import config class ConfReaderTest(unittest.TestCase): @classmethod def setUpClass(cls): # create a temporary folder cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-core.') cls.conffile = os.path.join(cls.tmpdir, 'config.ini') with open(cls.conffile, 'w') as conf: conf.write("""[main] a = 1 b = this is a string c = true ls = list, of, strings li = 1, 2, 3, 4 """) cls.non_existing_conffile = os.path.join(cls.tmpdir, 'config-nonexisting.ini') cls.empty_conffile = os.path.join(cls.tmpdir, 'empty.ini') open(cls.empty_conffile, 'w').close() cls.default_conf = { 'a': ('int', 2), 'b': ('string', 'default-string'), 'c': ('bool', True), 'd': ('int', 10), 'e': ('int', None), 'f': ('bool', None), 'g': ('string', None), 'ls': ('list[str]', ['a', 'b', 'c']), 'li': ('list[int]', [42, 43]), } cls.other_default_conf = { 'a': ('int', 3), } cls.full_default_conf = cls.default_conf.copy() cls.full_default_conf['a'] = cls.other_default_conf['a'] cls.parsed_default_conf = { key: value for key, (type, value) in cls.default_conf.items() } cls.parsed_conffile = { 'a': 1, 'b': 'this is a string', 'c': True, 'd': 10, 'e': None, 'f': None, 'g': None, 'ls': ['list', 'of', 'strings'], 'li': [1, 2, 3, 4], } @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdir) @istest def read(self): # when res = config.read(self.conffile, self.default_conf) # then self.assertEquals(res, self.parsed_conffile) @istest def read_empty_file(self): # when res = config.read(None, self.default_conf) # then self.assertEquals(res, self.parsed_default_conf) @istest def support_non_existing_conffile(self): # when res = config.read(self.non_existing_conffile, self.default_conf) # then self.assertEquals(res, self.parsed_default_conf) @istest def support_empty_conffile(self): # when res = config.read(self.empty_conffile, self.default_conf) # then self.assertEquals(res, self.parsed_default_conf) @istest def merge_default_configs(self): # when res = config.merge_default_configs(self.default_conf, self.other_default_conf) # then self.assertEquals(res, self.full_default_conf) @istest def priority_read(self): # when res = config.priority_read([self.non_existing_conffile, self.conffile], self.default_conf) # then self.assertEquals(res, self.parsed_conffile) # when res = config.priority_read([ self.conffile, self.non_existing_conffile, self.empty_conffile, ], self.default_conf) # then self.assertEquals(res, self.parsed_conffile) # when res = config.priority_read([ self.empty_conffile, self.conffile, self.non_existing_conffile, ], self.default_conf) # then self.assertEquals(res, self.parsed_default_conf) @istest def swh_config_paths(self): res = config.swh_config_paths('foo/bar.ini') self.assertEqual(res, [ - '~/.config/softwareheritage/foo/bar.ini', + '~/.config/swh/foo/bar.ini', '~/.swh/foo/bar.ini', '/etc/softwareheritage/foo/bar.ini', ]) @istest def prepare_folder(self): # given conf = {'path1': os.path.join(self.tmpdir, 'path1'), 'path2': os.path.join(self.tmpdir, 'path2', 'depth1')} # the folders does not exists self.assertFalse(os.path.exists(conf['path1']), "path1 should not exist.") self.assertFalse(os.path.exists(conf['path2']), "path2 should not exist.") # when config.prepare_folders(conf, 'path1') # path1 exists but not path2 self.assertTrue(os.path.exists(conf['path1']), "path1 should now exist!") self.assertFalse(os.path.exists(conf['path2']), "path2 should not exist.") # path1 already exists, skips it but creates path2 config.prepare_folders(conf, 'path1', 'path2') self.assertTrue(os.path.exists(conf['path1']), "path1 should still exist!") self.assertTrue(os.path.exists(conf['path2']), "path2 should now exist.") diff --git a/swh/core/tests/test_json.py b/swh/core/tests/test_serializers.py similarity index 70% rename from swh/core/tests/test_json.py rename to swh/core/tests/test_serializers.py index 66d56da..3bd980c 100644 --- a/swh/core/tests/test_json.py +++ b/swh/core/tests/test_serializers.py @@ -1,52 +1,71 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import unittest from nose.tools import istest -from swh.core.json import SWHJSONDecoder, SWHJSONEncoder +from swh.core.serializers import SWHJSONDecoder, SWHJSONEncoder +from swh.core.serializers import msgpack_dumps, msgpack_loads -class JSON(unittest.TestCase): +class Serializers(unittest.TestCase): def setUp(self): self.tz = datetime.timezone(datetime.timedelta(minutes=118)) self.data = { "bytes": b"123456789\x99\xaf\xff\x00\x12", "datetime_naive": datetime.datetime(2015, 1, 1, 12, 4, 42, 231455), "datetime_tz": datetime.datetime(2015, 3, 4, 18, 25, 13, 1234, tzinfo=self.tz), "datetime_utc": datetime.datetime(2015, 3, 4, 18, 25, 13, 1234, tzinfo=datetime.timezone.utc), "swhtype": "fake", "swh_dict": {"swhtype": 42, "d": "test"}, "random_dict": {"swhtype": 43}, } self.encoded_data = { "bytes": {"swhtype": "bytes", "d": "F)}kWH8wXmIhn8j01^"}, "datetime_naive": {"swhtype": "datetime", "d": "2015-01-01T12:04:42.231455"}, "datetime_tz": {"swhtype": "datetime", "d": "2015-03-04T18:25:13.001234+01:58"}, "datetime_utc": {"swhtype": "datetime", "d": "2015-03-04T18:25:13.001234+00:00"}, "swhtype": "fake", "swh_dict": {"swhtype": 42, "d": "test"}, "random_dict": {"swhtype": 43}, } + self.generator = (i for i in range(5)) + self.gen_lst = list(range(5)) + @istest - def round_trip(self): + def round_trip_json(self): data = json.dumps(self.data, cls=SWHJSONEncoder) self.assertEqual(self.data, json.loads(data, cls=SWHJSONDecoder)) @istest - def encode(self): + def encode_swh_json(self): data = json.dumps(self.data, cls=SWHJSONEncoder) self.assertEqual(self.encoded_data, json.loads(data)) + + @istest + def round_trip_msgpack(self): + data = msgpack_dumps(self.data) + self.assertEqual(self.data, msgpack_loads(data)) + + @istest + def generator_json(self): + data = json.dumps(self.generator, cls=SWHJSONEncoder) + self.assertEqual(self.gen_lst, json.loads(data, cls=SWHJSONDecoder)) + + @istest + def generator_msgpack(self): + data = msgpack_dumps(self.generator) + self.assertEqual(self.gen_lst, msgpack_loads(data)) diff --git a/version.txt b/version.txt index 7b98edd..f65fcd3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.4-0-g6c25f90 \ No newline at end of file +v0.0.5-0-g4beac75 \ No newline at end of file