diff --git a/MANIFEST.in b/MANIFEST.in index 08ebc95..e7c46fc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile include requirements.txt +include requirements-swh.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 6d30943..8d4a992 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.29 +Version: 0.0.30 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index e5c3fc3..3ef936b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ msgpack-python psycopg2 python-dateutil vcversioner PyYAML requests Flask +systemd-python diff --git a/setup.py b/setup.py index 3f2f7e9..4925503 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,30 @@ #!/usr/bin/env python3 from setuptools import setup def parse_requirements(): requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - + for reqf in ('requirements.txt', 'requirements-swh.txt'): + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.core', description='Software Heritage core utilities', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DCORE/', packages=['swh.core', 'swh.core.tests'], scripts=['bin/swh-hashdir', 'bin/swh-hashfile'], install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index 6d30943..8d4a992 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.29 +Version: 0.0.30 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 7ef18ec..8b969ec 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,25 +1,24 @@ MANIFEST.in Makefile +requirements-swh.txt requirements.txt setup.py version.txt bin/swh-hashdir bin/swh-hashfile swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api.py swh/core/config.py -swh/core/hashutil.py swh/core/logger.py swh/core/serializers.py swh/core/utils.py swh/core/tests/db_testing.py swh/core/tests/test_config.py -swh/core/tests/test_hashutil.py swh/core/tests/test_logger.py swh/core/tests/test_serializers.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index 11dff34..dce54a2 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,7 +1,8 @@ Flask PyYAML msgpack-python psycopg2 python-dateutil requests +systemd-python vcversioner diff --git a/swh/core/api.py b/swh/core/api.py index 8c508eb..7491045 100644 --- a/swh/core/api.py +++ b/swh/core/api.py @@ -1,106 +1,108 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json +import logging import pickle import requests from flask import Flask, Request, Response from .serializers import (decode_response, encode_data_client as encode_data, msgpack_dumps, msgpack_loads, SWHJSONDecoder) class SWHRemoteAPI: """Proxy to an internal SWH API """ def __init__(self, api_exception, url): super().__init__() self.api_exception = api_exception base_url = url if url.endswith('/') else url + '/' self.url = base_url self.session = requests.Session() def _url(self, endpoint): return '%s%s' % (self.url, endpoint) def post(self, endpoint, data): try: response = self.session.post( self._url(endpoint), data=encode_data(data), headers={'content-type': 'application/x-msgpack'}, ) except requests.exceptions.ConnectionError as e: raise self.api_exception(e) # XXX: this breaks language-independence and should be # replaced by proper unserialization if response.status_code == 400: raise pickle.loads(decode_response(response)) return decode_response(response) def get(self, endpoint, data=None): try: response = self.session.get( self._url(endpoint), params=data, ) except requests.exceptions.ConnectionError as e: raise self.api_exception(e) if response.status_code == 404: return None # XXX: this breaks language-independence and should be # replaced by proper unserialization if response.status_code == 400: raise pickle.loads(decode_response(response)) else: return decode_response(response) class BytesRequest(Request): """Request with proper escaping of arbitrary byte sequences.""" encoding = 'utf-8' encoding_errors = 'surrogateescape' def encode_data_server(data): return Response( msgpack_dumps(data), mimetype='application/x-msgpack', ) def decode_request(request): content_type = request.mimetype data = request.get_data() if content_type == 'application/x-msgpack': r = msgpack_loads(data) elif content_type == 'application/json': r = json.loads(data, cls=SWHJSONDecoder) else: raise ValueError('Wrong content type `%s` for API request' % content_type) return r def error_handler(exception, encoder): # XXX: this breaks language-independence and should be # replaced by proper serialization of errors + logging.exception(exception) response = encoder(pickle.dumps(exception)) response.status_code = 400 return response class SWHServerAPIApp(Flask): request_class = BytesRequest diff --git a/swh/core/hashutil.py b/swh/core/hashutil.py deleted file mode 100644 index 43bd331..0000000 --- a/swh/core/hashutil.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import binascii -import functools -import hashlib -import os - -from io import BytesIO - -# supported hashing algorithms -ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) - -# should be a multiple of 64 (sha1/sha256's block size) -# FWIW coreutils' sha1sum uses 32768 -HASH_BLOCK_SIZE = 32768 - - -def _new_git_hash(base_algo, git_type, length): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm, and feed it with the header for a git object of the - given type and length. - - The header for hashing a git object consists of: - - The type of the object (encoded in ASCII) - - One ASCII space (\x20) - - The length of the object (decimal encoded in ASCII) - - One NUL byte - - Args: - base_algo: a hashlib-supported algorithm - git_type: the type of the git object (supposedly one of 'blob', - 'commit', 'tag', 'tree') - length: the length of the git object you're encoding - - Returns: - a hashutil.hash object - """ - - h = hashlib.new(base_algo) - git_header = '%s %d\0' % (git_type, length) - h.update(git_header.encode('ascii')) - - return h - - -def _new_hash(algo, length=None): - """Initialize a digest object (as returned by python's hashlib) for the - requested algorithm. See the constant ALGORITHMS for the list of supported - algorithms. If a git-specific hashing algorithm is requested (e.g., - "sha1_git"), the hashing object will be pre-fed with the needed header; for - this to work, length must be given. - - """ - if algo not in ALGORITHMS: - raise ValueError('unknown hashing algorithm ' + algo) - - h = None - if algo.endswith('_git'): - if length is None: - raise ValueError('missing length for git hashing algorithm') - base_algo = algo[:-4] - h = _new_git_hash(base_algo, 'blob', length) - else: - h = hashlib.new(algo) - - return h - - -def _hash_file_obj(f, length, algorithms=ALGORITHMS, chunk_cb=None): - """hash the content of a file-like object - - If chunk_cb is given, call it on each data chunk after updating the hash - - """ - hashers = {algo: _new_hash(algo, length) - for algo in algorithms} - while True: - chunk = f.read(HASH_BLOCK_SIZE) - if not chunk: - break - for h in hashers.values(): - h.update(chunk) - if chunk_cb: - chunk_cb(chunk) - - return {algo: hashers[algo].digest() for algo in hashers} - - -def _hash_fname(fname, algorithms=ALGORITHMS): - """hash the content of a file specified by file name - - """ - length = os.path.getsize(fname) - with open(fname, 'rb') as f: - return _hash_file_obj(f, length) - - -def hashfile(f, length=None, algorithms=ALGORITHMS): - """Hash the content of a given file, given either as a file-like object or a - file name. All specified hash algorithms will be computed, reading the file - only once. Returns a dictionary mapping algorithm names to hex-encoded - checksums. - - When passing a file-like object, content length must be given; when passing - a file name, content length is ignored. - - """ - if isinstance(f, (str, bytes)): - return _hash_fname(f, algorithms) - else: - return _hash_file_obj(f, length, algorithms) - - -def hashdata(data, algorithms=ALGORITHMS): - """Like hashfile, but hashes content passed as a string (of bytes) - - """ - buf = BytesIO(data) - return _hash_file_obj(buf, len(data), algorithms) - - -def hash_git_object(git_object, git_type, hash_algo='sha1'): - """Hash a git_object of git_type using hash_algo. - - Args: - git_object: a bytestring containing a git object - git_type: one of ('blob', 'commit', 'tag', 'tree') - hash_algo: one of BASE_ALGORITHMS - Returns: - The resulting hashutil.hash object, fed with all the needed data. - """ - - git_types = ('blob', 'commit', 'tag', 'tree') - if git_type not in git_types: - raise ValueError('Unexpected git object type %s. Expected one of %s' % - (git_type, ', '.join(git_types))) - - length = len(git_object) - - h = _new_git_hash(hash_algo, git_type, length) - h.update(git_object) - - return h - - -@functools.lru_cache() -def hash_to_hex(hash): - """Converts a hash to its hexadecimal string representation""" - return hash_to_bytehex(hash).decode('ascii') - - -@functools.lru_cache() -def hash_to_bytehex(hash): - """Converts a hash to its hexadecimal bytes representation""" - return binascii.hexlify(hash) - - -@functools.lru_cache() -def hex_to_hash(hex): - """Converts a hexadecimal string representation of a hash to that hash""" - return bytes.fromhex(hex) - - -@functools.lru_cache() -def bytehex_to_hash(hex): - """Converts a hexadecimal bytes representation of a hash to that hash""" - return hex_to_hash(hex.decode()) diff --git a/swh/core/logger.py b/swh/core/logger.py index 41e15ef..ffe7b1c 100644 --- a/swh/core/logger.py +++ b/swh/core/logger.py @@ -1,122 +1,188 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import logging import os -import psycopg2 import socket +import psycopg2 +from psycopg2.extras import Json +from systemd.journal import JournalHandler as _JournalHandler, send + try: from celery import current_task except ImportError: current_task = None -from psycopg2.extras import Json - EXTRA_LOGDATA_PREFIX = 'swh_' def db_level_of_py_level(lvl): """convert a log level of the logging module to a log level suitable for the logging Postgres DB """ return logging.getLevelName(lvl).lower() +def get_extra_data(record): + """Get the extra data to insert to the database from the logging record""" + log_data = record.__dict__ + + extra_data = {k[len(EXTRA_LOGDATA_PREFIX):]: v + for k, v in log_data.items() + if k.startswith(EXTRA_LOGDATA_PREFIX)} + + args = log_data.get('args') + if args: + extra_data['logging_args'] = args + + # Retrieve Celery task info + if current_task and current_task.request: + extra_data['task'] = { + 'id': current_task.request.id, + 'name': current_task.name, + 'kwargs': current_task.request.kwargs, + 'args': current_task.request.args, + } + + return extra_data + + +def flatten(data, separator='_'): + """Flatten the data dictionary into a flat structure""" + def inner_flatten(data, prefix): + if isinstance(data, dict): + for key, value in data.items(): + yield from inner_flatten(value, prefix + [key]) + elif isinstance(data, (list, tuple)): + for key, value in enumerate(data): + yield from inner_flatten(value, prefix + [str(key)]) + else: + yield prefix, data + + for path, value in inner_flatten(data, []): + yield separator.join(path), value + + +def stringify(value): + """Convert value to string""" + if isinstance(value, datetime.datetime): + return value.isoformat() + + return str(value) + + class PostgresHandler(logging.Handler): """log handler that store messages in a Postgres DB See swh-core/sql/log-schema.sql for the DB schema. All logging methods can be used as usual. Additionally, arbitrary metadata can be passed to logging methods, requesting that they will be stored in the DB as a single JSONB value. To do so, pass a dictionary to the 'extra' kwarg of any logging method; all keys in that dictionary that start with EXTRA_LOGDATA_PREFIX (currently: 'swh_') will be extracted to form the JSONB dictionary. The prefix will be stripped and not included in the DB. Note: the logger name will be used to fill the 'module' DB column. Sample usage: logging.basicConfig(level=logging.INFO) h = PostgresHandler('dbname=softwareheritage-log') logging.getLogger().addHandler(h) logger.info('not so important notice', extra={'swh_type': 'swh_logging_test', 'swh_meditation': 'guru'}) logger.warn('something weird just happened, did you see that?') """ def __init__(self, connstring): """ Create a Postgres log handler. Args: config: configuration dictionary, with a key "log_db" containing a libpq connection string to the log DB """ super().__init__() self.connstring = connstring self.fqdn = socket.getfqdn() # cache FQDN value def _connect(self): return psycopg2.connect(self.connstring) def emit(self, record): - log_data = record.__dict__ - msg = self.format(record) + extra_data = get_extra_data(record) - extra_data = {k[len(EXTRA_LOGDATA_PREFIX):]: v - for k, v in log_data.items() - if k.startswith(EXTRA_LOGDATA_PREFIX)} - - # Retrieve Celery task info - if current_task and current_task.request: - extra_data['task'] = { - 'id': current_task.request.id, - 'name': current_task.name, - } - + if 'task' in extra_data: task_args = { - 'kwargs': current_task.request.kwargs, - 'args': current_task.request.args, + 'args': extra_data['task']['args'], + 'kwargs': extra_data['task']['kwargs'], } try: json_args = Json(task_args).getquoted() except TypeError: task_args = { 'args': [''], 'kwargs': {}, } else: json_args_length = len(json_args) if json_args_length >= 1000: task_args = { 'args': [''], 'kwargs': {}, } extra_data['task'].update(task_args) - log_entry = (db_level_of_py_level(log_data['levelno']), msg, - Json(extra_data), log_data['name'], self.fqdn, + log_entry = (db_level_of_py_level(record.levelno), msg, + Json(extra_data), record.name, self.fqdn, os.getpid()) db = self._connect() with db.cursor() as cur: cur.execute('INSERT INTO log ' '(level, message, data, src_module, src_host, src_pid)' 'VALUES (%s, %s, %s, %s, %s, %s)', log_entry) db.commit() db.close() + + +class JournalHandler(_JournalHandler): + def emit(self, record): + """Write `record` as a journal event. + + MESSAGE is taken from the message provided by the user, and PRIORITY, + LOGGER, THREAD_NAME, CODE_{FILE,LINE,FUNC} fields are appended + automatically. In addition, record.MESSAGE_ID will be used if present. + """ + try: + extra_data = { + (EXTRA_LOGDATA_PREFIX + key).upper(): stringify(value) + for key, value in flatten(get_extra_data(record)) + } + msg = self.format(record) + pri = self.mapPriority(record.levelno) + send(msg, + PRIORITY=format(pri), + LOGGER=record.name, + THREAD_NAME=record.threadName, + CODE_FILE=record.pathname, + CODE_LINE=record.lineno, + CODE_FUNC=record.funcName, + **extra_data) + except Exception: + self.handleError(record) diff --git a/swh/core/tests/test_hashutil.py b/swh/core/tests/test_hashutil.py deleted file mode 100644 index e797437..0000000 --- a/swh/core/tests/test_hashutil.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import tempfile -import unittest - -from nose.tools import istest - -from swh.core import hashutil - - -class Hashlib(unittest.TestCase): - - def setUp(self): - self.data = b'42\n' - self.hex_checksums = { - 'sha1': '34973274ccef6ab4dfaaf86599792fa9c3fe4689', - 'sha1_git': 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd', - 'sha256': '084c799cd551dd1d8d5c5f9a5d593b2e931f5e36' - '122ee5c793c1d08a19839cc0', - } - self.checksums = { - 'sha1': bytes.fromhex('34973274ccef6ab4dfaaf865997' - '92fa9c3fe4689'), - 'sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a845' - '3e1e07157b6cd'), - 'sha256': bytes.fromhex('084c799cd551dd1d8d5c5f9a5d5' - '93b2e931f5e36122ee5c793c1d0' - '8a19839cc0'), - } - - @istest - def hashdata(self): - checksums = hashutil.hashdata(self.data) - self.assertEqual(checksums, self.checksums) - - @istest - def unknown_algo(self): - with self.assertRaises(ValueError): - hashutil.hashdata(self.data, algorithms=['does-not-exist']) - - @istest - def algo_selection(self): - checksums = hashutil.hashdata(self.data, algorithms=['sha1', 'sha256']) - self.assertIn('sha1', checksums) - self.assertIn('sha256', checksums) - self.assertNotIn('sha1_git', checksums) - - @istest - def hashfile_by_name(self): - with tempfile.NamedTemporaryFile() as f: - f.write(self.data) - f.flush() - checksums = hashutil.hashfile(f.name) - self.assertEqual(checksums, self.checksums) - - @istest - def hashfile_by_name_as_bytes(self): - with tempfile.NamedTemporaryFile() as f: - f.write(self.data) - f.flush() - checksums = hashutil.hashfile(f.name.encode('utf-8')) - self.assertEqual(checksums, self.checksums) - - @istest - def hashfile_by_obj(self): - with tempfile.TemporaryFile() as f: - f.write(self.data) - f.seek(0) - checksums = hashutil.hashfile(f, len(self.data)) - self.assertEqual(checksums, self.checksums) - - @istest - def hex_to_hash(self): - for algo in self.checksums: - self.assertEqual(self.checksums[algo], - hashutil.hex_to_hash(self.hex_checksums[algo])) - - @istest - def hash_to_hex(self): - for algo in self.checksums: - self.assertEqual(self.hex_checksums[algo], - hashutil.hash_to_hex(self.checksums[algo])) - - @istest - def hash_to_bytehex(self): - for algo in self.checksums: - self.assertEqual(self.hex_checksums[algo].encode('ascii'), - hashutil.hash_to_bytehex(self.checksums[algo])) - - @istest - def bytehex_to_hash(self): - for algo in self.checksums: - self.assertEqual(self.checksums[algo], - hashutil.bytehex_to_hash( - self.hex_checksums[algo].encode())) - - -class HashlibGit(unittest.TestCase): - - def setUp(self): - self.blob_data = b'42\n' - - self.tree_data = b''.join([b'40000 barfoo\0', - bytes.fromhex('c3020f6bf135a38c6df' - '3afeb5fb38232c5e07087'), - b'100644 blah\0', - bytes.fromhex('63756ef0df5e4f10b6efa' - '33cfe5c758749615f20'), - b'100644 hello\0', - bytes.fromhex('907b308167f0880fb2a' - '5c0e1614bb0c7620f9dc3')]) - - self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 -author Antoine R. Dumont (@ardumont) 1444054085 +0200 -committer Antoine R. Dumont (@ardumont) 1444054085 +0200 - -initial -""".encode('utf-8') # NOQA - self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 -type commit -tag 0.0.1 -tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 - -blah -""".encode('utf-8') # NOQA - - self.checksums = { - 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' - 'e07157b6cd'), - 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' - '121dacdb1c'), - 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' - 'd629189653'), - 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' - 'e9e959f120'), - } - - @istest - def unknown_header_type(self): - with self.assertRaises(ValueError) as cm: - hashutil.hash_git_object(b'any-data', 'some-unknown-type') - - self.assertIn('Unexpected git object type', cm.exception.args[0]) - - @istest - def hashdata_content(self): - # when - hashobj = hashutil.hash_git_object(self.blob_data, 'blob') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['blob_sha1_git']) - - @istest - def hashdata_tree(self): - # when - hashobj = hashutil.hash_git_object(self.tree_data, 'tree') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['tree_sha1_git']) - - @istest - def hashdata_revision(self): - # when - hashobj = hashutil.hash_git_object(self.commit_data, 'commit') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['commit_sha1_git']) - - @istest - def hashdata_tag(self): - # when - hashobj = hashutil.hash_git_object(self.tag_data, 'tag') - - # then - self.assertEqual(hashobj.digest(), - self.checksums['tag_sha1_git']) diff --git a/version.txt b/version.txt index d96c111..759a840 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.29-0-g358c0d2 \ No newline at end of file +v0.0.30-0-g58d3f8e \ No newline at end of file