diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index 48ae672..6341b5d 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,136 +1,147 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" from datetime import datetime from swh.loader.dir.git.git import GitType +from swh.loader.dir.git import utils def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes -def blob_to_content(obj, log=None, +def blob_to_content(obj, log=None, max_content_size=None, + origin_id=None): + if 'data' not in obj: + filepath = obj['path'] + content_raw, length = utils._read_raw(filepath) + obj.update({'data': content_raw, + 'length': length}) + return _blob_to_content(obj, log, max_content_size, origin_id) + + +def _blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert to a compliant swh content. """ size = obj['length'] ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'data': obj['data'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (obj['sha1_git'], size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret ret.update({ 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ upper_directory = objects[''][0] return { 'id': commit['sha1_git'], 'date': datetime.fromtimestamp(commit['revision_author_date']), 'date_offset': format_to_minutes(commit['revision_author_offset']), 'committer_date': datetime.fromtimestamp(commit['revision_committer_date']), 'committer_date_offset': format_to_minutes(commit['revision_committer_offset']), 'type': commit['revision_type'], 'directory': upper_directory['sha1_git'], 'message': commit['revision_message'], 'author_name': commit['revision_author_name'], 'author_email': commit['revision_author_email'], 'committer_name': commit['revision_committer_name'], 'committer_email': commit['revision_committer_email'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { 'id': release['sha1_git'], 'revision': release['revision_sha1_git'], 'name': release['release_name'], 'comment': release['release_comment'], 'date': datetime.fromtimestamp(release['release_date']), 'date_offset': format_to_minutes(release['release_offset']), 'author_name': release['release_author_name'], 'author_email': release['release_author_email'], } def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'dir', 'url': origin_url, } diff --git a/swh/loader/dir/git/utils.py b/swh/loader/dir/git/utils.py index 31017f1..661dc9d 100644 --- a/swh/loader/dir/git/utils.py +++ b/swh/loader/dir/git/utils.py @@ -1,135 +1,121 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import os from io import BytesIO from swh.core import hashutil +hashfile = hashutil.hashfile hash_to_hex = hashutil.hash_to_hex hex_to_hash = hashutil.hex_to_hash def _new_hash(header_type, length): """Initialize a digest object (as returned by python's hashlib) for the git sha1 algorithm. This is in charge of pre-computing the needed header for git. Args: header_type: a git sha1 type ('blob', 'tree', 'commit', 'tag') length: Length of content to hash. Could be None if when hashing with sha1 and sha256 Returns: A digest object Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = hashlib.new('sha1') if header_type not in ('blob', 'commit', 'tree', 'tag'): raise ValueError( 'Only supported types are blob, commit, tree, tag') h.update(('%s %d\0' % (header_type, length)).encode('ascii')) return h def _hash_file_obj(f, header_type, length): """hash (git sha1) the content of a file-like object f with header_type and length. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = _new_hash(header_type, length) while True: chunk = f.read(hashutil.HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) return {'sha1_git': h.digest()} def hashdata(data, header_type): """Hash data as git sha1 with header_type. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ buf = BytesIO(data) return _hash_file_obj(buf, header_type, len(data)) def _read_raw(filepath): """Read filepath's raw content and returns it. + Args: + filepath: absolute path to an existing file. + + Returns: + raw content in bytes + its length + """ content_raw = b'' length = 0 with open(filepath, 'rb') as f: while True: chunk = f.read(hashutil.HASH_BLOCK_SIZE) if not chunk: break content_raw += chunk length += len(chunk) return content_raw, length -def hashfile(filepath): - """Compute the hashes of filepath (sha1, sha1_git, sha256). - - Args: - filepath: the absolute path name to the file to hash. - - Returns: - A dictionary of values: - - sha1 - - sha256 - - sha1_git - - content: the raw content of the filepath - - """ - hashes = hashutil.hashfile(filepath) - content_raw, length = _read_raw(filepath) - hashes.update({'data': content_raw, - 'length': length}) - return hashes - - def hashlink(linkpath): """Compute hashes for a link. Args: linkpath: the absolute path name to a symbolic link. Returns: dictionary with sha1_git as key and the actual binary sha1 as value. """ raw_data = os.readlink(linkpath).encode('utf-8') hashes = hashutil.hashdata(raw_data) hashes.update({ 'data': raw_data, 'length': len(raw_data) }) return hashes