diff --git a/swh/loader/dir/__init__.py b/swh/loader/dir/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index 2623ccd..5c15e61 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,156 +1,168 @@ + # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" import datetime import os from swh.loader.dir.git.git import GitType from swh.loader.dir.git import git, utils def to_datetime(ts): """Convert a timestamp to utc datetime. """ return datetime.datetime.utcfromtimestamp(ts).replace( tzinfo=datetime.timezone.utc) def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert obj to a swh storage content. Note: - If obj represents a link, the length and data are already provided so we use them directly. - 'data' is returned only if max_content_size is not reached. Returns: obj converted to content as a dictionary. """ filepath = obj['path'] if 'length' in obj: # link already has it size = obj['length'] else: size = os.lstat(filepath).st_size ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value, } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (utils.hash_to_hex(obj['sha1_git']), size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret if 'data' in obj: # link already has it data = obj['data'] else: data = open(filepath, 'rb').read() ret.update({ 'data': data, 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ upper_directory = objects[git.ROOT_TREE_KEY][0] return { - 'id': commit['sha1_git'], - 'date': to_datetime(commit['author_date']), - 'date_offset': format_to_minutes(commit['author_offset']), - 'committer_date': to_datetime(commit['committer_date']), - 'committer_date_offset': format_to_minutes(commit['committer_offset']), + 'date': { + 'timestamp': commit['author_date'], + 'offset': format_to_minutes(commit['author_offset']), + }, + 'committer_date': { + 'timestamp': commit['committer_date'], + 'offset': format_to_minutes(commit['committer_offset']), + }, 'type': commit['type'], 'directory': upper_directory['sha1_git'], - 'message': commit['message'], - 'author_name': commit['author_name'], - 'author_email': commit['author_email'], - 'committer_name': commit['committer_name'], - 'committer_email': commit['committer_email'], + 'message': commit['message'].encode('utf-8'), + 'author': { + 'name': commit['author_name'].encode('utf-8'), + 'email': commit['author_email'].encode('utf-8'), + }, + 'committer': { + 'name': commit['committer_name'].encode('utf-8'), + 'email': commit['committer_email'].encode('utf-8'), + }, 'synthetic': True, 'metadata': commit['metadata'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { - 'id': release['sha1_git'], - 'revision': release['revision'], + 'target': release['target'], + 'target_type': release['target_type'], 'name': release['name'], - 'comment': release['comment'], - 'date': to_datetime(release['date']), - 'date_offset': format_to_minutes(release['offset']), - 'author_name': release['author_name'], - 'author_email': release['author_email'], + 'message': release['comment'].encode('utf-8'), + 'date': { + 'timestamp': release['date'], + 'offset': format_to_minutes(release['offset']), + }, + 'author': { + 'name': release['author_name'].encode('utf-8'), + 'email': release['author_email'].encode('utf-8'), + }, 'synthetic': True, } diff --git a/swh/loader/dir/git/git.py b/swh/loader/dir/git/git.py index fe3c84c..e638f1f 100644 --- a/swh/loader/dir/git/git.py +++ b/swh/loader/dir/git/git.py @@ -1,300 +1,262 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.loader.dir.git import utils +from swh.model.identifiers import release_identifier, revision_identifier -ROOT_TREE_KEY = '' + +ROOT_TREE_KEY = b'' class GitType(Enum): BLOB = b'blob' TREE = b'tree' EXEC = b'exec' LINK = b'link' COMM = b'commit' RELE = b'release' REFS = b'ref' class GitPerm(Enum): BLOB = b'100644' TREE = b'40000' EXEC = b'100755' LINK = b'120000' def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: dictionary with sha1_git as key and the actual binary sha1 as value. Assumes: Every path exists in hashes. """ def sorted_key_fn(entry): """Beware the sorted algorithm in git add a / for tree entries. """ name = entry['name'] return name + b'/' if entry['type'] is GitType.TREE else name def sort_by_entry_name(hashes): return sorted(hashes, key=sorted_key_fn) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) return utils.hashdata(b''.join(rows), 'tree') def compute_revision_sha1_git(revision): """Compute a revision sha1 git from its dict representation. Args: revision: Additional dictionary information needed to compute a synthetic revision. Following keys are expected: - - author_name - - author_email - - author_date - - author_offset - - committer_name - - committer_email + - author + - date + - committer - committer_date - - committer_offset - message - type - directory: binary form of the tree hash Returns: revision sha1 in bytes # FIXME: beware, bytes output from storage api """ - revision_bytes = ("""tree %s -author %s <%s> %s %s -committer %s <%s> %s %s - -%s -""" % (utils.hash_to_hex(revision['directory']), - revision['author_name'], - revision['author_email'], - revision['author_date'], - revision['author_offset'], - revision['committer_name'], - revision['committer_email'], - revision['committer_date'], - revision['committer_offset'], - revision['message'])).encode('utf-8') - - hashes = utils.hashdata(revision_bytes, 'commit') - return hashes['sha1_git'] + return bytes.fromhex(revision_identifier(revision)) def compute_release_sha1_git(release): """Compute a release sha1 git from its dict representation. Args: release: Additional dictionary information needed to compute a synthetic release. Following keys are expected: - name - - comment + - message - date - - offset - - author_name - - author_email + - author - revision: binary form of the sha1_git revision targeted by this Returns: release sha1 in bytes - # FIXME: beware, bytes output from storage api - """ - release_bytes = ("""object %s -type commit -tag %s -tagger %s <%s> %s %s - -%s -""" % (utils.hash_to_hex(release['revision']), - release['name'], - release['author_name'], - release['author_email'], - release['date'], - release['offset'], - release['comment'])).encode('utf-8') - - hashes = utils.hashdata(release_bytes, 'tag') - return hashes['sha1_git'] + return bytes.fromhex(release_identifier(release)) def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ m_hashes = utils.hashlink(linkpath) m_hashes.update({ 'name': os.path.basename(linkpath), 'perms': GitPerm.LINK, 'type': GitType.BLOB, 'path': linkpath }) return m_hashes def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ m_hashes = utils.hashfile(filepath) perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB m_hashes.update({ 'name': os.path.basename(filepath), 'perms': perms, 'type': GitType.BLOB, 'path': filepath }) return m_hashes def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ tree_hash = compute_directory_git_sha1(dirname, ls_hashes) tree_hash.update({ 'name': os.path.basename(dirname), 'perms': GitPerm.TREE, 'type': GitType.TREE, 'path': dirname }) return tree_hash def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... Note: One special key is ROOT_TREE_KEY to indicate the upper root of the directory (this is the revision's directory). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} all_links = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] links = [os.path.join(dirpath, file) for file in (filenames+dirnames) if os.path.islink(os.path.join(dirpath, file))] for linkpath in links: all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) only_files = [os.path.join(dirpath, file) for file in filenames if os.path.join(dirpath, file) not in all_links] for filepath in only_files: m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] subdirs = [os.path.join(dirpath, dir) for dir in dirnames if os.path.join(dirpath, dir) not in all_links] for fulldirname in subdirs: tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_git_sha1(rootdir, ls_hashes) root_hash.update({ 'path': rootdir, 'name': os.path.basename(rootdir), 'perms': GitPerm.TREE, 'type': GitType.TREE }) ls_hashes.update({ ROOT_TREE_KEY: [root_hash] }) return ls_hashes diff --git a/swh/loader/dir/git/utils.py b/swh/loader/dir/git/utils.py index b1f8037..d260de4 100644 --- a/swh/loader/dir/git/utils.py +++ b/swh/loader/dir/git/utils.py @@ -1,98 +1,49 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import hashlib import os -from io import BytesIO - from swh.core import hashutil hashfile = hashutil.hashfile hash_to_hex = hashutil.hash_to_hex hex_to_hash = hashutil.hex_to_hash -def _new_hash(header_type, length): - """Initialize a digest object (as returned by python's hashlib) for the - git sha1 algorithm. - This is in charge of pre-computing the needed header for git. - - Args: - header_type: a git sha1 type ('blob', 'tree', 'commit', 'tag') - length: Length of content to hash. Could be None if when hashing - with sha1 and sha256 - - Returns: - A digest object - - Raises: - ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' - - """ - h = hashlib.new('sha1') - if header_type not in ('blob', 'commit', 'tree', 'tag'): - raise ValueError( - 'Only supported types are blob, commit, tree, tag') - - h.update(('%s %d\0' % (header_type, length)).encode('ascii')) - - return h - - -def _hash_file_obj(f, header_type, length): - """hash (git sha1) the content of a file-like object f with header_type - and length. - - Returns: - A dictionary with 'sha1_git' as key and value the computed sha1_git. - - Raises: - ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' - - """ - h = _new_hash(header_type, length) - while True: - chunk = f.read(hashutil.HASH_BLOCK_SIZE) - if not chunk: - break - h.update(chunk) - - return {'sha1_git': h.digest()} - - def hashdata(data, header_type): """Hash data as git sha1 with header_type. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ - buf = BytesIO(data) - return _hash_file_obj(buf, header_type, len(data)) + hashobj = hashutil.hash_git_object(data, header_type) + return { + 'sha1_git': hashobj.digest(), + } def hashlink(linkpath): """Compute hashes for a link. Args: linkpath: the absolute path name to a symbolic link. Returns: dictionary with sha1_git as key and the actual binary sha1 as value. """ raw_data = os.readlink(linkpath) hashes = hashutil.hashdata(raw_data) hashes.update({ 'data': raw_data, 'length': len(raw_data) }) return hashes diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 1dbd871..f59d62d 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,542 +1,545 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import psycopg2 import requests import sys import traceback import uuid from retrying import retry from swh.core import config from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType def send_in_packets(source_list, formatter, sender, packet_size, packet_size_bytes=None, *args, **kwargs): """Send objects from `source_list`, passed through `formatter` (with extra args *args, **kwargs), using the `sender`, in packets of `packet_size` objects (and of max `packet_size_bytes`). """ formatted_objects = [] count = 0 if not packet_size_bytes: packet_size_bytes = 0 for obj in source_list: formatted_object = formatter(obj, *args, **kwargs) if formatted_object: formatted_objects.append(formatted_object) else: continue if packet_size_bytes: count += formatted_object['length'] if len(formatted_objects) >= packet_size or count > packet_size_bytes: sender(formatted_objects) formatted_objects = [] count = 0 if formatted_objects: sender(formatted_objects) def retry_loading(error): """Retry policy when the database raises an integrity error""" exception_classes = [ # raised when two parallel insertions insert the same data. psycopg2.IntegrityError, # raised when uWSGI restarts and hungs up on the worker. requests.exceptions.ConnectionError, ] if not any(isinstance(error, exc) for exc in exception_classes): return False # FIXME: it could be DirLoaderWithHistory, TarLoader logger = logging.getLogger('swh.loader.dir.DirLoader') error_name = error.__module__ + '.' + error.__class__.__name__ logger.warning('Retry loading a batch', exc_info=False, extra={ 'swh_type': 'storage_retry', 'swh_exception_type': error_name, 'swh_exception': traceback.format_exception( error.__class__, error, error.__traceback__, ), }) return True class DirLoader(config.SWHConfig): """A bulk loader for a directory. This will load the content of the directory. """ DEFAULT_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 10000), 'content_packet_size_bytes': ('int', 1024 * 1024 * 1024), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100000), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), } def __init__(self, config): self.config = config if self.config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage self.storage = Storage(*self.config['storage_args']) self.log = logging.getLogger('swh.loader.dir.DirLoader') def open_fetch_history(self, origin_id): return self.storage.fetch_history_start(origin_id) def close_fetch_history(self, fetch_history_id, res): result = None if 'objects' in res: result = { 'contents': len(res['objects'].get(GitType.BLOB, [])), 'directories': len(res['objects'].get(GitType.TREE, [])), 'revisions': len(res['objects'].get(GitType.COMM, [])), 'releases': len(res['objects'].get(GitType.RELE, [])), 'occurrences': len(res['objects'].get(GitType.REFS, [])), } data = { 'status': res['status'], 'result': result, 'stderr': res.get('stderr') } return self.storage.fetch_history_end(fetch_history_id, data) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_contents(self, content_list): """Actually send properly formatted contents to the database""" num_contents = len(content_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) self.storage.content_add(content_list) self.log.debug("Done sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_directories(self, directory_list): """Actually send properly formatted directories to the database""" num_directories = len(directory_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) self.storage.directory_add(directory_list) self.log.debug("Done sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_revisions(self, revision_list): """Actually send properly formatted revisions to the database""" num_revisions = len(revision_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) self.storage.revision_add(revision_list) self.log.debug("Done sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_releases(self, release_list): """Actually send properly formatted releases to the database""" num_releases = len(release_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) self.storage.release_add(release_list) self.log.debug("Done sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_occurrences(self, occurrence_list): """Actually send properly formatted occurrences to the database""" num_occurrences = len(occurrence_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) self.storage.occurrence_add(occurrence_list) self.log.debug("Done sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) def bulk_send_blobs(self, objects, blobs, origin_id): """Format blobs as swh contents and send them to the database""" packet_size = self.config['content_packet_size'] packet_size_bytes = self.config['content_packet_size_bytes'] max_content_size = self.config['content_size_limit'] send_in_packets(blobs, converters.blob_to_content, self.send_contents, packet_size, packet_size_bytes=packet_size_bytes, log=self.log, max_content_size=max_content_size, origin_id=origin_id) def bulk_send_trees(self, objects, trees): """Format trees as swh directories and send them to the database""" packet_size = self.config['directory_packet_size'] send_in_packets(trees, converters.tree_to_directory, self.send_directories, packet_size, objects=objects, log=self.log) def bulk_send_commits(self, objects, commits): """Format commits as swh revisions and send them to the database""" packet_size = self.config['revision_packet_size'] - send_in_packets(commits, converters.commit_to_revision, + send_in_packets(commits, (lambda x, objects={}, log=None: x), self.send_revisions, packet_size, objects=objects, log=self.log) def bulk_send_annotated_tags(self, objects, tags): """Format annotated tags (pygit2.Tag objects) as swh releases and send them to the database """ packet_size = self.config['release_packet_size'] - send_in_packets(tags, converters.annotated_tag_to_release, + send_in_packets(tags, (lambda x, objects={}, log=None: x), self.send_releases, packet_size, log=self.log) def bulk_send_refs(self, objects, refs): """Format git references as swh occurrences and send them to the database """ packet_size = self.config['occurrence_packet_size'] - send_in_packets(refs, lambda ref: ref, + send_in_packets(refs, (lambda ref, objects={}, log=None: ref), self.send_occurrences, packet_size) def list_repo_objs(self, dir_path, revision, release): """List all objects from dir_path. Args: - dir_path (path): the directory to list - revision: revision dictionary representation - release: release dictionary representation Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ def get_objects_per_object_type(objects_per_path): m = { GitType.BLOB: [], GitType.TREE: [], GitType.COMM: [], GitType.RELE: [] } for tree_path in objects_per_path: objs = objects_per_path[tree_path] for obj in objs: m[obj['type']].append(obj) return m - def _revision_from(tree_hash, revision): + def _revision_from(tree_hash, revision, objects): full_rev = dict(revision) full_rev['directory'] = tree_hash - full_rev['sha1_git'] = git.compute_revision_sha1_git(full_rev) + full_rev = converters.commit_to_revision(full_rev, objects) + full_rev['id'] = git.compute_revision_sha1_git(full_rev) return full_rev def _release_from(revision_hash, release): full_rel = dict(release) - full_rel['revision'] = revision_hash - full_rel['sha1_git'] = git.compute_release_sha1_git(full_rel) + full_rel['target'] = revision_hash + full_rel['target_type'] = 'revision' + full_rel = converters.annotated_tag_to_release(full_rel) + full_rel['id'] = git.compute_release_sha1_git(full_rel) return full_rel log_id = str(uuid.uuid4()) sdir_path = dir_path.decode('utf-8') self.log.info("Started listing %s" % dir_path, extra={ 'swh_type': 'dir_list_objs_start', 'swh_repo': sdir_path, 'swh_id': log_id, }) objects_per_path = git.walk_and_compute_sha1_from_directory(dir_path) objects = get_objects_per_object_type(objects_per_path) tree_hash = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] - full_rev = _revision_from(tree_hash, revision) + full_rev = _revision_from(tree_hash, revision, objects_per_path) objects[GitType.COMM] = [full_rev] if release and 'name' in release: - full_rel = _release_from(full_rev['sha1_git'], release) + full_rel = _release_from(full_rev['id'], release) objects[GitType.RELE] = [full_rel] self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( sdir_path, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', 'swh_repo': sdir_path, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) return objects, objects_per_path def load_dir(self, dir_path, objects, objects_per_path, refs, origin_id): if self.config['send_contents']: self.bulk_send_blobs(objects_per_path, objects[GitType.BLOB], origin_id) else: self.log.info('Not sending contents') if self.config['send_directories']: self.bulk_send_trees(objects_per_path, objects[GitType.TREE]) else: self.log.info('Not sending directories') if self.config['send_revisions']: self.bulk_send_commits(objects_per_path, objects[GitType.COMM]) else: self.log.info('Not sending revisions') if self.config['send_releases']: self.bulk_send_annotated_tags(objects_per_path, objects[GitType.RELE]) else: self.log.info('Not sending releases') if self.config['send_occurrences']: self.bulk_send_refs(objects_per_path, refs) else: self.log.info('Not sending occurrences') def process(self, dir_path, origin, revision, release, occurrences): """Load a directory in backend. Args: - dir_path: source of the directory to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrences as dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise - objects: the actual objects sent to swh storage """ def _occurrence_from(origin_id, revision_hash, occurrence): occ = dict(occurrence) occ.update({ - 'revision': full_rev['sha1_git'], - 'origin': origin['id'], + 'revision': revision_hash, + 'origin': origin_id, }) return occ def _occurrences_from(origin_id, revision_hash, occurrences): full_occs = [] for occurrence in occurrences: full_occ = _occurrence_from(origin_id, revision_hash, occurrence) full_occs.append(full_occ) return full_occs if not os.path.exists(dir_path): warn_msg = 'Skipping inexistant directory %s' % dir_path self.log.warn(warn_msg, extra={ 'swh_type': 'dir_repo_list_refs', 'swh_repo': dir_path, 'swh_num_refs': 0, }) return {'status': False, 'stderr': warn_msg} if isinstance(dir_path, str): dir_path = dir_path.encode(sys.getfilesystemencoding()) # to load the repository, walk all objects, compute their hash objects, objects_per_path = self.list_repo_objs(dir_path, revision, release) full_rev = objects[GitType.COMM][0] # only 1 revision full_occs = _occurrences_from(origin['id'], - full_rev['sha1_git'], + full_rev['id'], occurrences) self.load_dir(dir_path, objects, objects_per_path, full_occs, origin['id']) objects[GitType.REFS] = full_occs return {'status': True, 'objects': objects} class DirLoaderWithHistory(DirLoader): """A bulk loader for a directory. This will: - create the origin if it does not exist - open an entry in fetch_history - load the content of the directory - close the entry in fetch_history """ def __init__(self, config): super().__init__(config) self.log = logging.getLogger('swh.loader.dir.DirLoaderWithHistory') def process(self, dir_path, origin, revision, release, occurrences): """Load a directory in backend. Args: - dir_path: source of the directory to import - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ origin['id'] = self.storage.origin_add_one(origin) fetch_history_id = self.open_fetch_history(origin['id']) result = super().process(dir_path, origin, revision, release, occurrences) self.close_fetch_history(fetch_history_id, result) diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index fc13af8..9de172b 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,304 +1,311 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime import os import shutil import tempfile import unittest from nose.tools import istest from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType, GitPerm def tmpfile_with_content(fromdir, contentfile): """Create a temporary file with content contentfile in directory fromdir. """ tmpfilepath = tempfile.mktemp( suffix='.swh', prefix='tmp-file-for-test', dir=fromdir) with open(tmpfilepath, 'wb') as f: f.write(contentfile) return tmpfilepath class TestConverters(unittest.TestCase): @classmethod - def setupClass(cls): + def setUpClass(cls): + super().setUpClass() cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-loader-dir.') @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdir) + super().tearDownClass() @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) @istest def annotated_tag_to_release(self): # given release = { - 'sha1_git': '123', - 'revision': '456', + 'id': '123', + 'target': '456', + 'target_type': 'revision', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': 1444054085, 'offset': '-0300', 'author_name': 'someone', - 'author_email': 'someone@whatelse.eu' + 'author_email': 'someone@whatelse.eu', } expected_release = { - 'id': '123', - 'revision': '456', + 'target': '456', + 'target_type': 'revision', 'name': 'some-release', - 'comment': 'some-comment-on-release', - 'date': datetime.datetime.fromtimestamp( - 1444054085, - tz=datetime.timezone.utc), - 'date_offset': -180, - 'author_name': 'someone', - 'author_email': 'someone@whatelse.eu', + 'message': b'some-comment-on-release', + 'date': { + 'timestamp': 1444054085, + 'offset': -180 + }, + 'author': { + 'name': b'someone', + 'email': b'someone@whatelse.eu', + }, 'synthetic': True, } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual(actual_release, expected_release) @istest def blob_to_content_visible_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'data': contentfile, 'length': len(contentfile), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content_link(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) tmplinkpath = tempfile.mktemp(dir=self.tmpdir) os.symlink(tmpfilepath, tmplinkpath) obj = { 'path': tmplinkpath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'data': contentfile, 'length': len(tmpfilepath), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content_link_with_data_length_populated(self): # given tmplinkpath = tempfile.mktemp(dir=self.tmpdir) obj = { 'length': 10, # wrong for test purposes 'data': 'something wrong', # again for test purposes 'path': tmplinkpath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'length': 10, 'data': 'something wrong', 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content2_absent_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'length': len(contentfile), 'status': 'absent', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'reason': 'Content too large', 'origin': 190 } # when actual_blob = converters.blob_to_content(obj, None, max_content_size=10, origin_id=190) # then self.assertEqual(actual_blob, expected_blob) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', 'author_date': 1444054085, 'author_offset': '+0000', 'committer_date': 1444054085, 'committer_offset': '-0000', 'type': 'tar', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'directory': 'targeted-tree-sha1', } objects = { git.ROOT_TREE_KEY: [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { - 'id': 'commit-git-sha1', - 'date': datetime.datetime.fromtimestamp( - 1444054085, - tz=datetime.timezone.utc), - 'date_offset': 0, - 'committer_date': datetime.datetime.fromtimestamp( - 1444054085, - tz=datetime.timezone.utc), - 'committer_date_offset': 0, + 'date': { + 'timestamp': 1444054085, + 'offset': 0, + }, + 'committer_date': { + 'timestamp': 1444054085, + 'offset': 0, + }, 'type': 'tar', 'directory': 'targeted-tree-sha1', - 'message': 'synthetic-message-input', - 'author_name': 'author-name', - 'author_email': 'author-email', - 'committer_name': 'committer-name', - 'committer_email': 'committer-email', + 'message': b'synthetic-message-input', + 'author': { + 'name': b'author-name', + 'email': b'author-email', + }, + 'committer': { + 'name': b'committer-name', + 'email': b'committer-email', + }, 'synthetic': True, 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision) diff --git a/swh/loader/dir/tests/test_git.py b/swh/loader/dir/tests/test_git.py index 5091f65..02b435e 100644 --- a/swh/loader/dir/tests/test_git.py +++ b/swh/loader/dir/tests/test_git.py @@ -1,121 +1,135 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.dir.git import git from swh.loader.dir.git.git import GitPerm, GitType class GitHashlib(unittest.TestCase): def setUp(self): self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def compute_directory_git_sha1(self): # given dirpath = 'some-dir-path' hashes = { dirpath: [{'perms': GitPerm.TREE, 'type': GitType.TREE, 'name': b'barfoo', 'sha1_git': bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087')}, {'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'name': b'hello', 'sha1_git': bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')}, {'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'name': b'blah', 'sha1_git': bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20')}] } # when checksums = git.compute_directory_git_sha1(dirpath, hashes) # then self.assertEqual(checksums['sha1_git'], self.checksums['tree_sha1_git']) @istest def compute_revision_sha1_git(self): # given tree_hash = bytes.fromhex('1c61f7259dcb770f46b194d941df4f08ff0a3970') revision = { - 'author_name': 'Antoine R. Dumont (@ardumont)', - 'author_email': 'antoine.romain.dumont@gmail.com', - 'author_date': '1444054085', - 'author_offset': '+0200', - 'committer_name': 'Antoine R. Dumont (@ardumont)', - 'committer_email': 'antoine.romain.dumont@gmail.com', - 'committer_date': '1444054085', - 'committer_offset': '+0200', - 'message': 'initial', + 'author': { + 'name': b'Antoine R. Dumont (@ardumont)', + 'email': b'antoine.romain.dumont@gmail.com', + }, + 'date': { + 'timestamp': 1444054085, + 'offset': 120, + }, + 'committer': { + 'name': b'Antoine R. Dumont (@ardumont)', + 'email': b'antoine.romain.dumont@gmail.com', + }, + 'committer_date': { + 'timestamp': 1444054085, + 'offset': 120, + }, + 'message': b'initial\n', 'type': 'tar', 'directory': tree_hash, + 'parents': [], } # when checksum = git.compute_revision_sha1_git(revision) # then self.assertEqual(checksum, self.checksums['commit_sha1_git']) @istest def compute_release_sha1_git(self): # given revision_hash = bytes.fromhex('24d012aaec0bc5a4d2f62c56399053' 'd6cc72a241') release = { 'name': '0.0.1', - 'author_name': 'Antoine R. Dumont (@ardumont)', - 'author_email': 'antoine.romain.dumont@gmail.com', - 'date': '1444225145', - 'offset': '+0200', - 'comment': 'blah', - 'revision': revision_hash, + 'author': { + 'name': b'Antoine R. Dumont (@ardumont)', + 'email': b'antoine.romain.dumont@gmail.com', + }, + 'date': { + 'timestamp': 1444225145, + 'offset': 120, + }, + 'message': b'blah\n', + 'target_type': 'revision', + 'target': revision_hash, } # when checksum = git.compute_release_sha1_git(release) # then self.assertEqual(checksum, self.checksums['tag_sha1_git']) diff --git a/swh/loader/dir/tests/test_git_utils.py b/swh/loader/dir/tests/test_git_utils.py index 1cbe733..678ebd0 100644 --- a/swh/loader/dir/tests/test_git_utils.py +++ b/swh/loader/dir/tests/test_git_utils.py @@ -1,94 +1,94 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.dir.git import utils class GitUtilsHashlib(unittest.TestCase): def setUp(self): self.blob_data = b'42\n' self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' 'e07157b6cd'), 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def unknown_header_type(self): with self.assertRaises(ValueError) as cm: utils.hashdata(b'any-data', 'some-unknown-type') - self.assertIn('Only supported types', cm.exception.args[0]) + self.assertIn('Unexpected git object type', cm.exception.args[0]) @istest def hashdata_content(self): # when checksums = utils.hashdata(self.blob_data, 'blob') # then self.assertEqual(checksums['sha1_git'], self.checksums['blob_sha1_git']) @istest def hashdata_tree(self): # when checksums = utils.hashdata(self.tree_data, 'tree') # then self.assertEqual(checksums['sha1_git'], self.checksums['tree_sha1_git']) @istest def hashdata_revision(self): # when checksums = utils.hashdata(self.commit_data, 'commit') # then self.assertEqual(checksums['sha1_git'], self.checksums['commit_sha1_git']) @istest def hashdata_tag(self): # when checksums = utils.hashdata(self.tag_data, 'tag') # then self.assertEqual(checksums['sha1_git'], self.checksums['tag_sha1_git']) diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index 0df08d9..585b3a3 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,111 +1,112 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.loader.dir.loader import DirLoader from swh.loader.dir.git.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp().encode('utf-8') start_path = os.path.dirname(__file__).encode('utf-8') sample_folder_archive = os.path.join(start_path, b'../../../../..', b'swh-storage-testdata', b'dir-folders', b'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, b'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) print(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage_class': 'remote_storage', 'storage_args': ['http://localhost:5000/'], } self.origin = { 'url': 'file:///dev/null', 'type': 'dir', } self.occurrence = { 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', } self.revision = { 'author_name': 'swh author', 'author_email': 'swh@inria.fr', 'author_date': '1444054085', 'author_offset': '+0200', 'committer_name': 'swh committer', 'committer_email': 'swh@inria.fr', 'committer_date': '1444054085', 'committer_offset': '+0200', 'type': 'tar', 'message': 'synthetic revision', + 'metadata': {'foo': 'bar'}, } self.release = { 'name': 'v0.0.1', 'date': '1444054085', 'offset': '+0200', 'author_name': 'swh author', 'author_email': 'swh@inria.fr', 'comment': 'synthetic release', } self.dirloader = DirLoader(self.info) @istest def load_without_storage(self): # when objects, objects_per_path = self.dirloader.list_repo_objs( self.root_path, self.revision, self.release) # then self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") self.assertEquals(len(objects[GitType.TREE]), 5, "5 directories: 4 subdirs + 1 empty + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "synthetic release") self.assertEquals(len(objects_per_path), 6, "5 folders + ") # print('objects: %s\n objects-per-path: %s\n' % # (objects.keys(), # objects_per_path.keys()))