diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index ec57fa5..808310e 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,155 +1,143 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" import datetime from swh.loader.dir.git.git import GitType from swh.loader.dir.git import git, utils def to_datetime(ts): """Convert a timestamp to utc datetime. """ return datetime.datetime.utcfromtimestamp(ts).replace( tzinfo=datetime.timezone.utc) def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): if 'data' not in obj: filepath = obj['path'] content_raw, length = utils._read_raw(filepath) obj.update({'data': content_raw, 'length': length}) return _blob_to_content(obj, log, max_content_size, origin_id) def _blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert to a compliant swh content. """ size = obj['length'] ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'data': obj['data'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (obj['sha1_git'], size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret ret.update({ 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ upper_directory = objects[git.ROOT_TREE_KEY][0] return { 'id': commit['sha1_git'], - 'date': - to_datetime(commit['revision_author_date']), - 'date_offset': - format_to_minutes(commit['revision_author_offset']), - 'committer_date': - to_datetime(commit['revision_committer_date']), - 'committer_date_offset': - format_to_minutes(commit['revision_committer_offset']), - 'type': commit['revision_type'], + 'date': to_datetime(commit['author_date']), + 'date_offset': format_to_minutes(commit['author_offset']), + 'committer_date': to_datetime(commit['committer_date']), + 'committer_date_offset': format_to_minutes(commit['committer_offset']), + 'type': commit['type'], 'directory': upper_directory['sha1_git'], - 'message': commit['revision_message'], - 'author_name': commit['revision_author_name'], - 'author_email': commit['revision_author_email'], - 'committer_name': commit['revision_committer_name'], - 'committer_email': commit['revision_committer_email'], + 'message': commit['message'], + 'author_name': commit['author_name'], + 'author_email': commit['author_email'], + 'committer_name': commit['committer_name'], + 'committer_email': commit['committer_email'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { 'id': release['sha1_git'], - 'revision': release['revision_sha1_git'], - 'name': release['release_name'], - 'comment': release['release_comment'], - 'date': to_datetime(release['release_date']), - 'date_offset': format_to_minutes(release['release_offset']), - 'author_name': release['release_author_name'], - 'author_email': release['release_author_email'], - } - - -def origin_url_to_origin(origin_url): - """Format a pygit2.Repository as an origin suitable for swh.storage""" - return { - 'type': 'dir', - 'url': origin_url, + 'revision': release['revision'], + 'name': release['name'], + 'comment': release['comment'], + 'date': to_datetime(release['date']), + 'date_offset': format_to_minutes(release['offset']), + 'author_name': release['author_name'], + 'author_email': release['author_email'], } diff --git a/swh/loader/dir/git/git.py b/swh/loader/dir/git/git.py index af9db3e..07bc345 100644 --- a/swh/loader/dir/git/git.py +++ b/swh/loader/dir/git/git.py @@ -1,334 +1,299 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.loader.dir.git import utils ROOT_TREE_KEY = '' class GitType(Enum): BLOB = b'blob' TREE = b'tree' EXEC = b'exec' LINK = b'link' COMM = b'commit' RELE = b'release' class GitPerm(Enum): BLOB = b'100644' TREE = b'40000' EXEC = b'100755' LINK = b'120000' def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: dictionary with sha1_git as key and the actual binary sha1 as value. Assumes: Every path exists in hashes. """ def sorted_key_fn(entry): """Beware the sorted algorithm in git add a / for tree entries. """ name = entry['name'] return name + b'/' if entry['type'] is GitType.TREE else name def sort_by_entry_name(hashes): return sorted(hashes, key=sorted_key_fn) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) return utils.hashdata(b''.join(rows), 'tree') -def compute_revision_git_sha1(tree_hash, info): - """Compute a revision representation targeting the tree_hash. - +def compute_revision_sha1_git(revision): + """Compute a revision sha1 git from its dict representation. Args: - tree_hash: binary form of the tree hash - info: Additional dictionary information needed to compute a synthetic + revision: Additional dictionary information needed to compute a + synthetic revision. Following keys are expected: - - revision_author_name - - revision_author_email - - revision_author_date - - revision_author_offset - - revision_committer_name - - revision_committer_email - - revision_committer_date - - revision_committer_offset - - revision_message - - revision_type + - author_name + - author_email + - author_date + - author_offset + - committer_name + - committer_email + - committer_date + - committer_offset + - message + - type + - directory: binary form of the tree hash + + Returns: + revision sha1 in bytes + + # FIXME: beware, bytes output from storage api """ - revision_author_name = info['revision_author_name'] - revision_author_email = info['revision_author_email'] - revision_author_date = info['revision_author_date'] - revision_author_offset = info['revision_author_offset'] - revision_committer_name = info['revision_committer_name'] - revision_committer_email = info['revision_committer_email'] - revision_committer_date = info['revision_committer_date'] - revision_committer_offset = info['revision_committer_offset'] - revision_message = info['revision_message'] - - revision_content = ("""tree %s + revision_bytes = ("""tree %s author %s <%s> %s %s committer %s <%s> %s %s %s -""" % (utils.hash_to_hex(tree_hash), - revision_author_name, - revision_author_email, - revision_author_date, - revision_author_offset, - revision_committer_name, - revision_committer_email, - revision_committer_date, - revision_committer_offset, - revision_message)).encode('utf-8') - hashes = utils.hashdata(revision_content, 'commit') - - # and update other information - hashes.update({ - 'revision_author_name': revision_author_name, - 'revision_author_email': revision_author_email, - 'revision_author_date': revision_author_date, - 'revision_author_offset': revision_author_offset, - 'revision_committer_name': revision_committer_name, - 'revision_committer_email': revision_committer_email, - 'revision_committer_date': revision_committer_date, - 'revision_committer_offset': revision_committer_offset, - 'revision_message': revision_message, - 'revision_type': info['revision_type'] - }) - return hashes +""" % (utils.hash_to_hex(revision['directory']), + revision['author_name'], + revision['author_email'], + revision['author_date'], + revision['author_offset'], + revision['committer_name'], + revision['committer_email'], + revision['committer_date'], + revision['committer_offset'], + revision['message'])).encode('utf-8') + hashes = utils.hashdata(revision_bytes, 'commit') + return hashes['sha1_git'] -def compute_release(revision_hash, info): - """Compute a release representation. - This release representation will contain the computed sha1_git for such - release. - This release will point to the revision_hash. - The additional informations are present in the dictionary info. +def compute_release_sha1_git(release): + """Compute a release sha1 git from its dict representation. Args: - revision_hash: binary form of the sha1_git revision targeted by this - release - info: Additional dictionary information needed to compute a synthetic - release. Following keys are expected: - - release_name - - release_comment - - release_date - - release_offset - - release_author_name - - release_author_email + release: Additional dictionary information needed to compute a + synthetic release. Following keys are expected: + - name + - comment + - date + - offset + - author_name + - author_email + - revision: binary form of the sha1_git revision targeted by this + + Returns: + release sha1 in bytes + + # FIXME: beware, bytes output from storage api """ - release_name = info['release_name'] - release_author_name = info['release_author_name'] - release_author_email = info['release_author_email'] - release_date = info['release_date'] - release_offset = info['release_offset'] - release_comment = info['release_comment'] - - release_content_to_hash = ("""object %s + release_bytes = ("""object %s type commit tag %s tagger %s <%s> %s %s %s -""" % (utils.hash_to_hex(revision_hash), - release_name, - release_author_name, - release_author_email, - release_date, - release_offset, - release_comment)).encode('utf-8') - - hashes = utils.hashdata(release_content_to_hash, 'tag') - hashes.update({ - 'revision_sha1_git': revision_hash, - 'release_name': release_name, - 'release_comment': release_comment, - 'release_date': release_date, - 'release_offset': release_offset, - 'release_author_name': release_author_name, - 'release_author_email': release_author_email, - }) - return hashes +""" % (utils.hash_to_hex(release['revision']), + release['name'], + release['author_name'], + release['author_email'], + release['date'], + release['offset'], + release['comment'])).encode('utf-8') + + hashes = utils.hashdata(release_bytes, 'tag') + return hashes['sha1_git'] def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ m_hashes = utils.hashlink(linkpath) m_hashes.update({ 'name': bytes(os.path.basename(linkpath), 'utf-8'), 'perms': GitPerm.LINK, 'type': GitType.BLOB, 'path': linkpath }) return m_hashes def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ m_hashes = utils.hashfile(filepath) perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB m_hashes.update({ 'name': bytes(os.path.basename(filepath), 'utf-8'), 'perms': perms, 'type': GitType.BLOB, 'path': filepath }) return m_hashes def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ tree_hash = compute_directory_git_sha1(dirname, ls_hashes) tree_hash.update({ 'name': bytes(os.path.basename(dirname), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE, 'path': dirname }) return tree_hash def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... Note: One special key is ROOT_TREE_KEY to indicate the upper root of the directory (this is the revision's directory). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} all_links = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] links = [os.path.join(dirpath, file) for file in (filenames+dirnames) if os.path.islink(os.path.join(dirpath, file))] for linkpath in links: all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) only_files = [os.path.join(dirpath, file) for file in filenames if os.path.join(dirpath, file) not in all_links] for filepath in only_files: m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] subdirs = [os.path.join(dirpath, dir) for dir in dirnames if os.path.join(dirpath, dir) not in all_links] for fulldirname in subdirs: tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_git_sha1(rootdir, ls_hashes) root_hash.update({ 'path': rootdir, 'name': bytes(os.path.basename(rootdir), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE }) ls_hashes.update({ ROOT_TREE_KEY: [root_hash] }) return ls_hashes diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 1701925..4c41ab0 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,498 +1,424 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import uuid import traceback import os import psycopg2 from retrying import retry from swh.core import config from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType def send_in_packets(source_list, formatter, sender, packet_size, packet_size_bytes=None, *args, **kwargs): """Send objects from `source_list`, passed through `formatter` (with extra args *args, **kwargs), using the `sender`, in packets of `packet_size` objects (and of max `packet_size_bytes`). """ formatted_objects = [] count = 0 if not packet_size_bytes: packet_size_bytes = 0 for obj in source_list: formatted_object = formatter(obj, *args, **kwargs) if formatted_object: formatted_objects.append(formatted_object) else: continue if packet_size_bytes: count += formatted_object['length'] if len(formatted_objects) >= packet_size or count > packet_size_bytes: sender(formatted_objects) formatted_objects = [] count = 0 if formatted_objects: sender(formatted_objects) def retry_loading(error): """Retry policy when the database raises an integrity error""" if not isinstance(error, psycopg2.IntegrityError): return False logger = logging.getLogger('swh.loader.git.DirLoader') error_name = error.__module__ + '.' + error.__class__.__name__ logger.warning('Retry loading a batch', exc_info=False, extra={ 'swh_type': 'storage_retry', 'swh_exception_type': error_name, 'swh_exception': traceback.format_exception( error.__class__, error, error.__traceback__, ), }) return True class DirLoader(config.SWHConfig): """A bulk loader for a directory""" DEFAULT_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 10000), 'content_packet_size_bytes': ('int', 1024 * 1024 * 1024), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100000), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), } def __init__(self, config): self.config = config if self.config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage self.storage = Storage(*self.config['storage_args']) self.log = logging.getLogger('swh.loader.dir.DirLoader') @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_contents(self, content_list): """Actually send properly formatted contents to the database""" num_contents = len(content_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) self.storage.content_add(content_list) self.log.debug("Done sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_directories(self, directory_list): """Actually send properly formatted directories to the database""" num_directories = len(directory_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) self.storage.directory_add(directory_list) self.log.debug("Done sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_revisions(self, revision_list): """Actually send properly formatted revisions to the database""" num_revisions = len(revision_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) self.storage.revision_add(revision_list) self.log.debug("Done sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_releases(self, release_list): """Actually send properly formatted releases to the database""" num_releases = len(release_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) self.storage.release_add(release_list) self.log.debug("Done sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_occurrences(self, occurrence_list): """Actually send properly formatted occurrences to the database""" num_occurrences = len(occurrence_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) self.storage.occurrence_add(occurrence_list) self.log.debug("Done sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) - def get_or_create_origin(self, origin_url): - origin = converters.origin_url_to_origin(origin_url) - - origin['id'] = self.storage.origin_add_one(origin) - - return origin - - def dir_origin(self, root_dir, origin_url): - log_id = str(uuid.uuid4()) - self.log.debug('Creating origin for %s' % origin_url, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'origin', - 'swh_num': 1, - 'swh_id': log_id - }) - origin = self.get_or_create_origin(origin_url) - self.log.debug('Done creating origin for %s' % origin_url, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'origin', - 'swh_num': 1, - 'swh_id': log_id - }) - - return origin - def dir_revision(self, - root_dir, + dir_path, origin_url, revision_date, revision_offset, revision_committer_date, revision_committer_offset, revision_type, revision_message, revision_author, revision_committer): """Create a revision. """ log_id = str(uuid.uuid4()) self.log.debug('Creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) self.get_or_create_origin(origin_url) self.log.debug('Done creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) def bulk_send_blobs(self, objects, blobs, origin_id): """Format blobs as swh contents and send them to the database""" packet_size = self.config['content_packet_size'] packet_size_bytes = self.config['content_packet_size_bytes'] max_content_size = self.config['content_size_limit'] send_in_packets(blobs, converters.blob_to_content, self.send_contents, packet_size, packet_size_bytes=packet_size_bytes, log=self.log, max_content_size=max_content_size, origin_id=origin_id) def bulk_send_trees(self, objects, trees): """Format trees as swh directories and send them to the database""" packet_size = self.config['directory_packet_size'] send_in_packets(trees, converters.tree_to_directory, self.send_directories, packet_size, objects=objects, log=self.log) def bulk_send_commits(self, objects, commits): """Format commits as swh revisions and send them to the database""" packet_size = self.config['revision_packet_size'] send_in_packets(commits, converters.commit_to_revision, self.send_revisions, packet_size, objects=objects, log=self.log) def bulk_send_annotated_tags(self, objects, tags): """Format annotated tags (pygit2.Tag objects) as swh releases and send them to the database """ packet_size = self.config['release_packet_size'] send_in_packets(tags, converters.annotated_tag_to_release, self.send_releases, packet_size, log=self.log) def bulk_send_refs(self, objects, refs): """Format git references as swh occurrences and send them to the database """ packet_size = self.config['occurrence_packet_size'] send_in_packets(refs, lambda ref: ref, self.send_occurrences, packet_size) - def compute_dir_ref(self, root_dir, - branch, - revision_hash, - origin_id, - authority_id, - validity): - """List all the refs from the given root directory root_dir. + def list_repo_objs(self, dir_path, revision, release): + """List all objects from dir_path. Args: - - root_dir: the root directory - - branch: occurrence's branch name - - revision_hash: the revision hash - - origin_id (int): the id of the origin from which the root_dir is - taken - - validity (datetime.datetime): the validity date for the - repository's refs - - authority_id (int): the id of the authority on `validity`. - - Returns: - One dictionary with keys: - - branch (str): name of the ref - - revision (sha1_git): revision pointed at by the ref - - origin (int) - - validity (datetime.DateTime) - - authority (int) - Compatible with occurrence_add. - """ - log_id = str(uuid.uuid4()) - - self.log.debug("Computing occurrence %s representation at %s" % ( - branch, revision_hash), extra={ - 'swh_type': 'computing_occurrence_dir', - 'swh_name': branch, - 'swh_target': str(revision_hash), - 'swh_id': log_id, - }) - - return { - 'branch': branch, - 'revision': revision_hash, - 'origin': origin_id, - 'validity': validity, - 'authority': authority_id, - } - - def list_repo_objs(self, root_dir, info): - """List all objects from root_dir. - - Args: - - root_dir (path): the directory to list + - dir_path (path): the directory to list + - revision: revision dictionary representation + - release: release dictionary representation Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ def get_objects_per_object_type(objects_per_path): m = { GitType.BLOB: [], GitType.TREE: [], GitType.COMM: [], GitType.RELE: [] } for tree_path in objects_per_path: objs = objects_per_path[tree_path] # print('tree_path: %s, objs: %s' % (tree_path, objs)) for obj in objs: m[obj['type']].append(obj) return m log_id = str(uuid.uuid4()) - self.log.info("Started listing %s" % root_dir, extra={ + self.log.info("Started listing %s" % dir_path, extra={ 'swh_type': 'dir_list_objs_start', - 'swh_repo': root_dir, + 'swh_repo': dir_path, 'swh_id': log_id, }) - objects_per_path = git.walk_and_compute_sha1_from_directory(root_dir) + objects_per_path = git.walk_and_compute_sha1_from_directory(dir_path) objects = get_objects_per_object_type(objects_per_path) tree_hash = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] + revision['directory'] = tree_hash - revision = git.compute_revision_git_sha1(tree_hash, info) - objects.update({ - GitType.COMM: [revision] - }) + revision['sha1_git'] = git.compute_revision_sha1_git(revision) - revision_hash = revision['sha1_git'] - release = git.compute_release(revision_hash, info) - objects.update({ - GitType.RELE: [release] - }) + objects[GitType.COMM] = [revision] + + release['revision'] = revision['sha1_git'] + release['sha1_git'] = git.compute_release_sha1_git(release) + + objects[GitType.RELE] = [release] self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( - root_dir, + dir_path, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', - 'swh_repo': root_dir, + 'swh_repo': dir_path, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) return objects, objects_per_path - def load_dir(self, root_dir, objects, objects_per_path, refs, origin_id): + def load_dir(self, dir_path, objects, objects_per_path, refs, origin_id): if self.config['send_contents']: self.bulk_send_blobs(objects_per_path, objects[GitType.BLOB], origin_id) else: self.log.info('Not sending contents') if self.config['send_directories']: self.bulk_send_trees(objects_per_path, objects[GitType.TREE]) else: self.log.info('Not sending directories') if self.config['send_revisions']: self.bulk_send_commits(objects_per_path, objects[GitType.COMM]) else: self.log.info('Not sending revisions') if self.config['send_releases']: self.bulk_send_annotated_tags(objects_per_path, objects[GitType.RELE]) else: self.log.info('Not sending releases') if self.config['send_occurrences']: self.bulk_send_refs(objects_per_path, refs) else: self.log.info('Not sending occurrences') - def process(self, info): - root_dir = info['dir_path'] - if not os.path.exists(root_dir): - self.log.info('Skipping inexistant directory %s' % root_dir, + def process(self, dir_path, origin, revision, release, occurrence): + if not os.path.exists(dir_path): + self.log.info('Skipping inexistant directory %s' % dir_path, extra={ 'swh_type': 'dir_repo_list_refs', - 'swh_repo': root_dir, + 'swh_repo': dir_path, 'swh_num_refs': 0, }) return - files = os.listdir(root_dir) + files = os.listdir(dir_path) if not(files): - self.log.info('Skipping empty directory %s' % root_dir, + self.log.info('Skipping empty directory %s' % dir_path, extra={ 'swh_type': 'dir_repo_list_refs', - 'swh_repo': root_dir, + 'swh_repo': dir_path, 'swh_num_refs': 0, }) return - # Add origin to storage if needed, use the one from config if not - origin = self.dir_origin(root_dir, info['origin_url']) + origin['id'] = self.storage.origin_add_one(origin) # We want to load the repository, walk all the objects - objects, objects_per_path = self.list_repo_objs(root_dir, info) + objects, objects_per_path = self.list_repo_objs(dir_path, revision, + release) # Compute revision information (mixed from outside input + dir content) revision = objects[GitType.COMM][0] - # Parse all the refs from our root_dir - ref = self.compute_dir_ref(root_dir, - info['branch'], - revision['sha1_git'], - origin['id'], - info['authority_id'], - info['validity']) + occurrence.update({ + 'revision': revision['sha1_git'], + 'origin': origin['id'], + }) # Finally, load the repository - self.load_dir(root_dir, objects, objects_per_path, [ref], origin['id']) + self.load_dir(dir_path, objects, objects_per_path, [occurrence], + origin['id']) diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index 25a6965..9b1a899 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,153 +1,144 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import shutil import tempfile from swh.core.scheduling import Task from swh.loader.dir.loader import DirLoader class LoadDirRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' CONFIG_BASE_FILENAME = 'loader/dir.ini' ADDITIONAL_CONFIG = {} def __init__(self): self.config = DirLoader.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG], ) - def run(self, info): + def run(self, dir_path, origin, revision, release, occurrence): """Import a directory. Args: - info: Dictionary of information needed, keys are: - - dir_path: directory to import - - origin_url: origin url (e.g file:///dev/null) + - dir_path: source of the directory to import + - origin: Dictionary origin + - url: url origin we fetched + - type: type of the origin + - revision: Dictionary of information needed, keys are: + - author_name: revision's author name + - author_email: revision's author email + - author_date: timestamp (e.g. 1444054085) + - author_offset: date offset e.g. -0220, +0100 + - committer_name: revision's committer name + - committer_email: revision's committer email + - committer_date: timestamp + - committer_offset: date offset e.g. -0220, +0100 + - type: type of revision dir, tar + - message: synthetic message for the revision + - release: Dictionary of information needed, keys are: + - name: release name + - date: release timestamp (e.g. 1444054085) + - offset: release date offset e.g. -0220, +0100 + - author_name: release author's name + - author_email: release author's email + - comment: release's comment message + - occurrence: Dictionary of information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) - - revision_author_name: revision's author name - - revision_author_email: revision's author email - - revision_author_date: timestamp (e.g. 1444054085) - - revision_author_offset: date offset e.g. -0220, +0100 - - revision_committer_name: revision's committer name - - revision_committer_email: revision's committer email - - revision_committer_date: timestamp - - revision_committer_offset: date offset e.g. -0220, +0100 - - revision_type: type of revision dir, tar - - revision_message: synthetic message for the revision - - release_name: release name - - release_date: release timestamp (e.g. 1444054085) - - release_offset: release date offset e.g. -0220, +0100 - - release_author_name: release author's name - - release_author_email: release author's email - - release_comment: release's comment message """ loader = DirLoader(self.config) loader.log = self.log - loader.process(info) + loader.process(dir_path, origin, revision, release, occurrence) def untar(tar_path, dir_path): """Decompress an archive tar_path to dir_path. At the end of this call, dir_path contains the tarball's uncompressed content. Args: tar_path: the path to access the tarball dir_path: The path where to extract the tarball's content. """ untar_cmd = ['tar', 'xavf', tar_path, '--preserve-permissions', '-C', dir_path] subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) class LoadTarRepository(LoadDirRepository): """Import a tarball to Software Heritage """ task_queue = 'swh_loader_tar' CONFIG_BASE_FILENAME = 'loader/tar.ini' ADDITIONAL_CONFIG = { - 'dir_path': ('str', '/tmp/swh.loader.tar/'), - - # occurrence information - 'branch': ('str', 'master'), - 'authority_id': ('int', 1), - 'validity': ('str', '2015-01-01 00:00:00+00'), - - # revision information - 'revision_author_name': ('str', 'swh author'), - 'revision_author_email': ('str', 'swh@inria.fr'), - 'revision_author_date': ('int', 1444054085), - 'revision_author_offset': ('str', '+0200'), - 'revision_committer_name': ('str', 'swh committer'), - 'revision_committer_email': ('str', 'swh@inria.fr'), - 'revision_committer_date': ('int', 1444054085), - 'revision_committer_offset': ('str', '+0200'), - 'revision_type': ('str', 'tar'), - 'revision_message': ('str', 'synthetic revision'), - - # release information - 'release_name': ('str', 'v0.0.1'), - 'release_date': ('int', 1444054085), - 'release_offset': ('str', '+0200'), - 'release_author_name': ('str', 'swh author'), - 'release_author_email': ('str', 'swh@inria.fr'), - 'release_comment': ('str', 'synthetic release'), + 'extraction_dir': ('str', '/tmp/swh.loader.tar/'), + + # # occurrence information + # 'branch': ('str', 'master'), + # 'authority_id': ('int', 1), + # 'validity': ('str', '2015-01-01 00:00:00+00'), + + # # revision information + # 'revision_type': ('str', 'tar'), + + # 'revision_author_name': ('str', 'swh author'), + # 'revision_author_email': ('str', 'swh@inria.fr'), + # 'revision_author_date': ('int', 1444054085), + # 'revision_author_offset': ('str', '+0200'), + # 'revision_committer_name': ('str', 'swh committer'), + # 'revision_committer_email': ('str', 'swh@inria.fr'), + # 'revision_committer_date': ('int', 1444054085), + # 'revision_committer_offset': ('str', '+0200'), + # 'revision_message': ('str', 'synthetic revision'), + + # # release information + # 'release_name': ('str', 'v0.0.1'), + # 'release_date': ('int', 1444054085), + # 'release_offset': ('str', '+0200'), + # 'release_author_name': ('str', 'swh author'), + # 'release_author_email': ('str', 'swh@inria.fr'), + # 'release_comment': ('str', 'synthetic release'), } - def run(self, tar_path): + def run(self, tar_path, origin_url, revision, release, occurrence): """Import a tarball tar_path. + Args: + - tar_path: path access to the tarball + - origin_url: url where we fetched the tarball + - revision, release, occurrence: see LoadDirRepository.run + """ - info = {} - for key in ['dir_path', - # origin - 'branch', 'authority_id', 'validity', - # revision - 'revision_author_name', 'revision_author_email', - 'revision_author_date', 'revision_author_offset', - 'revision_committer_name', 'revision_committer_email', - 'revision_committer_date', 'revision_committer_offset', - 'revision_type', 'revision_message', - # release - 'release_name', 'release_date', 'release_offset', - 'release_author_name', 'release_author_email', - 'release_comment']: - info.update({key: self.config[key]}) - - init_dir_path = self.config['dir_path'] - dir_path = tempfile.mkdtemp(prefix='swh.loader.tar', dir=init_dir_path) + extraction_dir = self.config['extraction_dir'] + dir_path = tempfile.mkdtemp(prefix='swh.loader.tar', + dir=extraction_dir) # unarchive in dir_path untar(tar_path, dir_path) - # Update the origin's url - # and the dir_path to load - origin_url = 'file://' + tar_path - info.update({ - 'origin_url': origin_url, - 'dir_path': dir_path - }) + origin = { + 'url': origin_url, + 'type': 'tar' + } - # Load the directory result try: - super().run(info) + super().run(dir_path, origin, revision, release, occurrence) finally: # always clean up shutil.rmtree(dir_path) diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index 91f1423..8c4c5bb 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,213 +1,203 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import datetime from nose.tools import istest from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType, GitPerm class TestConverters(unittest.TestCase): @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) - @istest - def origin_url_to_origin(self): - # given - origin_url = 'foobar' - - # when - self.assertDictEqual({ - 'type': 'dir', - 'url': origin_url, - }, converters.origin_url_to_origin(origin_url)) - @istest def annotated_tag_to_release(self): # given release = { 'sha1_git': '123', - 'revision_sha1_git': '456', - 'release_name': 'some-release', - 'release_comment': 'some-comment-on-release', - 'release_date': 1444054085, - 'release_offset': '-0300', - 'release_author_name': 'someone', - 'release_author_email': 'someone@whatelse.eu' + 'revision': '456', + 'name': 'some-release', + 'comment': 'some-comment-on-release', + 'date': 1444054085, + 'offset': '-0300', + 'author_name': 'someone', + 'author_email': 'someone@whatelse.eu' } expected_release = { 'id': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': -180, 'author_name': 'someone', 'author_email': 'someone@whatelse.eu', } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual( expected_release, actual_release) @istest def _blob_to_content_visible(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'visible' } # when actual_content = converters._blob_to_content(obj) # then self.assertEqual(expected_content, actual_content) @istest def _blob_to_content_absent(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'absent', 'reason': 'Content too large', 'origin': 3} # when actual_content = converters._blob_to_content(obj, max_content_size=5, origin_id=3) # then self.assertEqual(expected_content, actual_content) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', - 'revision_author_date': 1444054085, - 'revision_author_offset': '+0000', - 'revision_committer_date': 1444054085, - 'revision_committer_offset': '-0000', - 'revision_type': 'tar', - 'revision_message': 'synthetic-message-input', - 'revision_author_name': 'author-name', - 'revision_author_email': 'author-email', - 'revision_committer_name': 'committer-name', - 'revision_committer_email': 'committer-email', + 'author_date': 1444054085, + 'author_offset': '+0000', + 'committer_date': 1444054085, + 'committer_offset': '-0000', + 'type': 'tar', + 'message': 'synthetic-message-input', + 'author_name': 'author-name', + 'author_email': 'author-email', + 'committer_name': 'committer-name', + 'committer_email': 'committer-email', + 'directory': 'targeted-tree-sha1', } objects = { git.ROOT_TREE_KEY: [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { 'id': 'commit-git-sha1', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': 0, 'committer_date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'committer_date_offset': 0, 'type': 'tar', 'directory': 'targeted-tree-sha1', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision) diff --git a/swh/loader/dir/tests/test_git.py b/swh/loader/dir/tests/test_git.py index f4a4653..5091f65 100644 --- a/swh/loader/dir/tests/test_git.py +++ b/swh/loader/dir/tests/test_git.py @@ -1,123 +1,121 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.dir.git import git from swh.loader.dir.git.git import GitPerm, GitType class GitHashlib(unittest.TestCase): def setUp(self): self.tree_data = b''.join([b'40000 barfoo\0', bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087'), b'100644 blah\0', bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20'), b'100644 hello\0', bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')]) self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 author Antoine R. Dumont (@ardumont) 1444054085 +0200 committer Antoine R. Dumont (@ardumont) 1444054085 +0200 initial """.encode('utf-8') # NOQA self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 type commit tag 0.0.1 tagger Antoine R. Dumont (@ardumont) 1444225145 +0200 blah """.encode('utf-8') # NOQA self.checksums = { 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' '121dacdb1c'), 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' 'd629189653'), 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' 'e9e959f120'), } @istest def compute_directory_git_sha1(self): # given dirpath = 'some-dir-path' hashes = { dirpath: [{'perms': GitPerm.TREE, 'type': GitType.TREE, 'name': b'barfoo', 'sha1_git': bytes.fromhex('c3020f6bf135a38c6df' '3afeb5fb38232c5e07087')}, {'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'name': b'hello', 'sha1_git': bytes.fromhex('907b308167f0880fb2a' '5c0e1614bb0c7620f9dc3')}, {'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'name': b'blah', 'sha1_git': bytes.fromhex('63756ef0df5e4f10b6efa' '33cfe5c758749615f20')}] } # when checksums = git.compute_directory_git_sha1(dirpath, hashes) # then self.assertEqual(checksums['sha1_git'], self.checksums['tree_sha1_git']) @istest - def compute_revision_git_sha1(self): + def compute_revision_sha1_git(self): # given tree_hash = bytes.fromhex('1c61f7259dcb770f46b194d941df4f08ff0a3970') revision = { - 'revision_author_name': 'Antoine R. Dumont (@ardumont)', - 'revision_author_email': 'antoine.romain.dumont@gmail.com', - 'revision_author_date': '1444054085', - 'revision_author_offset': '+0200', - 'revision_committer_name': 'Antoine R. Dumont (@ardumont)', - 'revision_committer_email': 'antoine.romain.dumont@gmail.com', - 'revision_committer_date': '1444054085', - 'revision_committer_offset': '+0200', - 'revision_message': 'initial', - 'revision_type': 'tar' + 'author_name': 'Antoine R. Dumont (@ardumont)', + 'author_email': 'antoine.romain.dumont@gmail.com', + 'author_date': '1444054085', + 'author_offset': '+0200', + 'committer_name': 'Antoine R. Dumont (@ardumont)', + 'committer_email': 'antoine.romain.dumont@gmail.com', + 'committer_date': '1444054085', + 'committer_offset': '+0200', + 'message': 'initial', + 'type': 'tar', + 'directory': tree_hash, } # when - checksums = git.compute_revision_git_sha1(tree_hash, revision) + checksum = git.compute_revision_sha1_git(revision) # then - self.assertEqual(checksums['sha1_git'], - self.checksums['commit_sha1_git']) - self.assertDictContainsSubset(revision, checksums) + self.assertEqual(checksum, self.checksums['commit_sha1_git']) @istest - def compute_release(self): + def compute_release_sha1_git(self): # given revision_hash = bytes.fromhex('24d012aaec0bc5a4d2f62c56399053' 'd6cc72a241') release = { - 'release_name': '0.0.1', - 'release_author_name': 'Antoine R. Dumont (@ardumont)', - 'release_author_email': 'antoine.romain.dumont@gmail.com', - 'release_date': '1444225145', - 'release_offset': '+0200', - 'release_comment': 'blah', + 'name': '0.0.1', + 'author_name': 'Antoine R. Dumont (@ardumont)', + 'author_email': 'antoine.romain.dumont@gmail.com', + 'date': '1444225145', + 'offset': '+0200', + 'comment': 'blah', + 'revision': revision_hash, } # when - checksums = git.compute_release(revision_hash, release) + checksum = git.compute_release_sha1_git(release) # then - self.assertEqual(checksums['sha1_git'], - self.checksums['tag_sha1_git']) - self.assertDictContainsSubset(release, checksums) + self.assertEqual(checksum, self.checksums['tag_sha1_git']) diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index 7ba782f..cb0df64 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,103 +1,109 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.loader.dir.loader import DirLoader from swh.loader.dir.git.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp() sample_folder_archive = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'dir-folders', 'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, 'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) print(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage_class': 'remote_storage', 'storage_args': ['http://localhost:5000/'], + } - # origin information - 'origin_url': 'file:///dev/null', + self.origin = { + 'url': 'file:///dev/null', + 'type': 'dir', + } - # occurrence information + self.occurrence = { 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', + } + + self.revision = { + 'author_name': 'swh author', + 'author_email': 'swh@inria.fr', + 'author_date': '1444054085', + 'author_offset': '+0200', + 'committer_name': 'swh committer', + 'committer_email': 'swh@inria.fr', + 'committer_date': '1444054085', + 'committer_offset': '+0200', + 'type': 'tar', + 'message': 'synthetic revision', + } - # revision information - 'revision_author_name': 'swh author', - 'revision_author_email': 'swh@inria.fr', - 'revision_author_date': '1444054085', - 'revision_author_offset': '+0200', - 'revision_committer_name': 'swh committer', - 'revision_committer_email': 'swh@inria.fr', - 'revision_committer_date': '1444054085', - 'revision_committer_offset': '+0200', - 'revision_type': 'tar', - 'revision_message': 'synthetic revision', - - # release information - 'release_name': 'v0.0.1', - 'release_date': '1444054085', - 'release_offset': '+0200', - 'release_author_name': 'swh author', - 'release_author_email': 'swh@inria.fr', - 'release_comment': 'synthetic release', + self.release = { + 'name': 'v0.0.1', + 'date': '1444054085', + 'offset': '+0200', + 'author_name': 'swh author', + 'author_email': 'swh@inria.fr', + 'comment': 'synthetic release', } self.dirloader = DirLoader(self.info) @istest def load_without_storage(self): # when objects, objects_per_path = self.dirloader.list_repo_objs( self.root_path, - self.info) + self.revision, + self.release) # then self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") self.assertEquals(len(objects[GitType.TREE]), 5, "5 directories: 4 subdirs + 1 empty + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "synthetic release") self.assertEquals(len(objects_per_path), 6, "5 folders + ") # print('objects: %s\n objects-per-path: %s\n' % # (objects.keys(), # objects_per_path.keys()))