diff --git a/scratch/walking.py b/scratch/walking.py index b485168..41b9434 100755 --- a/scratch/walking.py +++ b/scratch/walking.py @@ -1,99 +1,99 @@ #!/usr/bin/env python3 # Tryouts scratch buffer # Not for production # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile from swh.loader.dir.git import git, utils def write_file(root, file, content): """Write some content in a file. """ filename = os.path.join(root, file) with open(filename, 'w') as f: f.write(content) def mkdir(root, name): """Create a directory path on disk. """ full_foldername = os.path.join(root, name) os.makedirs(full_foldername, exist_ok=True) return full_foldername def git_ls_tree_rec(hashes, info): """Display the computed result for debug purposes. """ for entry in hashes.keys(): entry_properties = hashes[entry] print("entry name: %s" % entry) for file in entry_properties: sha1 = utils.hash_to_hex(file['sha1_git']) print("%s %s %s\t%s" % (file['perms'].value.decode('utf-8'), file['type'].value.decode('utf-8'), sha1, file['name'].decode('utf-8'))) print() revision = git.compute_revision_git_sha1(hashes, info) print('revision %s -> directory %s' % ( utils.hash_to_hex(revision['sha1_git']), - utils.hash_to_hex(hashes[''][0]['sha1_git']) + utils.hash_to_hex(hashes[git.ROOT_TREE_KEY][0]['sha1_git']) )) ### setup - prepare some arborescence with dirs and files to walk it tempfilename = tempfile.mktemp(prefix='swh.loader.dir', suffix='.tmp', dir='/tmp') # want the same name for idempotency scratch_folder_root = mkdir(tempfilename, 'tmp') mkdir(scratch_folder_root, 'empty-folder') scratch_folder_foo = mkdir(scratch_folder_root, 'foo') scratch_folder_bar = mkdir(scratch_folder_root, 'bar/barfoo') write_file(scratch_folder_foo, 'quotes.md', 'Shoot for the moon. Even if you miss, you\'ll land among ' 'the stars.') write_file(scratch_folder_bar, 'another-quote.org', 'A Victory without danger is a triumph without glory.\n' '-- Pierre Corneille') ADDITIONAL_INFO = { 'revision_author_name': 'swh author', 'revision_author_email': 'swh@inria.fr', 'revision_author_date': '1444054085', 'revision_author_offset': '+0200', 'revision_committer_name': 'swh committer', 'revision_committer_email': 'swh@inria.fr', 'revision_committer_date': '1444054085', 'revision_committer_offset': '+0200', 'revision_type': 'dir', 'revision_message': 'synthetic revision message' } # when hashes = git.walk_and_compute_sha1_from_directory(scratch_folder_root) # then git_ls_tree_rec(hashes, ADDITIONAL_INFO) ### teardown shutil.rmtree(tempfilename, ignore_errors = True) diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index 6341b5d..24ce561 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,147 +1,147 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" from datetime import datetime from swh.loader.dir.git.git import GitType -from swh.loader.dir.git import utils +from swh.loader.dir.git import git, utils def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): if 'data' not in obj: filepath = obj['path'] content_raw, length = utils._read_raw(filepath) obj.update({'data': content_raw, 'length': length}) return _blob_to_content(obj, log, max_content_size, origin_id) def _blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert to a compliant swh content. """ size = obj['length'] ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'data': obj['data'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (obj['sha1_git'], size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret ret.update({ 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ - upper_directory = objects[''][0] + upper_directory = objects[git.ROOT_TREE_KEY][0] return { 'id': commit['sha1_git'], 'date': datetime.fromtimestamp(commit['revision_author_date']), 'date_offset': format_to_minutes(commit['revision_author_offset']), 'committer_date': datetime.fromtimestamp(commit['revision_committer_date']), 'committer_date_offset': format_to_minutes(commit['revision_committer_offset']), 'type': commit['revision_type'], 'directory': upper_directory['sha1_git'], 'message': commit['revision_message'], 'author_name': commit['revision_author_name'], 'author_email': commit['revision_author_email'], 'committer_name': commit['revision_committer_name'], 'committer_email': commit['revision_committer_email'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { 'id': release['sha1_git'], 'revision': release['revision_sha1_git'], 'name': release['release_name'], 'comment': release['release_comment'], 'date': datetime.fromtimestamp(release['release_date']), 'date_offset': format_to_minutes(release['release_offset']), 'author_name': release['release_author_name'], 'author_email': release['release_author_email'], } def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'dir', 'url': origin_url, } diff --git a/swh/loader/dir/git/git.py b/swh/loader/dir/git/git.py index 7b50994..33089be 100644 --- a/swh/loader/dir/git/git.py +++ b/swh/loader/dir/git/git.py @@ -1,331 +1,333 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.loader.dir.git import utils +ROOT_TREE_KEY = '' + class GitType(Enum): BLOB = b'blob' TREE = b'tree' EXEC = b'exec' LINK = b'link' COMM = b'commit' RELE = b'release' class GitPerm(Enum): BLOB = b'100644' TREE = b'40000' EXEC = b'100755' LINK = b'120000' def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: dictionary with sha1_git as key and the actual binary sha1 as value. Assumes: Every path exists in hashes. """ def sorted_key_fn(entry): """Beware the sorted algorithm in git add a / for tree entries. """ name = entry['name'] return name + b'/' if entry['type'] is GitType.TREE else name def sort_by_entry_name(hashes): return sorted(hashes, key=sorted_key_fn) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) return utils.hashdata(b''.join(rows), 'tree') def compute_revision_git_sha1(tree_hash, info): """Compute a revision representation targeting the tree_hash. Args: tree_hash: binary form of the tree hash info: Additional dictionary information needed to compute a synthetic revision. Following keys are expected: - revision_author_name - revision_author_email - revision_author_date - revision_author_offset - revision_committer_name - revision_committer_email - revision_committer_date - revision_committer_offset - revision_message - revision_type """ revision_author_name = info['revision_author_name'] revision_author_email = info['revision_author_email'] revision_author_date = info['revision_author_date'] revision_author_offset = info['revision_author_offset'] revision_committer_name = info['revision_committer_name'] revision_committer_email = info['revision_committer_email'] revision_committer_date = info['revision_committer_date'] revision_committer_offset = info['revision_committer_offset'] revision_message = info['revision_message'] revision_content = ("""tree %s author %s <%s> %s %s committer %s <%s> %s %s %s """ % (utils.hash_to_hex(tree_hash), revision_author_name, revision_author_email, revision_author_date, revision_author_offset, revision_committer_name, revision_committer_email, revision_committer_date, revision_committer_offset, revision_message)).encode('utf-8') hashes = utils.hashdata(revision_content, 'commit') # and update other information hashes.update({ 'revision_author_name': revision_author_name, 'revision_author_email': revision_author_email, 'revision_author_date': revision_author_date, 'revision_author_offset': revision_author_offset, 'revision_committer_name': revision_committer_name, 'revision_committer_email': revision_committer_email, 'revision_committer_date': revision_committer_date, 'revision_committer_offset': revision_committer_offset, 'revision_message': revision_message, 'revision_type': info['revision_type'] }) return hashes def compute_release(revision_hash, info): """Compute a release representation. This release representation will contain the computed sha1_git for such release. This release will point to the revision_hash. The additional informations are present in the dictionary info. Args: revision_hash: binary form of the sha1_git revision targeted by this release info: Additional dictionary information needed to compute a synthetic release. Following keys are expected: - release_name - release_comment - release_date - release_offset - release_author_name - release_author_email """ release_name = info['release_name'] release_author_name = info['release_author_name'] release_author_email = info['release_author_email'] release_date = info['release_date'] release_offset = info['release_offset'] release_comment = info['release_comment'] release_content_to_hash = ("""object %s type commit tag %s tagger %s <%s> %s %s %s """ % (utils.hash_to_hex(revision_hash), release_name, release_author_name, release_author_email, release_date, release_offset, release_comment)).encode('utf-8') hashes = utils.hashdata(release_content_to_hash, 'tag') hashes.update({ 'revision_sha1_git': revision_hash, 'release_name': release_name, 'release_comment': release_comment, 'release_date': release_date, 'release_offset': release_offset, 'release_author_name': release_author_name, 'release_author_email': release_author_email, }) return hashes def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ m_hashes = utils.hashlink(linkpath) m_hashes.update({ 'name': bytes(os.path.basename(linkpath), 'utf-8'), 'perms': GitPerm.LINK, 'type': GitType.BLOB, 'path': linkpath }) return m_hashes def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ m_hashes = utils.hashfile(filepath) perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB m_hashes.update({ 'name': bytes(os.path.basename(filepath), 'utf-8'), 'perms': perms, 'type': GitType.BLOB, 'path': filepath }) return m_hashes def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ tree_hash = compute_directory_git_sha1(dirname, ls_hashes) tree_hash.update({ 'name': bytes(os.path.basename(dirname), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE, 'path': dirname }) return tree_hash def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... Note: - One special key is '' to indicate the upper root of the + One special key is ROOT_TREE_KEY to indicate the upper root of the directory (this is the revision's directory). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} all_links = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] links = [os.path.join(dirpath, file) for file in (filenames+dirnames) if os.path.islink(os.path.join(dirpath, file))] for linkpath in links: all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) only_files = [os.path.join(dirpath, file) for file in filenames if os.path.join(dirpath, file) not in all_links] for filepath in only_files: m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] subdirs = [os.path.join(dirpath, dir) for dir in dirnames if os.path.join(dirpath, dir) not in all_links] for fulldirname in subdirs: tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_git_sha1(rootdir, ls_hashes) root_hash.update({ 'path': rootdir, 'name': bytes(os.path.basename(rootdir), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE }) ls_hashes.update({ - '': [root_hash] + ROOT_TREE_KEY: [root_hash] }) return ls_hashes diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 2051a5f..b502b03 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,526 +1,526 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import uuid import traceback import os import psycopg2 from retrying import retry from swh.core import config from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType def send_in_packets(source_list, formatter, sender, packet_size, packet_size_bytes=None, *args, **kwargs): """Send objects from `source_list`, passed through `formatter` (with extra args *args, **kwargs), using the `sender`, in packets of `packet_size` objects (and of max `packet_size_bytes`). """ formatted_objects = [] count = 0 if not packet_size_bytes: packet_size_bytes = 0 for obj in source_list: formatted_object = formatter(obj, *args, **kwargs) if formatted_object: formatted_objects.append(formatted_object) else: continue if packet_size_bytes: count += formatted_object['length'] if len(formatted_objects) >= packet_size or count > packet_size_bytes: sender(formatted_objects) formatted_objects = [] count = 0 if formatted_objects: sender(formatted_objects) def retry_loading(error): """Retry policy when the database raises an integrity error""" if not isinstance(error, psycopg2.IntegrityError): return False logger = logging.getLogger('swh.loader.git.DirLoader') error_name = error.__module__ + '.' + error.__class__.__name__ logger.warning('Retry loading a batch', exc_info=False, extra={ 'swh_type': 'storage_retry', 'swh_exception_type': error_name, 'swh_exception': traceback.format_exception( error.__class__, error, error.__traceback__, ), }) return True class DirLoader(config.SWHConfig): """A bulk loader for a directory""" DEFAULT_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 10000), 'content_packet_size_bytes': ('int', 1024 * 1024 * 1024), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100000), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), # origin information 'origin_url': ('str', 'file:///dev/null'), # occurrence information 'branch': ('str', 'master'), 'authority_id': ('int', 1), 'validity': ('str', '2015-01-01 00:00:00+00'), # revision information 'revision_author_name': ('str', 'swh author'), 'revision_author_email': ('str', 'swh@inria.fr'), 'revision_author_date': ('int', '1444054085'), 'revision_author_offset': ('str', '+0200'), 'revision_committer_name': ('str', 'swh committer'), 'revision_committer_email': ('str', 'swh@inria.fr'), 'revision_committer_date': ('int', '1444054085'), 'revision_committer_offset': ('str', '+0200'), 'revision_type': ('str', 'tar'), 'revision_message': ('str', 'synthetic revision'), # release information 'release_name': ('str', 'v0.0.1'), 'release_date': ('int', '1444054085'), 'release_offset': ('str', '+0200'), 'release_author_name': ('str', 'swh author'), 'release_author_email': ('str', 'swh@inria.fr'), 'release_comment': ('str', 'synthetic release'), } def __init__(self, config): self.config = config if self.config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage self.storage = Storage(*self.config['storage_args']) self.log = logging.getLogger('swh.loader.dir.DirLoader') @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_contents(self, content_list): """Actually send properly formatted contents to the database""" num_contents = len(content_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) self.storage.content_add(content_list) self.log.debug("Done sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_directories(self, directory_list): """Actually send properly formatted directories to the database""" num_directories = len(directory_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) self.storage.directory_add(directory_list) self.log.debug("Done sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_revisions(self, revision_list): """Actually send properly formatted revisions to the database""" num_revisions = len(revision_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) self.storage.revision_add(revision_list) self.log.debug("Done sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_releases(self, release_list): """Actually send properly formatted releases to the database""" num_releases = len(release_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) self.storage.release_add(release_list) self.log.debug("Done sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_occurrences(self, occurrence_list): """Actually send properly formatted occurrences to the database""" num_occurrences = len(occurrence_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) self.storage.occurrence_add(occurrence_list) self.log.debug("Done sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) def get_or_create_origin(self, origin_url): origin = converters.origin_url_to_origin(origin_url) origin['id'] = self.storage.origin_add_one(origin) return origin def dir_origin(self, root_dir, origin_url): log_id = str(uuid.uuid4()) self.log.debug('Creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) origin = self.get_or_create_origin(origin_url) self.log.debug('Done creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) return origin def dir_revision(self, root_dir, origin_url, revision_date, revision_offset, revision_committer_date, revision_committer_offset, revision_type, revision_message, revision_author, revision_committer): """Create a revision. """ log_id = str(uuid.uuid4()) self.log.debug('Creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) self.get_or_create_origin(origin_url) self.log.debug('Done creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) def bulk_send_blobs(self, objects, blobs, origin_id): """Format blobs as swh contents and send them to the database""" packet_size = self.config['content_packet_size'] packet_size_bytes = self.config['content_packet_size_bytes'] max_content_size = self.config['content_size_limit'] send_in_packets(blobs, converters.blob_to_content, self.send_contents, packet_size, packet_size_bytes=packet_size_bytes, log=self.log, max_content_size=max_content_size, origin_id=origin_id) def bulk_send_trees(self, objects, trees): """Format trees as swh directories and send them to the database""" packet_size = self.config['directory_packet_size'] send_in_packets(trees, converters.tree_to_directory, self.send_directories, packet_size, objects=objects, log=self.log) def bulk_send_commits(self, objects, commits): """Format commits as swh revisions and send them to the database""" packet_size = self.config['revision_packet_size'] send_in_packets(commits, converters.commit_to_revision, self.send_revisions, packet_size, objects=objects, log=self.log) def bulk_send_annotated_tags(self, objects, tags): """Format annotated tags (pygit2.Tag objects) as swh releases and send them to the database """ packet_size = self.config['release_packet_size'] send_in_packets(tags, converters.annotated_tag_to_release, self.send_releases, packet_size, log=self.log) def bulk_send_refs(self, objects, refs): """Format git references as swh occurrences and send them to the database """ packet_size = self.config['occurrence_packet_size'] send_in_packets(refs, lambda ref: ref, self.send_occurrences, packet_size) def compute_dir_ref(self, root_dir, branch, revision_hash, origin_id, authority_id, validity): """List all the refs from the given root directory root_dir. Args: - root_dir: the root directory - branch: occurrence's branch name - revision_hash: the revision hash - origin_id (int): the id of the origin from which the root_dir is taken - validity (datetime.datetime): the validity date for the repository's refs - authority_id (int): the id of the authority on `validity`. Returns: One dictionary with keys: - branch (str): name of the ref - revision (sha1_git): revision pointed at by the ref - origin (int) - validity (datetime.DateTime) - authority (int) Compatible with occurrence_add. """ log_id = str(uuid.uuid4()) self.log.debug("Computing occurrence %s representation at %s" % ( branch, revision_hash), extra={ 'swh_type': 'computing_occurrence_dir', 'swh_name': branch, 'swh_target': str(revision_hash), 'swh_id': log_id, }) return { 'branch': branch, 'revision': revision_hash, 'origin': origin_id, 'validity': validity, 'authority': authority_id, } def list_repo_objs(self, root_dir, info): """List all objects from root_dir. Args: - root_dir (path): the directory to list Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ def get_objects_per_object_type(objects_per_path): m = { GitType.BLOB: [], GitType.TREE: [], GitType.COMM: [], GitType.RELE: [] } for tree_path in objects_per_path: objs = objects_per_path[tree_path] # print('tree_path: %s, objs: %s' % (tree_path, objs)) for obj in objs: m[obj['type']].append(obj) return m log_id = str(uuid.uuid4()) self.log.info("Started listing %s" % root_dir, extra={ 'swh_type': 'dir_list_objs_start', 'swh_repo': root_dir, 'swh_id': log_id, }) objects_per_path = git.walk_and_compute_sha1_from_directory(root_dir) objects = get_objects_per_object_type(objects_per_path) - tree_hash = objects_per_path[''][0]['sha1_git'] + tree_hash = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] revision = git.compute_revision_git_sha1(tree_hash, info) objects.update({ GitType.COMM: [revision] }) revision_hash = revision['sha1_git'] release = git.compute_release(revision_hash, info) objects.update({ GitType.RELE: [release] }) self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( root_dir, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', 'swh_repo': root_dir, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) return objects, objects_per_path def load_dir(self, root_dir, objects, objects_per_path, refs, origin_id): if self.config['send_contents']: self.bulk_send_blobs(objects_per_path, objects[GitType.BLOB], origin_id) else: self.log.info('Not sending contents') if self.config['send_directories']: self.bulk_send_trees(objects_per_path, objects[GitType.TREE]) else: self.log.info('Not sending directories') if self.config['send_revisions']: self.bulk_send_commits(objects_per_path, objects[GitType.COMM]) else: self.log.info('Not sending revisions') if self.config['send_releases']: self.bulk_send_annotated_tags(objects_per_path, objects[GitType.RELE]) else: self.log.info('Not sending releases') if self.config['send_occurrences']: self.bulk_send_refs(objects_per_path, refs) else: self.log.info('Not sending occurrences') def process(self, root_dir): if not os.path.exists(root_dir): self.log.info('Skipping inexistant directory %s' % root_dir, extra={ 'swh_type': 'dir_repo_list_refs', 'swh_repo': root_dir, 'swh_num_refs': 0, }) return files = os.listdir(root_dir) if not(files): self.log.info('Skipping empty directory %s' % root_dir, extra={ 'swh_type': 'dir_repo_list_refs', 'swh_repo': root_dir, 'swh_num_refs': 0, }) return # Add origin to storage if needed, use the one from config if not origin = self.dir_origin(root_dir, self.config['origin_url']) # We want to load the repository, walk all the objects objects, objects_per_path = self.list_repo_objs(root_dir, self.config) # Compute revision information (mixed from outside input + dir content) revision = objects[GitType.COMM][0] # Parse all the refs from our root_dir ref = self.compute_dir_ref(root_dir, self.config['branch'], revision['sha1_git'], origin['id'], self.config['authority_id'], self.config['validity']) # Finally, load the repository self.load_dir(root_dir, objects, objects_per_path, [ref], origin['id']) diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index c71dead..960d7bb 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,206 +1,207 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from datetime import datetime from swh.loader.dir import converters +from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType, GitPerm class TestConverters(unittest.TestCase): @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) @istest def origin_url_to_origin(self): # given origin_url = 'foobar' # when self.assertDictEqual({ 'type': 'dir', 'url': origin_url, }, converters.origin_url_to_origin(origin_url)) @istest def annotated_tag_to_release(self): # given release = { 'sha1_git': '123', 'revision_sha1_git': '456', 'release_name': 'some-release', 'release_comment': 'some-comment-on-release', 'release_date': 1444054085, 'release_offset': '-0300', 'release_author_name': 'someone', 'release_author_email': 'someone@whatelse.eu' } expected_release = { 'id': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': datetime.fromtimestamp(1444054085), 'date_offset': -180, 'author_name': 'someone', 'author_email': 'someone@whatelse.eu', } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual( expected_release, actual_release) @istest def _blob_to_content_visible(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'visible' } # when actual_content = converters._blob_to_content(obj) # then self.assertEqual(expected_content, actual_content) @istest def _blob_to_content_absent(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'absent', 'reason': 'Content too large', 'origin': 3} # when actual_content = converters._blob_to_content(obj, max_content_size=5, origin_id=3) # then self.assertEqual(expected_content, actual_content) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', 'revision_author_date': 1444054085, 'revision_author_offset': '+0000', 'revision_committer_date': 1444054085, 'revision_committer_offset': '-0000', 'revision_type': 'tar', 'revision_message': 'synthetic-message-input', 'revision_author_name': 'author-name', 'revision_author_email': 'author-email', 'revision_committer_name': 'committer-name', 'revision_committer_email': 'committer-email', } objects = { - '': [{'sha1_git': 'targeted-tree-sha1'}] + git.ROOT_TREE_KEY: [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { 'id': 'commit-git-sha1', 'date': datetime.fromtimestamp(1444054085), 'date_offset': 0, 'committer_date': datetime.fromtimestamp(1444054085), 'committer_date_offset': 0, 'type': 'tar', 'directory': 'targeted-tree-sha1', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision)