diff --git a/PKG-INFO b/PKG-INFO index 92a71d5..2d3bab5 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.7 +Version: 0.0.8 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/README b/README index 6eff9bb..617f8ba 100644 --- a/README +++ b/README @@ -1,106 +1,106 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- - python3 - python3-pygit2 - python3-swh.core - python3-swh.storage Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit Configuration ============= bin/swh-loader-git takes one argument: a configuration file in .ini format. The configuration file contains the following directives: ``` [main] # the storage class used. one of remote_storage, local_storage storage_class = remote_storage # arguments passed to the storage class # for remote_storage: URI of the storage server storage_args = http://localhost:5000/ # for local_storage: database connection string and root of the # storage, comma separated # storage_args = dbname=softwareheritage-dev, /tmp/swh/storage # The path to the repository to load repo_path = /tmp/git_repo # The URL of the origin for the repo origin_url = https://github.com/hylang/hy -# The ID of the authority that dated the validity of the repo -authority = 1 +# The UUID of the authority that dated the validity of the repo +authority = 5f4d4c51-498a-4e28-88b3-b3e4e8396cba # The validity date of the refs in the given repo, in Postgres # timestamptz format validity = 2015-01-01 00:00:00+00 # Whether to send the given types of objects send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True # The size of the packets sent to storage for each kind of object content_packet_size = 100000 content_packet_size_bytes = 1073741824 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 ``` bin/swh-loader-git-multi takes the same arguments, and adds: ``` [main] # database connection string to the lister-github database lister_db = dbname=lister-github # base path of the github repositories repo_basepath = /srv/storage/space/data/github # Whether to run the mass loading or just list the repos dry_run = False ``` diff --git a/bin/swh-loader-git b/bin/swh-loader-git index 2938319..6711b23 100755 --- a/bin/swh-loader-git +++ b/bin/swh-loader-git @@ -1,36 +1,36 @@ #!/usr/bin/env python3 import logging import sys from swh.core.logger import PostgresHandler from swh.loader.git import BulkLoader ADDITIONAL_CONFIG = { 'repo_path': ('str', None), 'origin_url': ('str', 'file:///dev/null'), - 'authority': ('int', 1), + 'authority': ('str', '5f4d4c51-498a-4e28-88b3-b3e4e8396cba'), 'validity': ('str', '2015-01-01 00:00:00+00'), } my_config = BulkLoader.parse_config_file( config_filename=sys.argv[1], additional_configs=[ADDITIONAL_CONFIG]) logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s %(message)s', handlers=[ logging.StreamHandler(), PostgresHandler(my_config['log_db']), ], ) requests_log = logging.getLogger("requests") requests_log.setLevel(logging.CRITICAL) loader = BulkLoader(my_config) loader.process(my_config['repo_path'], my_config['origin_url'], my_config['authority'], my_config['validity']) diff --git a/bin/swh-loader-git-multi b/bin/swh-loader-git-multi index 18fb58b..715828b 100755 --- a/bin/swh-loader-git-multi +++ b/bin/swh-loader-git-multi @@ -1,117 +1,117 @@ #!/usr/bin/env python3 import datetime import logging import os import sys import psycopg2 import psycopg2.extras from swh.core import config from swh.loader.git import BulkLoader DEFAULT_CONFIG = { 'lister_db': ('str', 'dbname=lister-github'), 'repo_basepath': ('str', '/srv/storage/space/data/github'), 'dry_run': ('bool', True), 'db': ('str', 'dbname=softwareheritage-dev'), 'storage_base': ('str', '/tmp/swh-loader-git/test'), 'repo_path': ('str', None), 'origin_url': ('str', 'file:///dev/null'), - 'authority': ('int', 1), + 'authority': ('str', '5f4d4c51-498a-4e28-88b3-b3e4e8396cba'), 'validity': ('str', '2015-01-01 00:00:00+00'), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 100000), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100000), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), } logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s %(message)s') logger = logging.getLogger('test-bulk-loader-all') base_config = config.read(sys.argv[1], DEFAULT_CONFIG) def process_one_repository(base_config, repo_name): my_config = base_config.copy() basepath = my_config['repo_basepath'] my_path = os.path.join(basepath, repo_name[0], repo_name) my_config['repo_path'] = my_path if not os.path.exists(my_path): logger.error('Repository %s does not exist at %s' % (repo_name, my_path)) return witness_file = os.path.join(my_path, 'witness') if not os.path.exists(witness_file): logger.warn('No witness file for repository %s, using default value ' '%s' % (repo_name, my_config['validity'])) else: validity_timestamp = os.stat(witness_file).st_mtime my_config['validity'] = "%s+00" % datetime.datetime.utcfromtimestamp( validity_timestamp) logger.info('Processing repository %s fetched on %s' % ( repo_name, my_config['validity'])) if my_config['dry_run']: return loader = BulkLoader(my_config) origin = loader.get_origin() if origin['id']: logger.info('Repository %s already loaded (origin id=%s), skipping' % ( repo_name, origin['id'])) return loader.process() def list_random_repos(config): db = psycopg2.connect(config['lister_db'], cursor_factory=psycopg2.extras.NamedTupleCursor) query = '''select full_name from repos_random_sample(0.001) r inner join crawl_history c on r.id = c.repo where status=true''' cur = db.cursor() cur.execute(query) ret = cur.fetchall() cur.close() db.close() return ret processed_repos = set() print('Needs updating for new BulkLoader') sys.exit(1) while True: logger.info('listing 0.001% random repos') random_repos = list_random_repos(base_config) logger.info('done') for repo in random_repos: repo_name = repo.full_name if repo_name not in processed_repos: try: process_one_repository(base_config, repo_name) except Exception: logger.exception('Failed processing repository %s' % repo_name) finally: processed_repos.add(repo_name) diff --git a/debian/control b/debian/control index c49506a..97483d2 100644 --- a/debian/control +++ b/debian/control @@ -1,24 +1,24 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-pygit2, python3-retrying, python3-setuptools, python3-swh.core (>= 0.0.7~), - python3-swh.storage (>= 0.0.8~), + python3-swh.storage (>= 0.0.15~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), - python3-swh.storage (>= 0.0.8~), + python3-swh.storage (>= 0.0.15~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements.txt b/requirements.txt index 1198a53..a29d1a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ pygit2 retrying vcversioner swh.core >= 0.0.7 -swh.storage >= 0.0.8 +swh.storage >= 0.0.15 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 92a71d5..2d3bab5 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.7 +Version: 0.0.8 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index 541c26c..e119675 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,5 +1,5 @@ pygit2 retrying swh.core>=0.0.7 -swh.storage>=0.0.8 +swh.storage>=0.0.15 vcversioner diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index 05a3fac..e656591 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,461 +1,485 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import traceback import uuid import psycopg2 import pygit2 from pygit2 import Oid, GIT_OBJ_BLOB, GIT_OBJ_TREE, GIT_OBJ_COMMIT, GIT_OBJ_TAG import requests from retrying import retry from swh.core import config from . import converters from .utils import get_objects_per_object_type def send_in_packets(source_list, formatter, sender, packet_size, packet_size_bytes=None, *args, **kwargs): """Send objects from `source_list`, passed through `formatter` (with extra args *args, **kwargs), using the `sender`, in packets of `packet_size` objects (and of max `packet_size_bytes`). """ formatted_objects = [] count = 0 if not packet_size_bytes: packet_size_bytes = 0 for obj in source_list: formatted_object = formatter(obj, *args, **kwargs) if formatted_object: formatted_objects.append(formatted_object) else: continue if packet_size_bytes: count += formatted_object['length'] if len(formatted_objects) >= packet_size or count > packet_size_bytes: sender(formatted_objects) formatted_objects = [] count = 0 if formatted_objects: sender(formatted_objects) def retry_loading(error): """Retry policy when we catch a recoverable error""" exception_classes = [ # raised when two parallel insertions insert the same data. psycopg2.IntegrityError, # raised when uWSGI restarts and hungs up on the worker. requests.exceptions.ConnectionError, ] if not any(isinstance(error, exc) for exc in exception_classes): return False logger = logging.getLogger('swh.loader.git.BulkLoader') error_name = error.__module__ + '.' + error.__class__.__name__ logger.warning('Retry loading a batch', exc_info=False, extra={ 'swh_type': 'storage_retry', 'swh_exception_type': error_name, 'swh_exception': traceback.format_exception( error.__class__, error, error.__traceback__, ), }) return True class BulkLoader(config.SWHConfig): """A bulk loader for a git repository""" DEFAULT_CONFIG = { 'storage_class': ('str', 'remote_storage'), 'storage_args': ('list[str]', ['http://localhost:5000/']), 'send_contents': ('bool', True), 'send_directories': ('bool', True), 'send_revisions': ('bool', True), 'send_releases': ('bool', True), 'send_occurrences': ('bool', True), 'content_packet_size': ('int', 10000), 'content_packet_size_bytes': ('int', 1024 * 1024 * 1024), 'directory_packet_size': ('int', 25000), 'revision_packet_size': ('int', 100000), 'release_packet_size': ('int', 100000), 'occurrence_packet_size': ('int', 100000), } def __init__(self, config): self.config = config if self.config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage self.storage = Storage(*self.config['storage_args']) self.log = logging.getLogger('swh.loader.git.BulkLoader') @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_contents(self, content_list): """Actually send properly formatted contents to the database""" num_contents = len(content_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) self.storage.content_add(content_list) self.log.debug("Done sending %d contents" % num_contents, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'content', 'swh_num': num_contents, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_directories(self, directory_list): """Actually send properly formatted directories to the database""" num_directories = len(directory_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) self.storage.directory_add(directory_list) self.log.debug("Done sending %d directories" % num_directories, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'directory', 'swh_num': num_directories, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_revisions(self, revision_list): """Actually send properly formatted revisions to the database""" num_revisions = len(revision_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) self.storage.revision_add(revision_list) self.log.debug("Done sending %d revisions" % num_revisions, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'revision', 'swh_num': num_revisions, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_releases(self, release_list): """Actually send properly formatted releases to the database""" num_releases = len(release_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) self.storage.release_add(release_list) self.log.debug("Done sending %d releases" % num_releases, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'release', 'swh_num': num_releases, 'swh_id': log_id, }) @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) def send_occurrences(self, occurrence_list): """Actually send properly formatted occurrences to the database""" num_occurrences = len(occurrence_list) log_id = str(uuid.uuid4()) self.log.debug("Sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) self.storage.occurrence_add(occurrence_list) self.log.debug("Done sending %d occurrences" % num_occurrences, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'occurrence', 'swh_num': num_occurrences, 'swh_id': log_id, }) def get_or_create_origin(self, origin_url): origin = converters.origin_url_to_origin(origin_url) origin['id'] = self.storage.origin_add_one(origin) return origin def repo_origin(self, repo, origin_url): log_id = str(uuid.uuid4()) self.log.debug('Creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_start', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) origin = self.get_or_create_origin(origin_url) self.log.debug('Done creating origin for %s' % origin_url, extra={ 'swh_type': 'storage_send_end', 'swh_content_type': 'origin', 'swh_num': 1, 'swh_id': log_id }) return origin def bulk_send_blobs(self, repo, blobs, origin_id): """Format blobs as swh contents and send them to the database""" packet_size = self.config['content_packet_size'] packet_size_bytes = self.config['content_packet_size_bytes'] max_content_size = self.config['content_size_limit'] send_in_packets(blobs, converters.blob_to_content, self.send_contents, packet_size, repo=repo, packet_size_bytes=packet_size_bytes, log=self.log, max_content_size=max_content_size, origin_id=origin_id) def bulk_send_trees(self, repo, trees): """Format trees as swh directories and send them to the database""" packet_size = self.config['directory_packet_size'] send_in_packets(trees, converters.tree_to_directory, self.send_directories, packet_size, repo=repo, log=self.log) def bulk_send_commits(self, repo, commits): """Format commits as swh revisions and send them to the database""" packet_size = self.config['revision_packet_size'] send_in_packets(commits, converters.commit_to_revision, self.send_revisions, packet_size, repo=repo, log=self.log) def bulk_send_annotated_tags(self, repo, tags): """Format annotated tags (pygit2.Tag objects) as swh releases and send them to the database """ packet_size = self.config['release_packet_size'] send_in_packets(tags, converters.annotated_tag_to_release, self.send_releases, packet_size, repo=repo, log=self.log) def bulk_send_refs(self, repo, refs): """Format git references as swh occurrences and send them to the database """ packet_size = self.config['occurrence_packet_size'] send_in_packets(refs, converters.ref_to_occurrence, self.send_occurrences, packet_size) def list_repo_refs(self, repo, origin_id, authority_id, validity): """List all the refs from the given repository. Args: - repo (pygit2.Repository): the repository to list - origin_id (int): the id of the origin from which the repo is taken - validity (datetime.datetime): the validity date for the repository's refs - - authority_id (int): the id of the authority on `validity`. + - authority_id (str): the uuid of the authority on `validity`. Returns: A list of dicts with keys: - branch (str): name of the ref - revision (sha1_git): revision pointed at by the ref - origin (int) - validity (datetime.DateTime) - - authority (int) + - authority (str) Compatible with occurrence_add. """ log_id = str(uuid.uuid4()) refs = [] ref_names = repo.listall_references() for ref_name in ref_names: ref = repo.lookup_reference(ref_name) target = ref.target if not isinstance(target, Oid): self.log.debug("Peeling symbolic ref %s pointing at %s" % ( ref_name, ref.target), extra={ 'swh_type': 'git_sym_ref_peel', 'swh_name': ref_name, 'swh_target': str(ref.target), 'swh_id': log_id, }) target_obj = ref.peel() else: target_obj = repo[target] if target_obj.type == GIT_OBJ_TAG: self.log.debug("Peeling ref %s pointing at tag %s" % ( ref_name, target_obj.name), extra={ 'swh_type': 'git_ref_peel', 'swh_name': ref_name, 'swh_target': str(target_obj.name), 'swh_id': log_id, }) target_obj = ref.peel() if not target_obj.type == GIT_OBJ_COMMIT: self.log.info("Skipping ref %s pointing to %s %s" % ( ref_name, target_obj.__class__.__name__, target_obj.id.hex), extra={ 'swh_type': 'git_ref_skip', 'swh_name': ref_name, 'swh_target': str(target_obj), 'swh_id': log_id, }) refs.append({ 'branch': ref_name, 'revision': target_obj.id.raw, 'origin': origin_id, 'validity': validity, 'authority': authority_id, }) return refs def list_repo_objs(self, repo): """List all the objects from repo. Args: - repo (pygit2.Repository): the repository to list Returns: a dict containing lists of `Oid`s with keys for each object type: - GIT_OBJ_BLOB - GIT_OBJ_TREE - GIT_OBJ_COMMIT - GIT_OBJ_TAG """ log_id = str(uuid.uuid4()) self.log.info("Started listing %s" % repo.path, extra={ 'swh_type': 'git_list_objs_start', 'swh_repo': repo.path, 'swh_id': log_id, }) objects = get_objects_per_object_type(repo) self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( repo.path, len(objects[GIT_OBJ_BLOB]), len(objects[GIT_OBJ_TREE]), len(objects[GIT_OBJ_COMMIT]), len(objects[GIT_OBJ_TAG]), ), extra={ 'swh_type': 'git_list_objs_end', 'swh_repo': repo.path, 'swh_num_blobs': len(objects[GIT_OBJ_BLOB]), 'swh_num_trees': len(objects[GIT_OBJ_TREE]), 'swh_num_commits': len(objects[GIT_OBJ_COMMIT]), 'swh_num_tags': len(objects[GIT_OBJ_TAG]), 'swh_id': log_id, }) return objects def open_repo(self, repo_path): return pygit2.Repository(repo_path) + def open_fetch_history(self, origin_id): + return self.storage.fetch_history_start(origin_id) + + def close_fetch_history(self, fetch_history_id, objects, refs): + data = { + 'status': True, + 'result': { + 'contents': len(objects.get(GIT_OBJ_BLOB, [])), + 'directories': len(objects.get(GIT_OBJ_TREE, [])), + 'revisions': len(objects.get(GIT_OBJ_COMMIT, [])), + 'releases': len(objects.get(GIT_OBJ_TAG, [])), + 'occurrences': len(refs), + }, + } + return self.storage.fetch_history_end(fetch_history_id, data) + def load_repo(self, repo, objects, refs, origin_id): if self.config['send_contents']: self.bulk_send_blobs(repo, objects[GIT_OBJ_BLOB], origin_id) else: self.log.info('Not sending contents') if self.config['send_directories']: self.bulk_send_trees(repo, objects[GIT_OBJ_TREE]) else: self.log.info('Not sending directories') if self.config['send_revisions']: self.bulk_send_commits(repo, objects[GIT_OBJ_COMMIT]) else: self.log.info('Not sending revisions') if self.config['send_releases']: self.bulk_send_annotated_tags(repo, objects[GIT_OBJ_TAG]) else: self.log.info('Not sending releases') if self.config['send_occurrences']: self.bulk_send_refs(repo, refs) else: self.log.info('Not sending occurrences') def process(self, repo_path, origin_url, authority_id, validity): # Open repository repo = self.open_repo(repo_path) # Add origin to storage if needed, use the one from config if not origin = self.repo_origin(repo, origin_url) + # Create fetch_history + fetch_history = self.open_fetch_history(origin['id']) + # Parse all the refs from our repo refs = self.list_repo_refs(repo, origin['id'], authority_id, validity) if not refs: self.log.info('Skipping empty repository %s' % repo_path, extra={ 'swh_type': 'git_repo_list_refs', 'swh_repo': repo_path, 'swh_num_refs': 0, }) + # End fetch_history + self.close_fetch_history(fetch_history, {}, refs) return else: self.log.info('Listed %d refs for repo %s' % ( len(refs), repo_path), extra={ 'swh_type': 'git_repo_list_refs', 'swh_repo': repo_path, 'swh_num_refs': len(refs), }) # We want to load the repository, walk all the objects objects = self.list_repo_objs(repo) # Finally, load the repository self.load_repo(repo, objects, refs, origin['id']) + + # End fetch_history + self.close_fetch_history(fetch_history, objects, refs) diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py index 07e854d..8325333 100644 --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,97 +1,97 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os from swh.core.scheduling import Task from .loader import BulkLoader class LoadGitRepository(Task): """Import a git repository to Software Heritage""" task_queue = 'swh_loader_git' CONFIG_BASE_FILENAME = 'loader/git.ini' ADDITIONAL_CONFIG = {} def __init__(self): self.config = BulkLoader.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG], ) def run(self, repo_path, origin_url, authority_id, validity): """Import a git repository""" loader = BulkLoader(self.config) loader.log = self.log loader.process(repo_path, origin_url, authority_id, validity) class LoadGitHubRepository(LoadGitRepository): """Import a github repository to Software Heritage""" task_queue = 'swh_loader_git' CONFIG_BASE_FILENAME = 'loader/github.ini' ADDITIONAL_CONFIG = { 'github_basepath': ('str', '/srv/storage/space/data/github'), - 'authority_id': ('int', 1), + 'authority_id': ('str', '5f4d4c51-498a-4e28-88b3-b3e4e8396cba'), 'default_validity': ('str', '1970-01-01 00:00:00+00'), } def run(self, repo_fullname): authority_id = self.config['authority_id'] validity = self.config['default_validity'] repo_path = os.path.join(self.config['github_basepath'], repo_fullname[0], repo_fullname) witness_file = os.path.join(repo_path, 'witness') if os.path.exists(witness_file): validity_timestamp = os.stat(witness_file).st_mtime validity = '%s+00' % datetime.datetime.utcfromtimestamp( validity_timestamp) origin_url = 'https://github.com/%s' % repo_fullname super().run(repo_path, origin_url, authority_id, validity) class LoadGitHubRepositoryReleases(LoadGitHubRepository): """Import a GitHub repository to SoftwareHeritage, only with releases""" task_queue = 'swh_loader_git_express' def __init__(self): super(self.__class__, self).__init__() self.config.update({ 'send_contents': False, 'send_directories': False, 'send_revisions': False, 'send_releases': True, 'send_occurrences': False, }) class LoadGitHubRepositoryContents(LoadGitHubRepository): """Import a GitHub repository to SoftwareHeritage, only with contents""" task_queue = 'swh_loader_git_express' def __init__(self): super(self.__class__, self).__init__() self.config.update({ 'send_contents': True, 'send_directories': False, 'send_revisions': False, 'send_releases': False, 'send_occurrences': False, }) diff --git a/version.txt b/version.txt index 57370b7..8700f50 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.7-0-gee29257 \ No newline at end of file +v0.0.8-0-g5e2e56f \ No newline at end of file