diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index f80a135..41defae 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,150 +1,183 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import defaultdict import datetime - import dulwich.repo +import os +import shutil -from swh.core import hashutil +from collections import defaultdict -from . import base, converters +from swh.core import hashutil +from . import base, converters, utils class GitLoader(base.BaseLoader): """Load a git repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/git-loader' def prepare(self, origin_url, directory, fetch_date): self.origin_url = origin_url self.repo = dulwich.repo.Repo(directory) self.fetch_date = fetch_date def get_origin(self): """Get the origin that is currently being loaded""" return converters.origin_url_to_origin(self.origin_url) def iter_objects(self): object_store = self.repo.object_store for pack in object_store.packs: objs = list(pack.index.iterentries()) objs.sort(key=lambda x: x[1]) for sha, offset, crc32 in objs: yield hashutil.hash_to_bytehex(sha) yield from object_store._iter_loose_objects() yield from object_store._iter_alternate_objects() def fetch_data(self): """Fetch the data from the data source""" type_to_ids = defaultdict(list) for oid in self.iter_objects(): type_name = self.repo[oid].type_name type_to_ids[type_name].append(oid) self.type_to_ids = type_to_ids def has_contents(self): """Checks whether we need to load contents""" return bool(self.type_to_ids[b'blob']) def get_contents(self): """Get the contents that need to be loaded""" max_content_size = self.config['content_size_limit'] for oid in self.type_to_ids[b'blob']: yield converters.dulwich_blob_to_content( self.repo[oid], log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): """Checks whether we need to load directories""" return bool(self.type_to_ids[b'tree']) def get_directories(self): """Get the directories that need to be loaded""" for oid in self.type_to_ids[b'tree']: yield converters.dulwich_tree_to_directory( self.repo[oid], log=self.log) def has_revisions(self): """Checks whether we need to load revisions""" return bool(self.type_to_ids[b'commit']) def get_revisions(self): """Get the revisions that need to be loaded""" for oid in self.type_to_ids[b'commit']: yield converters.dulwich_commit_to_revision( self.repo[oid], log=self.log) def has_releases(self): """Checks whether we need to load releases""" return bool(self.type_to_ids[b'tag']) def get_releases(self): """Get the releases that need to be loaded""" for oid in self.type_to_ids[b'tag']: yield converters.dulwich_tag_to_release( self.repo[oid], log=self.log) def has_occurrences(self): """Checks whether we need to load occurrences""" return True def get_occurrences(self): """Get the occurrences that need to be loaded""" repo = self.repo origin_id = self.origin_id visit = self.visit for ref, target in repo.refs.as_dict().items(): target_type_name = repo[target].type_name target_type = converters.DULWICH_TYPES[target_type_name] yield { 'branch': ref, 'origin': origin_id, 'target': hashutil.bytehex_to_hash(target), 'target_type': target_type, 'visit': visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), 'occurrences': len(self.repo.refs.allkeys()), } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True +class GitLoaderFromArchive(GitLoader): + CONFIG_BASE_FILENAME = 'loader/zip-git-loader' + + def prepare(self, origin_url, archive_path, fetch_date): + """1. Uncompress the archive in temporary location. + 2. Prepare as the GitLoader does + + """ + self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( + archive_path) + self.project_name = os.path.basename(self.repo_path) + + self.log.info('Project %s - Uncompressing archive %s at %s' % ( + self.project_name, os.path.basename(archive_path), self.repo_path)) + super().prepare(origin_url, self.repo_path, fetch_date) + + def load(self, *args, **kwargs): + """1. Load as GitLoader does + 2. Finally clean up temporary location + + """ + try: + super().load(*args, **kwargs) + except Exception as e: + raise e + finally: + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + self.log.info('Project %s - Done injecting %s' % ( + self.project_name, self.repo_path)) + + if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = GitLoader() origin_url = sys.argv[1] directory = sys.argv[2] fetch_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, fetch_date)) diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py index 6b979de..0ef2057 100644 --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,53 +1,70 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser from swh.scheduler.task import Task -from .loader import GitLoader +from .loader import GitLoader, GitLoaderFromArchive from .updater import BulkUpdater from .reader import GitSha1RemoteReaderAndSendToQueue # TODO: rename to LoadRemoteGitRepository class UpdateGitRepository(Task): """Import a git repository from a remote location""" task_queue = 'swh_loader_git' def run(self, repo_url, base_url=None): """Import a git repository""" loader = BulkUpdater() loader.log = self.log return loader.load(repo_url, base_url) class LoadDiskGitRepository(Task): """Import a git repository from disk""" task_queue = 'swh_loader_git' def run(self, origin_url, directory, date): """Import a git repository, cloned in `directory` from `origin_url` at `date`.""" loader = GitLoader() loader.log = self.log return loader.load(origin_url, directory, dateutil.parser.parse(date)) +class UncompressAndLoadDiskGitRepository(Task): + """Import a git repository from a zip archive""" + task_queue = 'swh_loader_git_express' + + def run(self, origin_url, archive_path, date): + """1. Uncompress an archive repository in a local and temporary folder + 2. Load it through the git disk loader + 3. Clean up the temporary folder + + """ + loader = GitLoaderFromArchive() + loader.log = self.log + + return loader.load( + origin_url, archive_path, dateutil.parser.parse(date)) + + class ReaderGitRepository(Task): task_queue = 'swh_reader_git' def run(self, repo_url, base_url=None): """Read a git repository from a remote location and send sha1 to archival. """ loader = GitSha1RemoteReaderAndSendToQueue() loader.log = self.log return loader.load(repo_url) diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py new file mode 100644 index 0000000..ac4bd49 --- /dev/null +++ b/swh/loader/git/utils.py @@ -0,0 +1,48 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil +import tempfile + +from subprocess import call + + +def init_git_repo_from_archive(archive_path, root_temp_dir='/tmp'): + """Given a path to an archive containing a git repository. + Uncompress that archive to a temporary location and returns the path. + + If any problem whatsoever is raised, clean up the temporary location. + + Returns: + A tuple: + - temporary folder: containing the mounted repository + - repo_path, path to the mounted repository inside the temporary folder + + Raises: + ValueError in case of failure to run the command to uncompress + + """ + project_name = os.path.basename(os.path.dirname(archive_path)) + temp_dir = tempfile.mkdtemp(suffix='.swh.loader.git', + prefix='tmp.', + dir=root_temp_dir) + + try: + repo_path = os.path.join(temp_dir, project_name) + + # create the repository that will be loaded with the dump + cmd = ['unzip', '-q', '-o', archive_path, '-d', temp_dir] + r = call(cmd) + + if r != 0: + raise ValueError( + 'Failed to uncompress git repository for %s' % + project_name) + + return temp_dir, repo_path + except Exception as e: + shutil.rmtree(temp_dir) + raise e