diff --git a/PKG-INFO b/PKG-INFO index 5a55c46..a3e4f67 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.28 +Version: 0.0.29 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 5a55c46..a3e4f67 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.28 +Version: 0.0.29 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index 175b90b..76be640 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,181 +1,185 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import dulwich.repo import os import shutil from collections import defaultdict from swh.core import hashutil from . import base, converters, utils class GitLoader(base.BaseLoader): """Load a git repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/git-loader' def prepare(self, origin_url, directory, fetch_date): self.origin_url = origin_url self.repo = dulwich.repo.Repo(directory) self.fetch_date = fetch_date def get_origin(self): """Get the origin that is currently being loaded""" return converters.origin_url_to_origin(self.origin_url) def iter_objects(self): object_store = self.repo.object_store for pack in object_store.packs: objs = list(pack.index.iterentries()) objs.sort(key=lambda x: x[1]) for sha, offset, crc32 in objs: yield hashutil.hash_to_bytehex(sha) yield from object_store._iter_loose_objects() yield from object_store._iter_alternate_objects() def fetch_data(self): """Fetch the data from the data source""" type_to_ids = defaultdict(list) for oid in self.iter_objects(): type_name = self.repo[oid].type_name type_to_ids[type_name].append(oid) self.type_to_ids = type_to_ids def has_contents(self): """Checks whether we need to load contents""" return bool(self.type_to_ids[b'blob']) def get_contents(self): """Get the contents that need to be loaded""" max_content_size = self.config['content_size_limit'] for oid in self.type_to_ids[b'blob']: yield converters.dulwich_blob_to_content( self.repo[oid], log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): """Checks whether we need to load directories""" return bool(self.type_to_ids[b'tree']) def get_directories(self): """Get the directories that need to be loaded""" for oid in self.type_to_ids[b'tree']: yield converters.dulwich_tree_to_directory( self.repo[oid], log=self.log) def has_revisions(self): """Checks whether we need to load revisions""" return bool(self.type_to_ids[b'commit']) def get_revisions(self): """Get the revisions that need to be loaded""" for oid in self.type_to_ids[b'commit']: yield converters.dulwich_commit_to_revision( self.repo[oid], log=self.log) def has_releases(self): """Checks whether we need to load releases""" return bool(self.type_to_ids[b'tag']) def get_releases(self): """Get the releases that need to be loaded""" for oid in self.type_to_ids[b'tag']: yield converters.dulwich_tag_to_release( self.repo[oid], log=self.log) def has_occurrences(self): """Checks whether we need to load occurrences""" return True def get_occurrences(self): """Get the occurrences that need to be loaded""" repo = self.repo origin_id = self.origin_id visit = self.visit for ref, target in repo.refs.as_dict().items(): target_type_name = repo[target].type_name target_type = converters.DULWICH_TYPES[target_type_name] yield { 'branch': ref, 'origin': origin_id, 'target': hashutil.bytehex_to_hash(target), 'target_type': target_type, 'visit': visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), 'occurrences': len(self.repo.refs.allkeys()), } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True class GitLoaderFromArchive(GitLoader): """Load a git repository from an archive. """ - CONFIG_BASE_FILENAME = 'loader/archive-git-loader' + def project_name_from_archive(self, archive_path): + """Compute the project name from the archive's path. + + """ + return os.path.basename(os.path.dirname(archive_path)) def prepare(self, origin_url, archive_path, fetch_date): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoader does 3. Load as GitLoader does """ + project_name = self.project_name_from_archive(archive_path) self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( - archive_path) - self.project_name = os.path.basename(self.repo_path) + project_name, archive_path) self.log.info('Project %s - Uncompressing archive %s at %s' % ( - self.project_name, os.path.basename(archive_path), self.repo_path)) + origin_url, os.path.basename(archive_path), self.repo_path)) super().prepare(origin_url, self.repo_path, fetch_date) def cleanup(self): """Cleanup the temporary location (if it exists). """ if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) self.log.info('Project %s - Done injecting %s' % ( - self.project_name, self.repo_path)) + self.origin_url, self.repo_path)) if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = GitLoader() origin_url = sys.argv[1] directory = sys.argv[2] fetch_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, fetch_date)) diff --git a/swh/loader/git/reader.py b/swh/loader/git/reader.py index 634e23d..6646e63 100644 --- a/swh/loader/git/reader.py +++ b/swh/loader/git/reader.py @@ -1,204 +1,255 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click +from collections import defaultdict import logging +import pprint -from collections import defaultdict +import click from swh.core import hashutil, utils from .updater import BulkUpdater, SWHRepoRepresentation from . import converters class SWHRepoFullRepresentation(SWHRepoRepresentation): """Overridden representation of a swh repository to permit to read completely the remote repository. """ def __init__(self, storage, origin_id, occurrences=None): self.storage = storage self._parents_cache = {} self._type_cache = {} self.heads = set() def determine_wants(self, refs): """Filter the remote references to figure out which ones Software Heritage needs. In this particular context, we want to know everything. """ if not refs: return [] for target in refs.values(): self.heads.add(target) return self.filter_unwanted_refs(refs).values() def find_remote_ref_types_in_swh(self, remote_refs): """Find the known swh remote. In that particular context, we know nothing. """ return {} class DummyGraphWalker(object): """Dummy graph walker which claims that the client doesn’t have any objects. """ def ack(self, sha): pass def next(self): pass def __next__(self): pass -class GitSha1RemoteReader(BulkUpdater): - """Read sha1 git from a remote repository and dump only repository's - content sha1 as list. - - """ +class BaseGitRemoteReader(BulkUpdater): CONFIG_BASE_FILENAME = 'loader/git-remote-reader' ADDITIONAL_CONFIG = { 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), 'pack_storage_base': ('str', ''), # don't want to store packs so empty 'next_task': ( 'dict', { 'queue': 'swh.storage.archiver.tasks.SWHArchiverToBackendTask', 'batch_size': 100, 'destination': 'azure' } ) } def __init__(self): super().__init__(SWHRepoFullRepresentation) self.next_task = self.config['next_task'] self.batch_size = self.next_task['batch_size'] self.task_destination = self.next_task['queue'] self.destination = self.next_task['destination'] def graph_walker(self): return DummyGraphWalker() def prepare(self, origin_url, base_url=None): """Only retrieve information about the origin, set everything else to empty. """ - ori = converters.origin_url_to_origin(origin_url) - self.origin = self.storage.origin_get(ori) - self.origin_id = self.origin['id'] + self.origin = converters.origin_url_to_origin(origin_url) + self.origin_id = 0 self.base_occurrences = [] - self.base_origin_id = self.origin['id'] + self.base_origin_id = 0 + + def keep_object(self, obj): + """Do we want to keep this object or not?""" + raise NotImplementedError('Please implement keep_object') + + def get_id_and_data(self, obj): + """Get the id, type and data of the given object""" + raise NotImplementedError('Please implement get_id_and_data') def list_pack(self, pack_data, pack_size): """Override list_pack to only keep contents' sha1. Returns: id_to_type (dict): keys are sha1, values are their associated type type_to_ids (dict): keys are types, values are list of associated - ids (sha1 for blobs) + data (sha1 for blobs) """ + self.data = {} id_to_type = {} type_to_ids = defaultdict(set) inflater = self.get_inflater() for obj in inflater: - type = obj.type_name - if type != b'blob': # don't keep other types + if not self.keep_object(obj): continue - # compute the sha1 (obj.id is the sha1_git) - data = obj.as_raw_string() - hashes = hashutil.hashdata(data, {'sha1'}) - oid = hashes['sha1'] + object_id, type, data = self.get_id_and_data(obj) - id_to_type[oid] = type - type_to_ids[type].add(oid) + id_to_type[object_id] = type + type_to_ids[type].add(object_id) + self.data[object_id] = data return id_to_type, type_to_ids def load(self, *args, **kwargs): """Override the loading part which simply reads the repository's contents' sha1. Returns: Returns the list of discovered sha1s for that origin. """ - try: - self.prepare(*args, **kwargs) - except: - self.log.error('Unknown repository, skipping...') - return [] - + self.prepare(*args, **kwargs) self.fetch_data() - return self.type_to_ids[b'blob'] + + +class GitSha1RemoteReader(BaseGitRemoteReader): + """Read sha1 git from a remote repository and dump only repository's + content sha1 as list. + + """ + def keep_object(self, obj): + """Only keep blobs""" + return obj.type_name == b'blob' + + def get_id_and_data(self, obj): + """We want to store only object identifiers""" + # compute the sha1 (obj.id is the sha1_git) + data = obj.as_raw_string() + hashes = hashutil.hashdata(data, {'sha1'}) + oid = hashes['sha1'] + return (oid, b'blob', oid) class GitSha1RemoteReaderAndSendToQueue(GitSha1RemoteReader): """Read sha1 git from a remote repository and dump only repository's content sha1 as list and send batch of those sha1s to a celery queue for consumption. """ def load(self, *args, **kwargs): """Retrieve the list of sha1s for a particular origin and send those sha1s as group of sha1s to a specific queue. """ - data = super().load(*args, **kwargs) + super().load(*args, **kwargs) + + data = self.type_to_ids[b'blob'] from swh.scheduler.celery_backend.config import app try: # optional dependency from swh.storage.archiver import tasks # noqa except ImportError: pass from celery import group task_destination = app.tasks[self.task_destination] groups = [] for ids in utils.grouper(data, self.batch_size): sig_ids = task_destination.s(destination=self.destination, batch=list(ids)) groups.append(sig_ids) group(groups).delay() return data -@click.command() -@click.option('--origin-url', help='Origin\'s url') -@click.option('--send/--nosend', default=False, help='Origin\'s url') -def main(origin_url, send): +class GitCommitRemoteReader(BaseGitRemoteReader): + def keep_object(self, obj): + return obj.type_name == b'commit' + + def get_id_and_data(self, obj): + return obj.id, b'commit', converters.dulwich_commit_to_revision(obj) + + def load(self, *args, **kwargs): + super().load(*args, **kwargs) + return self.data + + +@click.group() +@click.option('--origin-url', help='Origin url') +@click.pass_context +def main(ctx, origin_url): logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) + ctx.obj['origin_url'] = origin_url + + +@main.command() +@click.option('--send/--nosend', default=False, help='Origin\'s url') +@click.pass_context +def blobs(ctx, send): + origin_url = ctx.obj['origin_url'] if send: loader = GitSha1RemoteReaderAndSendToQueue() ids = loader.load(origin_url) print('%s sha1s were sent to queue' % len(ids)) return loader = GitSha1RemoteReader() ids = loader.load(origin_url) if ids: for oid in ids: print(hashutil.hash_to_hex(oid)) +@main.command() +@click.option('--ids-only', is_flag=True, help='print ids only') +@click.pass_context +def commits(ctx, ids_only): + origin_url = ctx.obj['origin_url'] + + reader = GitCommitRemoteReader() + commits = reader.load(origin_url) + for commit_id, commit in commits.items(): + if ids_only: + print(commit_id.decode()) + else: + pprint.pprint(commit) + + if __name__ == '__main__': - main() + main(obj={}) diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py index ac4bd49..e5b2825 100644 --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -1,48 +1,49 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile from subprocess import call -def init_git_repo_from_archive(archive_path, root_temp_dir='/tmp'): +def init_git_repo_from_archive(project_name, archive_path, + root_temp_dir='/tmp'): """Given a path to an archive containing a git repository. + Uncompress that archive to a temporary location and returns the path. If any problem whatsoever is raised, clean up the temporary location. - Returns: + Args: + project_name (str): Project's name + archive_path (str): Full path to the archive + root_temp_dir (str): Optional temporary directory mount point + (default to /tmp) + + Returns A tuple: - temporary folder: containing the mounted repository - repo_path, path to the mounted repository inside the temporary folder - Raises: + Raises ValueError in case of failure to run the command to uncompress """ - project_name = os.path.basename(os.path.dirname(archive_path)) - temp_dir = tempfile.mkdtemp(suffix='.swh.loader.git', - prefix='tmp.', - dir=root_temp_dir) + temp_dir = tempfile.mkdtemp( + suffix='.swh.loader.git', prefix='tmp.', dir=root_temp_dir) try: - repo_path = os.path.join(temp_dir, project_name) - # create the repository that will be loaded with the dump - cmd = ['unzip', '-q', '-o', archive_path, '-d', temp_dir] - r = call(cmd) - + r = call(['unzip', '-q', '-o', archive_path, '-d', temp_dir]) if r != 0: - raise ValueError( - 'Failed to uncompress git repository for %s' % - project_name) + raise ValueError('Failed to uncompress archive %s' % archive_path) + repo_path = os.path.join(temp_dir, project_name) return temp_dir, repo_path except Exception as e: shutil.rmtree(temp_dir) raise e diff --git a/version.txt b/version.txt index 7f0ac6c..b39233c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.28-0-g1e8df3b \ No newline at end of file +v0.0.29-0-gdbd5ca3 \ No newline at end of file