diff --git a/debian/control b/debian/control index 6876512..49926cb 100644 --- a/debian/control +++ b/debian/control @@ -1,31 +1,31 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-click, python3-dulwich (>= 0.18.7~), python3-nose, python3-retrying, python3-setuptools, python3-swh.core (>= 0.0.7~), - python3-swh.loader.core (>= 0.0.22), + python3-swh.loader.core (>= 0.0.28), python3-swh.model (>= 0.0.15~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), - python3-swh.loader.core (>= 0.0.22~), + python3-swh.loader.core (>= 0.0.28~), python3-swh.model (>= 0.0.15~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 187f8ca..8cc4f47 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 0.0.22 +swh.loader.core >= 0.0.28 swh.model >= 0.0.15 swh.scheduler >= 0.0.14 swh.storage >= 0.0.83 diff --git a/swh/loader/git/base.py b/swh/loader/git/base.py deleted file mode 100644 index c135a68..0000000 --- a/swh/loader/git/base.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (C) 2016-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import abc -import os - -from swh.loader.core.loader import SWHLoader - - -class BaseLoader(SWHLoader): - """This base class is a pattern for loaders. - - The external calling convention is as such: - - - instantiate the class once (loads storage and the configuration) - - for each origin, call load with the origin-specific arguments (for - instance, an origin URL). - - load calls several methods that must be implemented in subclasses: - - - prepare(\*args, \**kwargs) prepares the loader for the new origin - - get_origin gets the origin object associated to the current loader - - fetch_data downloads the necessary data from the origin - - get_{contents,directories,revisions,releases,occurrences} retrieve each - kind of object from the origin - - has\_* checks whether there are some objects to load for that object type - - get_fetch_history_result retrieves the data to insert in the - fetch_history table once the load was successful - - cleanup cleans up an eventual state installed for computations - - eventful returns whether the load was eventful or not - - """ - DEFAULT_CONFIG = { - 'storage': ('dict', { - 'cls': 'remote', - 'args': { - 'url': 'http://localhost:5002/' - }, - }), - 'send_contents': ('bool', True), - 'send_directories': ('bool', True), - 'send_revisions': ('bool', True), - 'send_releases': ('bool', True), - 'send_occurrences': ('bool', True), - - 'save_data': ('bool', False), - 'save_data_path': ('str', ''), - - 'content_packet_size': ('int', 10000), - 'content_packet_size_bytes': ('int', 1024 * 1024 * 1024), - 'directory_packet_size': ('int', 25000), - 'revision_packet_size': ('int', 100000), - 'release_packet_size': ('int', 100000), - 'occurrence_packet_size': ('int', 100000), - } - - def __init__(self): - super().__init__(logging_class='swh.loader.git.BulkLoader') - - # Make sure the config is sane - if self.config['save_data']: - path = self.config['save_data_path'] - os.stat(path) - if not os.access(path, os.R_OK | os.W_OK): - raise PermissionError("Permission denied: %r" % path) - - self.visit_date = None # possibly overridden in self.prepare method - - @abc.abstractmethod - def has_contents(self): - """Checks whether we need to load contents""" - pass - - def get_contents(self): - """Get the contents that need to be loaded""" - raise NotImplementedError - - @abc.abstractmethod - def has_directories(self): - """Checks whether we need to load directories""" - pass - - def get_directories(self): - """Get the directories that need to be loaded""" - raise NotImplementedError - - @abc.abstractmethod - def has_revisions(self): - """Checks whether we need to load revisions""" - - def get_revisions(self): - """Get the revisions that need to be loaded""" - raise NotImplementedError - - @abc.abstractmethod - def has_releases(self): - """Checks whether we need to load releases""" - return True - - def get_releases(self): - """Get the releases that need to be loaded""" - raise NotImplementedError - - @abc.abstractmethod - def has_occurrences(self): - """Checks whether we need to load occurrences""" - pass - - def get_occurrences(self): - """Get the occurrences that need to be loaded""" - raise NotImplementedError - - def get_fetch_history_result(self): - """Return the data to store in fetch_history for the current loader""" - raise NotImplementedError - - def eventful(self): - """Whether the load was eventful""" - raise NotImplementedError - - def save_data(self): - """Save the data associated to the current load""" - raise NotImplementedError - - def get_save_data_path(self): - """The path to which we save the data""" - if not hasattr(self, '__save_data_path'): - origin_id = self.origin_id - year = str(self.visit_date.year) - - path = os.path.join( - self.config['save_data_path'], - "%04d" % (origin_id % 10000), - "%08d" % origin_id, - year, - ) - - os.makedirs(path, exist_ok=True) - self.__save_data_path = path - - return self.__save_data_path - - def cleanup(self): - """Clean up an eventual state installed for computations. - Nothing specific for the loader-git is needed. - - """ - pass - - def store_data(self): - """Store data fetched from the git repository. - - """ - if self.config['save_data']: - self.save_data() - - if self.config['send_contents'] and self.has_contents(): - self.send_batch_contents(self.get_contents()) - if self.config['send_directories'] and self.has_directories(): - self.send_batch_directories(self.get_directories()) - if self.config['send_revisions'] and self.has_revisions(): - self.send_batch_revisions(self.get_revisions()) - if self.config['send_releases'] and self.has_releases(): - self.send_batch_releases(self.get_releases()) - if self.config['send_occurrences'] and self.has_occurrences(): - self.send_batch_occurrences(self.get_occurrences()) diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index 785ee4a..b7a2720 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,295 +1,299 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import dulwich.repo import os import shutil from dulwich.errors import ObjectFormatException, EmptyFileException from collections import defaultdict from swh.model import hashutil -from . import base, converters, utils +from swh.loader.core.loader import SWHStatelessLoader +from . import converters, utils -class GitLoader(base.BaseLoader): +class GitLoader(SWHStatelessLoader): """Load a git repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/git-loader' + def __init__(self, config=None): + super().__init__(logging_class='swh.loader.git.Loader', config=config) + def prepare(self, origin_url, directory, visit_date): self.origin_url = origin_url self.origin = self.get_origin() self.repo = dulwich.repo.Repo(directory) self.visit_date = visit_date def get_origin(self): """Get the origin that is currently being loaded""" return converters.origin_url_to_origin(self.origin_url) def iter_objects(self): object_store = self.repo.object_store for pack in object_store.packs: objs = list(pack.index.iterentries()) objs.sort(key=lambda x: x[1]) for sha, offset, crc32 in objs: yield hashutil.hash_to_bytehex(sha) yield from object_store._iter_loose_objects() yield from object_store._iter_alternate_objects() def _check(self, obj): """Check the object's repository representation. If any errors in check exists, an ObjectFormatException is raised. Args: obj (object): Dulwich object read from the repository. """ obj.check() from dulwich.objects import Commit, Tag try: # For additional checks on dulwich objects with date # for now, only checks on *time if isinstance(obj, Commit): commit_time = obj._commit_time utils.check_date_time(commit_time) author_time = obj._author_time utils.check_date_time(author_time) elif isinstance(obj, Tag): tag_time = obj._tag_time utils.check_date_time(tag_time) except Exception as e: raise ObjectFormatException(e) def get_object(self, oid): """Given an object id, return the object if it is found and not malformed in some way. Args: oid (bytes): the object's identifier Returns: The object if found without malformation """ try: # some errors are raised when reading the object obj = self.repo[oid] # some we need to check ourselves self._check(obj) except KeyError: _id = oid.decode('utf-8') self.log.warn('object %s not found, skipping' % _id, extra={ 'swh_type': 'swh_loader_git_missing_object', 'swh_object_id': _id, 'origin_id': self.origin_id, }) return None except ObjectFormatException: _id = oid.decode('utf-8') self.log.warn('object %s malformed, skipping' % _id, extra={ 'swh_type': 'swh_loader_git_missing_object', 'swh_object_id': _id, 'origin_id': self.origin_id, }) return None except EmptyFileException: _id = oid.decode('utf-8') self.log.warn('object %s corrupted (empty file), skipping' % _id, extra={ 'swh_type': 'swh_loader_git_missing_object', 'swh_object_id': _id, 'origin_id': self.origin_id, }) else: return obj def fetch_data(self): """Fetch the data from the data source""" type_to_ids = defaultdict(list) for oid in self.iter_objects(): obj = self.get_object(oid) if not obj: continue type_name = obj.type_name type_to_ids[type_name].append(oid) self.type_to_ids = type_to_ids def has_contents(self): """Checks whether we need to load contents""" return bool(self.type_to_ids[b'blob']) def get_content_ids(self): """Get the content identifiers from the git repository""" for oid in self.type_to_ids[b'blob']: yield converters.dulwich_blob_to_content_id(self.repo[oid]) def get_contents(self): """Get the contents that need to be loaded""" max_content_size = self.config['content_size_limit'] missing_contents = set(self.storage.content_missing( self.get_content_ids(), 'sha1_git')) for oid in missing_contents: yield converters.dulwich_blob_to_content( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): """Checks whether we need to load directories""" return bool(self.type_to_ids[b'tree']) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tree']) def get_directories(self): """Get the directories that need to be loaded""" missing_dirs = set(self.storage.directory_missing( sorted(self.get_directory_ids()))) for oid in missing_dirs: yield converters.dulwich_tree_to_directory( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_revisions(self): """Checks whether we need to load revisions""" return bool(self.type_to_ids[b'commit']) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'commit']) def get_revisions(self): """Get the revisions that need to be loaded""" missing_revs = set(self.storage.revision_missing( sorted(self.get_revision_ids()))) for oid in missing_revs: yield converters.dulwich_commit_to_revision( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_releases(self): """Checks whether we need to load releases""" return bool(self.type_to_ids[b'tag']) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tag']) def get_releases(self): """Get the releases that need to be loaded""" missing_rels = set(self.storage.release_missing( sorted(self.get_release_ids()))) for oid in missing_rels: yield converters.dulwich_tag_to_release( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_occurrences(self): """Checks whether we need to load occurrences""" return True def get_occurrences(self): """Get the occurrences that need to be loaded""" origin_id = self.origin_id visit = self.visit ref_objs = ((refs, target, self.get_object(target)) for refs, target in self.repo.refs.as_dict().items() if self.get_object(target)) for ref, target, obj in ref_objs: target_type_name = obj.type_name target_type = converters.DULWICH_TYPES[target_type_name] yield { 'branch': ref, 'origin': origin_id, 'target': hashutil.bytehex_to_hash(target), 'target_type': target_type, 'visit': visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), 'occurrences': len(self.repo.refs.allkeys()), } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True class GitLoaderFromArchive(GitLoader): """Load a git repository from an archive. """ def project_name_from_archive(self, archive_path): """Compute the project name from the archive's path. """ return os.path.basename(os.path.dirname(archive_path)) def prepare(self, origin_url, archive_path, visit_date): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoader does 3. Load as GitLoader does """ project_name = self.project_name_from_archive(archive_path) self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( project_name, archive_path) self.log.info('Project %s - Uncompressing archive %s at %s' % ( origin_url, os.path.basename(archive_path), self.repo_path)) super().prepare(origin_url, self.repo_path, visit_date) def cleanup(self): """Cleanup the temporary location (if it exists). """ if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) self.log.info('Project %s - Done injecting %s' % ( self.origin_url, self.repo_path)) if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = GitLoader() origin_url = sys.argv[1] directory = sys.argv[2] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, visit_date)) diff --git a/swh/loader/git/updater.py b/swh/loader/git/updater.py index bb02b54..2782e88 100644 --- a/swh/loader/git/updater.py +++ b/swh/loader/git/updater.py @@ -1,480 +1,481 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import BytesIO import datetime import logging import os import pickle import sys from collections import defaultdict import dulwich.client from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater from urllib.parse import urlparse from swh.model import hashutil - -from . import base, converters +from swh.loader.core.loader import SWHStatelessLoader +from . import converters class SWHRepoRepresentation: """Repository representation for a Software Heritage origin.""" def __init__(self, storage, origin_id, occurrences=None): self.storage = storage self._parents_cache = {} self._type_cache = {} if origin_id: self.heads = set(self._cache_heads(origin_id, occurrences)) else: self.heads = set() def _fill_parents_cache(self, commits): """When querying for a commit's parents, we fill the cache to a depth of 1000 commits.""" root_revs = self._encode_for_storage(commits) for rev, parents in self.storage.revision_shortlog(root_revs, 1000): rev_id = hashutil.hash_to_bytehex(rev) if rev_id not in self._parents_cache: self._parents_cache[rev_id] = [ hashutil.hash_to_bytehex(parent) for parent in parents ] for rev in commits: if rev not in self._parents_cache: self._parents_cache[rev] = [] def _cache_heads(self, origin_id, occurrences): """Return all the known head commits for `origin_id`""" if not occurrences: occurrences = self.storage.occurrence_get(origin_id) return self._decode_from_storage( occurrence['target'] for occurrence in occurrences ) def get_parents(self, commit): """get the parent commits for `commit`""" # Prime the parents cache if not self._parents_cache and self.heads: self._fill_parents_cache(self.heads) if commit not in self._parents_cache: self._fill_parents_cache([commit]) return self._parents_cache[commit] def get_heads(self): return self.heads @staticmethod def _encode_for_storage(objects): return [hashutil.bytehex_to_hash(object) for object in objects] @staticmethod def _decode_from_storage(objects): return set(hashutil.hash_to_bytehex(object) for object in objects) def graph_walker(self): return ObjectStoreGraphWalker(self.get_heads(), self.get_parents) @staticmethod def filter_unwanted_refs(refs): """Filter the unwanted references from refs""" ret = {} for ref, val in refs.items(): if ref.endswith(b'^{}'): # Peeled refs make the git protocol explode continue elif ref.startswith(b'refs/pull/') and ref.endswith(b'/merge'): # We filter-out auto-merged GitHub pull requests continue else: ret[ref] = val return ret def determine_wants(self, refs): """Filter the remote references to figure out which ones Software Heritage needs. """ if not refs: return [] # Find what objects Software Heritage has refs = self.find_remote_ref_types_in_swh(refs) # Cache the objects found in swh as existing heads for target in refs.values(): if target['target_type'] is not None: self.heads.add(target['target']) ret = set() for target in self.filter_unwanted_refs(refs).values(): if target['target_type'] is None: # The target doesn't exist in Software Heritage, let's retrieve # it. ret.add(target['target']) return list(ret) def get_stored_objects(self, objects): return self.storage.object_find_by_sha1_git( self._encode_for_storage(objects)) def find_remote_ref_types_in_swh(self, remote_refs): """Parse the remote refs information and list the objects that exist in Software Heritage. """ all_objs = set(remote_refs.values()) - set(self._type_cache) type_by_id = {} for id, objs in self.get_stored_objects(all_objs).items(): id = hashutil.hash_to_bytehex(id) if objs: type_by_id[id] = objs[0]['type'] self._type_cache.update(type_by_id) ret = {} for ref, id in remote_refs.items(): ret[ref] = { 'target': id, 'target_type': self._type_cache.get(id), } return ret -class BulkUpdater(base.BaseLoader): +class BulkUpdater(SWHStatelessLoader): """A bulk loader for a git repository""" CONFIG_BASE_FILENAME = 'loader/git-updater' ADDITIONAL_CONFIG = { 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), } - def __init__(self, repo_representation=SWHRepoRepresentation): + def __init__(self, repo_representation=SWHRepoRepresentation, config=None): """Initialize the bulk updater. Args: repo_representation: swh's repository representation which is in charge of filtering between known and remote data. """ - super().__init__() + super().__init__(logging_class='swh.loader.git.BulkLoader', + config=config) self.repo_representation = repo_representation def fetch_pack_from_origin(self, origin_url, base_origin_id, base_occurrences, do_activity): """Fetch a pack from the origin""" pack_buffer = BytesIO() base_repo = self.repo_representation(self.storage, base_origin_id, base_occurrences) parsed_uri = urlparse(origin_url) path = parsed_uri.path if not path.endswith('.git'): path += '.git' client = dulwich.client.TCPGitClient(parsed_uri.netloc, thin_packs=False) size_limit = self.config['pack_size_bytes'] def do_pack(data, pack_buffer=pack_buffer, limit=size_limit, origin_url=origin_url): cur_size = pack_buffer.tell() would_write = len(data) if cur_size + would_write > limit: raise IOError('Pack file too big for repository %s, ' 'limit is %d bytes, current size is %d, ' 'would write %d' % (origin_url, limit, cur_size, would_write)) pack_buffer.write(data) remote_refs = client.fetch_pack(path.encode('ascii'), base_repo.determine_wants, base_repo.graph_walker(), do_pack, progress=do_activity) if remote_refs: local_refs = base_repo.find_remote_ref_types_in_swh(remote_refs) else: local_refs = remote_refs = {} pack_buffer.flush() pack_size = pack_buffer.tell() pack_buffer.seek(0) return { 'remote_refs': base_repo.filter_unwanted_refs(remote_refs), 'local_refs': local_refs, 'pack_buffer': pack_buffer, 'pack_size': pack_size, } def list_pack(self, pack_data, pack_size): id_to_type = {} type_to_ids = defaultdict(set) inflater = self.get_inflater() for obj in inflater: type, id = obj.type_name, obj.id id_to_type[id] = type type_to_ids[type].add(id) return id_to_type, type_to_ids def prepare(self, origin_url, base_url=None): self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) origin = converters.origin_url_to_origin(origin_url) base_origin = converters.origin_url_to_origin(base_url) base_occurrences = [] base_origin_id = origin_id = None db_origin = self.storage.origin_get(origin) if db_origin: base_origin_id = origin_id = db_origin['id'] if origin_id: base_occurrences = self.storage.occurrence_get(origin_id) if base_url and not base_occurrences: base_origin = self.storage.origin_get(base_origin) if base_origin: base_origin_id = base_origin['id'] base_occurrences = self.storage.occurrence_get(base_origin_id) self.base_occurrences = list(sorted(base_occurrences, key=lambda occ: occ['branch'])) self.base_origin_id = base_origin_id self.origin = origin def get_origin(self): return self.origin def fetch_data(self): def do_progress(msg): sys.stderr.buffer.write(msg) sys.stderr.flush() fetch_info = self.fetch_pack_from_origin( self.origin['url'], self.base_origin_id, self.base_occurrences, do_progress) self.pack_buffer = fetch_info['pack_buffer'] self.pack_size = fetch_info['pack_size'] self.remote_refs = fetch_info['remote_refs'] self.local_refs = fetch_info['local_refs'] origin_url = self.origin['url'] self.log.info('Listed %d refs for repo %s' % ( len(self.remote_refs), origin_url), extra={ 'swh_type': 'git_repo_list_refs', 'swh_repo': origin_url, 'swh_num_refs': len(self.remote_refs), }) # We want to load the repository, walk all the objects id_to_type, type_to_ids = self.list_pack(self.pack_buffer, self.pack_size) self.id_to_type = id_to_type self.type_to_ids = type_to_ids def save_data(self): """Store a pack for archival""" write_size = 8192 pack_dir = self.get_save_data_path() pack_name = "%s.pack" % self.visit_date.isoformat() refs_name = "%s.refs" % self.visit_date.isoformat() with open(os.path.join(pack_dir, pack_name), 'xb') as f: while True: r = self.pack_buffer.read(write_size) if not r: break f.write(r) self.pack_buffer.seek(0) with open(os.path.join(pack_dir, refs_name), 'xb') as f: pickle.dump(self.remote_refs, f) def get_inflater(self): """Reset the pack buffer and get an object inflater from it""" self.pack_buffer.seek(0) return PackInflater.for_pack_data( PackData.from_file(self.pack_buffer, self.pack_size)) def has_contents(self): return bool(self.type_to_ids[b'blob']) def get_content_ids(self): """Get the content identifiers from the git repository""" for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue yield converters.dulwich_blob_to_content_id(raw_obj) def get_contents(self): """Format the blobs from the git repository as swh contents""" max_content_size = self.config['content_size_limit'] missing_contents = set(self.storage.content_missing( self.get_content_ids(), 'sha1_git')) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue if raw_obj.sha().digest() not in missing_contents: continue yield converters.dulwich_blob_to_content( raw_obj, log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): return bool(self.type_to_ids[b'tree']) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tree']) def get_directories(self): """Format the trees as swh directories""" missing_dirs = set(self.storage.directory_missing( sorted(self.get_directory_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tree': continue if raw_obj.sha().digest() not in missing_dirs: continue yield converters.dulwich_tree_to_directory(raw_obj, log=self.log) def has_revisions(self): return bool(self.type_to_ids[b'commit']) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'commit']) def get_revisions(self): """Format commits as swh revisions""" missing_revs = set(self.storage.revision_missing( sorted(self.get_revision_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'commit': continue if raw_obj.sha().digest() not in missing_revs: continue yield converters.dulwich_commit_to_revision(raw_obj, log=self.log) def has_releases(self): return bool(self.type_to_ids[b'tag']) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tag']) def get_releases(self): """Retrieve all the release objects from the git repository""" missing_rels = set(self.storage.release_missing( sorted(self.get_release_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tag': continue if raw_obj.sha().digest() not in missing_rels: continue yield converters.dulwich_tag_to_release(raw_obj, log=self.log) def has_occurrences(self): return bool(self.remote_refs) def get_occurrences(self): origin_id = self.origin_id visit = self.visit ret = [] for ref in self.remote_refs: ret_ref = self.local_refs[ref].copy() ret_ref.update({ 'branch': ref, 'origin': origin_id, 'visit': visit, }) if not ret_ref['target_type']: target_type = self.id_to_type[ret_ref['target']] ret_ref['target_type'] = converters.DULWICH_TYPES[target_type] ret_ref['target'] = hashutil.bytehex_to_hash(ret_ref['target']) ret.append(ret_ref) return ret def get_fetch_history_result(self): return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), 'occurrences': len(self.remote_refs), } def eventful(self): """The load was eventful if the current occurrences are different to the ones we retrieved at the beginning of the run""" current_occurrences = list(sorted( self.storage.occurrence_get(self.origin_id), key=lambda occ: occ['branch'], )) return self.base_occurrences != current_occurrences if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) bulkupdater = BulkUpdater() origin_url = sys.argv[1] base_url = origin_url if len(sys.argv) > 2: base_url = sys.argv[2] print(bulkupdater.load(origin_url, base_url))