diff --git a/PKG-INFO b/PKG-INFO index 27725ad..941c78d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,95 +1,95 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.0.41 +Version: 0.0.42 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader or the updater directly by calling: ``` python3 -m swh.loader.git.{loader,updater} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample $SWH_CONFIG_PATH/loader/git-{loader,updater}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/debian/changelog b/debian/changelog index 50f8d7d..1fe224b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,309 +1,310 @@ -swh-loader-git (0.0.41-1~swh1~bpo9+1) stretch-swh; urgency=medium +swh-loader-git (0.0.42-1~swh1) unstable-swh; urgency=medium - * Rebuild for stretch-backports. + * Release swh.loader.git v0.0.42 + * Fix critical bug in incremental loading - -- Nicolas Dandrimont Thu, 11 Oct 2018 16:26:27 +0200 + -- Nicolas Dandrimont Thu, 11 Oct 2018 17:19:07 +0200 swh-loader-git (0.0.41-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.41 * Use explicit keyword argument for base_url in the load task -- Nicolas Dandrimont Thu, 11 Oct 2018 16:26:27 +0200 swh-loader-git (0.0.40-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.40 * Improve python packaging * Make the loader more robust against holes in the history caused by * buggy imports * Allow ignoring the history to make a full load -- Nicolas Dandrimont Tue, 09 Oct 2018 16:28:14 +0200 swh-loader-git (0.0.39-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.39 * Avoid walking the history of large git repos, which takes a long time * Really save packfiles -- Nicolas Dandrimont Thu, 20 Sep 2018 17:22:17 +0200 swh-loader-git (0.0.38-1~swh1) unstable-swh; urgency=medium * v0.0.38 * Improve origin_visit initialization step * Properly sandbox the prepare statement so that if it breaks, we can * update appropriately the visit with the correct status -- Antoine R. Dumont (@ardumont) Wed, 07 Mar 2018 11:39:30 +0100 swh-loader-git (0.0.37-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.37 * Remove spurious debug print -- Nicolas Dandrimont Tue, 06 Feb 2018 16:00:40 +0100 swh-loader-git (0.0.36-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.36 * Update to use snapshots instead of occurrences * Use dulwich get_transport_and_path rather than hardcode the tcp transport -- Nicolas Dandrimont Tue, 06 Feb 2018 14:42:36 +0100 swh-loader-git (0.0.35-1~swh1) unstable-swh; urgency=medium * v0.0.35 * swh.loader.git.loader: Warn when object is corrupted and continue * swh.loader.git.loader: Add structured data to the log message regarding skipping objects * swh.loader.git.loader: Force further checks on objects * swh.loader.git.loader: Unify reading object from the repository * swh.loader.git.loader: Warn when object malformed and continue * swh.loader.git.loader: Trap missing object id and continue * swh.loader.git.base: Reuse swh.loader.core base loader * swh.loader.git.converters: Fix release time conversion issue when no date provided -- Antoine R. Dumont (@ardumont) Mon, 18 Dec 2017 12:08:01 +0100 swh-loader-git (0.0.34-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git version 0.0.34 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 20:12:11 +0200 swh-loader-git (0.0.33-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.33 * make the updater's parent commit cache more useful -- Nicolas Dandrimont Fri, 15 Sep 2017 18:45:41 +0200 swh-loader-git (0.0.32-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git 0.0.32 * Update tasks to new swh.scheduler API -- Nicolas Dandrimont Mon, 12 Jun 2017 18:04:50 +0200 swh-loader-git (0.0.31-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.31 * Migrate from swh.core.hashutil to swh.model.hashutil * Only send objects that are actually missing -- Nicolas Dandrimont Fri, 17 Mar 2017 17:40:17 +0100 swh-loader-git (0.0.30-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.30 * Fix handling of mergetag headers -- Nicolas Dandrimont Thu, 09 Mar 2017 11:30:08 +0100 swh-loader-git (0.0.29-1~swh1) unstable-swh; urgency=medium * v0.0.29 * GitLoaderFromArchive: Use the same configuration file as * GitLoader (permit to deploy both as the same unit) * git reader: Refactor to allow listing revisions as well as contents -- Antoine R. Dumont (@ardumont) Mon, 20 Feb 2017 11:32:24 +0100 swh-loader-git (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * loader: Fix fetch_date override -- Antoine R. Dumont (@ardumont) Wed, 15 Feb 2017 18:43:32 +0100 swh-loader-git (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * Add loader-git from archive -- Antoine R. Dumont (@ardumont) Tue, 14 Feb 2017 18:56:52 +0100 swh-loader-git (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * Add a git loader able to deal with git repository in archive -- Antoine R. Dumont (@ardumont) Tue, 14 Feb 2017 16:24:50 +0100 swh-loader-git (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * Fix to permit to actually pass the fetch date as parameter for * the loading git disk loader -- Antoine R. Dumont (@ardumont) Fri, 10 Feb 2017 17:34:35 +0100 swh-loader-git (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 18:40:29 +0100 swh-loader-git (0.0.23-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.23 * Make the save_data mechanism generic -- Nicolas Dandrimont Fri, 02 Dec 2016 15:34:05 +0100 swh-loader-git (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * Improve reader to permit to use it as analyzer tool -- Antoine R. Dumont (@ardumont) Fri, 04 Nov 2016 10:37:24 +0100 swh-loader-git (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * Improve the reader git to load all contents from a pack. * Improve to avoid unnecessary readings from db -- Antoine R. Dumont (@ardumont) Wed, 26 Oct 2016 17:06:12 +0200 swh-loader-git (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * Add new reader git task -- Antoine R. Dumont (@ardumont) Tue, 25 Oct 2016 18:40:17 +0200 swh-loader-git (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Update git loaders to register origin_visit's state -- Antoine R. Dumont (@ardumont) Tue, 23 Aug 2016 16:34:15 +0200 swh-loader-git (0.0.18-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.18 * Properly handle skipped contents -- Nicolas Dandrimont Fri, 19 Aug 2016 18:12:44 +0200 swh-loader-git (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.16 * Add exist_ok to packfile cache directory creation -- Nicolas Dandrimont Mon, 01 Aug 2016 15:53:07 +0200 swh-loader-git (0.0.15-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.15 * Absence of remote refs doesn't throw an error in updater -- Nicolas Dandrimont Wed, 15 Jun 2016 01:20:37 +0200 swh-loader-git (0.0.14-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.14 * Add a disk loader using dulwich * Rework the loader logic to use a single pattern for both loaders * Allow caching of packfiles for the remote loader -- Nicolas Dandrimont Tue, 14 Jun 2016 18:10:21 +0200 swh-loader-git (0.0.13-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.13 * Update for latest schema revision -- Nicolas Dandrimont Fri, 08 Apr 2016 16:46:41 +0200 swh-loader-git (0.0.12-1~swh1) unstable-swh; urgency=medium * Release swh-loader-git v0.0.12 * Update to use new swh.storage api for object listing * Add a size limit to packfiles * Return a proper eventfulness for empty repositories * Do not crawl the pack file if unnecessary -- Nicolas Dandrimont Thu, 25 Feb 2016 18:21:34 +0100 swh-loader-git (0.0.11-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.11 * Implement git updater -- Nicolas Dandrimont Fri, 19 Feb 2016 19:13:22 +0100 swh-loader-git (0.0.10-1~swh1) unstable-swh; urgency=medium * Prepare swh.loader.git release v0.0.10 * Update for swh.model * Use new swh.storage -- Nicolas Dandrimont Mon, 07 Dec 2015 18:59:46 +0100 swh-loader-git (0.0.9-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.9 * Close fetch_history on failure too -- Nicolas Dandrimont Wed, 04 Nov 2015 10:54:37 +0100 swh-loader-git (0.0.8-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.8 * New database schema (v028) * Populate fetch_history (T121) -- Nicolas Dandrimont Tue, 27 Oct 2015 18:11:26 +0100 swh-loader-git (0.0.7-1~swh1) unstable-swh; urgency=medium * Prepare swh.loader.git v0.0.7 deployment -- Nicolas Dandrimont Mon, 19 Oct 2015 12:37:09 +0200 swh-loader-git (0.0.6-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.6 -- Nicolas Dandrimont Fri, 09 Oct 2015 17:50:35 +0200 swh-loader-git (0.0.5-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.5 -- Nicolas Dandrimont Tue, 06 Oct 2015 17:42:11 +0200 swh-loader-git (0.0.4-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.4 -- Nicolas Dandrimont Fri, 02 Oct 2015 14:54:04 +0200 swh-loader-git (0.0.3-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.3 -- Nicolas Dandrimont Thu, 01 Oct 2015 11:36:28 +0200 swh-loader-git (0.0.2-1~swh1) unstable-swh; urgency=medium * Prepare deploying swh.loader.git v0.0.2 -- Nicolas Dandrimont Tue, 29 Sep 2015 17:22:09 +0200 swh-loader-git (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Tagging swh.loader.git v0.0.1 -- Nicolas Dandrimont Fri, 25 Sep 2015 16:04:00 +0200 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 27725ad..941c78d 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,95 +1,95 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.0.41 +Version: 0.0.42 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader or the updater directly by calling: ``` python3 -m swh.loader.git.{loader,updater} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample $SWH_CONFIG_PATH/loader/git-{loader,updater}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/loader/git/updater.py b/swh/loader/git/updater.py index 6d6135c..53f7502 100644 --- a/swh/loader/git/updater.py +++ b/swh/loader/git/updater.py @@ -1,487 +1,487 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import dulwich.client import logging import os import pickle import sys from collections import defaultdict from io import BytesIO from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater from swh.model import hashutil from swh.loader.core.loader import SWHStatelessLoader from . import converters class SWHRepoRepresentation: """Repository representation for a Software Heritage origin.""" def __init__(self, storage, origin_id, base_snapshot=None, ignore_history=False): self.storage = storage self._parents_cache = {} self._type_cache = {} self.ignore_history = ignore_history if origin_id and not ignore_history: self.heads = set(self._cache_heads(origin_id, base_snapshot)) else: self.heads = set() def _fill_parents_cache(self, commits): """When querying for a commit's parents, we fill the cache to a depth of 1000 commits.""" root_revs = self._encode_for_storage(commits) for rev, parents in self.storage.revision_shortlog(root_revs, 1000): rev_id = hashutil.hash_to_bytehex(rev) if rev_id not in self._parents_cache: self._parents_cache[rev_id] = [ hashutil.hash_to_bytehex(parent) for parent in parents ] for rev in commits: if rev not in self._parents_cache: self._parents_cache[rev] = [] def _cache_heads(self, origin_id, base_snapshot): """Return all the known head commits for `origin_id`""" _git_types = ['content', 'directory', 'revision', 'release'] if not base_snapshot: return [] snapshot_targets = set() for target in base_snapshot['branches'].values(): if target and target['target_type'] in _git_types: snapshot_targets.add(target['target']) - for id, objs in self.get_stored_objects( - self._decode_from_storage(snapshot_targets) - ).items(): + decoded_targets = self._decode_from_storage(snapshot_targets) + + for id, objs in self.get_stored_objects(decoded_targets).items(): if not objs: logging.warn('Missing head: %s' % hashutil.hash_to_hex(id)) return [] - return snapshot_targets + return decoded_targets def get_parents(self, commit): """Bogus method to prevent expensive recursion, at the expense of less efficient downloading""" return [] def get_heads(self): return self.heads @staticmethod def _encode_for_storage(objects): return [hashutil.bytehex_to_hash(object) for object in objects] @staticmethod def _decode_from_storage(objects): return set(hashutil.hash_to_bytehex(object) for object in objects) def graph_walker(self): return ObjectStoreGraphWalker(self.get_heads(), self.get_parents) @staticmethod def filter_unwanted_refs(refs): """Filter the unwanted references from refs""" ret = {} for ref, val in refs.items(): if ref.endswith(b'^{}'): # Peeled refs make the git protocol explode continue elif ref.startswith(b'refs/pull/') and ref.endswith(b'/merge'): # We filter-out auto-merged GitHub pull requests continue else: ret[ref] = val return ret def determine_wants(self, refs): """Filter the remote references to figure out which ones Software Heritage needs. """ if not refs: return [] # Find what objects Software Heritage has refs = self.find_remote_ref_types_in_swh(refs) # Cache the objects found in swh as existing heads for target in refs.values(): if target['target_type'] is not None: self.heads.add(target['target']) ret = set() for target in self.filter_unwanted_refs(refs).values(): if target['target_type'] is None: # The target doesn't exist in Software Heritage, let's retrieve # it. ret.add(target['target']) return list(ret) def get_stored_objects(self, objects): if self.ignore_history: return {} return self.storage.object_find_by_sha1_git( self._encode_for_storage(objects)) def find_remote_ref_types_in_swh(self, remote_refs): """Parse the remote refs information and list the objects that exist in Software Heritage. """ all_objs = set(remote_refs.values()) - set(self._type_cache) type_by_id = {} for id, objs in self.get_stored_objects(all_objs).items(): id = hashutil.hash_to_bytehex(id) if objs: type_by_id[id] = objs[0]['type'] self._type_cache.update(type_by_id) ret = {} for ref, id in remote_refs.items(): ret[ref] = { 'target': id, 'target_type': self._type_cache.get(id), } return ret class BulkUpdater(SWHStatelessLoader): """A bulk loader for a git repository""" CONFIG_BASE_FILENAME = 'loader/git-updater' ADDITIONAL_CONFIG = { 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), } def __init__(self, repo_representation=SWHRepoRepresentation, config=None): """Initialize the bulk updater. Args: repo_representation: swh's repository representation which is in charge of filtering between known and remote data. """ super().__init__(logging_class='swh.loader.git.BulkLoader', config=config) self.repo_representation = repo_representation def fetch_pack_from_origin(self, origin_url, base_origin_id, base_snapshot, do_activity): """Fetch a pack from the origin""" pack_buffer = BytesIO() base_repo = self.repo_representation( storage=self.storage, origin_id=base_origin_id, base_snapshot=base_snapshot, ignore_history=self.ignore_history, ) client, path = dulwich.client.get_transport_and_path(origin_url, thin_packs=False) size_limit = self.config['pack_size_bytes'] def do_pack(data, pack_buffer=pack_buffer, limit=size_limit, origin_url=origin_url): cur_size = pack_buffer.tell() would_write = len(data) if cur_size + would_write > limit: raise IOError('Pack file too big for repository %s, ' 'limit is %d bytes, current size is %d, ' 'would write %d' % (origin_url, limit, cur_size, would_write)) pack_buffer.write(data) remote_refs = client.fetch_pack(path, base_repo.determine_wants, base_repo.graph_walker(), do_pack, progress=do_activity) if remote_refs: local_refs = base_repo.find_remote_ref_types_in_swh(remote_refs) else: local_refs = remote_refs = {} pack_buffer.flush() pack_size = pack_buffer.tell() pack_buffer.seek(0) return { 'remote_refs': base_repo.filter_unwanted_refs(remote_refs), 'local_refs': local_refs, 'pack_buffer': pack_buffer, 'pack_size': pack_size, } def list_pack(self, pack_data, pack_size): id_to_type = {} type_to_ids = defaultdict(set) inflater = self.get_inflater() for obj in inflater: type, id = obj.type_name, obj.id id_to_type[id] = type type_to_ids[type].add(id) return id_to_type, type_to_ids def prepare_origin_visit(self, origin_url, **kwargs): self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.origin = converters.origin_url_to_origin(origin_url) def prepare(self, origin_url, base_url=None, ignore_history=False): base_origin_id = origin_id = self.origin_id prev_snapshot = None if not ignore_history: prev_snapshot = self.storage.snapshot_get_latest(origin_id) if base_url and not prev_snapshot: base_origin = converters.origin_url_to_origin(base_url) base_origin = self.storage.origin_get(base_origin) if base_origin: base_origin_id = base_origin['id'] prev_snapshot = self.storage.snapshot_get_latest( base_origin_id ) self.base_snapshot = prev_snapshot self.base_origin_id = base_origin_id self.ignore_history = ignore_history def fetch_data(self): def do_progress(msg): sys.stderr.buffer.write(msg) sys.stderr.flush() fetch_info = self.fetch_pack_from_origin( self.origin['url'], self.base_origin_id, self.base_snapshot, do_progress) self.pack_buffer = fetch_info['pack_buffer'] self.pack_size = fetch_info['pack_size'] self.remote_refs = fetch_info['remote_refs'] self.local_refs = fetch_info['local_refs'] origin_url = self.origin['url'] self.log.info('Listed %d refs for repo %s' % ( len(self.remote_refs), origin_url), extra={ 'swh_type': 'git_repo_list_refs', 'swh_repo': origin_url, 'swh_num_refs': len(self.remote_refs), }) # We want to load the repository, walk all the objects id_to_type, type_to_ids = self.list_pack(self.pack_buffer, self.pack_size) self.id_to_type = id_to_type self.type_to_ids = type_to_ids def save_data(self): """Store a pack for archival""" write_size = 8192 pack_dir = self.get_save_data_path() pack_name = "%s.pack" % self.visit_date.isoformat() refs_name = "%s.refs" % self.visit_date.isoformat() with open(os.path.join(pack_dir, pack_name), 'xb') as f: self.pack_buffer.seek(0) while True: r = self.pack_buffer.read(write_size) if not r: break f.write(r) self.pack_buffer.seek(0) with open(os.path.join(pack_dir, refs_name), 'xb') as f: pickle.dump(self.remote_refs, f) def get_inflater(self): """Reset the pack buffer and get an object inflater from it""" self.pack_buffer.seek(0) return PackInflater.for_pack_data( PackData.from_file(self.pack_buffer, self.pack_size)) def has_contents(self): return bool(self.type_to_ids[b'blob']) def get_content_ids(self): """Get the content identifiers from the git repository""" for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue yield converters.dulwich_blob_to_content_id(raw_obj) def get_contents(self): """Format the blobs from the git repository as swh contents""" max_content_size = self.config['content_size_limit'] missing_contents = set(self.storage.content_missing( self.get_content_ids(), 'sha1_git')) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue if raw_obj.sha().digest() not in missing_contents: continue yield converters.dulwich_blob_to_content( raw_obj, log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): return bool(self.type_to_ids[b'tree']) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tree']) def get_directories(self): """Format the trees as swh directories""" missing_dirs = set(self.storage.directory_missing( sorted(self.get_directory_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tree': continue if raw_obj.sha().digest() not in missing_dirs: continue yield converters.dulwich_tree_to_directory(raw_obj, log=self.log) def has_revisions(self): return bool(self.type_to_ids[b'commit']) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'commit']) def get_revisions(self): """Format commits as swh revisions""" missing_revs = set(self.storage.revision_missing( sorted(self.get_revision_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'commit': continue if raw_obj.sha().digest() not in missing_revs: continue yield converters.dulwich_commit_to_revision(raw_obj, log=self.log) def has_releases(self): return bool(self.type_to_ids[b'tag']) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tag']) def get_releases(self): """Retrieve all the release objects from the git repository""" missing_rels = set(self.storage.release_missing( sorted(self.get_release_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tag': continue if raw_obj.sha().digest() not in missing_rels: continue yield converters.dulwich_tag_to_release(raw_obj, log=self.log) def get_snapshot(self): branches = {} for ref in self.remote_refs: ret_ref = self.local_refs[ref].copy() if not ret_ref['target_type']: target_type = self.id_to_type[ret_ref['target']] ret_ref['target_type'] = converters.DULWICH_TYPES[target_type] ret_ref['target'] = hashutil.bytehex_to_hash(ret_ref['target']) branches[ref] = ret_ref self.snapshot = converters.branches_to_snapshot(branches) return self.snapshot def get_fetch_history_result(self): return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), } def load_status(self): """The load was eventful if the current snapshot is different to the one we retrieved at the beginning of the run""" eventful = False if self.base_snapshot: eventful = self.snapshot['id'] != self.base_snapshot['id'] else: eventful = bool(self.snapshot['branches']) return {'status': ('eventful' if eventful else 'uneventful')} if __name__ == '__main__': import click logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) @click.command() @click.option('--origin-url', help='Origin url', required=True) @click.option('--base-url', default=None, help='Optional Base url') @click.option('--ignore-history/--no-ignore-history', help='Ignore the repository history', default=False) def main(origin_url, base_url, ignore_history): return BulkUpdater().load( origin_url, base_url=base_url, ignore_history=ignore_history, ) main() diff --git a/version.txt b/version.txt index 3cd46a1..0276c56 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.41-0-g8586650 \ No newline at end of file +v0.0.42-0-g4874dcd \ No newline at end of file