Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
- This file was copied to swh/loader/git/from_disk.py.
# Copyright (C) 2015-2018 The Software Heritage developers | # Copyright (C) 2016-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import dulwich.repo | import dulwich.client | ||||
import logging | |||||
import os | import os | ||||
import shutil | import pickle | ||||
import sys | |||||
from dulwich.errors import ObjectFormatException, EmptyFileException | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from io import BytesIO | |||||
from dulwich.object_store import ObjectStoreGraphWalker | |||||
from dulwich.pack import PackData, PackInflater | |||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.loader.core.loader import UnbufferedLoader | from swh.loader.core.loader import UnbufferedLoader | ||||
from . import converters, utils | from swh.storage.algos.snapshot import snapshot_get_all_branches | ||||
from . import converters | |||||
class GitLoader(UnbufferedLoader): | class RepoRepresentation: | ||||
"""Load a git repository from a directory. | """Repository representation for a Software Heritage origin.""" | ||||
def __init__(self, storage, origin_id, base_snapshot=None, | |||||
ignore_history=False): | |||||
self.storage = storage | |||||
self._parents_cache = {} | |||||
self._type_cache = {} | |||||
self.ignore_history = ignore_history | |||||
if origin_id and not ignore_history: | |||||
self.heads = set(self._cache_heads(origin_id, base_snapshot)) | |||||
else: | |||||
self.heads = set() | |||||
def _fill_parents_cache(self, commits): | |||||
"""When querying for a commit's parents, we fill the cache to a depth of 1000 | |||||
commits.""" | |||||
root_revs = self._encode_for_storage(commits) | |||||
for rev, parents in self.storage.revision_shortlog(root_revs, 1000): | |||||
rev_id = hashutil.hash_to_bytehex(rev) | |||||
if rev_id not in self._parents_cache: | |||||
self._parents_cache[rev_id] = [ | |||||
hashutil.hash_to_bytehex(parent) for parent in parents | |||||
] | |||||
for rev in commits: | |||||
if rev not in self._parents_cache: | |||||
self._parents_cache[rev] = [] | |||||
def _cache_heads(self, origin_id, base_snapshot): | |||||
"""Return all the known head commits for `origin_id`""" | |||||
_git_types = ['content', 'directory', 'revision', 'release'] | |||||
if not base_snapshot: | |||||
return [] | |||||
snapshot_targets = set() | |||||
for target in base_snapshot['branches'].values(): | |||||
if target and target['target_type'] in _git_types: | |||||
snapshot_targets.add(target['target']) | |||||
decoded_targets = self._decode_from_storage(snapshot_targets) | |||||
for id, objs in self.get_stored_objects(decoded_targets).items(): | |||||
if not objs: | |||||
logging.warn('Missing head: %s' % hashutil.hash_to_hex(id)) | |||||
return [] | |||||
return decoded_targets | |||||
def get_parents(self, commit): | |||||
"""Bogus method to prevent expensive recursion, at the expense of less | |||||
efficient downloading""" | |||||
return [] | |||||
def get_heads(self): | |||||
return self.heads | |||||
@staticmethod | |||||
def _encode_for_storage(objects): | |||||
return [hashutil.bytehex_to_hash(object) for object in objects] | |||||
@staticmethod | |||||
def _decode_from_storage(objects): | |||||
return set(hashutil.hash_to_bytehex(object) for object in objects) | |||||
def graph_walker(self): | |||||
return ObjectStoreGraphWalker(self.get_heads(), self.get_parents) | |||||
@staticmethod | |||||
def filter_unwanted_refs(refs): | |||||
"""Filter the unwanted references from refs""" | |||||
ret = {} | |||||
for ref, val in refs.items(): | |||||
if ref.endswith(b'^{}'): | |||||
# Peeled refs make the git protocol explode | |||||
continue | |||||
elif ref.startswith(b'refs/pull/') and ref.endswith(b'/merge'): | |||||
# We filter-out auto-merged GitHub pull requests | |||||
continue | |||||
else: | |||||
ret[ref] = val | |||||
return ret | |||||
def determine_wants(self, refs): | |||||
"""Filter the remote references to figure out which ones | |||||
Software Heritage needs. | |||||
""" | """ | ||||
if not refs: | |||||
return [] | |||||
CONFIG_BASE_FILENAME = 'loader/git-loader' | # Find what objects Software Heritage has | ||||
refs = self.find_remote_ref_types_in_swh(refs) | |||||
def __init__(self, config=None): | # Cache the objects found in swh as existing heads | ||||
super().__init__(logging_class='swh.loader.git.Loader', config=config) | for target in refs.values(): | ||||
if target['target_type'] is not None: | |||||
self.heads.add(target['target']) | |||||
ret = set() | |||||
for target in self.filter_unwanted_refs(refs).values(): | |||||
if target['target_type'] is None: | |||||
# The target doesn't exist in Software Heritage, let's retrieve | |||||
# it. | |||||
ret.add(target['target']) | |||||
def _prepare_origin_visit(self, origin_url, visit_date): | return list(ret) | ||||
self.origin_url = origin_url | |||||
self.origin = converters.origin_url_to_origin(self.origin_url) | |||||
self.visit_date = visit_date | |||||
def prepare_origin_visit(self, origin_url, directory, visit_date): | def get_stored_objects(self, objects): | ||||
self._prepare_origin_visit(origin_url, visit_date) | """Find which of these objects were stored in the archive. | ||||
def prepare(self, origin_url, directory, visit_date): | Do the request in packets to avoid a server timeout. | ||||
self.repo = dulwich.repo.Repo(directory) | """ | ||||
if self.ignore_history: | |||||
return {} | |||||
def iter_objects(self): | packet_size = 1000 | ||||
object_store = self.repo.object_store | |||||
for pack in object_store.packs: | ret = {} | ||||
objs = list(pack.index.iterentries()) | query = [] | ||||
objs.sort(key=lambda x: x[1]) | for object in objects: | ||||
for sha, offset, crc32 in objs: | query.append(object) | ||||
yield hashutil.hash_to_bytehex(sha) | if len(query) >= packet_size: | ||||
ret.update( | |||||
self.storage.object_find_by_sha1_git( | |||||
self._encode_for_storage(query) | |||||
) | |||||
) | |||||
query = [] | |||||
if query: | |||||
ret.update( | |||||
self.storage.object_find_by_sha1_git( | |||||
self._encode_for_storage(query) | |||||
) | |||||
) | |||||
return ret | |||||
yield from object_store._iter_loose_objects() | def find_remote_ref_types_in_swh(self, remote_refs): | ||||
yield from object_store._iter_alternate_objects() | """Parse the remote refs information and list the objects that exist in | ||||
Software Heritage. | |||||
""" | |||||
def _check(self, obj): | all_objs = set(remote_refs.values()) - set(self._type_cache) | ||||
"""Check the object's repository representation. | type_by_id = {} | ||||
for id, objs in self.get_stored_objects(all_objs).items(): | |||||
id = hashutil.hash_to_bytehex(id) | |||||
if objs: | |||||
type_by_id[id] = objs[0]['type'] | |||||
self._type_cache.update(type_by_id) | |||||
ret = {} | |||||
for ref, id in remote_refs.items(): | |||||
ret[ref] = { | |||||
'target': id, | |||||
'target_type': self._type_cache.get(id), | |||||
} | |||||
return ret | |||||
If any errors in check exists, an ObjectFormatException is | |||||
raised. | class GitLoader(UnbufferedLoader): | ||||
"""A bulk loader for a git repository""" | |||||
CONFIG_BASE_FILENAME = 'loader/git' | |||||
ADDITIONAL_CONFIG = { | |||||
'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), | |||||
} | |||||
def __init__(self, repo_representation=RepoRepresentation, config=None): | |||||
"""Initialize the bulk updater. | |||||
Args: | Args: | ||||
obj (object): Dulwich object read from the repository. | repo_representation: swh's repository representation | ||||
which is in charge of filtering between known and remote | |||||
data. | |||||
""" | """ | ||||
obj.check() | super().__init__(logging_class='swh.loader.git.BulkLoader', | ||||
from dulwich.objects import Commit, Tag | config=config) | ||||
try: | self.repo_representation = repo_representation | ||||
# For additional checks on dulwich objects with date | |||||
# for now, only checks on *time | def fetch_pack_from_origin(self, origin_url, base_origin_id, | ||||
if isinstance(obj, Commit): | base_snapshot, do_activity): | ||||
commit_time = obj._commit_time | """Fetch a pack from the origin""" | ||||
utils.check_date_time(commit_time) | pack_buffer = BytesIO() | ||||
author_time = obj._author_time | |||||
utils.check_date_time(author_time) | base_repo = self.repo_representation( | ||||
elif isinstance(obj, Tag): | storage=self.storage, | ||||
tag_time = obj._tag_time | origin_id=base_origin_id, | ||||
utils.check_date_time(tag_time) | base_snapshot=base_snapshot, | ||||
except Exception as e: | ignore_history=self.ignore_history, | ||||
raise ObjectFormatException(e) | ) | ||||
def get_object(self, oid): | |||||
"""Given an object id, return the object if it is found and not | |||||
malformed in some way. | |||||
Args: | client, path = dulwich.client.get_transport_and_path(origin_url, | ||||
oid (bytes): the object's identifier | thin_packs=False) | ||||
Returns: | size_limit = self.config['pack_size_bytes'] | ||||
The object if found without malformation | |||||
""" | def do_pack(data, | ||||
try: | pack_buffer=pack_buffer, | ||||
# some errors are raised when reading the object | limit=size_limit, | ||||
obj = self.repo[oid] | origin_url=origin_url): | ||||
# some we need to check ourselves | cur_size = pack_buffer.tell() | ||||
self._check(obj) | would_write = len(data) | ||||
except KeyError: | if cur_size + would_write > limit: | ||||
_id = oid.decode('utf-8') | raise IOError('Pack file too big for repository %s, ' | ||||
self.log.warn('object %s not found, skipping' % _id, | 'limit is %d bytes, current size is %d, ' | ||||
extra={ | 'would write %d' % | ||||
'swh_type': 'swh_loader_git_missing_object', | (origin_url, limit, cur_size, would_write)) | ||||
'swh_object_id': _id, | |||||
'origin_id': self.origin_id, | pack_buffer.write(data) | ||||
}) | |||||
return None | remote_refs = client.fetch_pack(path, | ||||
except ObjectFormatException: | base_repo.determine_wants, | ||||
_id = oid.decode('utf-8') | base_repo.graph_walker(), | ||||
self.log.warn('object %s malformed, skipping' % _id, | do_pack, | ||||
extra={ | progress=do_activity).refs | ||||
'swh_type': 'swh_loader_git_missing_object', | |||||
'swh_object_id': _id, | if remote_refs: | ||||
'origin_id': self.origin_id, | local_refs = base_repo.find_remote_ref_types_in_swh(remote_refs) | ||||
}) | |||||
return None | |||||
except EmptyFileException: | |||||
_id = oid.decode('utf-8') | |||||
self.log.warn('object %s corrupted (empty file), skipping' % _id, | |||||
extra={ | |||||
'swh_type': 'swh_loader_git_missing_object', | |||||
'swh_object_id': _id, | |||||
'origin_id': self.origin_id, | |||||
}) | |||||
else: | else: | ||||
return obj | local_refs = remote_refs = {} | ||||
pack_buffer.flush() | |||||
pack_size = pack_buffer.tell() | |||||
pack_buffer.seek(0) | |||||
return { | |||||
'remote_refs': base_repo.filter_unwanted_refs(remote_refs), | |||||
'local_refs': local_refs, | |||||
'pack_buffer': pack_buffer, | |||||
'pack_size': pack_size, | |||||
} | |||||
def list_pack(self, pack_data, pack_size): | |||||
id_to_type = {} | |||||
type_to_ids = defaultdict(set) | |||||
inflater = self.get_inflater() | |||||
for obj in inflater: | |||||
type, id = obj.type_name, obj.id | |||||
id_to_type[id] = type | |||||
type_to_ids[type].add(id) | |||||
return id_to_type, type_to_ids | |||||
def prepare_origin_visit(self, origin_url, **kwargs): | |||||
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | |||||
self.origin = converters.origin_url_to_origin(origin_url) | |||||
def get_full_snapshot(self, origin_id): | |||||
prev_snapshot = self.storage.snapshot_get_latest(origin_id) | |||||
if prev_snapshot and prev_snapshot.pop('next_branch', None): | |||||
return snapshot_get_all_branches(self.storage, prev_snapshot['id']) | |||||
return prev_snapshot | |||||
def prepare(self, origin_url, base_url=None, ignore_history=False): | |||||
base_origin_id = origin_id = self.origin_id | |||||
prev_snapshot = None | |||||
if not ignore_history: | |||||
prev_snapshot = self.get_full_snapshot(origin_id) | |||||
if base_url and not prev_snapshot: | |||||
base_origin = converters.origin_url_to_origin(base_url) | |||||
base_origin = self.storage.origin_get(base_origin) | |||||
if base_origin: | |||||
base_origin_id = base_origin['id'] | |||||
prev_snapshot = self.get_full_snapshot(base_origin_id) | |||||
self.base_snapshot = prev_snapshot | |||||
self.base_origin_id = base_origin_id | |||||
self.ignore_history = ignore_history | |||||
def fetch_data(self): | def fetch_data(self): | ||||
"""Fetch the data from the data source""" | def do_progress(msg): | ||||
self.previous_snapshot = self.storage.snapshot_get_latest( | sys.stderr.buffer.write(msg) | ||||
self.origin_id | sys.stderr.flush() | ||||
) | |||||
fetch_info = self.fetch_pack_from_origin( | |||||
self.origin['url'], self.base_origin_id, self.base_snapshot, | |||||
do_progress) | |||||
self.pack_buffer = fetch_info['pack_buffer'] | |||||
self.pack_size = fetch_info['pack_size'] | |||||
self.remote_refs = fetch_info['remote_refs'] | |||||
self.local_refs = fetch_info['local_refs'] | |||||
origin_url = self.origin['url'] | |||||
self.log.info('Listed %d refs for repo %s' % ( | |||||
len(self.remote_refs), origin_url), extra={ | |||||
'swh_type': 'git_repo_list_refs', | |||||
'swh_repo': origin_url, | |||||
'swh_num_refs': len(self.remote_refs), | |||||
}) | |||||
type_to_ids = defaultdict(list) | # We want to load the repository, walk all the objects | ||||
for oid in self.iter_objects(): | id_to_type, type_to_ids = self.list_pack(self.pack_buffer, | ||||
obj = self.get_object(oid) | self.pack_size) | ||||
if not obj: | |||||
continue | |||||
type_name = obj.type_name | |||||
type_to_ids[type_name].append(oid) | |||||
self.id_to_type = id_to_type | |||||
self.type_to_ids = type_to_ids | self.type_to_ids = type_to_ids | ||||
def save_data(self): | |||||
"""Store a pack for archival""" | |||||
write_size = 8192 | |||||
pack_dir = self.get_save_data_path() | |||||
pack_name = "%s.pack" % self.visit_date.isoformat() | |||||
refs_name = "%s.refs" % self.visit_date.isoformat() | |||||
with open(os.path.join(pack_dir, pack_name), 'xb') as f: | |||||
self.pack_buffer.seek(0) | |||||
while True: | |||||
r = self.pack_buffer.read(write_size) | |||||
if not r: | |||||
break | |||||
f.write(r) | |||||
self.pack_buffer.seek(0) | |||||
with open(os.path.join(pack_dir, refs_name), 'xb') as f: | |||||
pickle.dump(self.remote_refs, f) | |||||
def get_inflater(self): | |||||
"""Reset the pack buffer and get an object inflater from it""" | |||||
self.pack_buffer.seek(0) | |||||
return PackInflater.for_pack_data( | |||||
PackData.from_file(self.pack_buffer, self.pack_size)) | |||||
def has_contents(self): | def has_contents(self): | ||||
"""Checks whether we need to load contents""" | |||||
return bool(self.type_to_ids[b'blob']) | return bool(self.type_to_ids[b'blob']) | ||||
def get_content_ids(self): | def get_content_ids(self): | ||||
"""Get the content identifiers from the git repository""" | """Get the content identifiers from the git repository""" | ||||
for oid in self.type_to_ids[b'blob']: | for raw_obj in self.get_inflater(): | ||||
yield converters.dulwich_blob_to_content_id(self.repo[oid]) | if raw_obj.type_name != b'blob': | ||||
continue | |||||
yield converters.dulwich_blob_to_content_id(raw_obj) | |||||
def get_contents(self): | def get_contents(self): | ||||
"""Get the contents that need to be loaded""" | """Format the blobs from the git repository as swh contents""" | ||||
max_content_size = self.config['content_size_limit'] | max_content_size = self.config['content_size_limit'] | ||||
missing_contents = set(self.storage.content_missing( | missing_contents = set(self.storage.content_missing( | ||||
self.get_content_ids(), 'sha1_git')) | self.get_content_ids(), 'sha1_git')) | ||||
for oid in missing_contents: | for raw_obj in self.get_inflater(): | ||||
if raw_obj.type_name != b'blob': | |||||
continue | |||||
if raw_obj.sha().digest() not in missing_contents: | |||||
continue | |||||
yield converters.dulwich_blob_to_content( | yield converters.dulwich_blob_to_content( | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log, | raw_obj, log=self.log, max_content_size=max_content_size, | ||||
max_content_size=max_content_size, | |||||
origin_id=self.origin_id) | origin_id=self.origin_id) | ||||
def has_directories(self): | def has_directories(self): | ||||
"""Checks whether we need to load directories""" | |||||
return bool(self.type_to_ids[b'tree']) | return bool(self.type_to_ids[b'tree']) | ||||
def get_directory_ids(self): | def get_directory_ids(self): | ||||
"""Get the directory identifiers from the git repository""" | """Get the directory identifiers from the git repository""" | ||||
return (hashutil.hash_to_bytes(id.decode()) | return (hashutil.hash_to_bytes(id.decode()) | ||||
for id in self.type_to_ids[b'tree']) | for id in self.type_to_ids[b'tree']) | ||||
def get_directories(self): | def get_directories(self): | ||||
"""Get the directories that need to be loaded""" | """Format the trees as swh directories""" | ||||
missing_dirs = set(self.storage.directory_missing( | missing_dirs = set(self.storage.directory_missing( | ||||
sorted(self.get_directory_ids()))) | sorted(self.get_directory_ids()))) | ||||
for oid in missing_dirs: | for raw_obj in self.get_inflater(): | ||||
yield converters.dulwich_tree_to_directory( | if raw_obj.type_name != b'tree': | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) | continue | ||||
if raw_obj.sha().digest() not in missing_dirs: | |||||
continue | |||||
yield converters.dulwich_tree_to_directory(raw_obj, log=self.log) | |||||
def has_revisions(self): | def has_revisions(self): | ||||
"""Checks whether we need to load revisions""" | |||||
return bool(self.type_to_ids[b'commit']) | return bool(self.type_to_ids[b'commit']) | ||||
def get_revision_ids(self): | def get_revision_ids(self): | ||||
"""Get the revision identifiers from the git repository""" | """Get the revision identifiers from the git repository""" | ||||
return (hashutil.hash_to_bytes(id.decode()) | return (hashutil.hash_to_bytes(id.decode()) | ||||
for id in self.type_to_ids[b'commit']) | for id in self.type_to_ids[b'commit']) | ||||
def get_revisions(self): | def get_revisions(self): | ||||
"""Get the revisions that need to be loaded""" | """Format commits as swh revisions""" | ||||
missing_revs = set(self.storage.revision_missing( | missing_revs = set(self.storage.revision_missing( | ||||
sorted(self.get_revision_ids()))) | sorted(self.get_revision_ids()))) | ||||
for oid in missing_revs: | for raw_obj in self.get_inflater(): | ||||
yield converters.dulwich_commit_to_revision( | if raw_obj.type_name != b'commit': | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) | continue | ||||
if raw_obj.sha().digest() not in missing_revs: | |||||
continue | |||||
yield converters.dulwich_commit_to_revision(raw_obj, log=self.log) | |||||
def has_releases(self): | def has_releases(self): | ||||
"""Checks whether we need to load releases""" | |||||
return bool(self.type_to_ids[b'tag']) | return bool(self.type_to_ids[b'tag']) | ||||
def get_release_ids(self): | def get_release_ids(self): | ||||
"""Get the release identifiers from the git repository""" | """Get the release identifiers from the git repository""" | ||||
return (hashutil.hash_to_bytes(id.decode()) | return (hashutil.hash_to_bytes(id.decode()) | ||||
for id in self.type_to_ids[b'tag']) | for id in self.type_to_ids[b'tag']) | ||||
def get_releases(self): | def get_releases(self): | ||||
"""Get the releases that need to be loaded""" | """Retrieve all the release objects from the git repository""" | ||||
missing_rels = set(self.storage.release_missing( | missing_rels = set(self.storage.release_missing( | ||||
sorted(self.get_release_ids()))) | sorted(self.get_release_ids()))) | ||||
for oid in missing_rels: | for raw_obj in self.get_inflater(): | ||||
yield converters.dulwich_tag_to_release( | if raw_obj.type_name != b'tag': | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) | continue | ||||
if raw_obj.sha().digest() not in missing_rels: | |||||
continue | |||||
yield converters.dulwich_tag_to_release(raw_obj, log=self.log) | |||||
def get_snapshot(self): | def get_snapshot(self): | ||||
"""Turn the list of branches into a snapshot to load""" | |||||
branches = {} | branches = {} | ||||
for ref, target in self.repo.refs.as_dict().items(): | for ref in self.remote_refs: | ||||
obj = self.get_object(target) | ret_ref = self.local_refs[ref].copy() | ||||
if obj: | if not ret_ref['target_type']: | ||||
branches[ref] = { | target_type = self.id_to_type[ret_ref['target']] | ||||
'target': hashutil.bytehex_to_hash(target), | ret_ref['target_type'] = converters.DULWICH_TYPES[target_type] | ||||
'target_type': converters.DULWICH_TYPES[obj.type_name], | |||||
} | ret_ref['target'] = hashutil.bytehex_to_hash(ret_ref['target']) | ||||
else: | |||||
branches[ref] = None | branches[ref] = ret_ref | ||||
self.snapshot = converters.branches_to_snapshot(branches) | self.snapshot = converters.branches_to_snapshot(branches) | ||||
return self.snapshot | return self.snapshot | ||||
def get_fetch_history_result(self): | def get_fetch_history_result(self): | ||||
"""Return the data to store in fetch_history for the current loader""" | |||||
return { | return { | ||||
'contents': len(self.type_to_ids[b'blob']), | 'contents': len(self.type_to_ids[b'blob']), | ||||
'directories': len(self.type_to_ids[b'tree']), | 'directories': len(self.type_to_ids[b'tree']), | ||||
'revisions': len(self.type_to_ids[b'commit']), | 'revisions': len(self.type_to_ids[b'commit']), | ||||
'releases': len(self.type_to_ids[b'tag']), | 'releases': len(self.type_to_ids[b'tag']), | ||||
} | } | ||||
def save_data(self): | |||||
"""We already have the data locally, no need to save it""" | |||||
pass | |||||
def load_status(self): | def load_status(self): | ||||
"""The load was eventful if the current occurrences are different to | """The load was eventful if the current snapshot is different to | ||||
the ones we retrieved at the beginning of the run""" | the one we retrieved at the beginning of the run""" | ||||
eventful = False | eventful = False | ||||
if self.previous_snapshot: | if self.base_snapshot: | ||||
eventful = self.snapshot['id'] != self.previous_snapshot['id'] | eventful = self.snapshot['id'] != self.base_snapshot['id'] | ||||
else: | else: | ||||
eventful = bool(self.snapshot['branches']) | eventful = bool(self.snapshot['branches']) | ||||
return {'status': ('eventful' if eventful else 'uneventful')} | return {'status': ('eventful' if eventful else 'uneventful')} | ||||
class GitLoaderFromArchive(GitLoader): | |||||
"""Load a git repository from an archive. | |||||
This loader ingests a git repository compressed into an archive. | |||||
The supported archive formats are ``.zip`` and ``.tar.gz``. | |||||
From an input tarball named ``my-git-repo.zip``, the following layout is | |||||
expected in it:: | |||||
my-git-repo/ | |||||
├── .git | |||||
│ ├── branches | |||||
│ ├── COMMIT_EDITMSG | |||||
│ ├── config | |||||
│ ├── description | |||||
│ ├── HEAD | |||||
... | |||||
Nevertheless, the loader is able to ingest tarballs with the following | |||||
layouts too:: | |||||
. | |||||
├── .git | |||||
│ ├── branches | |||||
│ ├── COMMIT_EDITMSG | |||||
│ ├── config | |||||
│ ├── description | |||||
│ ├── HEAD | |||||
... | |||||
or:: | |||||
other-repo-name/ | |||||
├── .git | |||||
│ ├── branches | |||||
│ ├── COMMIT_EDITMSG | |||||
│ ├── config | |||||
│ ├── description | |||||
│ ├── HEAD | |||||
... | |||||
""" | |||||
def __init__(self, *args, **kwargs): | |||||
super().__init__(*args, **kwargs) | |||||
self.temp_dir = self.repo_path = None | |||||
def project_name_from_archive(self, archive_path): | |||||
"""Compute the project name from the archive's path. | |||||
""" | |||||
archive_name = os.path.basename(archive_path) | |||||
for ext in ('.zip', '.tar.gz', '.tgz'): | |||||
if archive_name.lower().endswith(ext): | |||||
archive_name = archive_name[:-len(ext)] | |||||
break | |||||
return archive_name | |||||
def prepare_origin_visit(self, origin_url, archive_path, visit_date): | |||||
self._prepare_origin_visit(origin_url, visit_date) | |||||
def prepare(self, origin_url, archive_path, visit_date): | |||||
"""1. Uncompress the archive in temporary location. | |||||
2. Prepare as the GitLoader does | |||||
3. Load as GitLoader does | |||||
""" | |||||
project_name = self.project_name_from_archive(archive_path) | |||||
self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( | |||||
project_name, archive_path) | |||||
self.log.info('Project %s - Uncompressing archive %s at %s' % ( | |||||
origin_url, os.path.basename(archive_path), self.repo_path)) | |||||
super().prepare(origin_url, self.repo_path, visit_date) | |||||
def cleanup(self): | |||||
"""Cleanup the temporary location (if it exists). | |||||
""" | |||||
if self.temp_dir and os.path.exists(self.temp_dir): | |||||
shutil.rmtree(self.temp_dir) | |||||
self.log.info('Project %s - Done injecting %s' % ( | |||||
self.origin_url, self.repo_path)) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
import click | import click | ||||
import logging | |||||
logging.basicConfig( | logging.basicConfig( | ||||
level=logging.DEBUG, | level=logging.DEBUG, | ||||
format='%(asctime)s %(process)d %(message)s' | format='%(asctime)s %(process)d %(message)s' | ||||
) | ) | ||||
@click.command() | @click.command() | ||||
@click.option('--origin-url', help='origin url') | @click.option('--origin-url', help='Origin url', required=True) | ||||
@click.option('--git-directory', help='Path to git repository to load') | @click.option('--base-url', default=None, help='Optional Base url') | ||||
@click.option('--visit-date', default=None, help='Visit date') | @click.option('--ignore-history/--no-ignore-history', | ||||
def main(origin_url, git_directory, visit_date): | help='Ignore the repository history', default=False) | ||||
if not visit_date: | def main(origin_url, base_url, ignore_history): | ||||
visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | return GitLoader().load( | ||||
origin_url, | |||||
return GitLoader().load(origin_url, git_directory, visit_date) | base_url=base_url, | ||||
ignore_history=ignore_history, | |||||
) | |||||
main() | main() |