Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/loader.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import dulwich.client | import dulwich.client | ||||
import logging | import logging | ||||
import os | import os | ||||
▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines | class GitLoader(UnbufferedLoader): | ||||
CONFIG_BASE_FILENAME = 'loader/git' | CONFIG_BASE_FILENAME = 'loader/git' | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), | 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), | ||||
} | } | ||||
visit_type = 'git' | visit_type = 'git' | ||||
def __init__(self, repo_representation=RepoRepresentation, config=None): | def __init__(self, url, base_url=None, ignore_history=False, | ||||
repo_representation=RepoRepresentation, config=None): | |||||
"""Initialize the bulk updater. | """Initialize the bulk updater. | ||||
Args: | Args: | ||||
repo_representation: swh's repository representation | repo_representation: swh's repository representation | ||||
which is in charge of filtering between known and remote | which is in charge of filtering between known and remote | ||||
data. | data. | ||||
""" | """ | ||||
super().__init__(logging_class='swh.loader.git.BulkLoader', | super().__init__(logging_class='swh.loader.git.BulkLoader', | ||||
config=config) | config=config) | ||||
self.origin_url = url | |||||
self.base_url = base_url | |||||
self.ignore_history = ignore_history | |||||
self.repo_representation = repo_representation | self.repo_representation = repo_representation | ||||
def fetch_pack_from_origin(self, origin_url, | def fetch_pack_from_origin(self, origin_url, | ||||
base_snapshot, do_activity): | base_snapshot, do_activity): | ||||
"""Fetch a pack from the origin""" | """Fetch a pack from the origin""" | ||||
pack_buffer = BytesIO() | pack_buffer = BytesIO() | ||||
base_repo = self.repo_representation( | base_repo = self.repo_representation( | ||||
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines | def list_pack(self, pack_data, pack_size): | ||||
for obj in inflater: | for obj in inflater: | ||||
type, id = obj.type_name, obj.id | type, id = obj.type_name, obj.id | ||||
id_to_type[id] = type | id_to_type[id] = type | ||||
type_to_ids[type].add(id) | type_to_ids[type].add(id) | ||||
return id_to_type, type_to_ids | return id_to_type, type_to_ids | ||||
def prepare_origin_visit(self, origin_url, **kwargs): | def prepare_origin_visit(self, *args, **kwargs): | ||||
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) | ||||
self.origin = converters.origin_url_to_origin(origin_url) | self.origin = converters.origin_url_to_origin(self.origin_url) | ||||
def get_full_snapshot(self, origin_url): | def get_full_snapshot(self, origin_url): | ||||
prev_snapshot = self.storage.snapshot_get_latest(origin_url) | prev_snapshot = self.storage.snapshot_get_latest(origin_url) | ||||
if prev_snapshot and prev_snapshot.pop('next_branch', None): | if prev_snapshot and prev_snapshot.pop('next_branch', None): | ||||
return snapshot_get_all_branches(self.storage, prev_snapshot['id']) | return snapshot_get_all_branches(self.storage, prev_snapshot['id']) | ||||
return prev_snapshot | return prev_snapshot | ||||
def prepare(self, origin_url, base_url=None, ignore_history=False): | def prepare(self, *args, **kwargs): | ||||
base_origin_url = origin_url = self.origin['url'] | base_origin_url = origin_url = self.origin['url'] | ||||
prev_snapshot = None | prev_snapshot = None | ||||
if not ignore_history: | if not self.ignore_history: | ||||
prev_snapshot = self.get_full_snapshot(origin_url) | prev_snapshot = self.get_full_snapshot(origin_url) | ||||
if base_url and not prev_snapshot: | if self.base_url and not prev_snapshot: | ||||
base_origin = converters.origin_url_to_origin(base_url) | base_origin = converters.origin_url_to_origin(self.base_url) | ||||
base_origin = self.storage.origin_get(base_origin) | base_origin = self.storage.origin_get(base_origin) | ||||
if base_origin: | if base_origin: | ||||
base_origin_url = base_origin['url'] | base_origin_url = base_origin['url'] | ||||
prev_snapshot = self.get_full_snapshot(base_origin_url) | prev_snapshot = self.get_full_snapshot(base_origin_url) | ||||
self.base_snapshot = prev_snapshot | self.base_snapshot = prev_snapshot | ||||
self.base_origin_url = base_origin_url | self.base_origin_url = base_origin_url | ||||
self.ignore_history = ignore_history | |||||
def fetch_data(self): | def fetch_data(self): | ||||
def do_progress(msg): | def do_progress(msg): | ||||
sys.stderr.buffer.write(msg) | sys.stderr.buffer.write(msg) | ||||
sys.stderr.flush() | sys.stderr.flush() | ||||
fetch_info = self.fetch_pack_from_origin( | fetch_info = self.fetch_pack_from_origin( | ||||
self.origin['url'], self.base_snapshot, | self.origin['url'], self.base_snapshot, | ||||
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines | if __name__ == '__main__': | ||||
) | ) | ||||
@click.command() | @click.command() | ||||
@click.option('--origin-url', help='Origin url', required=True) | @click.option('--origin-url', help='Origin url', required=True) | ||||
@click.option('--base-url', default=None, help='Optional Base url') | @click.option('--base-url', default=None, help='Optional Base url') | ||||
@click.option('--ignore-history/--no-ignore-history', | @click.option('--ignore-history/--no-ignore-history', | ||||
help='Ignore the repository history', default=False) | help='Ignore the repository history', default=False) | ||||
def main(origin_url, base_url, ignore_history): | def main(origin_url, base_url, ignore_history): | ||||
return GitLoader().load( | loader = GitLoader( | ||||
origin_url, | origin_url, | ||||
base_url=base_url, | base_url=base_url, | ||||
ignore_history=ignore_history, | ignore_history=ignore_history, | ||||
) | ) | ||||
return loader.load() | |||||
main() | main() |