diff --git a/PKG-INFO b/PKG-INFO index 6a074f3..64a7e74 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index e35b8b0..f0dde19 100644 --- a/debian/control +++ b/debian/control @@ -1,27 +1,28 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-dulwich, python3-retrying, python3-setuptools, + python3-click, python3-swh.core (>= 0.0.7~), python3-swh.model (>= 0.0.3~), python3-swh.scheduler, python3-swh.storage (>= 0.0.50~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), python3-swh.storage (>= 0.0.50~), python3-swh.model (>= 0.0.3~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements.txt b/requirements.txt index ef55e34..e7b0a99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ dulwich retrying vcversioner +click swh.core >= 0.0.7 swh.model >= 0.0.3 swh.scheduler swh.storage >= 0.0.50 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 6a074f3..64a7e74 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/SOURCES.txt b/swh.loader.git.egg-info/SOURCES.txt index 4f2b5af..6209d26 100644 --- a/swh.loader.git.egg-info/SOURCES.txt +++ b/swh.loader.git.egg-info/SOURCES.txt @@ -1,37 +1,38 @@ .gitignore .gitmodules AUTHORS LICENSE MANIFEST.in Makefile README requirements.txt setup.py version.txt bin/dir-git-repo-meta.sh debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/attic/api-backend-protocol.txt docs/attic/git-loading-design.txt resources/local-loader-git.ini resources/remote-loader-git.ini resources/updater.ini resources/test/back.ini resources/test/db-manager.ini scratch/analyse-profile.py swh.loader.git.egg-info/PKG-INFO swh.loader.git.egg-info/SOURCES.txt swh.loader.git.egg-info/dependency_links.txt swh.loader.git.egg-info/requires.txt swh.loader.git.egg-info/top_level.txt swh/loader/git/__init__.py swh/loader/git/base.py swh/loader/git/converters.py swh/loader/git/loader.py +swh/loader/git/reader.py swh/loader/git/tasks.py swh/loader/git/updater.py swh/loader/git/tests/test_converters.py \ No newline at end of file diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index 4447db0..c6ecc1e 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,7 +1,8 @@ +click dulwich retrying swh.core>=0.0.7 swh.model>=0.0.3 swh.scheduler swh.storage>=0.0.50 vcversioner diff --git a/swh/loader/git/reader.py b/swh/loader/git/reader.py new file mode 100644 index 0000000..b069665 --- /dev/null +++ b/swh/loader/git/reader.py @@ -0,0 +1,130 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import datetime + +from collections import defaultdict + +from swh.core import hashutil, utils + +from .updater import BulkUpdater +from .loader import GitLoader + + +class GitSha1Reader(GitLoader): + """Disk git sha1 reader. Only read and dump sha1s in stdout. + + """ + def fetch_data(self): + """Fetch the data from the data source""" + for oid in self.iter_objects(): + type_name = self.repo[oid].type_name + if type_name != b'blob': + continue + yield hashutil.hex_to_hash(oid.decode('utf-8')) + + def load(self, *args, **kwargs): + self.prepare(*args, **kwargs) + yield from self.fetch_data() + + +class GitSha1RemoteReader(BulkUpdater): + """Disk git sha1 reader to dump only repo's content sha1 list. + + """ + CONFIG_BASE_FILENAME = 'loader/git-remote-reader' + + ADDITIONAL_CONFIG = { + 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), + 'pack_storage_base': ('str', ''), # don't want to store packs so empty + 'next_task': ( + 'dict', { + 'queue': 'swh.storage.archiver.tasks.SWHArchiverToBackendTask', + 'batch_size': 100, + 'destination': 'azure' + } + ) + } + + def __init__(self): + super().__init__() + self.next_task = self.config['next_task'] + self.batch_size = self.next_task['batch_size'] + self.task_destination = self.next_task.get('queue') + self.destination = self.next_task['destination'] + + def list_pack(self, pack_data, pack_size): + """Override list_pack to only keep blobs data. + + """ + id_to_type = {} + type_to_ids = defaultdict(set) + + inflater = self.get_inflater() + + for obj in inflater: + type, id = obj.type_name, obj.id + if type != b'blob': # don't keep other types + continue + oid = hashutil.hex_to_hash(id.decode('utf-8')) + id_to_type[oid] = type + type_to_ids[type].add(oid) + + return id_to_type, type_to_ids + + def load(self, *args, **kwargs): + self.prepare(*args, **kwargs) + origin = self.get_origin() + self.origin_id = self.send_origin(origin) + + self.fetch_data() + data = self.id_to_type.keys() + if not self.task_destination: # to stdout + yield from data + return + + from swh.scheduler.celery_backend.config import app + try: + # optional dependency + from swh.storage.archiver import tasks # noqa + except ImportError: + pass + from celery import group + + task_destination = app.tasks[self.task_destination] + groups = [] + for ids in utils.grouper(data, self.batch_size): + sig_ids = task_destination.s(destination=self.destination, + batch=list(ids)) + groups.append(sig_ids) + group(groups).delay() + + +@click.command() +@click.option('--origin-url', help='Origin\'s url') +@click.option('--source', help='origin\'s source url (disk or remote)') +def main(origin_url, source): + import logging + + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(process)d %(message)s' + ) + + if source.startswith('/'): + loader = GitSha1Reader() + fetch_date = datetime.datetime.now(tz=datetime.timezone.utc) + ids = loader.load(origin_url, source, fetch_date) + else: + loader = GitSha1RemoteReader() + ids = loader.load(origin_url, source) + + for oid in ids: + print(oid) + + +if __name__ == '__main__': + main() diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py index 98e2333..752e6fc 100644 --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,38 +1,53 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser from swh.scheduler.task import Task from .loader import GitLoader from .updater import BulkUpdater +from .reader import GitSha1RemoteReader # TODO: rename to LoadRemoteGitRepository class UpdateGitRepository(Task): """Import a git repository from a remote location""" task_queue = 'swh_loader_git' def run(self, repo_url, base_url=None): """Import a git repository""" loader = BulkUpdater() loader.log = self.log return loader.load(repo_url, base_url) class LoadDiskGitRepository(Task): """Import a git repository from disk""" task_queue = 'swh_loader_git' def run(self, origin_url, directory, date): """Import a git repository, cloned in `directory` from `origin_url` at `date`.""" loader = GitLoader() loader.log = self.log return loader.load(origin_url, directory, dateutil.parser.parse(date)) + + +class ReaderGitRepository(Task): + task_queue = 'swh_reader_git' + + def run(self, repo_url, base_url=None): + """Read a git repository from a remote location and send sha1 to + archival. + + """ + loader = GitSha1RemoteReader() + loader.log = self.log + + loader.load(repo_url, base_url) diff --git a/version.txt b/version.txt index 4be919b..3ac068d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.19-0-gb534921 \ No newline at end of file +v0.0.20-0-g86928fa \ No newline at end of file