diff --git a/debian/control b/debian/control index e35b8b0..f0dde19 100644 --- a/debian/control +++ b/debian/control @@ -1,27 +1,28 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-dulwich, python3-retrying, python3-setuptools, + python3-click, python3-swh.core (>= 0.0.7~), python3-swh.model (>= 0.0.3~), python3-swh.scheduler, python3-swh.storage (>= 0.0.50~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), python3-swh.storage (>= 0.0.50~), python3-swh.model (>= 0.0.3~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements.txt b/requirements.txt index ef55e34..e7b0a99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ dulwich retrying vcversioner +click swh.core >= 0.0.7 swh.model >= 0.0.3 swh.scheduler swh.storage >= 0.0.50 diff --git a/swh/loader/git/reader.py b/swh/loader/git/reader.py new file mode 100644 index 0000000..69def1b --- /dev/null +++ b/swh/loader/git/reader.py @@ -0,0 +1,95 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import datetime + +from collections import defaultdict + +from .updater import BulkUpdater +from .loader import GitLoader + + +class GitSha1Reader(GitLoader): + """Disk git sha1 reader. Only read and dump sha1s in stdout. + + """ + def fetch_data(self): + """Fetch the data from the data source""" + for oid in self.iter_objects(): + type_name = self.repo[oid].type_name + if type_name != b'blob': + continue + yield oid + + def load(self, *args, **kwargs): + self.prepare(*args, **kwargs) + try: + for oid in self.fetch_data(): + yield oid.decode('utf-8') + except: + pass + + +class GitSha1RemoteReader(BulkUpdater): + """Disk git sha1 reader to dump only repo's content sha1 list. + + """ + def list_pack(self, pack_data, pack_size): + """Override list_pack to only keep blobs data. + + """ + id_to_type = {} + type_to_ids = defaultdict(set) + + inflater = self.get_inflater() + + for obj in inflater: + type, id = obj.type_name, obj.id + if type != b'blob': + continue + id_to_type[id] = type + type_to_ids[type].add(id) + + return id_to_type, type_to_ids + + def load(self, *args, **kwargs): + self.prepare(*args, **kwargs) + origin = self.get_origin() + self.origin_id = self.send_origin(origin) + + try: + self.fetch_data() + for oid in self.id_to_type.keys(): + yield oid.decode('utf-8') + except: + pass + + +@click.command() +@click.option('--origin-url', help='Origin\'s url') +@click.option('--source', help='origin\'s source url (disk or remote)') +def main(origin_url, source): + import logging + + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(process)d %(message)s' + ) + + if source.startswith('/'): + loader = GitSha1Reader() + fetch_date = datetime.datetime.now(tz=datetime.timezone.utc) + r = loader.load(origin_url, source, fetch_date) + else: + loader = GitSha1RemoteReader() + r = loader.load(origin_url, source) + + for id in r: + print(id) + + +if __name__ == '__main__': + main()