diff --git a/.gitignore b/.gitignore index f91c832..7594571 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,14 @@ .eggs/ /sgloader/__pycache__/ /dataset/ *.pyc /.coverage /scratch/swhgitloader.cProfile /scratch/swhgitloader.profile /scratch/save.p *.egg-info version.txt /resources/repo-linux-to-load.ini /resources/repo-to-load.ini +build/ +dist/ diff --git a/PKG-INFO b/PKG-INFO index c627de4..6e5adcf 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,95 @@ -Metadata-Version: 1.0 +Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.0.39 +Version: 0.0.40 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Description: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git +Description: swh-loader-git + ============== + + The Software Heritage Git Loader is a tool and a library to walk a local + Git repository and inject into the SWH dataset all contained files that + weren't known before. + + License + ------- + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + Public License for more details. + + See top-level LICENSE file for the full text of the GNU General Public + License along with this program. + + Dependencies + ------------ + + ### Runtime + + - python3 + - python3-dulwich + - python3-retrying + - python3-swh.core + - python3-swh.model + - python3-swh.storage + - python3-swh.scheduler + + ### Test + + - python3-nose + + Requirements + ------------ + + - implementation language, Python3 + - coding guidelines: conform to PEP8 + - Git access: via dulwich + + Configuration + ------------- + + You can run the loader or the updater directly by calling: + ``` + python3 -m swh.loader.git.{loader,updater} + ``` + + ### Location + + Both tools expect a configuration file. + + Either one of the following location: + - /etc/softwareheritage/ + - ~/.config/swh/ + - ~/.swh/ + + Note: Will call that location $SWH_CONFIG_PATH + + ### Configuration sample + + $SWH_CONFIG_PATH/loader/git-{loader,updater}.yml: + ``` + storage: + cls: remote + args: + url: http://localhost:5002/ + ``` + Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 5 - Production/Stable +Description-Content-Type: text/markdown +Provides-Extra: testing diff --git a/README b/README deleted file mode 100644 index a95a5f9..0000000 --- a/README +++ /dev/null @@ -1,82 +0,0 @@ -The Software Heritage Git Loader is a tool and a library to walk a local -Git repository and inject into the SWH dataset all contained files that -weren't known before. - -License -======= - -This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation, either version 3 of the License, or (at your -option) any later version. - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -Public License for more details. - -See top-level LICENSE file for the full text of the GNU General Public -License along with this program. - -Dependencies -============ - -Runtime -------- - -- python3 -- python3-dulwich -- python3-retrying -- python3-swh.core -- python3-swh.model -- python3-swh.storage -- python3-swh.scheduler - -Test ----- - -- python3-nose - -Requirements -============ - -- implementation language, Python3 -- coding guidelines: conform to PEP8 -- Git access: via dulwich - -Configuration -============= - -You can run the loader or the updater directly by calling python3 -m swh.loader.git.{loader,updater}. - -Both tools expect a configuration file in .ini format to be present in ~/.config/swh/loader/git-{loader,updater}.ini - -The configuration file contains the following directives: - -``` -[main] -# the storage class used. one of remote_storage, local_storage -storage_class = remote_storage - -# arguments passed to the storage class -# for remote_storage: URI of the storage server -storage_args = http://localhost:5002/ - -# for local_storage: database connection string and root of the -# storage, comma separated -# storage_args = dbname=softwareheritage-dev, /tmp/swh/storage - -# Whether to send the given types of objects -send_contents = True -send_directories = True -send_revisions = True -send_releases = True -send_snapshot = True - -# The size of the packets sent to storage for each kind of object -content_packet_size = 100000 -content_packet_size_bytes = 1073741824 -directory_packet_size = 25000 -revision_packet_size = 100000 -release_packet_size = 100000 -``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..5048f90 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +swh-loader-git +============== + +The Software Heritage Git Loader is a tool and a library to walk a local +Git repository and inject into the SWH dataset all contained files that +weren't known before. + +License +------- + +This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your +option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +See top-level LICENSE file for the full text of the GNU General Public +License along with this program. + +Dependencies +------------ + +### Runtime + +- python3 +- python3-dulwich +- python3-retrying +- python3-swh.core +- python3-swh.model +- python3-swh.storage +- python3-swh.scheduler + +### Test + +- python3-nose + +Requirements +------------ + +- implementation language, Python3 +- coding guidelines: conform to PEP8 +- Git access: via dulwich + +Configuration +------------- + +You can run the loader or the updater directly by calling: +``` +python3 -m swh.loader.git.{loader,updater} +``` + +### Location + +Both tools expect a configuration file. + +Either one of the following location: +- /etc/softwareheritage/ +- ~/.config/swh/ +- ~/.swh/ + +Note: Will call that location $SWH_CONFIG_PATH + +### Configuration sample + +$SWH_CONFIG_PATH/loader/git-{loader,updater}.yml: +``` +storage: + cls: remote + args: + url: http://localhost:5002/ +``` diff --git a/debian/control b/debian/control index 8b42bdc..83cfa83 100644 --- a/debian/control +++ b/debian/control @@ -1,31 +1,31 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-click, python3-dulwich (>= 0.18.7~), python3-nose, python3-retrying, python3-setuptools, python3-swh.core (>= 0.0.7~), python3-swh.loader.core (>= 0.0.32), - python3-swh.model (>= 0.0.15~), + python3-swh.model (>= 0.0.27~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), python3-swh.loader.core (>= 0.0.32~), - python3-swh.model (>= 0.0.15~), + python3-swh.model (>= 0.0.27~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/docs/index.rst b/docs/index.rst index cef3d76..4b1ed20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,17 +1,19 @@ .. _swh-loader-git: -Software Heritage - Development Documentation -============================================= +Software Heritage - Git loader +============================== + +Loader for `Git `_ repositories. + .. toctree:: :maxdepth: 2 :caption: Contents: - Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` diff --git a/requirements-swh.txt b/requirements-swh.txt index be87c8b..5e2352e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.0.32 -swh.model >= 0.0.15 +swh.model >= 0.0.27 swh.scheduler >= 0.0.14 swh.storage >= 0.0.83 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..f3c7e8e --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +nose diff --git a/setup.py b/setup.py index 6472a33..8a4ca7f 100755 --- a/setup.py +++ b/setup.py @@ -1,30 +1,65 @@ #!/usr/bin/env python3 +# Copyright (C) 2015-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information from setuptools import setup, find_packages +from os import path +from io import open + +here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + + +def parse_requirements(name=None): + if name: + reqf = 'requirements-%s.txt' % name + else: + reqf = 'requirements.txt' -def parse_requirements(): requirements = [] - for reqf in ('requirements.txt', 'requirements-swh.txt'): - with open(reqf) as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) + if not path.exists(reqf): + return requirements + + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.loader.git', description='Software Heritage git loader', + long_description=long_description, + long_description_content_type='text/markdown', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDG/', packages=find_packages(), scripts=[], - install_requires=parse_requirements(), + install_requires=parse_requirements() + parse_requirements('swh'), setup_requires=['vcversioner'], + extras_require={'testing': parse_requirements('test')}, vcversioner={}, include_package_data=True, + classifiers=[ + "Programming Language :: Python :: 3", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Operating System :: OS Independent", + "Development Status :: 5 - Production/Stable", + ], + project_urls={ + 'Bug Reports': 'https://forge.softwareheritage.org/maniphest', + 'Funding': 'https://www.softwareheritage.org/donate', + 'Source': 'https://forge.softwareheritage.org/source/swh-loader-git', + }, ) diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index c627de4..6e5adcf 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,95 @@ -Metadata-Version: 1.0 +Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.0.39 +Version: 0.0.40 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Description: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git +Description: swh-loader-git + ============== + + The Software Heritage Git Loader is a tool and a library to walk a local + Git repository and inject into the SWH dataset all contained files that + weren't known before. + + License + ------- + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + Public License for more details. + + See top-level LICENSE file for the full text of the GNU General Public + License along with this program. + + Dependencies + ------------ + + ### Runtime + + - python3 + - python3-dulwich + - python3-retrying + - python3-swh.core + - python3-swh.model + - python3-swh.storage + - python3-swh.scheduler + + ### Test + + - python3-nose + + Requirements + ------------ + + - implementation language, Python3 + - coding guidelines: conform to PEP8 + - Git access: via dulwich + + Configuration + ------------- + + You can run the loader or the updater directly by calling: + ``` + python3 -m swh.loader.git.{loader,updater} + ``` + + ### Location + + Both tools expect a configuration file. + + Either one of the following location: + - /etc/softwareheritage/ + - ~/.config/swh/ + - ~/.swh/ + + Note: Will call that location $SWH_CONFIG_PATH + + ### Configuration sample + + $SWH_CONFIG_PATH/loader/git-{loader,updater}.yml: + ``` + storage: + cls: remote + args: + url: http://localhost:5002/ + ``` + Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 5 - Production/Stable +Description-Content-Type: text/markdown +Provides-Extra: testing diff --git a/swh.loader.git.egg-info/SOURCES.txt b/swh.loader.git.egg-info/SOURCES.txt index bd9bf47..0bcb145 100644 --- a/swh.loader.git.egg-info/SOURCES.txt +++ b/swh.loader.git.egg-info/SOURCES.txt @@ -1,47 +1,48 @@ .gitignore .gitmodules AUTHORS LICENSE MANIFEST.in Makefile -README +README.md requirements-swh.txt +requirements-test.txt requirements.txt setup.py version.txt bin/dir-git-repo-meta.sh debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder docs/attic/api-backend-protocol.txt docs/attic/git-loading-design.txt resources/local-loader-git.ini resources/remote-loader-git.ini resources/updater.ini resources/test/back.ini resources/test/db-manager.ini swh/__init__.py swh.loader.git.egg-info/PKG-INFO swh.loader.git.egg-info/SOURCES.txt swh.loader.git.egg-info/dependency_links.txt swh.loader.git.egg-info/requires.txt swh.loader.git.egg-info/top_level.txt swh/loader/__init__.py swh/loader/git/__init__.py swh/loader/git/converters.py swh/loader/git/loader.py swh/loader/git/reader.py swh/loader/git/tasks.py swh/loader/git/updater.py swh/loader/git/utils.py swh/loader/git/tests/test_converters.py swh/loader/git/tests/test_utils.py \ No newline at end of file diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index abb8619..5e1c03f 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,9 +1,12 @@ click dulwich>=0.18.7 retrying swh.core>=0.0.7 swh.loader.core>=0.0.32 -swh.model>=0.0.15 +swh.model>=0.0.27 swh.scheduler>=0.0.14 swh.storage>=0.0.83 vcversioner + +[testing] +nose diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index bcdea5c..a69aeb7 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,240 +1,238 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" -from swh.model import hashutil, identifiers +from swh.model import identifiers +from swh.model.hashutil import ( + DEFAULT_ALGORITHMS, hash_to_hex, hash_to_bytes, MultiHash +) -HASH_ALGORITHMS = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} +HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {'sha1_git'} def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } def dulwich_blob_to_content_id(blob): """Convert a dulwich blob to a Software Heritage content id""" - if blob.type_name != b'blob': return size = blob.raw_length() - ret = { - 'sha1_git': blob.sha().digest(), - 'length': size, - } - data = blob.as_raw_string() - ret.update(hashutil.hash_data(data, HASH_ALGORITHMS)) - - return ret + hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() + hashes['sha1_git'] = blob.sha().digest() + hashes['length'] = size + return hashes def dulwich_blob_to_content(blob, log=None, max_content_size=None, origin_id=None): """Convert a dulwich blob to a Software Heritage content""" if blob.type_name != b'blob': return ret = dulwich_blob_to_content_id(blob) size = ret['length'] if max_content_size: if size > max_content_size: - id = hashutil.hash_to_hex(ret['sha1_git']) + id = hash_to_hex(ret['sha1_git']) if log: log.info('Skipping content %s, too large (%s > %s)' % (id, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_id': id, 'swh_size': size, }) ret['status'] = 'absent' ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.as_raw_string() ret['data'] = data ret['status'] = 'visible' return ret def dulwich_tree_to_directory(tree, log=None): """Format a tree as a directory""" if tree.type_name != b'tree': return ret = { 'id': tree.sha().digest(), } entries = [] ret['entries'] = entries entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append({ 'type': entry_mode_map.get(entry.mode, 'file'), 'perms': entry.mode, 'name': entry.path, - 'target': hashutil.hash_to_bytes(entry.sha.decode('ascii')), + 'target': hash_to_bytes(entry.sha.decode('ascii')), }) return ret def parse_author(name_email): """Parse an author line""" if name_email is None: return None try: open_bracket = name_email.index(b'<') except ValueError: name = email = None else: raw_name = name_email[:open_bracket] raw_email = name_email[open_bracket+1:] if not raw_name: name = None elif raw_name.endswith(b' '): name = raw_name[:-1] else: name = raw_name try: close_bracket = raw_email.index(b'>') except ValueError: email = None else: email = raw_email[:close_bracket] return { 'name': name, 'email': email, 'fullname': name_email, } def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc): """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return { 'timestamp': timestamp, 'offset': timezone // 60, 'negative_utc': timezone_neg_utc if timezone == 0 else None, } def dulwich_commit_to_revision(commit, log=None): if commit.type_name != b'commit': return ret = { 'id': commit.sha().digest(), 'author': parse_author(commit.author), 'date': dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), 'committer': parse_author(commit.committer), 'committer_date': dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), 'type': 'git', 'directory': bytes.fromhex(commit.tree.decode()), 'message': commit.message, 'metadata': None, 'synthetic': False, 'parents': [bytes.fromhex(p.decode()) for p in commit.parents], } git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b'\n') git_metadata.append(['mergetag', raw_string[:-1]]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: ret['metadata'] = { 'extra_headers': git_metadata, } return ret DULWICH_TYPES = { b'blob': 'content', b'tree': 'directory', b'commit': 'revision', b'tag': 'release', } def dulwich_tag_to_release(tag, log=None): if tag.type_name != b'tag': return target_type, target = tag.object ret = { 'id': tag.sha().digest(), 'name': tag.name, 'target': bytes.fromhex(target.decode()), 'target_type': DULWICH_TYPES[target_type.type_name], 'message': tag._message, 'metadata': None, 'synthetic': False, } if tag.tagger: ret['author'] = parse_author(tag.tagger) if not tag.tag_time: ret['date'] = None else: ret['date'] = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: ret['author'] = ret['date'] = None return ret def branches_to_snapshot(branches): snapshot = {'branches': branches} snapshot_id = identifiers.snapshot_identifier(snapshot) snapshot['id'] = identifiers.identifier_to_bytes(snapshot_id) return snapshot diff --git a/swh/loader/git/reader.py b/swh/loader/git/reader.py index 2491530..2da2b7f 100644 --- a/swh/loader/git/reader.py +++ b/swh/loader/git/reader.py @@ -1,258 +1,258 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import logging import pprint import click from swh.core import utils -from swh.model import hashutil +from swh.model.hashutil import MultiHash, hash_to_hex from .updater import BulkUpdater, SWHRepoRepresentation from . import converters class SWHRepoFullRepresentation(SWHRepoRepresentation): """Overridden representation of a swh repository to permit to read completely the remote repository. """ def __init__(self, storage, origin_id, occurrences=None): self.storage = storage self._parents_cache = {} self._type_cache = {} self.heads = set() def determine_wants(self, refs): """Filter the remote references to figure out which ones Software Heritage needs. In this particular context, we want to know everything. """ if not refs: return [] for target in refs.values(): self.heads.add(target) return self.filter_unwanted_refs(refs).values() def find_remote_ref_types_in_swh(self, remote_refs): """Find the known swh remote. In that particular context, we know nothing. """ return {} class DummyGraphWalker(object): """Dummy graph walker which claims that the client doesn’t have any objects. """ def ack(self, sha): pass def next(self): pass def __next__(self): pass class BaseGitRemoteReader(BulkUpdater): CONFIG_BASE_FILENAME = 'loader/git-remote-reader' ADDITIONAL_CONFIG = { 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), 'pack_storage_base': ('str', ''), # don't want to store packs so empty 'next_task': ( 'dict', { 'queue': 'swh.storage.archiver.tasks.SWHArchiverToBackendTask', 'batch_size': 100, 'destination': 'azure' } ) } def __init__(self): super().__init__(SWHRepoFullRepresentation) self.next_task = self.config['next_task'] self.batch_size = self.next_task['batch_size'] self.task_destination = self.next_task['queue'] self.destination = self.next_task['destination'] def graph_walker(self): return DummyGraphWalker() def prepare_origin_visit(self, origin_url, base_url=None): self.origin = converters.origin_url_to_origin(origin_url) self.origin_id = 0 def prepare(self, origin_url, base_url=None): """Only retrieve information about the origin, set everything else to empty. """ self.base_occurrences = [] self.base_origin_id = 0 def keep_object(self, obj): """Do we want to keep this object or not?""" raise NotImplementedError('Please implement keep_object') def get_id_and_data(self, obj): """Get the id, type and data of the given object""" raise NotImplementedError('Please implement get_id_and_data') def list_pack(self, pack_data, pack_size): """Override list_pack to only keep contents' sha1. Returns: id_to_type (dict): keys are sha1, values are their associated type type_to_ids (dict): keys are types, values are list of associated data (sha1 for blobs) """ self.data = {} id_to_type = {} type_to_ids = defaultdict(set) inflater = self.get_inflater() for obj in inflater: if not self.keep_object(obj): continue object_id, type, data = self.get_id_and_data(obj) id_to_type[object_id] = type type_to_ids[type].add(object_id) self.data[object_id] = data return id_to_type, type_to_ids def load(self, *args, **kwargs): """Override the loading part which simply reads the repository's contents' sha1. Returns: Returns the list of discovered sha1s for that origin. """ self.prepare(*args, **kwargs) self.fetch_data() class GitSha1RemoteReader(BaseGitRemoteReader): """Read sha1 git from a remote repository and dump only repository's content sha1 as list. """ def keep_object(self, obj): """Only keep blobs""" return obj.type_name == b'blob' def get_id_and_data(self, obj): """We want to store only object identifiers""" # compute the sha1 (obj.id is the sha1_git) data = obj.as_raw_string() - hashes = hashutil.hash_data(data, {'sha1'}) + hashes = MultiHash.from_data(data, {'sha1'}).digest() oid = hashes['sha1'] return (oid, b'blob', oid) class GitSha1RemoteReaderAndSendToQueue(GitSha1RemoteReader): """Read sha1 git from a remote repository and dump only repository's content sha1 as list and send batch of those sha1s to a celery queue for consumption. """ def load(self, *args, **kwargs): """Retrieve the list of sha1s for a particular origin and send those sha1s as group of sha1s to a specific queue. """ super().load(*args, **kwargs) data = self.type_to_ids[b'blob'] from swh.scheduler.celery_backend.config import app try: # optional dependency from swh.storage.archiver import tasks # noqa except ImportError: pass from celery import group task_destination = app.tasks[self.task_destination] groups = [] for ids in utils.grouper(data, self.batch_size): sig_ids = task_destination.s(destination=self.destination, batch=list(ids)) groups.append(sig_ids) group(groups).delay() return data class GitCommitRemoteReader(BaseGitRemoteReader): def keep_object(self, obj): return obj.type_name == b'commit' def get_id_and_data(self, obj): return obj.id, b'commit', converters.dulwich_commit_to_revision(obj) def load(self, *args, **kwargs): super().load(*args, **kwargs) return self.data @click.group() @click.option('--origin-url', help='Origin url') @click.pass_context def main(ctx, origin_url): logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) ctx.obj['origin_url'] = origin_url @main.command() @click.option('--send/--nosend', default=False, help='Origin\'s url') @click.pass_context def blobs(ctx, send): origin_url = ctx.obj['origin_url'] if send: loader = GitSha1RemoteReaderAndSendToQueue() ids = loader.load(origin_url) print('%s sha1s were sent to queue' % len(ids)) return loader = GitSha1RemoteReader() ids = loader.load(origin_url) if ids: for oid in ids: - print(hashutil.hash_to_hex(oid)) + print(hash_to_hex(oid)) @main.command() @click.option('--ids-only', is_flag=True, help='print ids only') @click.pass_context def commits(ctx, ids_only): origin_url = ctx.obj['origin_url'] reader = GitCommitRemoteReader() commits = reader.load(origin_url) for commit_id, commit in commits.items(): if ids_only: print(commit_id.decode()) else: pprint.pprint(commit) if __name__ == '__main__': main(obj={}) diff --git a/swh/loader/git/updater.py b/swh/loader/git/updater.py index 421d26a..6d6135c 100644 --- a/swh/loader/git/updater.py +++ b/swh/loader/git/updater.py @@ -1,458 +1,487 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import dulwich.client import logging import os import pickle import sys from collections import defaultdict from io import BytesIO from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater from swh.model import hashutil from swh.loader.core.loader import SWHStatelessLoader from . import converters class SWHRepoRepresentation: """Repository representation for a Software Heritage origin.""" - def __init__(self, storage, origin_id, base_snapshot=None): + def __init__(self, storage, origin_id, base_snapshot=None, + ignore_history=False): self.storage = storage self._parents_cache = {} self._type_cache = {} - if origin_id: + self.ignore_history = ignore_history + + if origin_id and not ignore_history: self.heads = set(self._cache_heads(origin_id, base_snapshot)) else: self.heads = set() def _fill_parents_cache(self, commits): """When querying for a commit's parents, we fill the cache to a depth of 1000 commits.""" root_revs = self._encode_for_storage(commits) for rev, parents in self.storage.revision_shortlog(root_revs, 1000): rev_id = hashutil.hash_to_bytehex(rev) if rev_id not in self._parents_cache: self._parents_cache[rev_id] = [ hashutil.hash_to_bytehex(parent) for parent in parents ] for rev in commits: if rev not in self._parents_cache: self._parents_cache[rev] = [] def _cache_heads(self, origin_id, base_snapshot): """Return all the known head commits for `origin_id`""" - if not base_snapshot: - base_snapshot = self.storage.snapshot_get_latest(origin_id) + _git_types = ['content', 'directory', 'revision', 'release'] - if base_snapshot: - return self._decode_from_storage( - target['target'] - for target in base_snapshot['branches'].values() - ) - else: + if not base_snapshot: return [] + snapshot_targets = set() + for target in base_snapshot['branches'].values(): + if target and target['target_type'] in _git_types: + snapshot_targets.add(target['target']) + + for id, objs in self.get_stored_objects( + self._decode_from_storage(snapshot_targets) + ).items(): + if not objs: + logging.warn('Missing head: %s' % hashutil.hash_to_hex(id)) + return [] + + return snapshot_targets + def get_parents(self, commit): """Bogus method to prevent expensive recursion, at the expense of less efficient downloading""" return [] def get_heads(self): return self.heads @staticmethod def _encode_for_storage(objects): return [hashutil.bytehex_to_hash(object) for object in objects] @staticmethod def _decode_from_storage(objects): return set(hashutil.hash_to_bytehex(object) for object in objects) def graph_walker(self): return ObjectStoreGraphWalker(self.get_heads(), self.get_parents) @staticmethod def filter_unwanted_refs(refs): """Filter the unwanted references from refs""" ret = {} for ref, val in refs.items(): if ref.endswith(b'^{}'): # Peeled refs make the git protocol explode continue elif ref.startswith(b'refs/pull/') and ref.endswith(b'/merge'): # We filter-out auto-merged GitHub pull requests continue else: ret[ref] = val return ret def determine_wants(self, refs): """Filter the remote references to figure out which ones Software Heritage needs. """ if not refs: return [] # Find what objects Software Heritage has refs = self.find_remote_ref_types_in_swh(refs) # Cache the objects found in swh as existing heads for target in refs.values(): if target['target_type'] is not None: self.heads.add(target['target']) ret = set() for target in self.filter_unwanted_refs(refs).values(): if target['target_type'] is None: # The target doesn't exist in Software Heritage, let's retrieve # it. ret.add(target['target']) return list(ret) def get_stored_objects(self, objects): + if self.ignore_history: + return {} + return self.storage.object_find_by_sha1_git( self._encode_for_storage(objects)) def find_remote_ref_types_in_swh(self, remote_refs): """Parse the remote refs information and list the objects that exist in Software Heritage. """ all_objs = set(remote_refs.values()) - set(self._type_cache) type_by_id = {} for id, objs in self.get_stored_objects(all_objs).items(): id = hashutil.hash_to_bytehex(id) if objs: type_by_id[id] = objs[0]['type'] self._type_cache.update(type_by_id) ret = {} for ref, id in remote_refs.items(): ret[ref] = { 'target': id, 'target_type': self._type_cache.get(id), } return ret class BulkUpdater(SWHStatelessLoader): """A bulk loader for a git repository""" CONFIG_BASE_FILENAME = 'loader/git-updater' ADDITIONAL_CONFIG = { 'pack_size_bytes': ('int', 4 * 1024 * 1024 * 1024), } def __init__(self, repo_representation=SWHRepoRepresentation, config=None): """Initialize the bulk updater. Args: repo_representation: swh's repository representation which is in charge of filtering between known and remote data. """ super().__init__(logging_class='swh.loader.git.BulkLoader', config=config) self.repo_representation = repo_representation def fetch_pack_from_origin(self, origin_url, base_origin_id, base_snapshot, do_activity): """Fetch a pack from the origin""" pack_buffer = BytesIO() - base_repo = self.repo_representation(self.storage, base_origin_id, - base_snapshot) + base_repo = self.repo_representation( + storage=self.storage, + origin_id=base_origin_id, + base_snapshot=base_snapshot, + ignore_history=self.ignore_history, + ) client, path = dulwich.client.get_transport_and_path(origin_url, thin_packs=False) size_limit = self.config['pack_size_bytes'] def do_pack(data, pack_buffer=pack_buffer, limit=size_limit, origin_url=origin_url): cur_size = pack_buffer.tell() would_write = len(data) if cur_size + would_write > limit: raise IOError('Pack file too big for repository %s, ' 'limit is %d bytes, current size is %d, ' 'would write %d' % (origin_url, limit, cur_size, would_write)) pack_buffer.write(data) remote_refs = client.fetch_pack(path, base_repo.determine_wants, base_repo.graph_walker(), do_pack, progress=do_activity) if remote_refs: local_refs = base_repo.find_remote_ref_types_in_swh(remote_refs) else: local_refs = remote_refs = {} pack_buffer.flush() pack_size = pack_buffer.tell() pack_buffer.seek(0) return { 'remote_refs': base_repo.filter_unwanted_refs(remote_refs), 'local_refs': local_refs, 'pack_buffer': pack_buffer, 'pack_size': pack_size, } def list_pack(self, pack_data, pack_size): id_to_type = {} type_to_ids = defaultdict(set) inflater = self.get_inflater() for obj in inflater: type, id = obj.type_name, obj.id id_to_type[id] = type type_to_ids[type].add(id) return id_to_type, type_to_ids - def prepare_origin_visit(self, origin_url, base_url=None): + def prepare_origin_visit(self, origin_url, **kwargs): self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.origin = converters.origin_url_to_origin(origin_url) - def prepare(self, origin_url, base_url=None): + def prepare(self, origin_url, base_url=None, ignore_history=False): base_origin_id = origin_id = self.origin_id - prev_snapshot = self.storage.snapshot_get_latest(origin_id) + + prev_snapshot = None + + if not ignore_history: + prev_snapshot = self.storage.snapshot_get_latest(origin_id) if base_url and not prev_snapshot: base_origin = converters.origin_url_to_origin(base_url) base_origin = self.storage.origin_get(base_origin) if base_origin: base_origin_id = base_origin['id'] prev_snapshot = self.storage.snapshot_get_latest( base_origin_id ) self.base_snapshot = prev_snapshot self.base_origin_id = base_origin_id + self.ignore_history = ignore_history def fetch_data(self): def do_progress(msg): sys.stderr.buffer.write(msg) sys.stderr.flush() fetch_info = self.fetch_pack_from_origin( self.origin['url'], self.base_origin_id, self.base_snapshot, do_progress) self.pack_buffer = fetch_info['pack_buffer'] self.pack_size = fetch_info['pack_size'] self.remote_refs = fetch_info['remote_refs'] self.local_refs = fetch_info['local_refs'] origin_url = self.origin['url'] self.log.info('Listed %d refs for repo %s' % ( len(self.remote_refs), origin_url), extra={ 'swh_type': 'git_repo_list_refs', 'swh_repo': origin_url, 'swh_num_refs': len(self.remote_refs), }) # We want to load the repository, walk all the objects id_to_type, type_to_ids = self.list_pack(self.pack_buffer, self.pack_size) self.id_to_type = id_to_type self.type_to_ids = type_to_ids def save_data(self): """Store a pack for archival""" write_size = 8192 pack_dir = self.get_save_data_path() pack_name = "%s.pack" % self.visit_date.isoformat() refs_name = "%s.refs" % self.visit_date.isoformat() with open(os.path.join(pack_dir, pack_name), 'xb') as f: self.pack_buffer.seek(0) while True: r = self.pack_buffer.read(write_size) if not r: break f.write(r) self.pack_buffer.seek(0) with open(os.path.join(pack_dir, refs_name), 'xb') as f: pickle.dump(self.remote_refs, f) def get_inflater(self): """Reset the pack buffer and get an object inflater from it""" self.pack_buffer.seek(0) return PackInflater.for_pack_data( PackData.from_file(self.pack_buffer, self.pack_size)) def has_contents(self): return bool(self.type_to_ids[b'blob']) def get_content_ids(self): """Get the content identifiers from the git repository""" for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue yield converters.dulwich_blob_to_content_id(raw_obj) def get_contents(self): """Format the blobs from the git repository as swh contents""" max_content_size = self.config['content_size_limit'] missing_contents = set(self.storage.content_missing( self.get_content_ids(), 'sha1_git')) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'blob': continue if raw_obj.sha().digest() not in missing_contents: continue yield converters.dulwich_blob_to_content( raw_obj, log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): return bool(self.type_to_ids[b'tree']) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tree']) def get_directories(self): """Format the trees as swh directories""" missing_dirs = set(self.storage.directory_missing( sorted(self.get_directory_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tree': continue if raw_obj.sha().digest() not in missing_dirs: continue yield converters.dulwich_tree_to_directory(raw_obj, log=self.log) def has_revisions(self): return bool(self.type_to_ids[b'commit']) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'commit']) def get_revisions(self): """Format commits as swh revisions""" missing_revs = set(self.storage.revision_missing( sorted(self.get_revision_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'commit': continue if raw_obj.sha().digest() not in missing_revs: continue yield converters.dulwich_commit_to_revision(raw_obj, log=self.log) def has_releases(self): return bool(self.type_to_ids[b'tag']) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tag']) def get_releases(self): """Retrieve all the release objects from the git repository""" missing_rels = set(self.storage.release_missing( sorted(self.get_release_ids()))) for raw_obj in self.get_inflater(): if raw_obj.type_name != b'tag': continue if raw_obj.sha().digest() not in missing_rels: continue yield converters.dulwich_tag_to_release(raw_obj, log=self.log) def get_snapshot(self): branches = {} for ref in self.remote_refs: ret_ref = self.local_refs[ref].copy() if not ret_ref['target_type']: target_type = self.id_to_type[ret_ref['target']] ret_ref['target_type'] = converters.DULWICH_TYPES[target_type] ret_ref['target'] = hashutil.bytehex_to_hash(ret_ref['target']) branches[ref] = ret_ref self.snapshot = converters.branches_to_snapshot(branches) return self.snapshot def get_fetch_history_result(self): return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), } def load_status(self): """The load was eventful if the current snapshot is different to the one we retrieved at the beginning of the run""" eventful = False if self.base_snapshot: eventful = self.snapshot['id'] != self.base_snapshot['id'] else: eventful = bool(self.snapshot['branches']) return {'status': ('eventful' if eventful else 'uneventful')} if __name__ == '__main__': import click logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) @click.command() - @click.option('--origin-url', help='Origin url') + @click.option('--origin-url', help='Origin url', required=True) @click.option('--base-url', default=None, help='Optional Base url') - def main(origin_url, base_url): - return BulkUpdater().load(origin_url, base_url=base_url) + @click.option('--ignore-history/--no-ignore-history', + help='Ignore the repository history', default=False) + def main(origin_url, base_url, ignore_history): + return BulkUpdater().load( + origin_url, + base_url=base_url, + ignore_history=ignore_history, + ) main() diff --git a/version.txt b/version.txt index c5fe5c4..b868166 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.39-0-geadefcb \ No newline at end of file +v0.0.40-0-g7156c46 \ No newline at end of file