diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index e24bea8..1107b5d 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -1,245 +1,277 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import dulwich.repo import os import shutil from dulwich.errors import ObjectFormatException from collections import defaultdict from swh.model import hashutil from . import base, converters, utils class GitLoader(base.BaseLoader): """Load a git repository from a directory. """ CONFIG_BASE_FILENAME = 'loader/git-loader' def prepare(self, origin_url, directory, visit_date): self.origin_url = origin_url self.origin = self.get_origin() self.repo = dulwich.repo.Repo(directory) self.visit_date = visit_date def get_origin(self): """Get the origin that is currently being loaded""" return converters.origin_url_to_origin(self.origin_url) def iter_objects(self): object_store = self.repo.object_store for pack in object_store.packs: objs = list(pack.index.iterentries()) objs.sort(key=lambda x: x[1]) for sha, offset, crc32 in objs: yield hashutil.hash_to_bytehex(sha) yield from object_store._iter_loose_objects() yield from object_store._iter_alternate_objects() + def _check(self, obj): + """Check the object's repository representation. + + If any errors in check exists, an ObjectFormatException is + raised. + + Args: + obj (object): Dulwich object read from the repository. + + """ + obj.check() + from dulwich.objects import Commit, Tag + try: + # For additional checks on dulwich objects with date + # for now, only checks on *time + if isinstance(obj, Commit): + commit_time = obj._commit_time + utils.check_date_time(commit_time) + author_time = obj._author_time + utils.check_date_time(author_time) + elif isinstance(obj, Tag): + tag_time = obj._tag_time + utils.check_date_time(tag_time) + except Exception as e: + raise ObjectFormatException(e) + def get_object(self, oid): """Given an object id, return the object if it is found and not malformed in some way. + Args: + oid (bytes): the object's identifier + + Returns: + The object if found without malformation + """ try: # some errors are raised when reading the object obj = self.repo[oid] # some we need to check ourselves - obj.check() + self._check(obj) except KeyError: self.log.warn('object %s not found, skipping' % ( oid.decode('utf-8'), )) return None except ObjectFormatException: self.log.warn('object %s malformed, skipping' % ( oid.decode('utf-8'), )) return None else: return obj def fetch_data(self): """Fetch the data from the data source""" type_to_ids = defaultdict(list) for oid in self.iter_objects(): obj = self.get_object(oid) if not obj: continue type_name = obj.type_name type_to_ids[type_name].append(oid) self.type_to_ids = type_to_ids def has_contents(self): """Checks whether we need to load contents""" return bool(self.type_to_ids[b'blob']) def get_content_ids(self): """Get the content identifiers from the git repository""" for oid in self.type_to_ids[b'blob']: yield converters.dulwich_blob_to_content_id(self.repo[oid]) def get_contents(self): """Get the contents that need to be loaded""" max_content_size = self.config['content_size_limit'] missing_contents = set(self.storage.content_missing( self.get_content_ids(), 'sha1_git')) for oid in missing_contents: yield converters.dulwich_blob_to_content( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log, max_content_size=max_content_size, origin_id=self.origin_id) def has_directories(self): """Checks whether we need to load directories""" return bool(self.type_to_ids[b'tree']) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tree']) def get_directories(self): """Get the directories that need to be loaded""" missing_dirs = set(self.storage.directory_missing( sorted(self.get_directory_ids()))) for oid in missing_dirs: yield converters.dulwich_tree_to_directory( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_revisions(self): """Checks whether we need to load revisions""" return bool(self.type_to_ids[b'commit']) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'commit']) def get_revisions(self): """Get the revisions that need to be loaded""" missing_revs = set(self.storage.revision_missing( sorted(self.get_revision_ids()))) for oid in missing_revs: yield converters.dulwich_commit_to_revision( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_releases(self): """Checks whether we need to load releases""" return bool(self.type_to_ids[b'tag']) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b'tag']) def get_releases(self): """Get the releases that need to be loaded""" missing_rels = set(self.storage.release_missing( sorted(self.get_release_ids()))) for oid in missing_rels: yield converters.dulwich_tag_to_release( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log) def has_occurrences(self): """Checks whether we need to load occurrences""" return True def get_occurrences(self): """Get the occurrences that need to be loaded""" origin_id = self.origin_id visit = self.visit ref_objs = ((refs, target, self.get_object(target)) for refs, target in self.repo.refs.as_dict().items() if self.get_object(target)) for ref, target, obj in ref_objs: target_type_name = obj.type_name target_type = converters.DULWICH_TYPES[target_type_name] yield { 'branch': ref, 'origin': origin_id, 'target': hashutil.bytehex_to_hash(target), 'target_type': target_type, 'visit': visit, } def get_fetch_history_result(self): """Return the data to store in fetch_history for the current loader""" return { 'contents': len(self.type_to_ids[b'blob']), 'directories': len(self.type_to_ids[b'tree']), 'revisions': len(self.type_to_ids[b'commit']), 'releases': len(self.type_to_ids[b'tag']), 'occurrences': len(self.repo.refs.allkeys()), } def save_data(self): """We already have the data locally, no need to save it""" pass def eventful(self): """Whether the load was eventful""" return True class GitLoaderFromArchive(GitLoader): """Load a git repository from an archive. """ def project_name_from_archive(self, archive_path): """Compute the project name from the archive's path. """ return os.path.basename(os.path.dirname(archive_path)) def prepare(self, origin_url, archive_path, visit_date): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoader does 3. Load as GitLoader does """ project_name = self.project_name_from_archive(archive_path) self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( project_name, archive_path) self.log.info('Project %s - Uncompressing archive %s at %s' % ( origin_url, os.path.basename(archive_path), self.repo_path)) super().prepare(origin_url, self.repo_path, visit_date) def cleanup(self): """Cleanup the temporary location (if it exists). """ if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) self.log.info('Project %s - Done injecting %s' % ( self.origin_url, self.repo_path)) if __name__ == '__main__': import logging import sys logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) loader = GitLoader() origin_url = sys.argv[1] directory = sys.argv[2] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) print(loader.load(origin_url, directory, visit_date)) diff --git a/swh/loader/git/tests/test_utils.py b/swh/loader/git/tests/test_utils.py new file mode 100644 index 0000000..b288d2b --- /dev/null +++ b/swh/loader/git/tests/test_utils.py @@ -0,0 +1,35 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.loader.git import utils + + +class TestUtils(unittest.TestCase): + @istest + def check_date_time(self): + """A long as datetime is fine, date time check does not raise + + """ + for e in range(32, 37): + ts = 2**e + utils.check_date_time(ts) + + @istest + def check_date_time_empty_value(self): + self.assertIsNone(utils.check_date_time(None)) + + @istest + def check_date_time_raises(self): + """From a give threshold, check will no longer works. + + """ + exp = 38 + timestamp = 2**exp + with self.assertRaisesRegex(ValueError, 'year is out of range'): + utils.check_date_time(timestamp) diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py index e5b2825..3b46f68 100644 --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -1,49 +1,68 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +"""Utilities helper functions""" + +import datetime import os import shutil import tempfile from subprocess import call def init_git_repo_from_archive(project_name, archive_path, root_temp_dir='/tmp'): """Given a path to an archive containing a git repository. Uncompress that archive to a temporary location and returns the path. If any problem whatsoever is raised, clean up the temporary location. Args: project_name (str): Project's name archive_path (str): Full path to the archive root_temp_dir (str): Optional temporary directory mount point (default to /tmp) Returns A tuple: - temporary folder: containing the mounted repository - repo_path, path to the mounted repository inside the temporary folder Raises ValueError in case of failure to run the command to uncompress """ temp_dir = tempfile.mkdtemp( suffix='.swh.loader.git', prefix='tmp.', dir=root_temp_dir) try: # create the repository that will be loaded with the dump r = call(['unzip', '-q', '-o', archive_path, '-d', temp_dir]) if r != 0: raise ValueError('Failed to uncompress archive %s' % archive_path) repo_path = os.path.join(temp_dir, project_name) return temp_dir, repo_path except Exception as e: shutil.rmtree(temp_dir) raise e + + +def check_date_time(timestamp): + """Check date time for overflow errors. + + Args: + timestamp (timestamp): Timestamp in seconds + + Raise: + Any error raised by datetime fromtimestamp conversion error. + + """ + if not timestamp: + return None + datetime.datetime.fromtimestamp(timestamp, + datetime.timezone.utc)