diff --git a/Makefile.tests b/Makefile.tests index f5c353b..166289c 100644 --- a/Makefile.tests +++ b/Makefile.tests @@ -1,80 +1,80 @@ # -*- makefile -*- NOSEFLAGS=--nologcapture -v DB_TEST=$(DB)-test TESTDIR = ./swh/loader/git/tests test-connect-db: psql $(DB_TEST) test-create-db: make create-db DB=$(DB_TEST) test-drop-db: make drop-db DB=$(DB_TEST) test-clean-db: make clean-db SWH_DB_MANAGER_CONFIG=./resources/test/db-manager.ini test-clean: rm -rf /tmp/swh-loader-git/test/ test-prepare: mkdir -p /tmp/swh-loader-git/test/ test-log-back: tail $(FOLLOW_LOG) /tmp/swh-loader-git/test/log/back.log test-check-meta: @echo "DB $(DB_TEST) metadata:" @$(BINDIR)/db-git-repo-meta.sh $(DB_TEST) @echo test-run-back: $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini test-http: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_http.py test-swhrepo: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_swhrepo.py test-api: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api*.py test-api-post-per-type: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_post_*.py test-api-content: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_content.py test-api-directory: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_directory.py test-api-revision: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_revision.py test-api-release: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_release.py test-api-occurrence: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_occurrence.py test-api-home: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_home.py test-api-origin: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_origin.py test-api-person: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_person.py -test-file: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_file.py +test-date: + $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_date.py test-remote-loader: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_remote_loader.py test-local-loader: $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_local_loader.py test-loaders: test-local-loader test-remote-loader diff --git a/swh/loader/git/date.py b/swh/loader/git/date.py new file mode 100644 index 0000000..77abc2e --- /dev/null +++ b/swh/loader/git/date.py @@ -0,0 +1,31 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import timedelta, datetime, tzinfo + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes = offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + + +def ts_to_datetime(timestamp, offset): + """Convert a timestamp to string. + + """ + dt = datetime.fromtimestamp(timestamp, tz=FixedOffset(offset, 'swh')) + return dt diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py index 0a705b1..0b8d525 100644 --- a/swh/loader/git/git.py +++ b/swh/loader/git/git.py @@ -1,234 +1,218 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob import logging import os import subprocess import time import pygit2 -from datetime import datetime from pygit2 import GIT_REF_OID from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL from enum import Enum from swh.core import hashutil +from swh.loader.git import date from swh.loader.git.data import swhrepo from swh.loader.git.storage import storage - -def date_format(d): - """d is expected to be a datetime object. - - """ - return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple()) - - -def now(): - """Cheat time values. - - """ - return date_format(datetime.utcnow()) - - -def timestamp_to_string(timestamp): - """Convert a timestamps to string. - - """ - return date_format(datetime.utcfromtimestamp(timestamp)) - - def list_objects_from_packfile_index(packfile_index): """List the objects indexed by this packfile. """ input_file = open(packfile_index, 'rb') with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: for line in process.stdout.readlines(): obj_id = line.decode('utf-8', 'ignore').split()[1] yield obj_id def list_objects(repo): """List the objects in a given repository. """ objects_dir = os.path.join(repo.path, 'objects') objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): yield ''.join(object_file.split(os.path.sep)[-2:]) HASH_ALGORITHMS = ['sha1', 'sha256'] def parse(repo_path): """Given a repository path, parse and return a memory model of such repository. """ def read_signature(signature): return '%s <%s>' % (signature.name, signature.email) def treewalk(repo, tree): """Walk a tree with the same implementation as `os.path`. Returns: tree, trees, contents """ trees, contents, dir_entry_dirs, dir_entry_files = [], [], [], [] for tree_entry in tree: if swh_repo.already_visited(tree_entry.hex): logging.debug('tree_entry %s already visited,' ' skipped' % tree_entry.hex) continue obj = repo.get(tree_entry.oid) if obj is None: # or obj.type == GIT_OBJ_COMMIT: logging.warn('skip submodule-commit %s' % tree_entry.hex) continue # submodule! dir_entry = {'name': tree_entry.name, 'type': storage.Type.directory_entry, 'target-sha1': obj.hex, 'perms': tree_entry.filemode, 'atime': None, 'mtime': None, 'ctime': None} if obj.type == GIT_OBJ_TREE: logging.debug('found tree %s' % tree_entry.hex) trees.append(tree_entry) dir_entry_dirs.append(dir_entry) else: logging.debug('found content %s' % tree_entry.hex) data = obj.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) contents.append({'id': hashes['sha1'], 'type': storage.Type.content, 'git-sha1': obj.hex, 'content-sha256': hashes['sha256'], 'content': data, 'size': obj.size}) dir_entry_files.append(dir_entry) yield tree, dir_entry_dirs, dir_entry_files, trees, contents for tree_entry in trees: for x in treewalk(repo, repo[tree_entry.oid]): yield x def walk_tree(repo, swh_repo, rev): """Walk the rev revision's directories. """ for dir_root, dir_entry_dirs, dir_entry_files, _, contents_ref \ in treewalk(repo, rev.tree): for content_ref in contents_ref: swh_repo.add_content(content_ref) swh_repo.add_directory({'id': dir_root.hex, 'type': storage.Type.directory, 'entry-dirs': dir_entry_dirs, 'entry-files': dir_entry_files}) revision_parent_sha1s = list(map(str, rev.parent_ids)) author = {'name': rev.author.name, 'email': rev.author.email, 'type': storage.Type.person} committer = {'name': rev.committer.name, 'email': rev.committer.email, 'type': storage.Type.person} swh_repo.add_revision({'id': rev.hex, 'type': storage.Type.revision, - 'date': timestamp_to_string(rev.commit_time), + 'date': date.ts_to_datetime( + rev.commit_time, + rev.commit_time_offset), + 'author_date': date.ts_to_datetime( + rev.author.time, + rev.author.offset), 'directory': rev.tree.hex, 'message': rev.message, 'committer': committer, 'author': author, 'parent-sha1s': revision_parent_sha1s - }) + }) swh_repo.add_person(read_signature(rev.author), author) swh_repo.add_person(read_signature(rev.committer), committer) return swh_repo def walk_revision_from(repo, swh_repo, head_rev): """Walk the rev history log from head_rev. - repo is the current repository - rev is the latest rev to start from. """ for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): sha1 = rev.hex if swh_repo.already_visited(sha1): logging.debug('commit %s already visited, skipped' % sha1) continue swh_repo = walk_tree(repo, swh_repo, rev) return swh_repo repo = pygit2.Repository(repo_path) # memory model swh_repo = swhrepo.SWHRepo() # add origin origin = {'type': 'git', 'url': 'file://' + repo.path} swh_repo.add_origin(origin) # add references and crawl them for ref_name in repo.listall_references(): logging.info('walk reference %s' % ref_name) ref = repo.lookup_reference(ref_name) head_rev = repo[ref.target] \ if ref.type is GIT_REF_OID \ else ref.peel(GIT_OBJ_COMMIT) # noqa if isinstance(head_rev, pygit2.Tag): head_start = head_rev.get_object() taggerSig = head_rev.tagger author = {'name': taggerSig.name, 'email': taggerSig.email, 'type': storage.Type.person} release = {'id': head_rev.hex, 'type': storage.Type.release, 'revision': head_rev.target.hex, 'name': ref_name, - 'date': now(), # FIXME: find the tag's date, - 'author': author, + 'date': date.ts_to_datetime(taggerSig.time, + taggerSig.offset), + 'author': author, 'comment': head_rev.message} swh_repo.add_release(release) swh_repo.add_person(read_signature(taggerSig), author) else: swh_repo.add_occurrence({'id': head_rev.hex, 'revision': head_rev.hex, 'branch': ref_name, 'url-origin': origin['url'], 'type': storage.Type.occurrence}) head_start = head_rev # crawl commits and trees walk_revision_from(repo, swh_repo, head_start) return swh_repo diff --git a/swh/loader/git/tests/test_date.py b/swh/loader/git/tests/test_date.py new file mode 100644 index 0000000..8289fc5 --- /dev/null +++ b/swh/loader/git/tests/test_date.py @@ -0,0 +1,29 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.loader.git import date + + +class DateTestCase(unittest.TestCase): + @istest + def new_swhrepo(self): + # when + d0 = date.ts_to_datetime(1434449254, 120) + + assert str(d0) == '2015-06-16 12:07:34+02:00' + + # when + d1 = date.ts_to_datetime(1434449254, 60) + + assert str(d1) == '2015-06-16 11:07:34+01:00' + + # when + d2 = date.ts_to_datetime(1434449254, 0) + + assert str(d2) == '2015-06-16 10:07:34+00:00'