diff --git a/Makefile b/Makefile index 02f9793..a06e171 100644 --- a/Makefile +++ b/Makefile @@ -1,154 +1,157 @@ FLAKE = flake8 BINDIR = bin SRCDIR = swh REPO_PATH=../debsources # add -v for example FLAG= NOSE = nosetests3 TESTFLAGS = -s TESTDIR = ./swh/tests DB=softwareheritage-dev DB_TEST=$(DB)-test SWH_LOADER=$(BINDIR)/swh-git-loader SWH_DB_MANAGER=$(BINDIR)/swh-db-manager SWH_BACK=$(BINDIR)/swh-backend # could use cProfile PROFILE_TYPE=profile FOLLOW_LOG=-f +# Adapt python-path to use other modules +_PYPATH=`pwd`:`pwd`/../swh-core + deps: apt-get install -y \ python3 \ python3-pygit2 \ python3-psycopg2 \ python3-nose \ python3-flask \ python3-requests \ python3-retrying \ ipython3 clean: rm -rf /tmp/swh-git-loader/content-storage cleandb: clean - PYTHONPATH=`pwd` $(SWH_DB_MANAGER) $(FLAG) cleandb + PYTHONPATH=$(_PYPATH) $(SWH_DB_MANAGER) $(FLAG) cleandb run-remote: - PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH) + PYTHONPATH=`pwd`:`pwd`/../swh-core $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH) run-local: - PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH) + PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH) run: # works with the default ~/.config/swh/git-loader.ini file - PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) load $(REPO_PATH) + PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) load $(REPO_PATH) run-back: - PYTHONPATH=`pwd` $(SWH_BACK) $(FLAG) + PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) check: $(FLAKE) $(BINDIR) $(SRCDIR) profile-run: - PYTHONPATH=`pwd` python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py + PYTHONPATH=$(_PYPATH) python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py profile-stats: - PYTHONPATH=`pwd` ./scratch/analyse-profile.py + PYTHONPATH=$(_PYPATH) ./scratch/analyse-profile.py test-run-back: - PYTHONPATH=`pwd` $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini + PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini test: - $(NOSE) $(TESTFLAGS) $(TESTDIR) + PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR) + +test-remote-loader: + PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py + +test-local-loader: + PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py test-http: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_http.py test-swhmap: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhmap.py -test-remote-loader: - $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py - -test-local-loader: - $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py - test-api: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api*.py test-api-post-per-type: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_post_*.py test-api-content: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_content.py test-api-directory: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_directory.py test-api-revision: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_revision.py test-api-release: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_release.py test-api-occurrence: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_occurrence.py test-api-home: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_home.py test-api-origin: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_origin.py test-api-person: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_person.py test-api-pickle: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_pickle.py test-file: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_file.py connect-db: psql -d $(DB) create-db: cd ../swh-sql && make clean initdb drop-db: cd ../swh-sql && make clean dropdb test-connect-db: psql -d $(DB_TEST) test-create-db: cd ../swh-sql && make clean initdb DBNAME=$(DB_TEST) test-drop-db: cd ../swh-sql && make clean dropdb DBNAME=$(DB_TEST) check-meta: @echo "Repository: $(REPO_PATH)" @echo "Git metadata:" @$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH) @echo @echo "DB metadata:" @$(BINDIR)/db-git-repo-meta.sh $(DB) $(REPO_PATH) @echo log-loader: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/sgloader.log log-back: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/back.log coverage: - $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR) + PYTHONPATH=$(_PYPATH) $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR) diff --git a/swh/gitloader/git.py b/swh/gitloader/git.py index f1183a1..1c95a16 100644 --- a/swh/gitloader/git.py +++ b/swh/gitloader/git.py @@ -1,180 +1,182 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import pygit2 import time from datetime import datetime from pygit2 import GIT_REF_OID from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL from enum import Enum -from swh import hash +from swh.core import hashutil +#from swh import hash from swh.data import swhrepo class DirectoryTypeEntry(Enum): """Types of git objects. """ file = 'file' directory = 'directory' def date_format(d): """d is expected to be a datetime object. """ return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple()) def now(): """Cheat time values.""" return date_format(datetime.utcnow()) def timestamp_to_string(timestamp): """Convert a timestamps to string. """ return date_format(datetime.utcfromtimestamp(timestamp)) def parse(repo_path): """Given a repository path, parse and return a memory model of such repository.""" def read_signature(signature): return '%s <%s>' % (signature.name, signature.email) def treewalk(repo, tree): """Walk a tree with the same implementation as `os.path`. Returns: tree, trees, blobs """ trees, blobs, dir_entries = [], [], [] for tree_entry in tree: obj = repo.get(tree_entry.oid) if obj is None: logging.warn('skip submodule-commit %s' % tree_entry.hex) continue # submodule! if obj.type == GIT_OBJ_TREE: logging.debug('found tree %s' % tree_entry.hex) nature = DirectoryTypeEntry.directory.value trees.append(tree_entry) else: logging.debug('found content %s' % tree_entry.hex) data = obj.data nature = DirectoryTypeEntry.file.value + hashes = hashutil.hashdata(data, algorithms=['sha1', 'sha256']) blobs.append({'sha1': obj.hex, - 'content-sha1': hash.hash1(data).hexdigest(), - 'content-sha256': hash.hash256(data).hexdigest(), + 'content-sha1': hashes['sha1'], + 'content-sha256': hashes['sha256'], 'content': data, # FIXME: add pointer to data on disk? 'size': obj.size}) logging.debug('(name: %s, tgt: %s, nat: %s, perms: %s, parent: %s) ' % # noqa (tree_entry.name, obj.hex, nature, tree_entry.filemode, tree.hex)) dir_entries.append({'name': tree_entry.name, 'target-sha1': obj.hex, 'nature': nature, 'perms': tree_entry.filemode, 'atime': now(), # FIXME: use real data 'mtime': now(), # FIXME: use real data 'ctime': now(), # FIXME: use real data 'parent': tree.hex}) yield tree, dir_entries, trees, blobs for tree_entry in trees: for x in treewalk(repo, repo[tree_entry.oid]): yield x def walk_tree(repo, swh_repo, rev): """Walk the rev revision's directories. """ if swh_repo.already_visited(rev.hex): logging.debug('commit %s already visited, skipped' % rev.hex) return swh_repo for dir_root, dir_entries, _, contents_ref in treewalk(repo, rev.tree): for content_ref in contents_ref: swh_repo.add_content(content_ref) swh_repo.add_directory({'sha1': dir_root.hex, 'content': dir_root.read_raw(), # FIXME: add pointer to data on disk? 'entries': dir_entries}) revision_parent_sha1s = list(map(str, rev.parent_ids)) author = {'name': rev.author.name, 'email': rev.author.email} committer = {'name': rev.committer.name, 'email': rev.committer.email} swh_repo.add_revision({'sha1': rev.hex, 'content': rev.read_raw(), # FIXME: add pointer to data on disk? 'date': timestamp_to_string(rev.commit_time), 'directory': rev.tree.hex, 'message': rev.message, 'committer': committer, 'author': author, 'parent-sha1s': revision_parent_sha1s }) swh_repo.add_person(read_signature(rev.author), author) swh_repo.add_person(read_signature(rev.committer), committer) return swh_repo def walk_revision_from(repo, swh_repo, head_rev): """Walk the rev history log from head_rev. - repo is the current repository - rev is the latest rev to start from. """ for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): swh_repo = walk_tree(repo, swh_repo, rev) return swh_repo repo = pygit2.Repository(repo_path) # memory model swh_repo = swhrepo.SWHRepo() # add origin origin = {'type': 'git', 'url': 'file://' + repo.path} swh_repo.add_origin(origin) # add references and crawl them for ref_name in repo.listall_references(): logging.info('walk reference %s' % ref_name) ref = repo.lookup_reference(ref_name) head_rev = repo[ref.target] \ if ref.type is GIT_REF_OID \ else ref.peel(GIT_OBJ_COMMIT) # noqa if isinstance(head_rev, pygit2.Tag): head_start = head_rev.get_object() taggerSig = head_rev.tagger author = {'name': taggerSig.name, 'email': taggerSig.email} release = {'sha1': head_rev.hex, 'content': head_rev.read_raw(), # FIXME: add pointer to data on disk? 'revision': head_rev.target.hex, 'name': ref_name, 'date': now(), # FIXME: find the tag's date, 'author': author, 'comment': head_rev.message} swh_repo.add_release(release) swh_repo.add_person(read_signature(taggerSig), author) else: swh_repo.add_occurrence({'sha1': head_rev.hex, 'reference': ref_name, 'url-origin': origin['url']}) head_start = head_rev # crawl commits and trees walk_revision_from(repo, swh_repo, head_start) return swh_repo diff --git a/swh/hash.py b/swh/hash.py deleted file mode 100644 index a0103c4..0000000 --- a/swh/hash.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import hashlib -import binascii - - -def sha1_bin(hexsha1): - """Compute the sha1's binary format from an hexadecimal format string. - """ - return binascii.unhexlify(hexsha1) - - -def sha1_hex(binsha1): - """Compute the sha1's binary format from an hexadecimal format string. - """ - return binascii.hexlify(binsha1) - - -def hash1(data): - """Given some data, compute the hash ready object of such data. - Return the reference object but not the computation. - """ - sha1 = hashlib.sha1() - sha1.update(data) - return sha1 - - -def hash256(data): - """Given some data, compute the hash ready object of such data. - Return the reference object but not the computation. - """ - sha2 = hashlib.sha256() - sha2.update(data) - return sha2 - - -def blob_sha1(blob_data): - """Compute the sha1 of the blob's data. - blob_data is the blob's data uncompressed. - """ - return sha1('blob', blob_data) - - -def sha1(type, data): - """Compute the sha1 of a data. - `type` must be git compliant: tree, blob, commit, tag. - `data` must be uncompressed adequate data for the corresponding type. - - Inspired by pygit2's test utils code. - https://github.com/libgit2/pygit2/blob/74b81bf18076555fb12369d5f20e4282214116d3/test/utils.py#L50-L56 - http://stackoverflow.com/questions/552659/assigning-git-sha1s-without-git - """ - git_format_data = ('%s %d\0%s' % (type, len(data), data)).encode() - - return hash1(git_format_data) diff --git a/swh/tests/test_hash.py b/swh/tests/test_hash.py deleted file mode 100644 index 52875be..0000000 --- a/swh/tests/test_hash.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh import hash - - -@attr('slow') -class TestHash(unittest.TestCase): - @istest - def compute_sha1_data(self): - # given - # when - actual_sha1 = hash.hash1(b'some data') - - # then - self.assertEquals( - actual_sha1.hexdigest(), - 'baf34551fecb48acc3da868eb85e1b6dac9de356', - "Result should be the result of `echo -n 'some data' | sha1sum`") - - @istest - def compute_sha1_data2(self): - # given - # when - actual_sha1 = hash.hash1(b'some other data') - - # then - self.assertEquals( - actual_sha1.hexdigest(), - '7bd8e7cb8e1e8b7b2e94b472422512935c9d4519', - """Result should be the result of - `echo -n 'some other data' | sha1sum`""") - - @istest - def compute_blob_sha1(self): - # given - # when - sha1 = hash.blob_sha1('some blob data') - sha1hex = sha1.hexdigest() - - # then - self.assertEquals(sha1hex, - '895018df42621eed0b1e42fd9b8fa12d8f534f38', - """Result should be the result of - `echo -en 'blob 14\0some blob data' | sha1sum`""") - - @istest - def compute_blob_sha1_2(self): - # given - # when - sha1 = hash.blob_sha1('some other blob data') - sha1hex = sha1.hexdigest() - - # then - self.assertEquals( - sha1hex, - '25388c1e2102249d5b8dd33dd9bb4a0e1cc95e62', - """Result should be the result of - `echo -en 'blob 20\0some other blob data' | sha1sum`""")