diff --git a/swh/gitloader/loader.py b/swh/gitloader/loader.py index c76cb56..d222ee0 100644 --- a/swh/gitloader/loader.py +++ b/swh/gitloader/loader.py @@ -1,53 +1,53 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os from swh.gitloader import git, remote_store, local_store _load_to_back_fn = {'remote': remote_store.load_to_back - ,'local': local_store.load_to_back + ,'local': local_store.prepare_and_load_to_back } def check_user_conf(conf): """Check the user's configuration and rejects if problems. """ action = conf['action'] if action != 'load': return 'skip unknown action %s' % action backend_type = conf['backend-type'] if backend_type not in _load_to_back_fn: return 'skip unknown backend-type %s (only `remote`, `local` supported)' % backend_type repo_path = conf['repo_path'] if not os.path.exists(repo_path): return 'Repository %s does not exist.' % repo_path return None def load(conf): """According to action, load the repo_path. used configuration keys: - action: requested action - repo_path: git repository path ('load' action only) - backend-type: backend access's type (remote or local) - backend: url access to backend api """ error_msg = check_user_conf(conf) if error_msg: logging.error(error_msg) raise Exception(error_msg) repo_path = conf['repo_path'] logging.info('load repo_path %s' % repo_path) swhrepo = git.parse(repo_path) _load_to_back_fn[conf['backend-type']](conf['backend'], swhrepo) diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py index f3a742f..badc980 100644 --- a/swh/gitloader/local_store.py +++ b/swh/gitloader/local_store.py @@ -1,86 +1,91 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import store, db, service from swh.conf import reader # FIXME: duplicated from bin/swh-backend... # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir': ('string', '/tmp/swh-git-loader/log'), 'db_url': ('string', 'dbname=softwareheritage-dev'), 'storage_compression': ('bool', None), 'folder_depth': ('int', 4), 'debug': ('bool', None), 'port': ('int', 5000) } def store_only_new(db_conn, conf, obj_type, obj): """Store object if not already present. """ obj.update({'type': obj_type}) if not store.find(db_conn, obj): store.add(db_conn, conf, obj) _obj_to_persist_fn = {store.Type.revision: service.add_revisions} def store_unknown_objects(db_conn, conf, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now store in backend persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) obj_fulls = map(swhmap.get, unknown_obj_sha1s) return persist_fn(db_conn, conf, obj_type, obj_fulls) -def load_to_back(backend_setup_file, swhrepo): - """Load to the backend the repository swhrepo. +def load_to_back(conf, swh_repo): + """Load to the backend the repository swh_repo. """ - # Read the configuration file (no check yet) - conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) - with db.connect(conf['db_url']) as db_conn: # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged # on the right swh db ftm) - service.add_origin(db_conn, swhrepo.get_origin()) + service.add_origin(db_conn, swh_repo.get_origin()) # First reference all unknown persons service.add_persons(db_conn, conf, store.Type.person, - swhrepo.get_persons()) + swh_repo.get_persons()) res = store_unknown_objects(db_conn, conf, store.Type.content, - swhrepo.get_contents()) + swh_repo.get_contents()) if res: res = store_unknown_objects(db_conn, conf, store.Type.directory, - swhrepo.get_directories()) + swh_repo.get_directories()) if res: res = store_unknown_objects(db_conn, conf, store.Type.revision, - swhrepo.get_revisions()) + swh_repo.get_revisions()) if res: # brutally send all remaining occurrences service.add_objects(db_conn, conf, store.Type.occurrence, - swhrepo.get_occurrences()) + swh_repo.get_occurrences()) # and releases (the idea here is that compared to existing # objects, the quantity is less) service.add_objects(db_conn, conf, store.Type.release, - swhrepo.get_releases()) + swh_repo.get_releases()) + + +def prepare_and_load_to_back(backend_setup_file, swh_repo): + # Read the configuration file (no check yet) + conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) + reader.prepare_folders(conf['content_storage_dir']) + load_to_back(conf, swh_repo) + diff --git a/swh/tests/test_local_loader.py b/swh/tests/test_local_loader.py index 0e14561..4c017a9 100644 --- a/swh/tests/test_local_loader.py +++ b/swh/tests/test_local_loader.py @@ -1,229 +1,231 @@ # coding: utf-8 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pygit2 import tempfile import shutil from nose.plugins.attrib import attr from nose.tools import istest from swh.storage import db, models from swh.gitloader import loader from swh.conf import reader import test_initdb from test_git_utils import create_commit_with_content, create_tag @attr('slow') class TestRemoteLoader(unittest.TestCase): def setUp(self): """Initialize a git repository for the remaining test to manipulate. """ tmp_git_folder_path = tempfile.mkdtemp(prefix='test-sgloader.', dir='/tmp') self.tmp_git_repo = pygit2.init_repository(tmp_git_folder_path) conf_back = reader.read('./resources/test/back.ini', {'port': ('int', 9999)}) self.db_url = conf_back['db_url'] + self.content_storage_dir = conf_back['content_storage_dir'] self.conf = { 'action': 'load', 'repo_path': self.tmp_git_repo.workdir, 'backend-type': 'local', 'backend': './resources/test/back.ini' } test_initdb.prepare_db(self.db_url) def tearDown(self): """Destroy the test git repository. """ shutil.rmtree(self.tmp_git_repo.workdir) + shutil.rmtree(self.content_storage_dir, ignore_errors=True) @istest def should_fail_on_bad_action(self): # when try: loader.load({'action': 'unknown'}) except: pass @istest def should_fail_on_inexistant_folder(self): # when try: loader.load({'action': 'load', 'repo_path': 'something-that-definitely-does-not-exist'}) except: pass @istest def should_fail_on_inexistant_backend_type(self): # when try: loader.load({'action': 'load', 'repo_path': '.', 'backend-type': 'unknown'}) # only local or remote supported except: pass @istest def remote_loader(self): """Trigger loader and make sure everything is ok. """ # given commit0 = create_commit_with_content(self.tmp_git_repo, 'blob 0', 'commit msg 0') commit1 = create_commit_with_content(self.tmp_git_repo, 'blob 1', 'commit msg 1', [commit0.hex]) commit2 = create_commit_with_content(self.tmp_git_repo, 'blob 2', 'commit msg 2', [commit1.hex]) commit3 = create_commit_with_content(self.tmp_git_repo, None, 'commit msg 3', [commit2.hex]) commit4 = create_commit_with_content(self.tmp_git_repo, 'blob 4', 'commit msg 4', [commit3.hex]) # when loader.load(self.conf) # then with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 5, "Should be 5 commits") self.assertEquals( models.count_directories(db_conn), 5, "Should be 5 trees") self.assertEquals( models.count_contents(db_conn), 4, "Should be 4 blobs as we created one commit without data!") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 1, "Should be 1 reference (master) so 1 occurrence.") # given commit5 = create_commit_with_content(self.tmp_git_repo, 'new blob 5', 'commit msg 5', [commit4.hex]) commit6 = create_commit_with_content(self.tmp_git_repo, 'new blob and last 6', 'commit msg 6', [commit5.hex]) commit7 = create_commit_with_content(self.tmp_git_repo, 'new blob 7', 'commit msg 7', [commit6.hex]) # when loader.load(self.conf) # then with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 8, "Should be 5+3 == 8 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 5+3 == 8 trees") self.assertEquals( models.count_contents(db_conn), 7, "Should be 4+3 == 7 blobs") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 2, "Should be 1 reference which changed twice so 2 occurrences (master changed).") # given create_commit_with_content(self.tmp_git_repo, None, 'commit 8 with parent 2', [commit7.hex]) # when loader.load(self.conf) # then with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 3, "Should be 1 reference which changed thrice so 3 occurrences (master changed again).") self.assertEquals( models.count_person(db_conn), 2, "1 author + 1 committer") # add tag create_tag(self.tmp_git_repo, '0.0.1', commit5, 'bad ass release 0.0.1, towards infinity...') create_tag(self.tmp_git_repo, '0.0.2', commit7, 'release 0.0.2... and beyond') loader.load(self.conf) # then with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 2, "Should be 2 annotated tags so 2 releases") self.assertEquals( models.count_occurrence(db_conn), 3, "master did not change this time so still 3 occurrences") self.assertEquals( models.count_person(db_conn), 3, "1 author + 1 committer + 1 tagger")