diff --git a/bin/swh-backend b/bin/swh-backend index 0124211..4bee07c 100755 --- a/bin/swh-backend +++ b/bin/swh-backend @@ -1,58 +1,58 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import logging import os +from swh.core.conf import reader from swh.loader.git.backend import api -from swh.loader.git.conf import reader from swh.storage.objstorage import ObjStorage # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir' : ('string', '/tmp/swh-loader-git/content-storage'), 'log_dir' : ('string', '/tmp/swh-loader-git/log'), 'db_url' : ('string', 'dbname=softwareheritage-dev'), 'folder_depth' : ('int' , 4), 'debug' : ('bool' , None), 'host' : ('string', '127.0.0.1'), 'port' : ('int' , 5000) } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir', 'content_storage_dir') conf.update({ 'objstorage': ObjStorage(conf['content_storage_dir'], conf['folder_depth']) }) logging.basicConfig(filename=os.path.join(conf['log_dir'], 'back.log'), level=logging.DEBUG if args.verbose else logging.INFO) api.run(conf) diff --git a/bin/swh-db-manager b/bin/swh-db-manager index a690114..079b3c7 100755 --- a/bin/swh-db-manager +++ b/bin/swh-db-manager @@ -1,56 +1,57 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import argparse import logging import os - +from swh.core.conf import reader from swh.loader.git import manager -from swh.loader.git.conf import reader + # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/db-manager.ini' # default configuration (can be overriden by the DEFAULT_CONF_FILE) DEFAULT_CONF = { 'log_dir': ('string', '/tmp/swh-loader-git/log'), 'db_url' : ('string', 'dbname=softwareheritage-dev') } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') subcli = cli.add_subparsers(dest='action') subcli.add_parser('initdb', help='initialize DB') subcli.add_parser('cleandb', help='clean DB') args = cli.parse_args() if not args.action: cli.error('no action given') return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir') logging.basicConfig(filename=os.path.join(conf['log_dir'], 'db-manager.log'), level=logging.DEBUG if args.verbose else logging.INFO) manager.manage(args.action, conf['db_url']) diff --git a/bin/swh-loader-git b/bin/swh-loader-git index 3d6ae75..9913310 100755 --- a/bin/swh-loader-git +++ b/bin/swh-loader-git @@ -1,67 +1,69 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import argparse import logging import os +from swh.core.conf import reader from swh.loader.git import loader -from swh.loader.git.conf import reader + # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/loader-git.ini' # default configuration (can be overriden by the DEFAULT_CONF_FILE) DEFAULT_CONF = { 'log_dir': ('string', '/tmp/swh-loader-git/log'), 'backend-type': ('string', 'remote'), 'backend': ('string', 'http://localhost:5000'), } # Another example of configuration: # DEFAULT_CONF = { # 'log_dir': ('string', '/tmp/swh-loader-git/log'), # 'backend-type': ('string', 'local'), # 'backend': ('string', '~/.config/swh/back.ini'), # } def parse_args(): """Parse the CLI arguments. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') subcli = cli.add_subparsers(dest='action') load_cli = subcli.add_parser('load', help='load Git repo into DB') load_cli.add_argument('repository', help='Git repository path') args = cli.parse_args() if not args.action: cli.error('no action given') return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir') conf['action'] = args.action conf['repo_path'] = args.repository logging.basicConfig(filename=os.path.join(conf['log_dir'], 'sgloader.log'), level=logging.DEBUG if args.verbose else logging.INFO) loader.load(conf) diff --git a/swh/loader/git/conf/reader.py b/swh/loader/git/conf/reader.py deleted file mode 100755 index f332883..0000000 --- a/swh/loader/git/conf/reader.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import configparser -import os - - -_map_convert_fn = {'int': int, - 'bool': lambda x: x == 'true'} # conversion per type - - -def read(conf_file, default_conf=None): - """Read the user's configuration file. - Fill in the gap using `default_conf`. -`default_conf` is similar to this: -DEFAULT_CONF = { - 'a': ('string', '/tmp/swh-loader-git/log'), - 'b': ('string', 'dbname=swhloadergit') - 'c': ('bool', true) - 'e': ('bool', None) - 'd': ('int', 10) -} - - """ - config = configparser.ConfigParser(defaults=default_conf) - config.read(os.path.expanduser(conf_file)) - conf = config._sections['main'] - - # remaining missing default configuration key are set - # also type conversion is enforced for underneath layer - for key in default_conf: - nature_type, default_value = default_conf[key] - val = conf.get(key, None) - if not val: # fallback to default value - conf[key] = default_value - else: # value present but in string format, force type conversion - conf[key] = _map_convert_fn.get(nature_type, lambda x: x)(val) - - return conf - - -def prepare_folders(conf, *keys): - """Prepare the folder mentioned in config under keys. - """ - def makedir(folder): - if not os.path.exists(folder): - os.makedirs(folder) - - for key in keys: - makedir(conf[key]) diff --git a/swh/loader/git/local_store.py b/swh/loader/git/local_store.py index d5ecfd7..2e0446c 100644 --- a/swh/loader/git/local_store.py +++ b/swh/loader/git/local_store.py @@ -1,108 +1,109 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +from swh.core.conf import reader from swh.loader.git.storage import storage, db, service -from swh.loader.git.conf import reader from swh.storage.objstorage import ObjStorage # FIXME: duplicated from bin/swh-backend... # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir': ('string', '/tmp/swh-loader-git/content-storage'), 'log_dir': ('string', '/tmp/swh-loader-git/log'), 'db_url': ('string', 'dbname=softwareheritage-dev'), 'folder_depth': ('int', 4), 'debug': ('bool', None), 'host': ('string', '127.0.0.1'), 'port': ('int', 5000) } def store_only_new(db_conn, conf, obj_type, obj): """Store object if not already present. """ if not storage.find(db_conn, obj['id'], obj_type): storage.add(db_conn, conf, obj) _obj_to_persist_fn = {storage.Type.revision: service.add_revisions} def store_unknown_objects(db_conn, conf, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now store in backend persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) obj_fulls = map(swhmap.get, unknown_obj_sha1s) return persist_fn(db_conn, conf, obj_type, obj_fulls) def load_to_back(conf, swh_repo): """Load to the backend the repository swh_repo. """ with db.connect(conf['db_url']) as db_conn: # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged # on the right swh db ftm) service.add_origin(db_conn, swh_repo.get_origin()) # First reference all unknown persons service.add_persons(db_conn, conf, storage.Type.person, swh_repo.get_persons()) res = store_unknown_objects(db_conn, conf, storage.Type.content, swh_repo.get_contents()) if res: res = store_unknown_objects(db_conn, conf, storage.Type.directory, swh_repo.get_directories()) if res: res = store_unknown_objects(db_conn, conf, storage.Type.revision, swh_repo.get_revisions()) if res: # brutally send all remaining occurrences service.add_objects(db_conn, conf, storage.Type.occurrence, swh_repo.get_occurrences()) # and releases (the idea here is that compared to existing # objects, the quantity is less) service.add_objects(db_conn, conf, storage.Type.release, swh_repo.get_releases()) def prepare_and_load_to_back(backend_setup_file, swh_repo): """Prepare and load to back the swh_repo. backend-setup-file is the backend's setup to load to access the db and file storage. """ # Read the configuration file (no check yet) conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'content_storage_dir') conf.update({ 'objstorage': ObjStorage(conf['content_storage_dir'], conf['folder_depth']) }) load_to_back(conf, swh_repo) diff --git a/swh/loader/git/tests/test_local_loader.py b/swh/loader/git/tests/test_local_loader.py index c6f92dc..04daa46 100644 --- a/swh/loader/git/tests/test_local_loader.py +++ b/swh/loader/git/tests/test_local_loader.py @@ -1,249 +1,249 @@ # coding: utf-8 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pygit2 import tempfile import shutil from nose.plugins.attrib import attr from nose.tools import istest +from swh.core.conf import reader from swh.loader.git.storage import db, models from swh.loader.git import loader -from swh.loader.git.conf import reader import test_initdb from test_utils import list_files_from from test_git_utils import create_commit_with_content, create_tag @attr('slow') class TestLocalLoader(unittest.TestCase): def setUp(self): """Initialize a git repository for the remaining test to manipulate. """ tmp_git_folder_path = tempfile.mkdtemp(prefix='test-sgloader.', dir='/tmp') self.tmp_git_repo = pygit2.init_repository(tmp_git_folder_path) self.conf_back = reader.read('./resources/test/back.ini', {'port': ('int', 9999)}) self.db_url = self.conf_back['db_url'] self.conf = { 'action': 'load', 'repo_path': self.tmp_git_repo.workdir, 'backend-type': 'local', 'backend': './resources/test/back.ini' } def init_db_setup(self): """Initialize a git repository for the remaining test to manipulate. """ test_initdb.prepare_db(self.db_url) def tearDown(self): """Destroy the test git repository. """ shutil.rmtree(self.tmp_git_repo.workdir) shutil.rmtree(self.conf_back['content_storage_dir'], ignore_errors=True) @istest def should_fail_on_bad_action(self): # when try: loader.load({'action': 'unknown'}) except: pass @istest def should_fail_on_inexistant_folder(self): # when try: loader.load({'action': 'load', 'repo_path': 'something-that-definitely-does-not-exist'}) except: pass @istest def should_fail_on_inexistant_backend_type(self): # when try: loader.load({'action': 'load', 'repo_path': '.', 'backend-type': 'unknown'}) # only local or remote supported except: pass @istest def local_loader(self): """Trigger loader and make sure everything is ok. """ self.init_db_setup() # given commit0 = create_commit_with_content(self.tmp_git_repo, 'blob 0', 'commit msg 0') commit1 = create_commit_with_content(self.tmp_git_repo, 'blob 1', 'commit msg 1', [commit0.hex]) commit2 = create_commit_with_content(self.tmp_git_repo, 'blob 2', 'commit msg 2', [commit1.hex]) commit3 = create_commit_with_content(self.tmp_git_repo, None, 'commit msg 3', [commit2.hex]) commit4 = create_commit_with_content(self.tmp_git_repo, 'blob 4', 'commit msg 4', [commit3.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) self.assertEquals(nb_files, 4, "4 blobs.") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 5, "Should be 5 commits") self.assertEquals( models.count_directories(db_conn), 5, "Should be 5 trees") self.assertEquals( models.count_contents(db_conn), 4, "Should be 4 blobs as we created one commit without data!") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 1, "Should be 1 reference (master) so 1 occurrence.") # given commit5 = create_commit_with_content(self.tmp_git_repo, 'new blob 5', 'commit msg 5', [commit4.hex]) commit6 = create_commit_with_content(self.tmp_git_repo, 'new blob and last 6', 'commit msg 6', [commit5.hex]) commit7 = create_commit_with_content(self.tmp_git_repo, 'new blob 7', 'commit msg 7', [commit6.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) self.assertEquals(nb_files, 4+3, "3 new blobs.") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 8, "Should be 5+3 == 8 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 5+3 == 8 trees") self.assertEquals( models.count_contents(db_conn), 7, "Should be 4+3 == 7 blobs") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 2, "Should be 1 reference which changed twice so 2 occurrences (master changed).") # given create_commit_with_content(self.tmp_git_repo, None, 'commit 8 with parent 2', [commit7.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) self.assertEquals(nb_files, 7, "no new blob.") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 3, "Should be 1 reference which changed thrice so 3 occurrences (master changed again).") self.assertEquals( models.count_person(db_conn), 2, "1 author + 1 committer") # add tag create_tag(self.tmp_git_repo, '0.0.1', commit5, 'bad ass release 0.0.1, towards infinity...') create_tag(self.tmp_git_repo, '0.0.2', commit7, 'release 0.0.2... and beyond') loader.load(self.conf) # then nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) self.assertEquals(nb_files, 7, "no new blob.") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 2, "Should be 2 annotated tags so 2 releases") self.assertEquals( models.count_occurrence(db_conn), 3, "master did not change this time so still 3 occurrences") self.assertEquals( models.count_person(db_conn), 3, "1 author + 1 committer + 1 tagger") diff --git a/swh/loader/git/tests/test_remote_loader.py b/swh/loader/git/tests/test_remote_loader.py index 971f83e..dcaf401 100644 --- a/swh/loader/git/tests/test_remote_loader.py +++ b/swh/loader/git/tests/test_remote_loader.py @@ -1,252 +1,252 @@ # coding: utf-8 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pygit2 import tempfile import shutil import os from nose.plugins.attrib import attr from nose.tools import istest +from swh.core.conf import reader from swh.loader.git.storage import db, models from swh.loader.git import loader -from swh.loader.git.conf import reader import test_initdb from test_git_utils import create_commit_with_content, create_tag from test_utils import list_files_from @attr('slow') class TestRemoteLoader(unittest.TestCase): def setUp(self): tmp_git_folder_path = tempfile.mkdtemp(prefix='test-sgloader.', dir='/tmp') self.tmp_git_repo = pygit2.init_repository(tmp_git_folder_path) self.conf = reader.read('./resources/test/back.ini', {'port': ('int', 9999)}) self.db_url = self.conf['db_url'] self.conf.update({ 'action': 'load', 'repo_path': self.tmp_git_repo.workdir, 'backend-type': 'remote', 'backend': 'http://localhost:%s' % self.conf['port'] }) # Not the remote loader in charge of creating the folder, so we do it if not os.path.exists(self.conf['content_storage_dir']): os.mkdir(self.conf['content_storage_dir']) def init_db_setup(self): """Initialize a git repository for the remaining test to manipulate. """ test_initdb.prepare_db(self.db_url) def tearDown(self): """Destroy the test git repository. """ shutil.rmtree(self.tmp_git_repo.workdir) shutil.rmtree(self.conf['content_storage_dir']) @istest def should_fail_on_bad_action(self): # when try: loader.load({'action': 'unknown'}) except: # FIXME assert raises pass @istest def should_fail_on_inexistant_folder(self): # when try: loader.load({'action': 'load', 'repo_path': 'something-that-definitely-does-not-exist'}) except: pass @istest def should_fail_on_inexistant_backend_type(self): # when try: loader.load({'action': 'load', 'repo_path': '.', 'backend-type': 'unknown'}) # only local or remote supported except: pass @istest def remote_loader(self): """Trigger loader and make sure everything is ok. """ # given self.init_db_setup() # given commit0 = create_commit_with_content(self.tmp_git_repo, 'blob 0', 'commit msg 0') commit1 = create_commit_with_content(self.tmp_git_repo, 'blob 1', 'commit msg 1', [commit0.hex]) commit2 = create_commit_with_content(self.tmp_git_repo, 'blob 2', 'commit msg 2', [commit1.hex]) commit3 = create_commit_with_content(self.tmp_git_repo, None, 'commit msg 3', [commit2.hex]) commit4 = create_commit_with_content(self.tmp_git_repo, 'blob 4', 'commit msg 4', [commit3.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf['content_storage_dir'])) self.assertEquals(nb_files, 4, "4 blobs") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 5, "Should be 5 commits") self.assertEquals( models.count_directories(db_conn), 5, "Should be 5 trees") self.assertEquals( models.count_contents(db_conn), 4, "Should be 4 blobs as we created one commit without data!") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 1, "Should be 1 reference (master) so 1 occurrence.") # given commit5 = create_commit_with_content(self.tmp_git_repo, 'new blob 5', 'commit msg 5', [commit4.hex]) commit6 = create_commit_with_content(self.tmp_git_repo, 'new blob and last 6', 'commit msg 6', [commit5.hex]) commit7 = create_commit_with_content(self.tmp_git_repo, 'new blob 7', 'commit msg 7', [commit6.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf['content_storage_dir'])) self.assertEquals(nb_files, 4+3, "3 new blobs") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 8, "Should be 5+3 == 8 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 5+3 == 8 trees") self.assertEquals( models.count_contents(db_conn), 7, "Should be 4+3 == 7 blobs") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 2, "Should be 1 reference which changed twice so 2 occurrences (master changed).") # given create_commit_with_content(self.tmp_git_repo, None, 'commit 8 with parent 2', [commit7.hex]) # when loader.load(self.conf) # then nb_files = len(list_files_from(self.conf['content_storage_dir'])) self.assertEquals(nb_files, 7, "no new blob") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 0, "No tag created so 0 release.") self.assertEquals( models.count_occurrence(db_conn), 3, "Should be 1 reference which changed thrice so 3 occurrences (master changed again).") self.assertEquals( models.count_person(db_conn), 2, "1 author + 1 committer") # add tag create_tag(self.tmp_git_repo, '0.0.1', commit5, 'bad ass release 0.0.1, towards infinity...') create_tag(self.tmp_git_repo, '0.0.2', commit7, 'release 0.0.2... and beyond') loader.load(self.conf) # then nb_files = len(list_files_from(self.conf['content_storage_dir'])) self.assertEquals(nb_files, 7, "no new blob") with db.connect(self.db_url) as db_conn: self.assertEquals( models.count_revisions(db_conn), 9, "Should be 8+1 == 9 commits now") self.assertEquals( models.count_directories(db_conn), 8, "Should be 8 trees (new commit without blob so no new tree)") self.assertEquals( models.count_contents(db_conn), 7, "Should be 7 blobs (new commit without new blob)") self.assertEquals( models.count_release(db_conn), 2, "Should be 2 annotated tags so 2 releases") self.assertEquals( models.count_occurrence(db_conn), 3, "master did not change this time so still 3 occurrences") self.assertEquals( models.count_person(db_conn), 3, "1 author + 1 committer + 1 tagger")