diff --git a/README b/README index 381b2d8..1f54ae7 100644 --- a/README +++ b/README @@ -1,221 +1,218 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- - python3 - python3-psycopg2 - python3-pygit2 Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit - cache: implemented as Postgres tables Configuration ============= swh-git-loader depends on some tools, here are the configuration files for those: swh-db-manager -------------- This is solely a tool in charge of db cleanup now. Create a configuration file in **\~/.config/db-manager.ini** ``` {.ini} [main] # Where to store the logs log_dir = swh-git-loader/log # url access to db db_url = dbname=swhgitloader ``` See for the db url's schema swh-git-loader -------------- Create a configuration file in **\~/.config/swh/git-loader.ini**: ``` {.ini} [main] # Where to store the logs log_dir = /tmp/swh-git-loader/log # how to access the backend (remote or local) backend-type = remote # backend-type remote: url access to api rest's backend # backend-type local: configuration file to backend file .ini (cf. back.ini file) backend = http://localhost:5000 ``` Note: - [DB url DSL](http://initd.org/psycopg/docs/module.html#psycopg2.connect) - the configuration file can be changed in the CLI with the flag \`-c \\` or \`--config-file \\` swh-backend ----------- Backend api. This Create a configuration file in **\~/.config/swh/back.ini**: ``` {.ini} [main] # where to store blob on disk content_storage_dir = /tmp/swh-git-loader/content-storage # Where to store the logs log_dir = swh-git-loader/log # url access to db: dbname= (port= user= pass=) db_url = dbname=swhgitloader -# activate the compression for each vcs stored object -# storage_compression = true - # compute folder's depth on disk aa/bb/cc/dd # folder_depth = 2 # Debugger (for dev only) debug = true # server port to listen to requests port = 6000 ``` See for the db url's schema Run === Environment initialization -------------------------- ``` {.bash} export PYTHONPATH=`pwd`:$PYTHONPATH ``` Backend ------- ### With initialization This depends on swh-sql repository, so: ``` {.bash} cd /path/to/swh-sql && make clean initdb DBNAME=softwareheritage-dev ``` Using the Makefile eases: ``` {.bash} make drop-db create-db run-back FOLLOW_LOG=-f ``` ### without initialization Running the backend. ``` {.bash} ./bin/swh-backend -v ``` With makefile: ``` {.bash} make run-back FOLLOW_LOG=-f ``` Help ---- ``` {.bash} bin/swh-git-loader --help bin/swh-db-manager --help ``` Parse a repository from a clean slate ------------------------------------- Clean and initialize the model then parse the repository git: ``` {.bash} bin/swh-db-manager cleandb bin/swh-git-loader load /path/to/git/repo ``` For ease: ``` {.bash} time make cleandb run REPO_PATH=~/work/inria/repo/swh-git-cloner ``` Parse an existing repository ---------------------------- ``` {.bash} bin/swh-git-loader load /path/to/git/repo ``` Clean data ---------- This will truncate the relevant table in the schema ``` {.bash} bin/swh-db-manager cleandb ``` For ease: ``` {.bash} make cleandb ``` Init data --------- ``` {.bash} make drop-db create-db ``` diff --git a/bin/swh-backend b/bin/swh-backend index 78b6014..0f399d3 100755 --- a/bin/swh-backend +++ b/bin/swh-backend @@ -1,54 +1,53 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import logging import os from swh.backend import api from swh.conf import reader # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir' : ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir' : ('string', '/tmp/swh-git-loader/log'), 'db_url' : ('string', 'dbname=softwareheritage-dev'), - 'storage_compression' : ('bool' , None), 'folder_depth' : ('int' , 4), 'debug' : ('bool' , None), 'port' : ('int' , 5000) } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir', 'content_storage_dir') logging.basicConfig(filename=os.path.join(conf['log_dir'], 'back.log'), level=logging.DEBUG if args.verbose else logging.INFO) api.run(conf) diff --git a/resources/test/back.ini b/resources/test/back.ini index e633835..070b29a 100644 --- a/resources/test/back.ini +++ b/resources/test/back.ini @@ -1,22 +1,19 @@ [main] # where to store blob on disk content_storage_dir = /tmp/swh-git-loader/test/content-storage # Where to store the logs log_dir = /tmp/swh-git-loader/test/log # url access to db: dbname= (port= user= pass=) db_url = dbname=softwareheritage-dev-test -# activate the compression for each stored object -#storage_compression = true - # compute folder's depth on disk aa/bb/cc/dd #folder_depth = 4 # Debugger (for dev only) debug = true # server port to listen to requests port = 5001 diff --git a/scratch/profile-swhgitloader.py b/scratch/profile-swhgitloader.py index 6194fb0..43361eb 100755 --- a/scratch/profile-swhgitloader.py +++ b/scratch/profile-swhgitloader.py @@ -1,30 +1,29 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.gitloader import loader conf = { 'db_url': 'dbname=swhgitloader', # 'repository': os.path.expanduser('./test-repo'), 'repository': os.path.expanduser('../debsources'), 'content_storage_dir': '/tmp/swh-git-loader/content-storage', - 'folder_depth': 4, - 'storage_compression': None, + 'folder_depth': 4 } conf['action'] = 'cleandb' loader.load(conf) conf['action'] = 'initdb' loader.load(conf) conf['action'] = 'load' loader.load(conf) diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py index 71e5f9c..3363366 100644 --- a/swh/gitloader/local_store.py +++ b/swh/gitloader/local_store.py @@ -1,91 +1,89 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.store import store, db, service from swh.conf import reader # FIXME: duplicated from bin/swh-backend... # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir': ('string', '/tmp/swh-git-loader/log'), 'db_url': ('string', 'dbname=softwareheritage-dev'), - 'storage_compression': ('bool', None), 'folder_depth': ('int', 4), 'debug': ('bool', None), 'port': ('int', 5000) } def store_only_new(db_conn, conf, obj_type, obj): """Store object if not already present. """ obj.update({'type': obj_type}) if not store.find(db_conn, obj): store.add(db_conn, conf, obj) _obj_to_persist_fn = {store.Type.revision: service.add_revisions} def store_unknown_objects(db_conn, conf, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now store in backend persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) obj_fulls = map(swhmap.get, unknown_obj_sha1s) return persist_fn(db_conn, conf, obj_type, obj_fulls) def load_to_back(conf, swh_repo): """Load to the backend the repository swh_repo. """ with db.connect(conf['db_url']) as db_conn: # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged # on the right swh db ftm) service.add_origin(db_conn, swh_repo.get_origin()) # First reference all unknown persons service.add_persons(db_conn, conf, store.Type.person, swh_repo.get_persons()) res = store_unknown_objects(db_conn, conf, store.Type.content, swh_repo.get_contents()) if res: res = store_unknown_objects(db_conn, conf, store.Type.directory, swh_repo.get_directories()) if res: res = store_unknown_objects(db_conn, conf, store.Type.revision, swh_repo.get_revisions()) if res: # brutally send all remaining occurrences service.add_objects(db_conn, conf, store.Type.occurrence, swh_repo.get_occurrences()) # and releases (the idea here is that compared to existing # objects, the quantity is less) service.add_objects(db_conn, conf, store.Type.release, swh_repo.get_releases()) def prepare_and_load_to_back(backend_setup_file, swh_repo): # Read the configuration file (no check yet) conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'content_storage_dir') load_to_back(conf, swh_repo) - diff --git a/swh/tests/test_utils.py b/swh/tests/test_utils.py index 56f833d..790e376 100644 --- a/swh/tests/test_utils.py +++ b/swh/tests/test_utils.py @@ -1,55 +1,54 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import os import shutil import tempfile from swh.backend import api import test_initdb def now(): """Build the date as of now in the api's format. """ return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) def list_files_from(root_path): """Compute the list of files from root_path. """ f = [] for (dirpath, dirnames, filenames) in os.walk(root_path): f.extend(filenames) return f def app_client(db_url="dbname=softwareheritage-dev-test"): """Setup the application ready for testing. """ content_storage_dir = tempfile.mkdtemp(prefix='test-swh-git-loader.', dir='/tmp') api.app.config['conf'] = {'db_url': db_url, 'content_storage_dir': content_storage_dir, 'log_dir': '/tmp/swh-git-loader/log', 'folder_depth': 2, - 'storage_compression': None, 'debug': 'true'} api.app.config['TESTING'] = True app = api.app.test_client() test_initdb.prepare_db(db_url) return app, db_url, content_storage_dir def app_client_teardown(content_storage_dir): """Tear down app client's context. """ shutil.rmtree(content_storage_dir, ignore_errors=True)