diff --git a/README b/README index 1f54ae7..6abc2f1 100644 --- a/README +++ b/README @@ -1,218 +1,221 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- - python3 - python3-psycopg2 - python3-pygit2 Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit - cache: implemented as Postgres tables Configuration ============= swh-git-loader depends on some tools, here are the configuration files for those: swh-db-manager -------------- This is solely a tool in charge of db cleanup now. Create a configuration file in **\~/.config/db-manager.ini** ``` {.ini} [main] # Where to store the logs log_dir = swh-git-loader/log # url access to db db_url = dbname=swhgitloader ``` See for the db url's schema swh-git-loader -------------- Create a configuration file in **\~/.config/swh/git-loader.ini**: ``` {.ini} [main] # Where to store the logs log_dir = /tmp/swh-git-loader/log # how to access the backend (remote or local) backend-type = remote # backend-type remote: url access to api rest's backend # backend-type local: configuration file to backend file .ini (cf. back.ini file) backend = http://localhost:5000 ``` Note: - [DB url DSL](http://initd.org/psycopg/docs/module.html#psycopg2.connect) - the configuration file can be changed in the CLI with the flag \`-c \\` or \`--config-file \\` swh-backend ----------- Backend api. This Create a configuration file in **\~/.config/swh/back.ini**: ``` {.ini} [main] # where to store blob on disk content_storage_dir = /tmp/swh-git-loader/content-storage # Where to store the logs log_dir = swh-git-loader/log # url access to db: dbname= (port= user= pass=) db_url = dbname=swhgitloader # compute folder's depth on disk aa/bb/cc/dd # folder_depth = 2 +# To open to the world, 0.0.0.0 +#host = 127.0.0.1 + # Debugger (for dev only) debug = true # server port to listen to requests port = 6000 ``` See for the db url's schema Run === Environment initialization -------------------------- ``` {.bash} export PYTHONPATH=`pwd`:$PYTHONPATH ``` Backend ------- ### With initialization This depends on swh-sql repository, so: ``` {.bash} cd /path/to/swh-sql && make clean initdb DBNAME=softwareheritage-dev ``` Using the Makefile eases: ``` {.bash} make drop-db create-db run-back FOLLOW_LOG=-f ``` ### without initialization Running the backend. ``` {.bash} ./bin/swh-backend -v ``` With makefile: ``` {.bash} make run-back FOLLOW_LOG=-f ``` Help ---- ``` {.bash} bin/swh-git-loader --help bin/swh-db-manager --help ``` Parse a repository from a clean slate ------------------------------------- Clean and initialize the model then parse the repository git: ``` {.bash} bin/swh-db-manager cleandb bin/swh-git-loader load /path/to/git/repo ``` For ease: ``` {.bash} time make cleandb run REPO_PATH=~/work/inria/repo/swh-git-cloner ``` Parse an existing repository ---------------------------- ``` {.bash} bin/swh-git-loader load /path/to/git/repo ``` Clean data ---------- This will truncate the relevant table in the schema ``` {.bash} bin/swh-db-manager cleandb ``` For ease: ``` {.bash} make cleandb ``` Init data --------- ``` {.bash} make drop-db create-db ``` diff --git a/bin/swh-backend b/bin/swh-backend index 07e3983..7f2c972 100755 --- a/bin/swh-backend +++ b/bin/swh-backend @@ -1,57 +1,58 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import logging import os from swh.backend import api from swh.conf import reader from swh.storage.objstorage import ObjStorage # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir' : ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir' : ('string', '/tmp/swh-git-loader/log'), 'db_url' : ('string', 'dbname=softwareheritage-dev'), 'folder_depth' : ('int' , 4), 'debug' : ('bool' , None), + 'host' : ('string', '127.0.0.1'), 'port' : ('int' , 5000) } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir', 'content_storage_dir') conf.update({ 'objstorage': ObjStorage(conf['content_storage_dir'], conf['folder_depth']) }) logging.basicConfig(filename=os.path.join(conf['log_dir'], 'back.log'), level=logging.DEBUG if args.verbose else logging.INFO) api.run(conf) diff --git a/resources/test/back.ini b/resources/test/back.ini index 070b29a..726a1e1 100644 --- a/resources/test/back.ini +++ b/resources/test/back.ini @@ -1,19 +1,22 @@ [main] # where to store blob on disk content_storage_dir = /tmp/swh-git-loader/test/content-storage # Where to store the logs log_dir = /tmp/swh-git-loader/test/log # url access to db: dbname= (port= user= pass=) db_url = dbname=softwareheritage-dev-test # compute folder's depth on disk aa/bb/cc/dd #folder_depth = 4 +# To open to the world, 0.0.0.0 +#host = 127.0.0.1 + # Debugger (for dev only) debug = true # server port to listen to requests port = 5001 diff --git a/swh/backend/api.py b/swh/backend/api.py index b7cebdc..768f858 100755 --- a/swh/backend/api.py +++ b/swh/backend/api.py @@ -1,293 +1,298 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from flask import Flask, Response, make_response, request from swh.store import store, db, service from swh.protocols import serial # api's definition app = Flask(__name__) def read_request_payload(request): """Read the request's payload. """ # TODO: Check the signed pickled data? return serial.load(request.stream) def write_response(data): """Write response from data. """ return Response(serial.dumps(data), mimetype=serial.MIMETYPE) @app.route('/') def hello(): """A simple api to define what the server is all about. FIXME: A redirect towards a static page defining the routes would be nice. """ return 'Dev SWH API' # from uri to type _uri_types = {'revisions': store.Type.revision, 'directories': store.Type.directory, 'contents': store.Type.content, 'releases': store.Type.release, 'occurrences': store.Type.occurrence} def _do_action_with_payload(conf, action_fn, uri_type, id, map_result_fn): uri_type_ok = _uri_types.get(uri_type, None) if uri_type_ok is None: return make_response('Bad request!', 400) vcs_object = read_request_payload(request) vcs_object.update({'id': id, 'type': uri_type_ok}) return action_fn(conf, vcs_object, map_result_fn) # occurrence type is not dealt the same way _post_all_uri_types = {'revisions': store.Type.revision, 'directories': store.Type.directory, 'contents': store.Type.content} @app.route('/vcs//', methods=['POST']) def filter_unknowns_type(uri_type): """Filters unknown sha1 to the backend and returns them. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) obj_type = _post_all_uri_types.get(uri_type) if obj_type is None: return make_response('Bad request. Type not supported!', 400) sha1s = read_request_payload(request) config = app.config['conf'] with db.connect(config['db_url']) as db_conn: unknowns_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if unknowns_sha1s is None: return make_response('Bad request!', 400) else: return write_response(unknowns_sha1s) @app.route('/vcs/persons/', methods=['POST']) def post_person(): """Post a person. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) origin = read_request_payload(request) config = app.config['conf'] with db.connect(config['db_url']) as db_conn: try: person_found = service.find_person(db_conn, origin) if person_found: return write_response(person_found) else: return make_response('Person not found!', 404) except: return make_response('Bad request!', 400) @app.route('/origins/', methods=['POST']) def post_origin(): """Post an origin. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) origin = read_request_payload(request) config = app.config['conf'] with db.connect(config['db_url']) as db_conn: try: origin_found = service.find_origin(db_conn, origin) if origin_found: return write_response(origin_found) else: return make_response('Origin not found!', 404) except: return make_response('Bad request!', 400) @app.route('/origins/', methods=['PUT']) def put_origin(): """Create an origin or returns it if already existing. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) origin = read_request_payload(request) config = app.config['conf'] with db.connect(config['db_url']) as db_conn: try: origin_found = service.add_origin(db_conn, origin) return write_response(origin_found) # FIXME: 204 except: return make_response('Bad request!', 400) @app.route('/vcs/persons/', methods=['PUT']) def put_all_persons(): """Store or update given revisions. FIXME: Refactor same behavior with `put_all`. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) payload = read_request_payload(request) obj_type = store.Type.person config = app.config['conf'] with db.connect(config['db_url']) as db_conn: service.add_persons(db_conn, config, obj_type, payload) return make_response('Successful creation!', 204) @app.route('/vcs/revisions/', methods=['PUT']) def put_all_revisions(): """Store or update given revisions. FIXME: Refactor same behavior with `put_all`. """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) payload = read_request_payload(request) obj_type = store.Type.revision config = app.config['conf'] with db.connect(config['db_url']) as db_conn: service.add_revisions(db_conn, config, obj_type, payload) return make_response('Successful creation!', 204) @app.route('/vcs//', methods=['PUT']) def put_all(uri_type): """Store or update given objects (uri_type in {contents, directories, releases). """ if request.headers.get('Content-Type') != serial.MIMETYPE: return make_response('Bad request. Expected %s data!' % serial.MIMETYPE, 400) payload = read_request_payload(request) obj_type = _uri_types[uri_type] config = app.config['conf'] with db.connect(config['db_url']) as db_conn: service.add_objects(db_conn, config, obj_type, payload) return make_response('Successful creation!', 204) def add_object(config, vcs_object, map_result_fn): """Add object in storage. - config is the configuration needed for the backend to execute query - vcs_object is the object to look for in the backend - map_result_fn is a mapping function which takes the backend's result and transform its output accordingly. This function returns an http response of the result. """ type = vcs_object['type'] id = vcs_object['id'] logging.debug('store %s %s' % (type, id)) with db.connect(config['db_url']) as db_conn: if store.find(db_conn, vcs_object): logging.debug('update %s %s' % (id, type)) return make_response('Successful update!', 200) # immutable else: logging.debug('store %s %s' % (id, type)) res = store.add(db_conn, config, vcs_object) if res is None: return make_response('Bad request!', 400) elif res is False: logging.error('store %s %s' % (id, type)) return make_response('Internal server error!', 500) else: return make_response(map_result_fn(id, res), 204) def _do_lookup(conf, uri_type, id, map_result_fn): """Looking up type object with sha1. - config is the configuration needed for the backend to execute query - vcs_object is the object to look for in the backend - map_result_fn is a mapping function which takes the backend's result and transform its output accordingly. This function returns an http response of the result. """ uri_type_ok = _uri_types.get(uri_type, None) if not uri_type_ok: return make_response('Bad request!', 400) vcs_object = {'id': id, 'type': uri_type_ok} with db.connect(conf['db_url']) as db_conn: res = store.find(db_conn, vcs_object) if res: return write_response(map_result_fn(id, res)) # 200 return make_response('Not found!', 404) @app.route('/vcs/occurrences/') def list_occurrences_for(id): """Return the occurrences pointing to the revision id. """ return _do_lookup(app.config['conf'], 'occurrences', id, lambda _, result: list(map(lambda col: col[1], result))) @app.route('/vcs//') def object_exists_p(uri_type, id): """Assert if the object with sha1 id, of type uri_type, exists. """ return _do_lookup(app.config['conf'], uri_type, id, lambda sha1, _: {'id': sha1}) @app.route('/vcs//', methods=['PUT']) def put_object(uri_type, id): """Put an object in storage. """ return _do_action_with_payload(app.config['conf'], add_object, uri_type, id, lambda _1, _2: 'Successful Creation!') # FIXME: use id or result instead def run(conf): """Run the api's server. conf is a dictionary of keywords: - 'db_url' the db url's access (through psycopg2 format) - 'content_storage_dir' revisions/directories/contents storage on disk + - 'host' to override the default 127.0.0.1 to open or not the server to + the world - 'port' to override the default of 5000 (from the underlying layer: flask) - 'debug' activate the verbose logs """ # app.config is the app's state (accessible) app.config.update({'conf': conf}) - app.run(port=conf.get('port', None), debug=conf['debug'] == 'true') + + app.run(host=conf['host'], + port=conf.get('port', None), + debug=conf['debug'] == 'true') diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py index 29f15d0..7b815db 100644 --- a/swh/gitloader/local_store.py +++ b/swh/gitloader/local_store.py @@ -1,95 +1,96 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.store import store, db, service from swh.conf import reader from swh.storage.objstorage import ObjStorage # FIXME: duplicated from bin/swh-backend... # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir': ('string', '/tmp/swh-git-loader/log'), 'db_url': ('string', 'dbname=softwareheritage-dev'), 'folder_depth': ('int', 4), 'debug': ('bool', None), + 'host': ('string', '127.0.0.1'), 'port': ('int', 5000) } def store_only_new(db_conn, conf, obj_type, obj): """Store object if not already present. """ obj.update({'type': obj_type}) if not store.find(db_conn, obj): store.add(db_conn, conf, obj) _obj_to_persist_fn = {store.Type.revision: service.add_revisions} def store_unknown_objects(db_conn, conf, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now store in backend persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) obj_fulls = map(swhmap.get, unknown_obj_sha1s) return persist_fn(db_conn, conf, obj_type, obj_fulls) def load_to_back(conf, swh_repo): """Load to the backend the repository swh_repo. """ with db.connect(conf['db_url']) as db_conn: # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged # on the right swh db ftm) service.add_origin(db_conn, swh_repo.get_origin()) # First reference all unknown persons service.add_persons(db_conn, conf, store.Type.person, swh_repo.get_persons()) res = store_unknown_objects(db_conn, conf, store.Type.content, swh_repo.get_contents()) if res: res = store_unknown_objects(db_conn, conf, store.Type.directory, swh_repo.get_directories()) if res: res = store_unknown_objects(db_conn, conf, store.Type.revision, swh_repo.get_revisions()) if res: # brutally send all remaining occurrences service.add_objects(db_conn, conf, store.Type.occurrence, swh_repo.get_occurrences()) # and releases (the idea here is that compared to existing # objects, the quantity is less) service.add_objects(db_conn, conf, store.Type.release, swh_repo.get_releases()) def prepare_and_load_to_back(backend_setup_file, swh_repo): # Read the configuration file (no check yet) conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'content_storage_dir') conf.update({ 'objstorage': ObjStorage(conf['content_storage_dir'], conf['folder_depth']) }) load_to_back(conf, swh_repo)