diff --git a/Makefile.local b/Makefile.local index 5d566ee..091bf4b 100644 --- a/Makefile.local +++ b/Makefile.local @@ -1,95 +1,95 @@ # -*- makefile -*- FLAKE = flake8 BINDIR = bin SRCDIR = swh REPO_PATH=./swh-loader-git-testdata # add -v for example FLAG= DB=softwareheritage-dev SWH_LOADER=$(BINDIR)/swh-loader-git SWH_DB_MANAGER=$(BINDIR)/swh-db-manager SWH_BACK=$(BINDIR)/swh-backend SQL_FOLDER=../swh-storage/sql/ # could use cProfile PROFILE_TYPE=profile FOLLOW_LOG=-f # Adapt python-path to use other swh modules _PYPATH=`pwd`:`pwd`/../swh-core:`pwd`/../swh-storage deps: apt-get install -y \ python3 \ python3-pygit2 \ python3-psycopg2 \ python3-nose \ python3-flask \ python3-requests \ python3-retrying \ ipython3 cover: PYTHONPATH=$(_PYPATH) make coverage clean: rm -rf /tmp/swh-loader-git/content-storage prepare: mkdir -p /tmp/swh-loader-git/content-storage cleandb: clean PYTHONPATH=$(_PYPATH) $(SWH_DB_MANAGER) $(FLAG) cleandb run-remote: - PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH) + PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/remote-loader-git.ini load $(REPO_PATH) run-local: - PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH) + PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-loader-git.ini load $(REPO_PATH) run: - # works with the default ~/.config/swh/git-loader.ini file + # works with the default ~/.config/swh/loader-git.ini file PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) load $(REPO_PATH) run-back: PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) connect-db: psql -d $(DB) create-db: cd $(SQL_FOLDER) && make clean initdb drop-db: cd $(SQL_FOLDER) && make clean dropdb check-meta: @echo "Repository: $(REPO_PATH)" @echo "Git metadata:" @$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH) @echo @echo "DB metadata:" @$(BINDIR)/db-git-repo-meta.sh $(DB) @echo log-loader: tail $(FOLLOW_LOG) /tmp/swh-loader-git/log/sgloader.log log-back: tail $(FOLLOW_LOG) /tmp/swh-loader-git/log/back.log profile-run: PYTHONPATH=$(_PYPATH) python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py profile-stats: PYTHONPATH=$(_PYPATH) ./scratch/analyse-profile.py include Makefile.tests diff --git a/README b/README index bf78590..7d1f815 100644 --- a/README +++ b/README @@ -1,221 +1,221 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- - python3 - python3-psycopg2 - python3-pygit2 Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit - cache: implemented as Postgres tables Configuration ============= swh-loader-git depends on some tools, here are the configuration files for those: swh-db-manager -------------- This is solely a tool in charge of db cleanup now. Create a configuration file in **\~/.config/db-manager.ini** ``` {.ini} [main] # Where to store the logs log_dir = swh-loader-git/log # url access to db db_url = dbname=swhgitloader ``` See for the db url's schema swh-loader-git -------------- -Create a configuration file in **\~/.config/swh/git-loader.ini**: +Create a configuration file in **\~/.config/swh/loader-git.ini**: ``` {.ini} [main] # Where to store the logs log_dir = /tmp/swh-loader-git/log # how to access the backend (remote or local) backend-type = remote # backend-type remote: url access to api rest's backend # backend-type local: configuration file to backend file .ini (cf. back.ini file) backend = http://localhost:5000 ``` Note: - [DB url DSL](http://initd.org/psycopg/docs/module.html#psycopg2.connect) - the configuration file can be changed in the CLI with the flag \`-c \\` or \`--config-file \\` swh-backend ----------- Backend api. This Create a configuration file in **\~/.config/swh/back.ini**: ``` {.ini} [main] # where to store blob on disk content_storage_dir = /tmp/swh-loader-git/content-storage # Where to store the logs log_dir = swh-loader-git/log # url access to db: dbname= (host= port= user= password=) db_url = dbname=swhgitloader # compute folder's depth on disk aa/bb/cc/dd # folder_depth = 2 # To open to the world, 0.0.0.0 #host = 127.0.0.1 # Debugger (for dev only) debug = true # server port to listen to requests port = 6000 ``` See for the db url's schema Run === Environment initialization -------------------------- ``` {.bash} export PYTHONPATH=`pwd`:$PYTHONPATH ``` Backend ------- ### With initialization This depends on swh-sql repository, so: ``` {.bash} cd /path/to/swh-sql && make clean initdb DBNAME=softwareheritage-dev ``` Using the Makefile eases: ``` {.bash} make drop-db create-db run-back FOLLOW_LOG=-f ``` ### without initialization Running the backend. ``` {.bash} ./bin/swh-backend -v ``` With makefile: ``` {.bash} make run-back FOLLOW_LOG=-f ``` Help ---- ``` {.bash} bin/swh-loader-git --help bin/swh-db-manager --help ``` Parse a repository from a clean slate ------------------------------------- Clean and initialize the model then parse the repository git: ``` {.bash} bin/swh-db-manager cleandb bin/swh-loader-git load /path/to/git/repo ``` For ease: ``` {.bash} time make cleandb run REPO_PATH=~/work/inria/repo/swh-git-cloner ``` Parse an existing repository ---------------------------- ``` {.bash} bin/swh-loader-git load /path/to/git/repo ``` Clean data ---------- This will truncate the relevant table in the schema ``` {.bash} bin/swh-db-manager cleandb ``` For ease: ``` {.bash} make cleandb ``` Init data --------- ``` {.bash} make drop-db create-db ``` diff --git a/bin/swh-loader-git b/bin/swh-loader-git index a4d3374..27fdcab 100755 --- a/bin/swh-loader-git +++ b/bin/swh-loader-git @@ -1,67 +1,67 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import logging import os from swh.gitloader import loader from swh.conf import reader # Default configuration file -DEFAULT_CONF_FILE = '~/.config/swh/git-loader.ini' +DEFAULT_CONF_FILE = '~/.config/swh/loader-git.ini' # default configuration (can be overriden by the DEFAULT_CONF_FILE) DEFAULT_CONF = { 'log_dir': ('string', '/tmp/swh-loader-git/log'), 'backend-type': ('string', 'remote'), 'backend': ('string', 'http://localhost:5000'), } # Another example of configuration: # DEFAULT_CONF = { # 'log_dir': ('string', '/tmp/swh-loader-git/log'), # 'backend-type': ('string', 'local'), # 'backend': ('string', '~/.config/swh/back.ini'), # } def parse_args(): """Parse the CLI arguments. """ cli = argparse.ArgumentParser( description='Parse git repository objects to load them into DB.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') subcli = cli.add_subparsers(dest='action') load_cli = subcli.add_parser('load', help='load Git repo into DB') load_cli.add_argument('repository', help='Git repository path') args = cli.parse_args() if not args.action: cli.error('no action given') return args if __name__ == '__main__': args = parse_args() conf = reader.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) reader.prepare_folders(conf, 'log_dir') conf['action'] = args.action conf['repo_path'] = args.repository logging.basicConfig(filename=os.path.join(conf['log_dir'], 'sgloader.log'), level=logging.DEBUG if args.verbose else logging.INFO) loader.load(conf)