diff --git a/AUTHORS b/AUTHORS index 3c44b3c..72ac9eb 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,23 +1,24 @@ Authors ======= Below you can find a list of contributors to swh-loader-git and copyright owners of code that has become part of swh-loader-git. They've contributed in a variety of ways and this software wouldn't exist without them. Thank you! (For actual copyright notices, please refer to the individual source files and the Git repository.) Original authors ---------------- * Stefano Zacchiroli * Antoine R. Dumont +* Nicolas Dandrimont Code contributors ----------------- * Contribute and ADD YOUR NAME HERE! diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 08ebc95..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include Makefile -include requirements.txt -include version.txt diff --git a/Makefile.local b/Makefile.local deleted file mode 100644 index 7390c37..0000000 --- a/Makefile.local +++ /dev/null @@ -1,94 +0,0 @@ -# -*- makefile -*- - -FLAKE = flake8 -BINDIR = bin -SRCDIR = swh -REPO_PATH=./swh-loader-git-testdata - -# add -v for example -FLAG= - -DB=softwareheritage-dev - -SWH_LOADER=$(BINDIR)/swh-loader-git -SWH_DB_MANAGER=$(BINDIR)/swh-db-manager -SWH_BACK=$(BINDIR)/swh-backend - -SQL_FOLDER=../swh-storage/sql/ - -# could use cProfile -PROFILE_TYPE=profile - -FOLLOW_LOG=-f - -SWH_DB_MANAGER_CONFIG=~/.config/swh/db-manager.ini - -deps: - apt-get install -y \ - python3 \ - python3-pygit2 \ - python3-psycopg2 \ - python3-nose \ - python3-flask \ - python3-requests \ - python3-retrying \ - ipython3 - -clean: - rm -rf /tmp/swh-loader-git/content-storage - -prepare: - mkdir -p /tmp/swh-loader-git/content-storage - -clean-db: clean - $(SWH_DB_MANAGER) $(FLAG) --config $(SWH_DB_MANAGER_CONFIG) cleandb - -init-db: - $(SWH_DB_MANAGER) $(FLAG) --config $(SWH_DB_MANAGER_CONFIG) initdb - -run-remote: - $(SWH_LOADER) $(FLAG) --config ./resources/remote-loader-git.ini load $(REPO_PATH) - -run-local: - $(SWH_LOADER) $(FLAG) --config ./resources/local-loader-git.ini load $(REPO_PATH) - -run: - # works with the default ~/.config/swh/loader-git.ini file - $(SWH_LOADER) $(FLAG) load $(REPO_PATH) - -run-back: - $(SWH_BACK) $(FLAG) - -connect-db: - psql -d $(DB) - -create-db: - make -C $(SQL_FOLDER) clean filldb DBNAME=$(DB) - -drop-db: - make -C $(SQL_FOLDER) clean dropdb DBNAME=$(DB) - -check-meta: - @echo "Repository: $(REPO_PATH)" - - @echo "Git metadata:" - @$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH) - @echo - - @echo "DB metadata:" - @$(BINDIR)/db-git-repo-meta.sh $(DB) - @echo - -log-loader: - tail $(FOLLOW_LOG) /tmp/swh-loader-git/log/sgloader.log - -log-back: - tail $(FOLLOW_LOG) /tmp/swh-loader-git/log/back.log - -profile-run: - python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py - -profile-stats: - ./scratch/analyse-profile.py - -include Makefile.tests diff --git a/Makefile.tests b/Makefile.tests deleted file mode 100644 index 9456da1..0000000 --- a/Makefile.tests +++ /dev/null @@ -1,83 +0,0 @@ -# -*- makefile -*- -NOSEFLAGS=--nologcapture -v -DB_TEST=$(DB)-test -TESTDIR = ./swh/loader/git/tests - -test-connect-db: - psql $(DB_TEST) - -test-create-db: - make create-db DB=$(DB_TEST) - -test-drop-db: - make drop-db DB=$(DB_TEST) - -test-clean-db: - make clean-db SWH_DB_MANAGER_CONFIG=./resources/test/db-manager.ini - -test-init-db: - make init-db SWH_DB_MANAGER_CONFIG=./resources/test/db-manager.ini - -test-clean: - rm -rf /tmp/swh-loader-git/test/ - -test-prepare: - mkdir -p /tmp/swh-loader-git/test/ - -test-log-back: - tail $(FOLLOW_LOG) /tmp/swh-loader-git/test/log/back.log - -test-check-meta: - @echo "DB $(DB_TEST) metadata:" - @$(BINDIR)/db-git-repo-meta.sh $(DB_TEST) - @echo - -test-run-back: - $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini - -test-http: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_http.py - -test-swhrepo: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_swhrepo.py - -test-api: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api*.py - -test-api-post-per-type: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_post_*.py - -test-api-content: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_content.py - -test-api-directory: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_directory.py - -test-api-revision: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_revision.py - -test-api-release: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_release.py - -test-api-occurrence: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_occurrence.py - -test-api-home: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_home.py - -test-api-origin: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_origin.py - -test-api-person: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_api_person.py - -test-date: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_date.py - -test-remote-loader: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_remote_loader.py - -test-local-loader: - $(NOSE) $(NOSEFLAGS) $(TESTDIR)/test_local_loader.py - -test-loaders: test-local-loader test-remote-loader diff --git a/README b/README index fd46b79..7d1f815 100644 --- a/README +++ b/README @@ -1,229 +1,221 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- -- python3 -- python3-pygit2 -- python3-psycopg2 -- python3-flask -- python3-requests -- python3-retrying +- python3 +- python3-psycopg2 +- python3-pygit2 Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit - cache: implemented as Postgres tables Configuration ============= swh-loader-git depends on some tools, here are the configuration files for those: swh-db-manager -------------- -This is solely a db cleanup tool (which will fade away) +This is solely a tool in charge of db cleanup now. -Create a configuration file in **~/.config/db-manager.ini** +Create a configuration file in **\~/.config/db-manager.ini** -``` +``` {.ini} [main] # Where to store the logs log_dir = swh-loader-git/log # url access to db db_url = dbname=swhgitloader ``` See for the -db url's schema. +db url's schema swh-loader-git -------------- -The loader, which declines in 2 forms: -- one client which parses and loads directly to swh's backend (db + storage). - -- one part client which parses teh repository and load to swh's remote server: - -## local - -Create a configuration file in **~/.config/swh/loader-git.ini**: - -``` -[main] -# Where to store the logs -log_dir = /tmp/swh-loader-git/log - -# how to access the backend (remote or local) -backend-type = local - -# backend-type local: configuration file to backend file .ini (cf. back.ini file) -backend = ~/.config/swh/back.ini -``` - -Note: See swh-backend's configuration file. - -## remote - -Create a configuration file in **~/.config/swh/loader-git.ini**: +Create a configuration file in **\~/.config/swh/loader-git.ini**: -``` +``` {.ini} [main] # Where to store the logs log_dir = /tmp/swh-loader-git/log # how to access the backend (remote or local) backend-type = remote # backend-type remote: url access to api rest's backend +# backend-type local: configuration file to backend file .ini (cf. back.ini file) backend = http://localhost:5000 ``` Note: - [DB url DSL](http://initd.org/psycopg/docs/module.html#psycopg2.connect) -- the configuration file can be changed in the CLI with the flag `-c - ` or `--config-file ` +- the configuration file can be changed in the CLI with the flag \`-c + \\` or \`--config-file \\` swh-backend ----------- Backend api. This -Create a configuration file in **~/.config/swh/back.ini**: +Create a configuration file in **\~/.config/swh/back.ini**: -``` +``` {.ini} [main] # where to store blob on disk content_storage_dir = /tmp/swh-loader-git/content-storage # Where to store the logs log_dir = swh-loader-git/log # url access to db: dbname= (host= port= user= password=) db_url = dbname=swhgitloader # compute folder's depth on disk aa/bb/cc/dd # folder_depth = 2 # To open to the world, 0.0.0.0 #host = 127.0.0.1 # Debugger (for dev only) debug = true # server port to listen to requests port = 6000 ``` See for the db url's schema Run === Environment initialization -------------------------- -The PYTHONPATH must be set adequately. -The tools depends on: -- swh-environment/swh-core -- swh-environment/swh-storage - -Note: see swh-environment/pythonpath.sh - -Help ----- - -``` -bin/swh-backend --help -bin/swh-loader-git --help -bin/swh-db-manager --help +``` {.bash} +export PYTHONPATH=`pwd`:$PYTHONPATH ``` Backend ------- -The backend server depends on the object storage and the db. -The db depends on the actual sql schema defined in swh-environment/swh-storage/sql/*.sql. - ### With initialization -The Makefile.local usually is a good place to start with: +This depends on swh-sql repository, so: -``` -make create-db (DB=softwareheritage-dev) +``` {.bash} +cd /path/to/swh-sql && make clean initdb DBNAME=softwareheritage-dev ``` -Note: -- This delegates to the `swh-storage/sql/`'s Makefile the creation. -- Between parenthesis, the optional and default values. Override to use according to your needs. +Using the Makefile eases: + +``` {.bash} +make drop-db create-db run-back FOLLOW_LOG=-f +``` ### without initialization Running the backend. -``` -./bin/swh-backend -v (--config ~/.config/swh/back.ini) +``` {.bash} +./bin/swh-backend -v ``` -Note: Between parenthesis, the optional and default values. -Override to use according to your needs. - With makefile: -``` +``` {.bash} make run-back FOLLOW_LOG=-f ``` -Parse a repository: -------------------- +Help +---- + +``` {.bash} +bin/swh-loader-git --help +bin/swh-db-manager --help +``` + +Parse a repository from a clean slate +------------------------------------- -Parse and load the repository /path/to/git/repo: +Clean and initialize the model then parse the repository git: +``` {.bash} +bin/swh-db-manager cleandb +bin/swh-loader-git load /path/to/git/repo ``` -bin/swh-loader-git -c (~/.config/swh/git-loader.ini) load /path/to/git/repo + +For ease: + +``` {.bash} +time make cleandb run REPO_PATH=~/work/inria/repo/swh-git-cloner ``` -Scratch data ------------- +Parse an existing repository +---------------------------- +``` {.bash} +bin/swh-loader-git load /path/to/git/repo ``` -make drop-db create-db + +Clean data +---------- + +This will truncate the relevant table in the schema + +``` {.bash} +bin/swh-db-manager cleandb +``` + +For ease: + +``` {.bash} +make cleandb ``` -Note: It only deal with the db and not with the fs. +Init data +--------- + +``` {.bash} +make drop-db create-db +``` diff --git a/bin/db-git-repo-meta.sh b/bin/db-git-repo-meta.sh deleted file mode 100755 index ca83c99..0000000 --- a/bin/db-git-repo-meta.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -# Use: $0 -# will compute the number of revision, directory, content from db respectively. - -DB=$1 - -count() { - DB=$1 - QUERY=$2 - psql -d $1 --command "$QUERY;" | tail -3 | head -1 -} - -NB_CONTENTS=$(count $DB "select count(*) from content;") -NB_DIRECTORIES=$(count $DB "select count(*) from directory;") -NB_DIRECTORY_ENTRIES_DIR=$(count $DB "select count(*) from directory_entry_dir;") -NB_DIRECTORY_ENTRIES_FILE=$(count $DB "select count(*) from directory_entry_file;") -NB_DIRECTORY_ENTRIES_REV=$(count $DB "select count(*) from directory_entry_rev;") -NB_REVISIONS=$(count $DB "select count(*) from revision;") -NB_RELEASES=$(count $DB "select count(*) from release;") -NB_PERSONS=$(count $DB "select count(*) from person;") - -cat< submodule are not in repo object - print('submodule revision: %s' % tree_entry.hex) - elif obj.type == GIT_OBJ_TREE: - print('tree') - else: - print('blob') + print(repo.get(tree_entry.oid)) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 638ef17..0000000 --- a/setup.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -# ignore = E226,E302,E41 -# max-line-length = 79 -exclude = swh/tests/* -# max-complexity = 10 -# source: http://flake8.readthedocs.org/en/latest/config.html?highlight=ignore diff --git a/setup.py b/setup.py deleted file mode 100755 index af07550..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 - -from setuptools import setup - - -def parse_requirements(): - requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - - return requirements - - -setup( - name='swh.loader.git', - description='Software Heritage loader git utilities', - author='Software Heritage developers', - author_email='swh-devel@inria.fr', - url='https://forge.softwareheritage.org/diffusion/DLDG/', - packages=['swh.loader.git', 'swh.loader.git.tests'], - scripts=['bin/swh-backend', 'bin/swh-db-manager', 'bin/swh-loader-git'], - install_requires=parse_requirements(), - setup_requires=['vcversioner'], - vcversioner={}, - include_package_data=True, -) diff --git a/swh-loader-git-testdata b/swh-loader-git-testdata index e1d099b..d566a50 160000 --- a/swh-loader-git-testdata +++ b/swh-loader-git-testdata @@ -1 +1 @@ -Subproject commit e1d099bab92e8b70b1460922697eb8314f7210c5 +Subproject commit d566a501787b67450d5545675c9548b5c660c145 diff --git a/swh/loader/git/__init__.py b/swh/loader/git/__init__.py new file mode 100644 index 0000000..fdffa2a --- /dev/null +++ b/swh/loader/git/__init__.py @@ -0,0 +1 @@ +# placeholder diff --git a/swh/loader/git/backend/api.py b/swh/loader/git/backend/api.py deleted file mode 100755 index 0def40c..0000000 --- a/swh/loader/git/backend/api.py +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import logging - -from flask import Flask, Response, make_response, request - -from swh.loader.git.storage import storage, db, service -from swh.loader.git.protocols import serial -from swh.core import hashutil - - -# api's definition -app = Flask(__name__) - - -def read_request_payload(request): - """Read the request's payload. - - """ # TODO: Check the signed pickled data? - return serial.load(request.stream) - - -def write_response(data): - """Write response from data. - - """ - return Response(serial.dumps(data), mimetype=serial.MIMETYPE) - - -@app.route('/') -def hello(): - """A simple api to define what the server is all about. - FIXME: A redirect towards a static page defining the routes would be nice. - - """ - return 'Dev SWH API' - - -# from uri to type -_uri_types = { - 'revisions': storage.Type.revision, - 'directories': storage.Type.directory, - 'contents': storage.Type.content, - 'releases': storage.Type.release, - 'occurrences': storage.Type.occurrence, - 'persons': storage.Type.person -} - - -def _do_action_with_payload(conf, action_fn, uri_type, id, map_result_fn): - uri_type_ok = _uri_types.get(uri_type, None) - if uri_type_ok is None: - return make_response('Bad request!', 400) - - vcs_object = read_request_payload(request) - - try: - id_ = hashutil.hex_to_hash(id) - except: - return make_response('Bad request!', 400) - - vcs_object.update({'id': id_, - 'type': uri_type_ok}) - return action_fn(conf, vcs_object, map_result_fn) - - -# occurrence type is not dealt the same way -_post_all_uri_types = { - 'revisions': storage.Type.revision, - 'directories': storage.Type.directory, - 'contents': storage.Type.content -} - - -@app.route('/vcs//', methods=['POST']) -def filter_unknowns_type(uri_type): - """Filters unknown sha1 to the backend and returns them. - - """ - if request.headers.get('Content-Type') != serial.MIMETYPE: - return make_response('Bad request. Expected ' - '%s data!' % serial.MIMETYPE, 400) - - obj_type = _post_all_uri_types.get(uri_type) - if obj_type is None: - return make_response('Bad request. Type not supported!', 400) - - sha1s = read_request_payload(request) - config = app.config['conf'] - - with db.connect(config['db_url']) as db_conn: - unknowns_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) - if unknowns_sha1s is None: - return make_response('Bad request!', 400) - else: - return write_response(unknowns_sha1s) - - -@app.route('/vcs/persons/', methods=['POST']) -def post_person(): - """Find a person. - - """ - if request.headers.get('Content-Type') != serial.MIMETYPE: - return make_response('Bad request. Expected ' - '%s data!' % serial.MIMETYPE, 400) - - origin = read_request_payload(request) - config = app.config['conf'] - - with db.connect(config['db_url']) as db_conn: - try: - person_found = service.find_person(db_conn, origin) - if person_found: - return write_response(person_found) - else: - return make_response('Person not found!', 404) - except: - return make_response('Bad request!', 400) - - -@app.route('/vcs/origins/', methods=['POST']) -def post_origin(): - """Find an origin. - - """ - if request.headers.get('Content-Type') != serial.MIMETYPE: - return make_response('Bad request. Expected ' - '%s data!' % serial.MIMETYPE, 400) - - origin = read_request_payload(request) - config = app.config['conf'] - - with db.connect(config['db_url']) as db_conn: - try: - origin_found = service.find_origin(db_conn, origin) - if origin_found: - return write_response(origin_found) - else: - return make_response('Origin not found!', 404) - except: - return make_response('Bad request!', 400) - - -@app.route('/vcs/origins/', methods=['PUT']) -def put_origin(): - """Create an origin or returns it if already existing. - - """ - if request.headers.get('Content-Type') != serial.MIMETYPE: - return make_response('Bad request. Expected ' - '%s data!' % serial.MIMETYPE, 400) - - origin = read_request_payload(request) - config = app.config['conf'] - - with db.connect(config['db_url']) as db_conn: - try: - origin_found = service.add_origin(db_conn, origin) - return write_response(origin_found) # FIXME: 204 - except: - return make_response('Bad request!', 400) - - -@app.route('/vcs//', methods=['PUT']) -def put_all(uri_type): - """Store/update objects (uri_type in {contents, directories, releases}). - - """ - if request.headers.get('Content-Type') != serial.MIMETYPE: - return make_response('Bad request. Expected ' - '%s data!' % serial.MIMETYPE, 400) - - payload = read_request_payload(request) - obj_type = _uri_types[uri_type] - - config = app.config['conf'] - - with db.connect(config['db_url']) as db_conn: - service.persist(db_conn, config, obj_type, payload) - - return make_response('Successful creation!', 204) - - -def add_object(config, vcs_object, map_result_fn): - """Add object in storage. - - config is the configuration needed for the backend to execute query - - vcs_object is the object to look for in the backend - - map_result_fn is a mapping function which takes the backend's result - and transform its output accordingly. - - This function returns an http response of the result. - - """ - type = vcs_object['type'] - id = vcs_object['id'] - logging.debug('storage %s %s' % (type, id)) - - with db.connect(config['db_url']) as db_conn: - res = service.persist(db_conn, config, type, [vcs_object]) - return make_response(map_result_fn(id, res), 204) - - -def _do_lookup(conf, uri_type, id, map_result_fn): - """Looking up type object with sha1. - - config is the configuration needed for the backend to execute query - - vcs_object is the object to look for in the backend - - map_result_fn is a mapping function which takes the backend's result - and transform its output accordingly. - - This function returns an http response of the result. - - """ - uri_type_ok = _uri_types.get(uri_type, None) - if not uri_type_ok: - return make_response('Bad request!', 400) - - try: - id_ = hashutil.hex_to_hash(id) - except: - return make_response('Not found!', 404) - - with db.connect(conf['db_url']) as db_conn: - res = storage.find(db_conn, id_, uri_type_ok) - if res: - return write_response(map_result_fn(id, res)) # 200 - return make_response('Not found!', 404) - - -@app.route('/vcs/occurrences/') -def list_occurrences_for(id): - """Return the occurrences pointing to the revision id. - - """ - return _do_lookup(app.config['conf'], - 'occurrences', - id, - lambda _, result: list(map(lambda col: col[1], result))) - - -@app.route('/vcs//') -def object_exists_p(uri_type, id): - """Assert if the object with sha1 id, of type uri_type, exists. - - """ - return _do_lookup(app.config['conf'], - uri_type, - id, - lambda sha1, _: {'id': sha1}) - - -@app.route('/vcs//', methods=['PUT']) -def put_object(uri_type, id): - """Put an object in storage. - - """ - return _do_action_with_payload(app.config['conf'], - add_object, - uri_type, - id, - # FIXME: use id or result instead - lambda sha1, _2: sha1) - - -def run(conf): - """Run the api's server. - conf is a dictionary of keywords: - - 'db_url' the db url's access (through psycopg2 format) - - 'content_storage_dir' revisions/directories/contents storage on disk - - 'host' to override the default 127.0.0.1 to open or not the server to - the world - - 'port' to override the default of 5000 (from the underlying layer: - flask) - - 'debug' activate the verbose logs - - """ - print("""SWH Api run -host: %s -port: %s -debug: %s""" % (conf['host'], conf.get('port', None), conf['debug'])) - - # app.config is the app's state (accessible) - app.config.update({'conf': conf}) - - app.run(host=conf['host'], - port=conf.get('port', None), - debug=conf['debug'] == 'true') diff --git a/swh/loader/git/client/http.py b/swh/loader/git/client/http.py deleted file mode 100755 index 64dcb4d..0000000 --- a/swh/loader/git/client/http.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import requests - -from retrying import retry - -from swh.loader.git.retry import policy -from swh.loader.git.storage import storage -from swh.loader.git.protocols import serial - - -session_swh = requests.Session() - - -def compute_simple_url(base_url, type): - """Compute the api url. - - """ - return '%s%s' % (base_url, type) - - -@retry(retry_on_exception=policy.retry_if_connection_error, - wrap_exception=True, - stop_max_attempt_number=3) -def execute(map_type_url, - method_fn, - base_url, - obj_type, - data, - result_fn=lambda result: result.ok): - """Execute a query to the backend. - - map_type_url is a map of {type: url backend} - - method_fn is swh_session.post or swh_session.put - - base_url is the base url of the backend - - obj_type is the nature of the data - - data is the data to send to the backend - - result_fn is a function which takes the response - result and do something with it. The default function - is to return if the server is ok or not. - - """ - if not data: - return data - - res = method_fn(compute_simple_url(base_url, map_type_url[obj_type]), - data=serial.dumps(data), - headers={'Content-Type': serial.MIMETYPE}) - return result_fn(res) - - -# url mapping for lookup -url_lookup_per_type = { - storage.Type.origin: "/vcs/origins/", - storage.Type.content: "/vcs/contents/", - storage.Type.directory: "/vcs/directories/", - storage.Type.revision: "/vcs/revisions/", -} - - -def post(base_url, obj_type, obj_sha1s): - """Retrieve the objects of type type with sha1 sha1hex. - - """ - return execute(url_lookup_per_type, - session_swh.post, - base_url, - obj_type, - obj_sha1s, - result_fn=lambda res: serial.loads(res.content)) - - -# url mapping for storage -url_store_per_type = { - storage.Type.origin: "/vcs/origins/", - storage.Type.content: "/vcs/contents/", - storage.Type.directory: "/vcs/directories/", - storage.Type.revision: "/vcs/revisions/", - storage.Type.release: "/vcs/releases/", - storage.Type.occurrence: "/vcs/occurrences/", - storage.Type.person: "/vcs/persons/", -} - - -def put(base_url, obj_type, obj): - """Given an obj (map, simple object) of obj_type, PUT it in the backend. - - """ - return execute(url_store_per_type, - session_swh.put, - base_url, - obj_type, - obj) diff --git a/swh/loader/git/data/swhrepo.py b/swh/loader/git/data/swhrepo.py deleted file mode 100644 index 90c6951..0000000 --- a/swh/loader/git/data/swhrepo.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -class SWHRepo(): - """Structure with: - - sha1s as list - - map indexed by sha1 - """ - def __init__(self): - self.origin = {} - self.releases = [] - self.occurrences = [] - self.contents = {} - self.directories = {} - self.revisions = {} - self.persons = {} - self.visited = set() - - def add_origin(self, origin): - self.origin = origin - - def get_origin(self): - return self.origin - - def add_release(self, release): - self.releases.append(release) - - def get_releases(self): - return self.releases - - def add_occurrence(self, occurrence): - self.occurrences.append(occurrence) - - def get_occurrences(self): - return self.occurrences - - def add_content(self, content_ref): - sha1 = content_ref['id'] - self.contents[sha1] = content_ref - self.visited.add(sha1) - - def get_contents(self): - return self.contents - - def add_directory(self, directory): - sha1 = directory['id'] - self.directories[sha1] = directory - self.visited.add(sha1) - - def get_directories(self): - return self.directories - - def add_revision(self, revision): - sha1 = revision['id'] - self.revisions[sha1] = revision - self.visited.add(sha1) - - def add_person(self, id, person): - self.persons[id] = person - - def get_persons(self): - return self.persons.values() - - def already_visited(self, sha1): - return sha1 in self.visited - - def get_revisions(self): - return self.revisions diff --git a/swh/loader/git/date.py b/swh/loader/git/date.py deleted file mode 100644 index e0f7a31..0000000 --- a/swh/loader/git/date.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import time -from datetime import timedelta, datetime, tzinfo - - -class FixedOffset(tzinfo): - """Fixed offset in minutes east from UTC. - - """ - def __init__(self, offset, name): - self.__offset = timedelta(minutes = offset) - self.__name = name - - def utcoffset(self, dt): - return self.__offset - - def tzname(self, dt): - return self.__name - - def dst(self, dt): - return timedelta(0) - - -_cache_tz = {} - - -def ts_to_str(timestamp, offset): - """Convert a timestamp to string. - - """ - if offset in _cache_tz: - _tz = _cache_tz[offset] - else: - _tz = FixedOffset(offset, 'swh') - _cache_tz[offset] = _tz - - dt = datetime.fromtimestamp(timestamp, _tz) - return str(dt) - - -def now(): - """Build the date as of now in the api's format. - - """ - return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py index 3a6829b..0e92131 100644 --- a/swh/loader/git/git.py +++ b/swh/loader/git/git.py @@ -1,227 +1,411 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import glob import logging import os import subprocess -import time import pygit2 -from pygit2 import GIT_REF_OID -from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL -from enum import Enum +from collections import defaultdict +from pygit2 import Oid +from pygit2 import GIT_OBJ_BLOB, GIT_OBJ_TREE, GIT_OBJ_COMMIT, GIT_OBJ_TAG from swh.core import hashutil -from swh.loader.git import date -from swh.loader.git.data import swhrepo -from swh.loader.git.storage import storage -SWH_AUTHORITY = 'softwareheritage' +def format_date(signature): + """Convert the date from a signature to a datetime""" + return datetime.datetime.fromtimestamp(signature.time, + datetime.timezone.utc) def list_objects_from_packfile_index(packfile_index): - """List the objects indexed by this packfile. - + """List the objects indexed by this packfile, in packfile offset + order. """ input_file = open(packfile_index, 'rb') + with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: + + data = [] + for line in process.stdout.readlines(): - obj_id = line.decode('utf-8', 'ignore').split()[1] - yield obj_id + # git show-index returns the line as: + # () + line_components = line.split() + offset = int(line_components[0]) + object_id = line_components[1] + data.append((offset, object_id)) -def list_objects(repo): - """List the objects in a given repository. + yield from (Oid(hex=object_id.decode('ascii')) + for _, object_id in sorted(data)) - """ + input_file.close() + + +def simple_list_objects(repo): + """List the objects in a given repository. Watch out for duplicates!""" objects_dir = os.path.join(repo.path, 'objects') + # Git hashes are 40-character long objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): - yield ''.join(object_file.split(os.path.sep)[-2:]) + # Rebuild the object id as the last two components of the path + yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:])) + + +def list_objects(repo): + """List the objects in a given repository, removing duplicates""" + seen = set() + for oid in simple_list_objects(repo): + if oid not in seen: + yield oid + seen.add(oid) + + +def get_objects_per_object_type(repo): + """Get all the (pygit2-parsed) objects from repo per object type""" + objects_per_object_type = defaultdict(list) + + for object_id in list_objects(repo): + object = repo[object_id] + objects_per_object_type[object.type].append(object_id) + + return objects_per_object_type HASH_ALGORITHMS = ['sha1', 'sha256'] -def parse(repo_path): - """Given a repository path, parse and return a memory model of such - repository. +def send_in_packets(source_list, formatter, sender, packet_size): + """Send objects from `source_list`, passed through `formatter`, by the + `sender`, in packets of `packet_size` objects """ - def read_signature(signature): - return '%s <%s>' % (signature.name, signature.email) + formatted_objects = [] + for obj in source_list: + formatted_object = formatter(obj) + if formatted_object: + formatted_objects.append(formatted_object) + if len(formatted_objects) >= packet_size: + sender(formatted_objects) + formatted_objects = [] - def treewalk(repo, tree): - """Walk a tree with the same implementation as `os.path`. - Returns: tree, trees, contents - """ - trees, contents = [], [] - dir_entry_dirs, dir_entry_files, dir_entry_revs = [], [], [] - - for tree_entry in tree: - tree_sha1 = hashutil.hex_to_hash(tree_entry.hex) - if swh_repo.already_visited(tree_sha1): - logging.debug('tree_entry %s already visited,' - ' skipped' % tree_entry.hex) - continue + sender(formatted_objects) - dir_entry = {'name': tree_entry.name, - 'type': storage.Type.directory_entry, - 'target-sha1': tree_sha1, - 'perms': tree_entry.filemode, - 'atime': None, - 'mtime': None, - 'ctime': None} - - obj = repo.get(tree_entry.oid) - - if obj is None: # submodule - logging.debug('found rev %s' % tree_entry.hex) - dir_entry_revs.append(dir_entry) - elif obj.type == GIT_OBJ_TREE: - logging.debug('found tree %s' % tree_entry.hex) - trees.append(tree_entry) - dir_entry_dirs.append(dir_entry) - else: - logging.debug('found content %s' % tree_entry.hex) - data = obj.data - hashes = hashutil.hashdata(data, HASH_ALGORITHMS) - contents.append({'id': hashes['sha1'], - 'type': storage.Type.content, - 'git-sha1': hashutil.hex_to_hash(obj.hex), - 'content-sha256': hashes['sha256'], - 'content': data, - 'size': obj.size}) - dir_entry_files.append(dir_entry) - - yield tree, dir_entry_dirs, dir_entry_files, dir_entry_revs, trees, contents - for tree_entry in trees: - for x in treewalk(repo, repo[tree_entry.oid]): - yield x - - def walk_tree(repo, swh_repo, rev): - """Walk the rev revision's directories. +class BulkLoader: + """A bulk loader for a git repository""" + def __init__(self, config): + self.config = config + + if self.config['storage_class'] == 'remote_storage': + from swh.storage.remote_storage import RemoteStorage as Storage + else: + from swh.storage import Storage + + self.storage = Storage(*self.config['storage_args']) + + self.repo = pygit2.Repository(config['repo_path']) + + self.log = logging.getLogger('swh.loader.git.BulkLoader') + + def send_contents(self, content_list): + """Actually send properly formatted contents to the database""" + self.log.info("Sending %d contents" % len(content_list)) + self.storage.content_add(content_list) + self.log.info("Done sending %d contents" % len(content_list)) + + def send_directories(self, directory_list): + """Actually send properly formatted directories to the database""" + self.log.info("Sending %d directories" % len(directory_list)) + self.storage.directory_add(directory_list) + self.log.info("Done sending %d directories" % len(directory_list)) + + def send_revisions(self, revision_list): + """Actually send properly formatted revisions to the database""" + self.log.info("Sending %d revisions" % len(revision_list)) + self.storage.revision_add(revision_list) + self.log.info("Done sending %d revisions" % len(revision_list)) + + def send_releases(self, release_list): + """Actually send properly formatted releases to the database""" + self.log.info("Sending %d releases" % len(release_list)) + self.storage.release_add(release_list) + self.log.info("Done sending %d releases" % len(release_list)) + + def send_occurrences(self, occurrence_list): + """Actually send properly formatted occurrences to the database""" + self.log.info("Sending %d occurrences" % len(occurrence_list)) + self.storage.occurrence_add(occurrence_list) + self.log.info("Done sending %d occurrences" % len(occurrence_list)) + + def blob_to_content(self, id): + """Format a blob as a content""" + blob = self.repo[id] + data = blob.data + hashes = hashutil.hashdata(data, HASH_ALGORITHMS) + return { + 'sha1_git': id.raw, + 'sha1': hashes['sha1'], + 'sha256': hashes['sha256'], + 'data': data, + 'length': blob.size, + } + + def tree_to_directory(self, id): + """Format a tree as a directory""" + ret = { + 'id': id.raw, + } + entries = [] + ret['entries'] = entries + + entry_type_map = { + 'tree': 'dir', + 'blob': 'file', + 'commit': 'rev', + } + + for entry in self.repo[id]: + entries.append({ + 'type': entry_type_map[entry.type], + 'perms': entry.filemode, + 'name': entry.name, + 'target': entry.id.raw, + 'atime': None, + 'mtime': None, + 'ctime': None, + }) + + return ret + + def commit_to_revision(self, id): + """Format a commit as a revision""" + commit = self.repo[id] + + author = commit.author + committer = commit.committer + return { + 'id': id.raw, + 'date': format_date(author), + 'date_offset': author.offset, + 'committer_date': format_date(committer), + 'committer_date_offset': committer.offset, + 'type': 'git', + 'directory': commit.tree_id.raw, + 'message': commit.raw_message, + 'author_name': author.name, + 'author_email': author.email, + 'committer_name': committer.name, + 'committer_email': committer.email, + 'parents': [p.raw for p in commit.parent_ids], + } + + def annotated_tag_to_release(self, id): + """Format an annotated tag as a release""" + tag = self.repo[id] + + tag_pointer = self.repo[tag.target] + if tag_pointer.type != GIT_OBJ_COMMIT: + self.log.warn("Ignoring tag %s pointing at %s %s" % ( + tag.id.hex, tag_pointer.__class__.__name__, + tag_pointer.id.hex)) + return + + author = tag.tagger + + if not author: + self.log.warn("Tag %s has no author, using default values" + % id.hex) + author_name = '' + author_email = '' + date = None + date_offset = 0 + else: + author_name = author.name + author_email = author.email + date = format_date(author) + date_offset = author.offset + + return { + 'id': id.raw, + 'date': date, + 'date_offset': date_offset, + 'revision': tag.target.raw, + 'comment': tag.message.encode('utf-8'), + 'author_name': author_name, + 'author_email': author_email, + } + + def ref_to_occurrence(self, ref): + """Format a reference as an occurrence""" + ref = ref.copy() + + ref.update(origin=self.origin, authority=self.config['authority'], + validity=self.config['validity']) + + return ref + + def get_origin(self): + origin = { + 'type': 'git', + 'url': 'file://%s' % self.config['repo_path'], + } + + origin['id'] = self.storage.origin_get(origin) + + return origin + + def create_origin(self): + origin = self.get_origin() + id = origin['id'] + + if not id: + id = self.storage.origin_add_one(origin) + + self.origin = id + + def bulk_send_blobs(self, blob_dict): + """Format blobs as swh contents and send them to the database""" + packet_size = self.config['content_packet_size'] + + send_in_packets(blob_dict, self.blob_to_content, + self.send_contents, packet_size) + + def bulk_send_trees(self, tree_dict): + """Format trees as swh directories and send them to the database""" + packet_size = self.config['directory_packet_size'] + + send_in_packets(tree_dict, self.tree_to_directory, + self.send_directories, packet_size) + + def bulk_send_commits(self, commit_dict): + """Format commits as swh revisions and send them to the database""" + packet_size = self.config['revision_packet_size'] + + send_in_packets(commit_dict, self.commit_to_revision, + self.send_revisions, packet_size) + + def bulk_send_annotated_tags(self, tag_dict): + """Format annotated tags (pygit2.Tag objects) as swh releases and send + them to the database """ - for dir_root, dir_entry_dirs, dir_entry_files, dir_entry_revs, _, contents_ref \ - in treewalk(repo, rev.tree): - - for content_ref in contents_ref: - swh_repo.add_content(content_ref) - - swh_repo.add_directory({'id': hashutil.hex_to_hash(dir_root.hex), - 'type': storage.Type.directory, - 'entry-dirs': dir_entry_dirs, - 'entry-files': dir_entry_files, - 'entry-revs': dir_entry_revs}) - - revision_parent_sha1s = list(map(lambda x: hashutil.hex_to_hash(str(x)), rev.parent_ids)) - - author = {'name': rev.author.name, - 'email': rev.author.email, - 'type': storage.Type.person} - committer = {'name': rev.committer.name, - 'email': rev.committer.email, - 'type': storage.Type.person} - - swh_repo.add_revision({'id': hashutil.hex_to_hash(rev.hex), - 'type': storage.Type.revision, - 'date': date.ts_to_str( - rev.author.time, - rev.author.offset), - 'committer-date': date.ts_to_str( - rev.commit_time, - rev.commit_time_offset), - 'directory': hashutil.hex_to_hash(rev.tree.hex), - 'message': rev.message, - 'committer': committer, - 'author': author, - 'parent-sha1s': revision_parent_sha1s - }) - - swh_repo.add_person(read_signature(rev.author), author) - swh_repo.add_person(read_signature(rev.committer), committer) - - return swh_repo - - def walk_revision_from(repo, swh_repo, head_rev): - """Walk the rev history log from head_rev. - - repo is the current repository - - rev is the latest rev to start from. + packet_size = self.config['release_packet_size'] + send_in_packets(tag_dict, self.annotated_tag_to_release, + self.send_releases, packet_size) + + def bulk_send_refs(self, refs): + """Format git references as swh occurrences and send them to the + database """ - for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): - sha1 = hashutil.hex_to_hash(rev.hex) - if swh_repo.already_visited(sha1): - logging.debug('commit %s already visited, skipped' % sha1) - continue - swh_repo = walk_tree(repo, swh_repo, rev) - - return swh_repo - - repo = pygit2.Repository(repo_path) - # memory model - swh_repo = swhrepo.SWHRepo() - # add origin - origin = {'type': 'git', - 'url': 'file://' + repo.path} - swh_repo.add_origin(origin) - # add references and crawl them - for ref_name in repo.listall_references(): - logging.info('walk reference %s' % ref_name) - ref = repo.lookup_reference(ref_name) - - head_rev = repo[ref.target] \ - if ref.type is GIT_REF_OID \ - else ref.peel(GIT_OBJ_COMMIT) # noqa - - if isinstance(head_rev, pygit2.Tag): - head_start = head_rev.get_object() - taggerSig = head_rev.tagger - author = {'name': taggerSig.name, - 'email': taggerSig.email, - 'type': storage.Type.person} - release = {'id': hashutil.hex_to_hash(head_rev.hex), - 'type': storage.Type.release, - 'revision': hashutil.hex_to_hash(head_rev.target.hex), - 'name': ref_name, - 'date': date.ts_to_str(taggerSig.time, - taggerSig.offset), - 'author': author, - 'comment': head_rev.message} - - swh_repo.add_release(release) - swh_repo.add_person(read_signature(taggerSig), author) + packet_size = self.config['occurrence_packet_size'] + + send_in_packets(refs, self.ref_to_occurrence, + self.send_occurrences, packet_size) + + def list_repo(self): + self.log.info("Started listing %s" % self.config['repo_path']) + self.objects = get_objects_per_object_type(self.repo) + + refs = [] + ref_names = self.repo.listall_references() + for ref_name in ref_names: + ref = self.repo.lookup_reference(ref_name) + target = ref.target + + if not isinstance(target, Oid): + self.log.debug("Peeling symbolic ref %s pointing at %s" % ( + ref_name, ref.target)) + target_obj = ref.peel() + else: + target_obj = self.repo[target] + + if target_obj.type == GIT_OBJ_TAG: + self.log.debug("Peeling ref %s pointing at tag %s" % ( + ref_name, target_obj.name)) + target_obj = ref.peel() + + if not target_obj.type == GIT_OBJ_COMMIT: + self.log.info("Skipping ref %s pointing to %s %s" % ( + ref_name, target_obj.__class__.__name__, + target_obj.id.hex)) + + refs.append({ + 'branch': ref_name, + 'revision': target_obj.id.raw, + }) + + self.objects['refs'] = refs + + self.log.info("Done listing the objects in %s: %d contents, " + "%d directories, %d revisions, %d releases, " + "%d occurrences" % ( + self.config['repo_path'], + len(self.objects[GIT_OBJ_BLOB]), + len(self.objects[GIT_OBJ_TREE]), + len(self.objects[GIT_OBJ_COMMIT]), + len(self.objects[GIT_OBJ_TAG]), + len(self.objects['refs']) + )) + + def load_repo(self): + if self.config['create_origin']: + self.create_origin() + else: + self.log.info('Not creating origin, pulling id from config') + self.origin = self.config['origin'] + + if not self.objects['refs']: + self.log.info('Skipping empty repository') + return + + if self.config['send_contents']: + self.bulk_send_blobs(self.objects[GIT_OBJ_BLOB]) + else: + self.log.info('Not sending contents') + + if self.config['send_directories']: + self.bulk_send_trees(self.objects[GIT_OBJ_TREE]) else: - swh_repo.add_occurrence({'id': hashutil.hex_to_hash(head_rev.hex), - 'revision': hashutil.hex_to_hash(head_rev.hex), - 'authority': SWH_AUTHORITY, - 'branch': ref_name, - 'url-origin': origin['url'], - 'type': storage.Type.occurrence}) - head_start = head_rev - - # crawl commits and trees - walk_revision_from(repo, swh_repo, head_start) - - return swh_repo + self.log.info('Not sending directories') + + if self.config['send_revisions']: + self.bulk_send_commits(self.objects[GIT_OBJ_COMMIT]) + else: + self.log.info('Not sending revisions') + + if self.config['send_releases']: + self.bulk_send_annotated_tags(self.objects[GIT_OBJ_TAG]) + else: + self.log.info('Not sending releases') + + if self.config['send_occurrences']: + self.bulk_send_refs(self.objects['refs']) + else: + self.log.info('Not sending occurrences') + + def process(self): + self.list_repo() + self.load_repo() diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py deleted file mode 100644 index 0a6dfeb..0000000 --- a/swh/loader/git/loader.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging -import os - -from swh.loader.git import git, remote_store, local_store - - -_load_to_back = { - 'remote': remote_store.load_to_back, - 'local': local_store.prepare_and_load_to_back, -} - - -def check_user_conf(conf): - """Check the user's configuration and rejects if problems. - - """ - action = conf['action'] - if action != 'load': - return 'skip unknown action %s' % action - - backend_type = conf['backend-type'] - if backend_type not in _load_to_back: - return ('skip unknown backend-type %s (only ' - '`remote`, `local` supported)' % backend_type) - - repo_path = conf['repo_path'] - if not os.path.exists(repo_path): - return 'Repository %s does not exist.' % repo_path - - return None - - -def load(conf): - """According to action, load the repo_path. - - used configuration keys: - - action: requested action - - repo_path: git repository path ('load' action only) - - backend-type: backend access's type (remote or local) - - backend: url access to backend api - - """ - error_msg = check_user_conf(conf) - if error_msg: - logging.error(error_msg) - raise Exception(error_msg) - - repo_path = conf['repo_path'] - logging.info('load repo_path %s' % repo_path) - - swhrepo = git.parse(repo_path) - loader = _load_to_back[conf['backend-type']] - - loader(conf['backend'], swhrepo) diff --git a/swh/loader/git/local_store.py b/swh/loader/git/local_store.py deleted file mode 100644 index 13ba945..0000000 --- a/swh/loader/git/local_store.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -from swh.core import config -from swh.loader.git.storage import storage, db, service -from swh.storage.objstorage import ObjStorage - - -# FIXME: duplicated from bin/swh-backend... -# Default configuration file -DEFAULT_CONF_FILE = '~/.config/swh/back.ini' - - -# default configuration -DEFAULT_CONF = { - 'content_storage_dir': ('string', '/tmp/swh-loader-git/content-storage'), - 'log_dir': ('string', '/tmp/swh-loader-git/log'), - 'db_url': ('string', 'dbname=softwareheritage-dev'), - 'folder_depth': ('int', 4), - 'debug': ('bool', None), - 'host': ('string', '127.0.0.1'), - 'port': ('int', 5000) -} - - -def store_only_new(db_conn, conf, obj_type, obj): - """Store object if not already present. - - """ - if not storage.find(db_conn, obj['id'], obj_type): - storage.add(db_conn, conf, obj) - - -_obj_to_persist_fn = {storage.Type.revision: service.add_revisions} - - -def store_unknown_objects(db_conn, conf, obj_type, swhmap): - """Load objects to the backend. - - """ - sha1s = swhmap.keys() - - # have: filter unknown obj - unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) - if not unknown_obj_sha1s: - return True - - # seen: now store in backend - persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) - obj_fulls = map(swhmap.get, unknown_obj_sha1s) - return persist_fn(db_conn, conf, obj_type, obj_fulls) - - -def load_to_back(conf, swh_repo): - """Load to the backend the repository swh_repo. - - """ - with db.connect(conf['db_url']) as db_conn: - # First, store/retrieve the origin identifier - # FIXME: should be done by the cloner worker (which is not yet plugged - # on the right swh db ftm) - service.add_origin(db_conn, swh_repo.get_origin()) - - # First reference all unknown persons - service.add_persons(db_conn, conf, storage.Type.person, - swh_repo.get_persons()) - - res = store_unknown_objects(db_conn, conf, - storage.Type.content, - swh_repo.get_contents()) - if res: - res = store_unknown_objects(db_conn, conf, - storage.Type.directory, - swh_repo.get_directories()) - if res: - res = store_unknown_objects(db_conn, conf, - storage.Type.revision, - swh_repo.get_revisions()) - if res: - # brutally send all remaining occurrences - service.add_objects(db_conn, conf, - storage.Type.occurrence, - swh_repo.get_occurrences()) - - # and releases (the idea here is that compared to existing - # objects, the quantity is less) - service.add_objects(db_conn, conf, - storage.Type.release, - swh_repo.get_releases()) - - -def prepare_and_load_to_back(backend_setup_file, swh_repo): - """Prepare and load to back the swh_repo. - backend-setup-file is the backend's setup to load to access the db and file - storage. - - """ - # Read the configuration file (no check yet) - conf = config.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) - config.prepare_folders(conf, 'content_storage_dir') - conf.update({ - 'objstorage': ObjStorage(conf['content_storage_dir'], - conf['folder_depth']) - }) - - load_to_back(conf, swh_repo) diff --git a/swh/loader/git/manager.py b/swh/loader/git/manager.py deleted file mode 100755 index fd48ce7..0000000 --- a/swh/loader/git/manager.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging - -from swh.loader.git.storage import db, models - - -def manage(action, db_url): - """According to action, load the repository. - - used configuration keys: - - action: requested action [cleandb|initdb] - - """ - with db.connect(db_url) as db_conn: - if action == 'cleandb': - logging.info('clean database') - models.cleandb(db_conn) - elif action == 'initdb': - logging.info('initialize database') - models.initdb(db_conn) - else: - logging.warn('skip unknown-action %s' % action) diff --git a/swh/loader/git/protocols/serial.py b/swh/loader/git/protocols/serial.py deleted file mode 100755 index b6c232b..0000000 --- a/swh/loader/git/protocols/serial.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pickle -from io import BytesIO - - -MIMETYPE = "application/octet-stream" - - -def load(file_or_handle): - """Read a pickled object from the opened file_or_handle object. - - """ - return pickle.load(file_or_handle) - - -def loads(obj): - """Read a pickled object from bytes object. - - """ - if obj == b'': - return obj - return pickle.loads(obj) - - -def dumps(obj): - """Return the pickle representation of the obj. - - """ - return pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) - - -def dumps_as_stream(obj): - """Return the pickle representation of the obj as stream. - - """ - return pickle.dump(obj, BytesIO(), protocol=pickle.HIGHEST_PROTOCOL) diff --git a/swh/loader/git/remote_store.py b/swh/loader/git/remote_store.py deleted file mode 100644 index 92f3c96..0000000 --- a/swh/loader/git/remote_store.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from swh.loader.git.storage import storage -from swh.loader.git.client import http - - -def store_unknown_objects(back_url, obj_type, swhmap): - """Load objects to the backend. - - """ - sha1s = list(swhmap.keys()) - # have: filter unknown obj - unknown_obj_sha1s = http.post(back_url, obj_type, sha1s) - if not unknown_obj_sha1s: - return True - - # store unknown objects - return http.put(back_url, obj_type, map(swhmap.get, unknown_obj_sha1s)) - - -def load_to_back(back_url, swh_repo): - """Load to the back_url the repository swh_repo. - - """ - # First, store/retrieve the origin identifier - # FIXME: should be done by the cloner worker (which is not yet plugged on - # the right swh db ftm) - http.put(back_url, - obj_type=storage.Type.origin, - obj=swh_repo.get_origin()) - - http.put(back_url, - obj_type=storage.Type.person, - obj=list(swh_repo.get_persons())) - - # let the backend and api discuss what's really needed - # - first this worker sends the checksums - # - then the backend answers the checksums it does not know - # - then the worker sends only what the backend does not know per - # object type basis - res = store_unknown_objects(back_url, storage.Type.content, - swh_repo.get_contents()) - - if res: - res = store_unknown_objects(back_url, storage.Type.directory, - swh_repo.get_directories()) - if res: - res = store_unknown_objects(back_url, storage.Type.revision, - swh_repo.get_revisions()) - if res: - # brutally send all remaining occurrences - http.put(back_url, - storage.Type.occurrence, - swh_repo.get_occurrences()) - - # and releases (the idea here is that compared to existing - # other objects, the quantity is less) - http.put(back_url, - storage.Type.release, - swh_repo.get_releases()) - - # FIXME: deal with collision failures which should be raised by backend. diff --git a/swh/loader/git/retry/policy.py b/swh/loader/git/retry/policy.py deleted file mode 100644 index 2fca5d8..0000000 --- a/swh/loader/git/retry/policy.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from requests import ConnectionError - - -def retry_if_io_error(exc): - """Return True if IOError, - False otherwise. - """ - return isinstance(exc, IOError) - - -def retry_if_connection_error(exc): - """Return True if ConnectionError, - False otherwise. - """ - return isinstance(exc, ConnectionError) diff --git a/swh/loader/git/storage/db.py b/swh/loader/git/storage/db.py deleted file mode 100644 index 4bea13d..0000000 --- a/swh/loader/git/storage/db.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import psycopg2 - - -connect = psycopg2.connect - - -def execute(cur, query_params, trace=None): - """Execute the query_params. - query_params is expected to be either: - - a sql query (string) - - a tuple (sql query, params) - """ - if isinstance(query_params, str): - if trace: - print("query: '%s'" % query_params) - cur.execute(query_params) - else: - if trace: - print("query: ", cur.mogrify(*query_params).decode()) - cur.execute(*query_params) - - -def copy_from(cur, file, table): - """Copy the content of a file to the db in the table table. - """ - cur.copy_from(file, table) - - -def insert(db_conn, query_params, trace=None): - """Execute an insertion and returns the identifier. - Expect an insert query with the right returning clause. - No check is done. - """ - with db_conn.cursor() as cur: - execute(cur, query_params, trace) - result = cur.fetchone() - return result[0] - - -def query_execute(db_conn, query_params, trace=None): - """Execute one query. - Type of sql queries: insert, delete, drop, create... - query_params is expected to be either: - - a sql query (string) - - a tuple (sql query, params) - """ - with db_conn.cursor() as cur: - return execute(cur, query_params, trace) - - -def queries_execute(db_conn, queries_params, trace=None): - """Execute multiple queries without any result expected. - Type of sql queries: insert, delete, drop, create... - query_params is expected to be a list of mixed: - - sql query (string) - - tuple (sql query, params) - """ - with db_conn.cursor() as cur: - for query_params in queries_params: - execute(cur, query_params, trace) - - -def query_fetchone(db_conn, query_params, trace=None): - """Execute sql query which returns one result. - query_params is expected to be either: - - a sql query (string) - - a tuple (sql query, params) - """ - with db_conn.cursor() as cur: - return fetchone(cur, query_params, trace) - - -def fetchone(cur, query_params, trace=None): - """Execute sql query and returns one result. - """ - execute(cur, query_params, trace) - return cur.fetchone() - - -def query_fetch(db_conn, query_params, trace=None): - """Execute sql query which returns results. - query_params is expected to be either: - - a sql query (string) - - a tuple (sql query, params) - """ - with db_conn.cursor() as cur: - execute(cur, query_params, trace) - return cur.fetchall() diff --git a/swh/loader/git/storage/models.py b/swh/loader/git/storage/models.py deleted file mode 100644 index d75caa8..0000000 --- a/swh/loader/git/storage/models.py +++ /dev/null @@ -1,449 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from enum import Enum - -from . import db -from swh.loader.git import date - - -class Type(Enum): - """Types of git objects. - - """ - occurrence = 'occurrence' # ~git branch - release = 'release' # ~git annotated tag - revision = 'revision' # ~git commit - directory = 'directory' # ~git tree - directory_entry = 'directory_entry' # ~git tree_entry - content = 'content' # ~git blob - origin = 'origin' - person = 'person' # committer, tagger, author - - -def initdb(db_conn): - """For retrocompatibility. - - """ - db.query_execute(db_conn, - """INSERT INTO organization(name, description, homepage) - VALUES('softwareheritage', - 'Software Heritage', - 'http://www.softwareheritage.org');""") - - -def cleandb(db_conn): - """Clean up DB. - - """ - db.queries_execute(db_conn, [ - 'TRUNCATE TABLE content CASCADE;', - 'TRUNCATE TABLE organization CASCADE;', - 'TRUNCATE TABLE list_history CASCADE;', - 'TRUNCATE TABLE origin CASCADE;', - 'TRUNCATE TABLE fetch_history CASCADE;', - 'TRUNCATE TABLE project CASCADE;', - 'TRUNCATE TABLE project_history CASCADE;', - 'TRUNCATE TABLE directory CASCADE;', - 'TRUNCATE TABLE directory_entry_dir CASCADE;', - 'TRUNCATE TABLE directory_list_dir CASCADE;', - 'TRUNCATE TABLE directory_entry_file CASCADE;', - 'TRUNCATE TABLE directory_list_file CASCADE;', - 'TRUNCATE TABLE person CASCADE;', - 'TRUNCATE TABLE revision CASCADE;', - 'TRUNCATE TABLE revision_history CASCADE;', - 'TRUNCATE TABLE occurrence_history CASCADE;', - 'TRUNCATE TABLE occurrence CASCADE;', - 'TRUNCATE TABLE release CASCADE;', - ]) - - -def add_origin(db_conn, url, type): - """Insert origin and returns the newly inserted id. - - """ - return db.insert(db_conn, - ("""INSERT INTO origin (type, url) - VALUES (%s, %s) - RETURNING id""", - (type, url))) - - -def add_person(db_conn, name, email): - """Insert author and returns the newly inserted id. - - """ - return db.insert(db_conn, - ("""INSERT INTO person (name, email) - VALUES (%s, %s) - RETURNING id""", - (name, email))) - - -def add_content(db_conn, sha1, sha1_git, sha256_content, size): - """Insert a new content. - - """ - db.query_execute(db_conn, - ("""INSERT INTO content (sha1, sha1_git, sha256, length) - VALUES (%s, %s, %s, %s)""", - (sha1, sha1_git, sha256_content, size))) - - -def add_directory(db_conn, obj_sha): - """Insert a new directory. - - """ - return db.insert(db_conn, - ("""INSERT INTO directory (id) - VALUES (%s) - RETURNING id""", - (obj_sha,))) - - -def add_directory_entry_dir(db_conn, name, sha, perms, - atime, mtime, ctime, parent_id): - """Insert a new directory entry dir. - - """ - dir_entry_id = db.insert(db_conn, - ("""INSERT INTO directory_entry_dir - (name, target, perms, atime, mtime, ctime) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id""", - (name, sha, perms, atime, mtime, ctime))) - db.query_execute(db_conn, - ("""INSERT INTO directory_list_dir - (dir_id, entry_id) - VALUES (%s, %s)""", - (parent_id, dir_entry_id))) - - -def add_directory_entry_file(db_conn, name, sha, perms, - atime, mtime, ctime, parent_id): - """Insert a new directory entry file. - - """ - dir_entry_id = db.insert(db_conn, - ("""INSERT INTO directory_entry_file - (name, target, perms, atime, mtime, ctime) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id""", - (name, sha, perms, atime, mtime, ctime))) - - db.query_execute(db_conn, - ("""INSERT INTO directory_list_file - (dir_id, entry_id) - VALUES (%s, %s)""", - (parent_id, dir_entry_id))) - - -def add_directory_entry_rev(db_conn, name, sha, perms, - atime, mtime, ctime, parent_id): - """Insert a new directory entry rev. - - """ - dir_entry_id = db.insert(db_conn, - ("""INSERT INTO directory_entry_rev - (name, target, perms, atime, mtime, ctime) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id""", - (name, sha, perms, atime, mtime, ctime))) - - db.query_execute(db_conn, - ("""INSERT INTO directory_list_rev - (dir_id, entry_id) - VALUES (%s, %s)""", - (parent_id, dir_entry_id))) - - -def add_revision(db_conn, sha, date, committer_date, directory, message, author, - committer, parent_shas=None): - """Insert a revision. - - """ - db.query_execute( - db_conn, - ("""INSERT INTO revision - (id, date, committer_date, type, directory, message, - author, - committer) - VALUES (%s, %s, %s, %s, %s, %s, - (select id from person where name=%s and email=%s), - (select id from person where name=%s and email=%s))""", - (sha, date, committer_date, 'git', directory, message, - author['name'], author['email'], - committer['name'], committer['email']))) - - -def add_revision_history(db_conn, tuple_parents): - """Store the revision history graph. - - """ - tuples = ','.join(["('%s'::bytea,'%s'::bytea, %s)" % t for t in tuple_parents]) - query = 'INSERT INTO revision_history ' + \ - '(id, parent_id, parent_rank) VALUES ' + tuples - db.query_execute(db_conn, query) - - -def add_release(db_conn, obj_sha, revision, date, name, comment, author): - """Insert a release. - - """ - db.query_execute( - db_conn, - ("""INSERT INTO release (id, revision, date, name, comment, author) - VALUES (%s, %s, %s, %s, %s, - (SELECT id FROM person WHERE name=%s AND email=%s))""", - (obj_sha, revision, date, name, comment, author['name'], - author['email']))) - - -def find_occurrence_history(cur, url_origin, branch, authority): - """Is there some occurrence still active already existing for origin, - branch and authority? - - """ - return db.fetchone( - cur, - ("""SELECT revision - FROM occurrence_history - INNER JOIN origin ori ON ori.id = origin - INNER JOIN organization org ON org.id = authority - WHERE branch=%s - AND org.name=%s - AND ori.url=%s - AND upper(validity) is null""", - (branch, authority, url_origin))) - - -def close_occurrence_history(cur, url_origin, branch, revision, authority): - """Close an occurrence history. - - """ - return db.execute( - cur, - ("""UPDATE occurrence_history - SET validity=tstzrange(lower(validity), %s, '[]') - WHERE branch=%s - AND authority=(SELECT id FROM organization where name=%s) - AND origin=(SELECT id FROM origin where url=%s) - """, - (date.now(), branch, authority, url_origin))) - - -def create_new_occurrence_history(cur, url_origin, branch, revision, authority): - """Create a new entry in occurrence_history. - - """ - db.execute( - cur, - ("""INSERT INTO occurrence_history - (origin, - branch, revision, - authority, - validity) - VALUES ((select id from origin where url=%s), - %s, %s, - (select id from organization where name=%s), - tstzrange(%s, NULL))""", - (url_origin, branch, revision, authority, date.now()))) - - - -def add_occurrence_history(db_conn, url_origin, branch, revision, authority): - """Insert an occurrence. - Check if occurrence history already present. - If present do nothing, otherwise insert - - """ - with db_conn.cursor() as cur: - # is there an already occurrence that exists - occ = find_occurrence_history(cur, url_origin, branch, authority) - - if not occ: # none exists, so we create one - create_new_occurrence_history(cur, url_origin, branch, revision, - authority) - elif occ[0] != revision: # one exists but the revision is new, we - # close the old one and add a new one - close_occurrence_history(cur, url_origin, branch, occ[0], authority) - create_new_occurrence_history(cur, url_origin, branch, revision, - authority) - else: # one exists on the same revision, we do nothing - pass - - -def find_revision(db_conn, obj_sha): - """Find a revision by its obj_sha. - - """ - return find_object(db_conn, obj_sha, Type.revision) - - -def find_directory(db_conn, obj_sha): - """Find a directory by its obj_sha. - - """ - return find_object(db_conn, obj_sha, Type.directory) - - -def find_content(db_conn, obj_sha): - """Find a content by its obj_sha. - - """ - return find_object(db_conn, obj_sha, Type.content, column='sha1') - - -def find_occurrences_for_revision(db_conn, revision, type): - """Find all occurences for a specific revisions. - type is not used (implementation detail). - - """ - return db.query_fetch(db_conn, ("""SELECT * - FROM occurrence_history - WHERE revision=%s""", - (revision,))) - - -def find_origin(db_conn, origin_url, origin_type): - """Find all origins matching an url and an origin type. - - """ - return db.query_fetchone(db_conn, ("""SELECT * - FROM origin - WHERE url=%s - AND type=%s""", - (origin_url, origin_type))) - - -def find_person(db_conn, email, name): - """Find a person uniquely identified by email and name. - - """ - return db.query_fetchone(db_conn, ("""SELECT id - FROM person - WHERE email=%s - AND name=%s""", - (email, name))) - - -def find_occurrence(cur, branch, revision, url_origin): - """Find an ocurrence with branch pointing on valid revision for date. - - """ - return db.fetchone( - cur, - ("""SELECT * - FROM occurrence oc - WHERE branch=%s - AND revision=%s - AND origin = (select id from origin where url = %s)""", - (branch, revision, url_origin))) - - -def find_object(db_conn, obj_sha, obj_type, column='id'): - """Find an object of obj_type by its obj_sha. - - """ - table = obj_type if isinstance(obj_type, str) else obj_type.value - query = 'select ' + column + ' from ' + table + ' where ' + column + '=%s' - return db.query_fetchone(db_conn, (query, (obj_sha,))) - - -def filter_unknown_objects(db_conn, file_sha1s, - table_to_filter, tbl_tmp_name, - column_to_filter='id', nature_column='sha1_git'): - """Given a list of sha1s, filter the unknown object between this list and - the content of the table table_to_filter. - tbl_tmp_name is the temporary table used to filter. - - """ - with db_conn.cursor() as cur: - # explicit is better than implicit - # simply creating the temporary table seems to be enough - db.execute(cur, """CREATE TEMPORARY TABLE IF NOT EXISTS %s( - %s %s) - ON COMMIT DELETE ROWS;""" % - (tbl_tmp_name, column_to_filter, nature_column)) - db.copy_from(cur, file_sha1s, tbl_tmp_name) - db.execute(cur, '(SELECT %s FROM %s) EXCEPT (SELECT %s FROM %s);' % - (column_to_filter, tbl_tmp_name, - column_to_filter, table_to_filter)) - return cur.fetchall() - - -def find_unknown_revisions(db_conn, file_sha1s): - """Filter unknown revisions from file_sha1s. - - """ - return filter_unknown_objects(db_conn, file_sha1s, - 'revision', 'filter_sha1_revision') - - -def find_unknown_directories(db_conn, file_sha1s): - """Filter unknown directories from file_sha1s. - - """ - return filter_unknown_objects(db_conn, file_sha1s, - 'directory', 'filter_sha1_directory') - - -def find_unknown_contents(db_conn, file_sha1s): - """Filter unknown contents from file_sha1s. - - """ - return filter_unknown_objects(db_conn, file_sha1s, - 'content', - 'filter_sha1_content', - 'sha1', 'sha1') - - -def _count_objects(db_conn, type): - """Count the number of a given type object. - - """ - return db.query_fetchone(db_conn, 'SELECT count(*) FROM ' + type.value)[0] - - -def count_revisions(db_conn): - """Count the number of revisions. - - """ - return _count_objects(db_conn, Type.revision) - - -def count_directories(db_conn): - """Count the number of directories. - - """ - return _count_objects(db_conn, Type.directory) - - -def count_contents(db_conn): - """Count the number of contents. - - """ - return _count_objects(db_conn, Type.content) - - -def count_occurrence(db_conn): - """Count the number of occurrence. - - """ - return db.query_fetchone(db_conn, 'SELECT count(*) FROM occurrence_history')[0] - - -def count_release(db_conn): - """Count the number of occurrence. - - """ - return _count_objects(db_conn, Type.release) - - -def count_person(db_conn): - """Count the number of occurrence. - - """ - return _count_objects(db_conn, Type.person) diff --git a/swh/loader/git/storage/service.py b/swh/loader/git/storage/service.py deleted file mode 100644 index f18b7b8..0000000 --- a/swh/loader/git/storage/service.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from . import storage - - -filter_unknowns_type = storage.find_unknowns - - -def find_origin(db_conn, origin): - """Find origin. - - """ - orig_found = storage.find_origin(db_conn, origin) - return None if not orig_found else {'id': orig_found[0]} - - -def find_person(db_conn, person): - """Find person. - - """ - person_found = storage.find_person(db_conn, person) - return None if not person_found else {'id': person_found[0]} - - -def add_origin(db_conn, origin): - """Add origin if not already existing. - - """ - orig_found = storage.find_origin(db_conn, origin) - id = orig_found[0] if orig_found else storage.add_origin(db_conn, origin) - return {'id': id} - - -def add_revisions(db_conn, conf, obj_type, objs): - """Add Revisions. - - """ - tuple_parents = [] - for obj in objs: # iterate over objects of type uri_type - obj_id = obj['id'] - obj_found = storage.find(db_conn, obj_id, obj_type) - if not obj_found: - storage.add(db_conn, conf, obj_id, obj_type, obj) - - # deal with revision history - par_shas = obj.get('parent-sha1s', None) - if par_shas: - parent_rank = [(obj_id, parent, rank) - for (rank, parent) in enumerate(par_shas)] - tuple_parents.extend(parent_rank) - - storage.add_revision_history(db_conn, tuple_parents) - - return True - - -def add_persons(db_conn, conf, obj_type, objs): - """Add persons. - conf, obj_type are not used (implementation detail.) - - """ - for obj in objs: - obj_found = storage.find_person(db_conn, obj) - if not obj_found: - storage.add_person(db_conn, obj) - - return True - - -def add_occurrences(db_conn, conf, obj_type, objs): - """Add occurrences. - - """ - res = [] - for obj in objs: # iterate over objects of type uri_type - obj = storage._add_occurrence(db_conn, obj, obj['id']) - res.append(obj) - - return res - - -# dispatch map to add in storage with fs or not -_add_fn = { - storage.Type.content: storage.add_with_fs_storage, -} - - -def add_objects(db_conn, conf, obj_type, objs): - """Add objects if not already present in the storage. - - """ - add_fn = _add_fn.get(obj_type, storage.add) - res = [] - for obj in objs: # iterate over objects of type uri_type - obj_id = obj['id'] - obj_found = storage.find(db_conn, obj_id, obj_type) - if not obj_found: - obj = add_fn(db_conn, conf, obj_id, obj_type, obj) - res.append(obj) - else: - res.append(obj_found) - - return res - - -_persist_fn = { - storage.Type.person: add_persons, - storage.Type.revision: add_revisions, - storage.Type.occurrence: add_occurrences -} - - -def persist(db_conn, conf, obj_type, objs): - """Generic call to persist persons, revisions or other objects. - - """ - persist_fn = _persist_fn.get(obj_type, add_objects) - return persist_fn(db_conn, conf, obj_type, objs) diff --git a/swh/loader/git/storage/storage.py b/swh/loader/git/storage/storage.py deleted file mode 100755 index 7a2c1ff..0000000 --- a/swh/loader/git/storage/storage.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from io import StringIO - -from swh.loader.git.storage import models -from swh.core import hashutil - -Type = models.Type - - -_find_object = {Type.occurrence: models.find_occurrences_for_revision, - Type.content: lambda *args: models.find_object(*args, - column='sha1')} - -hex_to_sha1 = lambda x: '\\\\x%s' % hashutil.hash_to_hex(x) - -def find(db_conn, id, type): - """Find an object according to its id and type. - """ - return _find_object.get(type, models.find_object)(db_conn, id, type) - - -_find_unknown = {Type.revision: models.find_unknown_revisions, - Type.content: models.find_unknown_contents, - Type.directory: models.find_unknown_directories} - - -def find_unknowns(db_conn, obj_type, sha1s_hex): - """Given a list of sha1s, return the non presents one in storage. - """ - def row_to_sha1(row): - """Convert a row (memoryview) to a string sha1. - """ - return bytes(row[0]) - - cpy_data_buffer = StringIO() - vals = '\n'.join(map(hex_to_sha1, - sha1s_hex)) - cpy_data_buffer.write(vals) - cpy_data_buffer.seek(0) # move file cursor back at start of file - - find_unknown_fn = _find_unknown[obj_type] - unknowns = find_unknown_fn(db_conn, cpy_data_buffer) - cpy_data_buffer.close() - return list(map(row_to_sha1, unknowns)) # hack: force resolution for remote loader - - -def _add_content(db_conn, vcs_object, id): - """Add a blob to storage. - Designed to be wrapped in a db transaction. - Returns: - - the sha1 if everything went alright. - - None if something went wrong - Writing exceptions can also be raised and expected to be handled by the - caller. - """ - models.add_content(db_conn, - id, - vcs_object['git-sha1'], - vcs_object['content-sha256'], - vcs_object['size']) - return id - - -def _add_directory(db_conn, vcs_object, id): - """Add a directory to storage. - Designed to be wrapped in a db transaction. - """ - parent_id = models.add_directory(db_conn, id) - for directory_entry_dir in vcs_object['entry-dirs']: - _add_directory_entry_dir(db_conn, parent_id, directory_entry_dir) - for directory_entry_file in vcs_object['entry-files']: - _add_directory_entry_file(db_conn, parent_id, directory_entry_file) - for directory_entry_rev in vcs_object['entry-revs']: - _add_directory_entry_rev(db_conn, parent_id, directory_entry_rev) - return id - - -def _add_directory_entry_dir(db_conn, parent_id, vcs_object): - """Add a directory entry dir to storage. - Designed to be wrapped in a db transaction. - Returns: - - the sha1 if everything went alright. - - None if something went wrong - Writing exceptions can also be raised and expected to be handled by the - caller. - """ - name = vcs_object['name'] - models.add_directory_entry_dir(db_conn, - name, - vcs_object['target-sha1'], - vcs_object['perms'], - vcs_object['atime'], - vcs_object['mtime'], - vcs_object['ctime'], - parent_id) - return name, parent_id - - -def _add_directory_entry_file(db_conn, parent_id, vcs_object): - """Add a directory entry file to storage. - Designed to be wrapped in a db transaction. - Returns: - - the sha1 if everything went alright. - - None if something went wrong - Writing exceptions can also be raised and expected to be handled by the - caller. - """ - name = vcs_object['name'] - models.add_directory_entry_file(db_conn, - name, - vcs_object['target-sha1'], - vcs_object['perms'], - vcs_object['atime'], - vcs_object['mtime'], - vcs_object['ctime'], - parent_id) - return name, parent_id - - -def _add_directory_entry_rev(db_conn, parent_id, vcs_object): - """Add a directory entry rev to storage. - Designed to be wrapped in a db transaction. - Returns: - - the sha1 if everything went alright. - - None if something went wrong - Writing exceptions can also be raised and expected to be handled by the - caller. - """ - name = vcs_object['name'] - models.add_directory_entry_rev(db_conn, - name, - vcs_object['target-sha1'], - vcs_object['perms'], - vcs_object['atime'], - vcs_object['mtime'], - vcs_object['ctime'], - parent_id) - return parent_id - - -def _add_revision(db_conn, vcs_object, id): - """Add a revision to storage. - Designed to be wrapped in a db transaction. - Returns: - - the sha1 if everything went alright. - - None if something went wrong - Writing exceptions can also be raised and expected to be handled by the - caller. - """ - models.add_revision(db_conn, - id, - vcs_object['date'], - vcs_object['committer-date'], - vcs_object['directory'], - vcs_object['message'], - vcs_object['author'], - vcs_object['committer'], - vcs_object['parent-sha1s']) - return id - - -def _add_release(db_conn, vcs_object, id): - """Add a release. - """ - models.add_release(db_conn, - id, - vcs_object['revision'], - vcs_object['date'], - vcs_object['name'], - vcs_object['comment'], - vcs_object['author']) - return id - - -def _add_occurrence(db_conn, vcs_object, id): - """Add an occurrence. - """ - models.add_occurrence_history(db_conn, - vcs_object['url-origin'], - vcs_object['branch'], - vcs_object['revision'], - vcs_object['authority']) - return id - - -def add_person(db_conn, vcs_object): - """Add an author. - """ - return models.add_person(db_conn, - vcs_object['name'], - vcs_object['email']) - - -_store_fn = {Type.directory: _add_directory, - Type.revision: _add_revision, - Type.release: _add_release, - Type.occurrence: _add_occurrence} - - -def add_origin(db_conn, origin): - """A a new origin and returns its id. - """ - return models.add_origin(db_conn, origin['url'], origin['type']) - - -def find_origin(db_conn, origin): - """Find an existing origin. - """ - return models.find_origin(db_conn, origin['url'], origin['type']) - - -def find_person(db_conn, person): - """Find an existing person. - """ - return models.find_person(db_conn, person['email'], person['name']) - - -def add_with_fs_storage(db_conn, config, id, type, vcs_object): - """Add vcs_object in the storage - - db_conn is the opened connection to the db - - config is the map of configuration needed for core layer - - type is not used here but represent the type of vcs_object - - vcs_object is the object meant to be persisted in fs and db - """ - config['objstorage'].add_bytes(vcs_object['content'], id) - return _add_content(db_conn, vcs_object, id) - - -def add(db_conn, config, id, type, vcs_object): - """Given a id, type and content, store a given object in the store. - - db_conn is the opened connection to the db - - config is not used here - - type is the object's type - - vcs_object is the object meant to be persisted in db - """ - return _store_fn[type](db_conn, vcs_object, id) - -hex_to_sha1_2 = lambda x: '\\x%s' % hashutil.hash_to_hex(x) - -def add_revision_history(db_conn, tuple_parents): - """Given a list of tuple (sha, parent_sha), store in revision_history. - """ - if len(tuple_parents) > 0: - models.add_revision_history( - db_conn, - map(lambda t: (hex_to_sha1_2(t[0]), hex_to_sha1_2(t[1]), t[2]), - tuple_parents)) diff --git a/swh/loader/git/tests/test_api_content.py b/swh/loader/git/tests/test_api_content.py deleted file mode 100644 index 21aef7b..0000000 --- a/swh/loader/git/tests/test_api_content.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import app_client, app_client_teardown -from swh.core import hashutil - -@attr('slow') -class ContentTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, self.content_storage_dir = app_client() - - with db.connect(db_url) as db_conn: - self.content_sha1_id = '37bfdafafe3b16d970125df29e6ec9a3d2521fd2' - self.content_sha1_id_bin = hashutil.hex_to_hash(self.content_sha1_id) - - content_sha1_id = '47bfdafafe3b16d970125df29e6ec9a3d2521fd2' - content_sha1_id_bin = hashutil.hex_to_hash(content_sha1_id) - - self.content_sha256_bin = hashutil.hashdata(b'something-to-hash', ['sha256'])['sha256'] - models.add_content(db_conn, - self.content_sha1_id_bin, - content_sha1_id_bin, - self.content_sha256_bin, - 10) - - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def get_content_ok(self): - # when - rv = self.app.get('/vcs/contents/%s' % self.content_sha1_id) - - # then - self.assertEquals(rv.status_code, 200) - data = serial.loads(rv.data) - self.assertEquals(data['id'], self.content_sha1_id) - - @istest - def get_content_not_found(self): - # when - rv = self.app.get('/vcs/contents/222222f9dd5dc46ee476a8be155ab049994f7170') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def get_content_not_found_with_bad_format(self): - # when - rv = self.app.get('/vcs/contents/1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def put_content_create_and_update(self): - content_sha1 = '57bfdafafe3b16d970125df29e6ec9a3d2521fd2' - content_git_sha1 = hashutil.hex_to_hash('57bfdafafe3b16d970125df29e6ec9a3d2521fd2') - content_sha1_bin = hashutil.hex_to_hash(content_sha1) - content_sha256_bin = hashutil.hashdata(b'another-thing-to-hash', ['sha256'])['sha256'] - - # does not exist - rv = self.app.get('/vcs/contents/%s' % content_sha1) - - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - # we create it - body = {'id': content_sha1_bin, - 'git-sha1': content_git_sha1, - 'content-sha256': content_sha256_bin, - 'content': b'bar', - 'size': '3'} - - rv = self.app.put('/vcs/contents/%s' % content_sha1, - data=serial.dumps(body), - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # now it exists - rv = self.app.get('/vcs/contents/%s' % content_sha1) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], content_sha1) - - # we update it - rv = self.app.put('/vcs/contents/%s' % content_sha1, - data=serial.dumps(body), - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # still the same - rv = self.app.get('/vcs/contents/%s' % content_sha1) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], content_sha1) diff --git a/swh/loader/git/tests/test_api_directory.py b/swh/loader/git/tests/test_api_directory.py deleted file mode 100644 index fc6d600..0000000 --- a/swh/loader/git/tests/test_api_directory.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models, storage -from swh.loader.git.protocols import serial -from test_utils import app_client, app_client_teardown -from swh.core import hashutil - -@attr('slow') -class DirectoryTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, self.content_storage_dir = app_client() - - with db.connect(db_url) as db_conn: - self.content_sha1_id = hashutil.hex_to_hash('e5ba97de299a0e1e26b4a471b3d67c098d178e6e') - content_sha1_bin = hashutil.hex_to_hash('d5ba97de299a0e1e26b4a471b3d67c098d178e6e') - content_sha256_bin = hashutil.hashdata(b'something-to-hash', ['sha256'])['sha256'] - models.add_content(db_conn, - self.content_sha1_id, - content_sha1_bin, - content_sha256_bin, - 10) - - self.directory_sha1_hex = 'b5ba97de299a0e1e26b4a471b3d67c098d178e6e' - directory_sha1_bin = hashutil.hex_to_hash(self.directory_sha1_hex) - models.add_directory(db_conn, directory_sha1_bin) - - self.directory_sha1_put = 'a5ba97de299a0e1e26b4a471b3d67c098d178e6e' - self.directory_sha1_put_bin = hashutil.hex_to_hash(self.directory_sha1_put) - models.add_directory(db_conn, self.directory_sha1_put_bin) - - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def get_directory_ok(self): - # when - rv = self.app.get('/vcs/directories/%s' % self.directory_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], self.directory_sha1_hex) - - @istest - def get_directory_not_found(self): - # when - rv = self.app.get('/vcs/directories/111111f9dd5dc46ee476a8be155ab049994f7170') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def get_directory_not_found_with_bad_format(self): - # when - rv = self.app.get('/vcs/directories/1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def put_directory_create_and_update(self): - directory_sha1 = '15ba97de299a0e1e26b4a471b3d67c098d178e6e' - - # does not exist - rv = self.app.get('/vcs/directories/%s' % directory_sha1) - - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - # we create it - body = serial.dumps({'entry-files': [{'name': 'filename', - 'type': storage.Type.directory_entry, - 'target-sha1': self.content_sha1_id, - 'perms': '000', - 'atime': None, - 'mtime': None, - 'ctime': None}], - 'entry-dirs': [{'name': 'dirname', - 'type': storage.Type.directory_entry, - 'target-sha1': self.directory_sha1_put_bin, - 'perms': '012', - 'atime': None, - 'mtime': None, - 'ctime': None}], - 'entry-revs': [{'name': "rev-name", - 'type': storage.Type.directory_entry, - 'target-sha1': hashutil.hex_to_hash('35ba97de299a0e1e26b4a471b3d67c098d178e6e'), - 'perms': '000', - 'atime': None, - 'mtime': None, - 'ctime': None}] - }) - - rv = self.app.put('/vcs/directories/%s' % directory_sha1, - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - print(rv.status_code) - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # now it exists - rv = self.app.get('/vcs/directories/%s' % directory_sha1) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], directory_sha1) - - # we update it - rv = self.app.put('/vcs/directories/%s' % directory_sha1, - data=serial.dumps({'entry-files': 'directory-bar'}), - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # still the same - rv = self.app.get('/vcs/directories/%s' % directory_sha1) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], directory_sha1) diff --git a/swh/loader/git/tests/test_api_home.py b/swh/loader/git/tests/test_api_home.py deleted file mode 100644 index 8829c29..0000000 --- a/swh/loader/git/tests/test_api_home.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from test_utils import app_client - - -@attr('slow') -class HomeTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, _, _ = app_client() - - @istest - def get_slash(self): - # when - rv = self.app.get('/') - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(rv.data, b'Dev SWH API') - - @istest - def get_404(self): - # when - rv = self.app.get('/nowhere') - - # then - self.assertEquals(rv.status_code, 404) - - @istest - def get_bad_request(self): - # when - rv = self.app.get('/vcs/not-a-good-type/1') - - # then - self.assertEquals(rv.status_code, 400) - self.assertEquals(rv.data, b'Bad request!') diff --git a/swh/loader/git/tests/test_api_occurrence.py b/swh/loader/git/tests/test_api_occurrence.py deleted file mode 100644 index 6d7f422..0000000 --- a/swh/loader/git/tests/test_api_occurrence.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import now, app_client, app_client_teardown -from swh.core import hashutil - -@attr('slow') -class OccurrenceTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, self.content_storage_dir = app_client() - - with db.connect(db_url) as db_conn: - self.directory_sha1_hex = '0876886dc3b49ebe1043e116727ae781be7c8583' - self.directory_sha1_bin = hashutil.hex_to_hash(self.directory_sha1_hex) - models.add_directory(db_conn, self.directory_sha1_bin) - - authorAndCommitter = {'name': 'some-name', 'email': 'some-email'} - models.add_person(db_conn, authorAndCommitter['name'], authorAndCommitter['email']) - - self.revision_sha1_hex = '1876886dc3b49ebe1043e116727ae781be7c8583' - self.revision_sha1_bin = hashutil.hex_to_hash(self.revision_sha1_hex) - models.add_revision(db_conn, - self.revision_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message", - authorAndCommitter, - authorAndCommitter) - - self.origin_url = "https://github.com/user/repo" - models.add_origin(db_conn, self.origin_url, 'git') - - self.branch_name = 'master' - models.add_occurrence_history(db_conn, - self.origin_url, - self.branch_name, - self.revision_sha1_bin, - 'softwareheritage') - - self.branch_name2 = 'master2' - models.add_occurrence_history(db_conn, - self.origin_url, - self.branch_name2, - self.revision_sha1_bin, - 'softwareheritage') - - self.revision_sha1_hex_2 = '2876886dc3b49ebe1043e116727ae781be7c8583' - self.revision_sha1_bin_2 = hashutil.hex_to_hash(self.revision_sha1_hex_2) - models.add_revision(db_conn, - self.revision_sha1_bin_2, - now(), - now(), - self.directory_sha1_bin, - "revision message 2", - authorAndCommitter, - authorAndCommitter) - - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def get_occurrence_ok(self): - # when - rv = self.app.get('/vcs/occurrences/%s' % self.revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - branches = serial.loads(rv.data) - self.assertEquals(len(branches), 2) - self.assertIn(self.branch_name, branches) - self.assertIn(self.branch_name2, branches) - - @istest - def get_occurrence_not_found(self): - # when - rv = self.app.get('/vcs/occurrences/inexistant-sha1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def get_occurrence_not_found_with_bad_format(self): - # when - rv = self.app.get('/vcs/occurrences/1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def put_occurrence_create_and_update(self): - occ_revision_sha1_hex = self.revision_sha1_hex_2 - - rv = self.app.get('/vcs/occurrences/%s' % occ_revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - # we create it - body = serial.dumps({'revision': hashutil.hex_to_hash(occ_revision_sha1_hex), # FIXME: redundant with the one from uri.. - 'branch': 'master', - 'authority': 'softwareheritage', - 'url-origin': self.origin_url}) - - rv = self.app.put('/vcs/occurrences/%s' % occ_revision_sha1_hex, # ... here - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # now it exists - rv = self.app.get('/vcs/occurrences/%s' % occ_revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data), ['master']) - - # we update it - # rv = self.app.put('/vcs/occurrences/%s' % occ_revision_sha1_hex, - # data=body, - # headers={'Content-Type': serial.MIMETYPE}) - - #self.assertEquals(rv.status_code, 204) - #self.assertEquals(rv.data, b'') - - # # still the same - # rv = self.app.get('/vcs/occurrences/%s' % occ_revision_sha1_hex) - - # # then - # occs = serial.loads(rv.data) - # self.assertEquals(rv.status_code, 200) - # self.assertEquals(occs, ['master']) diff --git a/swh/loader/git/tests/test_api_origin.py b/swh/loader/git/tests/test_api_origin.py deleted file mode 100644 index 21d3e2d..0000000 --- a/swh/loader/git/tests/test_api_origin.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import app_client - - -@attr('slow') -class OriginTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, _ = app_client() - - with db.connect(db_url) as db_conn: - self.origin_url = 'https://github.com/torvalds/linux.git' - self.origin_type = 'git' - self.origin_id = models.add_origin(db_conn, self.origin_url, self.origin_type) - - @istest - def get_origin_ok(self): - # when - payload = {'url': self.origin_url, - 'type': self.origin_type} - rv = self.app.post('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], self.origin_id) - - @istest - def get_origin_not_found(self): - # when - payload = {'url': 'unknown', - 'type': 'blah'} - rv = self.app.post('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Origin not found!') - - @istest - def get_origin_not_found_with_bad_format(self): - # when - rv = self.app.post('/vcs/origins/', - data=serial.dumps({'url': 'unknown'}), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 400) - - @istest - def put_origin(self): - # when - payload = {'url': 'unknown', - 'type': 'blah'} - rv = self.app.post('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Origin not found!') - - # when - rv = self.app.put('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) # FIXME: 201) - self.assertIsNotNone(serial.loads(rv.data)['id']) - - payload = {'url': 'unknown', - 'type': 'blah'} - rv = self.app.post('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 200) - origin_id = serial.loads(rv.data)['id'] - self.assertIsNotNone(origin_id) - - # when - rv = self.app.put('/vcs/origins/', - data=serial.dumps(payload), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) # FIXME: 204 - self.assertEquals(serial.loads(rv.data)['id'], origin_id) diff --git a/swh/loader/git/tests/test_api_person.py b/swh/loader/git/tests/test_api_person.py deleted file mode 100644 index f0b2434..0000000 --- a/swh/loader/git/tests/test_api_person.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import app_client - - -@attr('slow') -class PersonTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, _ = app_client() - - with db.connect(db_url) as db_conn: - self.person_name = 'some-name' - self.person_email = 'some@mail.git' - self.person_id = models.add_person(db_conn, self.person_name, self.person_email) - - @istest - def get_person_ok(self): - # when - person = {'name': self.person_name, - 'email': self.person_email} - rv = self.app.post('/vcs/persons/', - data=serial.dumps(person), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], self.person_id) - - @istest - def get_person_not_found(self): - # when - person = {'name': 'unknown', - 'email': 'blah'} - rv = self.app.post('/vcs/persons/', - data=serial.dumps(person), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Person not found!') - - @istest - def get_person_not_found_with_bad_format(self): - # when - rv = self.app.post('/vcs/persons/', - data=serial.dumps({'name': 'unknown'}), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 400) - - @istest - def put_person(self): - # when - person = {'name': 'unknown', - 'email': 'blah'} - rv = self.app.post('/vcs/persons/', - data=serial.dumps(person), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Person not found!') - - # when - rv = self.app.put('/vcs/persons/', - data=serial.dumps([person]), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - person = {'name': 'unknown', - 'email': 'blah'} - rv = self.app.post('/vcs/persons/', - data=serial.dumps(person), - headers={'Content-Type': serial.MIMETYPE}) - # then - self.assertEquals(rv.status_code, 200) - person_id = serial.loads(rv.data)['id'] - self.assertIsNotNone(person_id) - - # when - rv = self.app.put('/vcs/persons/', - data=serial.dumps([person, person]), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') diff --git a/swh/loader/git/tests/test_api_post_per_type.py b/swh/loader/git/tests/test_api_post_per_type.py deleted file mode 100644 index 03988ba..0000000 --- a/swh/loader/git/tests/test_api_post_per_type.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import now, app_client, app_client_teardown -from swh.core import hashutil - -@attr('slow') -class TestPostObjectsPerTypeCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, self.db_url, self.content_storage_dir = app_client() - - with db.connect(self.db_url) as db_conn: - self.content_sha1_id = '09a8ca4c0f510fda04a4dfe04d842cdfaafa7d8c' - self.content_sha1_id_bin = hashutil.hex_to_hash(self.content_sha1_id) - - self.content_sha256_bin = hashutil.hashdata(b'something-to-hash', ['sha256'])['sha256'] - models.add_content(db_conn, - self.content_sha1_id_bin, - self.content_sha1_id_bin, - self.content_sha256_bin, - 10) - - self.directory_sha1_hex = '19a8ca4c0f510fda04a4dfe04d842cdfaafa7d8c' - self.directory_sha1_bin = hashutil.hex_to_hash(self.directory_sha1_hex) - models.add_directory(db_conn, self.directory_sha1_bin) - - authorAndCommitter = {'name': 'some-name', 'email': 'some-email'} - models.add_person(db_conn, authorAndCommitter['name'], authorAndCommitter['email']) - - authorAndCommitter2 = {'name': 'tony', 'email': 'tony@dude.org'} - models.add_person(db_conn, authorAndCommitter2['name'], authorAndCommitter2['email']) - - self.revision_sha1_hex = '29a8ca4c0f510fda04a4dfe04d842cdfaafa7d8c' - self.revision_sha1_bin = hashutil.hex_to_hash(self.revision_sha1_hex) - models.add_revision(db_conn, - self.revision_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message", - authorAndCommitter, - authorAndCommitter) - - self.revision_sha1_hex2 = '39a8ca4c0f510fda04a4dfe04d842cdfaafa7d8c' - self.revision_sha1_bin2 = hashutil.hex_to_hash(self.revision_sha1_hex2) - models.add_revision(db_conn, - self.revision_sha1_bin2, - now(), - now(), - self.directory_sha1_bin, - "revision message", - authorAndCommitter2, - authorAndCommitter2, - parent_shas=['revision-sha1-to-test-existence9994f717e']) - - self.release_sha1_hex = '49a8ca4c0f510fda04a4dfe04d842cdfaafa7d8c' - self.release_sha1_bin = hashutil.hex_to_hash(self.release_sha1_hex) - models.add_release(db_conn, - self.release_sha1_bin, - self.revision_sha1_bin, - now(), - "0.0.1", - "Super release tagged by tony", - authorAndCommitter2) - - self.origin_url = "https://github.com/user/repo" - models.add_origin(db_conn, self.origin_url, 'git') - - models.add_occurrence_history(db_conn, - self.origin_url, - 'master', - self.revision_sha1_bin, - 'softwareheritage') - - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def post_all_non_presents_contents(self): - # given - - # when - payload = [self.content_sha1_id_bin, - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('666777f9dd5dc46ee476a8be155ab049994f717e')] - query_payload = serial.dumps(payload) - - rv = self.app.post('/vcs/contents/', - data=query_payload, - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - - sha1s = serial.loads(rv.data) - self.assertEquals(len(sha1s), 2) # only 2 sha1s - self.assertIn(hashutil.hex_to_hash("666777f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - self.assertIn(hashutil.hex_to_hash("555444f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - - @istest - def post_all_non_presents_directories(self): - # given - - # when - payload = [self.directory_sha1_bin, - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('666777f9dd5dc46ee476a8be155ab049994f717e')] - query_payload = serial.dumps(payload) - - rv = self.app.post('/vcs/directories/', - data=query_payload, - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - - sha1s = serial.loads(rv.data) - self.assertEquals(len(sha1s), 2) # only 2 sha1s - self.assertIn(hashutil.hex_to_hash("666777f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - self.assertIn(hashutil.hex_to_hash("555444f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - - @istest - def post_all_non_presents_revisions(self): - # given - - # when - payload = [self.revision_sha1_bin, - self.revision_sha1_bin, - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('666777f9dd5dc46ee476a8be155ab049994f717e')] - query_payload = serial.dumps(payload) - - rv = self.app.post('/vcs/revisions/', - data=query_payload, - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - - sha1s = serial.loads(rv.data) - self.assertEquals(len(sha1s), 2) - self.assertIn(hashutil.hex_to_hash("666777f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - self.assertIn(hashutil.hex_to_hash("555444f9dd5dc46ee476a8be155ab049994f717e"), sha1s) - - @istest - def post_all_non_presents_releases(self): - # given - - # when - payload = [self.release_sha1_bin, - self.release_sha1_bin, - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('666777f9dd5dc46ee476a8be155ab049994f717e')] - query_payload = serial.dumps(payload) - - rv = self.app.post('/vcs/releases/', - data=query_payload, - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 400) - self.assertEquals(rv.data, b'Bad request. Type not supported!') - - @istest - def post_all_non_presents_occurrences_KO(self): - # given - - # when - payload = [self.revision_sha1_bin, - self.revision_sha1_bin, - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('555444f9dd5dc46ee476a8be155ab049994f717e'), - hashutil.hex_to_hash('666777f9dd5dc46ee476a8be155ab049994f717e')] - query_payload = serial.dumps(payload) - - rv = self.app.post('/vcs/occurrences/', - data=query_payload, - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 400) - self.assertEquals(rv.data, b'Bad request. Type not supported!') - - @istest - def post_non_presents_objects_empty_payload_so_empty_results(self): - # given - - # when - for api_type in ['contents', 'directories', 'revisions']: - rv = self.app.post('/vcs/%s/' % api_type, - data=serial.dumps({}), - headers={'Content-Type': serial.MIMETYPE}) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data), []) - - @istest - def post_non_presents_objects_bad_requests_format_pickle(self): - # given - - # when - for api_type in ['contents', 'directories', 'revisions']: - rv = self.app.post('/vcs/%s/' % api_type, - data="not pickle -> fail") - - # then - self.assertEquals(rv.status_code, 400) - self.assertEquals(rv.data, b'Bad request. Expected application/octet-stream data!') diff --git a/swh/loader/git/tests/test_api_release.py b/swh/loader/git/tests/test_api_release.py deleted file mode 100644 index 4784409..0000000 --- a/swh/loader/git/tests/test_api_release.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import now, app_client, app_client_teardown -from swh.core import hashutil - -@attr('slow') -class ReleaseTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, self.content_storage_dir = app_client() - - with db.connect(db_url) as db_conn: - self.directory_sha1_hex = 'ebefcd8f9c8a1003ee35f7f953c4c1480986e607' - self.directory_sha1_bin = hashutil.hex_to_hash(self.directory_sha1_hex) - - models.add_directory(db_conn, self.directory_sha1_bin) - - self.tagAuthor = {'name': 'tony', 'email': 'tony@mail.org'} - models.add_person(db_conn, self.tagAuthor['name'], self.tagAuthor['email']) - - self.revision_sha1_hex = 'dbefcd8f9c8a1003ee35f7f953c4c1480986e607' - self.revision_sha1_bin = hashutil.hex_to_hash(self.revision_sha1_hex) - models.add_revision(db_conn, - self.revision_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message", - self.tagAuthor, - self.tagAuthor) - - self.release_sha1_hex = 'cbefcd8f9c8a1003ee35f7f953c4c1480986e607' - self.release_sha1_bin = hashutil.hex_to_hash(self.release_sha1_hex) - models.add_release(db_conn, - self.release_sha1_bin, - self.revision_sha1_bin, - now(), - "0.0.1", - "Super release tagged by tony", - self.tagAuthor) - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def get_release_ok(self): - # when - rv = self.app.get('/vcs/releases/%s' % self.release_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], self.release_sha1_hex) - - @istest - def get_release_not_found(self): - # when - rv = self.app.get('/vcs/releases/inexistant-sha1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def get_release_not_found_with_bad_format(self): - # when - rv = self.app.get('/vcs/releases/1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def put_release_create_and_update(self): - release_sha1_hex = 'bbefcd8f9c8a1003ee35f7f953c4c1480986e607' - release_sha1_bin = hashutil.hex_to_hash(release_sha1_hex) - - rv = self.app.get('/vcs/releases/%s' % release_sha1_hex) - - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - # we create it - body = serial.dumps({'id': release_sha1_bin, - 'revision': self.revision_sha1_bin, - 'date': now(), - 'name': '0.0.1', - 'comment': 'super release tagged by ardumont', - 'author': self.tagAuthor}) - - rv = self.app.put('/vcs/releases/%s' % release_sha1_hex, - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # now it exists - rv = self.app.get('/vcs/releases/%s' % release_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], release_sha1_hex) - - # we update it - rv = self.app.put('/vcs/releases/%s' % release_sha1_hex, - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # still the same - rv = self.app.get('/vcs/releases/%s' % release_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], release_sha1_hex) diff --git a/swh/loader/git/tests/test_api_revision.py b/swh/loader/git/tests/test_api_revision.py deleted file mode 100644 index 18935bf..0000000 --- a/swh/loader/git/tests/test_api_revision.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest -from nose.plugins.attrib import attr - -from swh.loader.git.storage import db, models -from swh.loader.git.protocols import serial -from test_utils import now, app_client, app_client_teardown -from swh.core import hashutil - - -@attr('slow') -class RevisionTestCase(unittest.TestCase): - @classmethod - def setUpClass(self): - self.app, db_url, self.content_storage_dir = app_client() - - with db.connect(db_url) as db_conn: - directory_sha1_hex = '13d2a9739ac02431681c317ce449909a46c59554' - self.directory_sha1_bin = hashutil.hex_to_hash(directory_sha1_hex) - models.add_directory(db_conn, self.directory_sha1_bin) - - self.authorAndCommitter = {'name': 'some-name', 'email': 'some-email'} - models.add_person(db_conn, self.authorAndCommitter['name'], self.authorAndCommitter['email']) - - self.revision_parent_sha1_hex = '23d2a9739ac02431681c317ce449909a46c59554' - self.revision_parent_sha1_bin = hashutil.hex_to_hash(self.revision_parent_sha1_hex) - models.add_revision(db_conn, - self.revision_parent_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message", - self.authorAndCommitter, - self.authorAndCommitter) - - revision_parent_2_sha1_hex = '33d2a9739ac02431681c317ce449909a46c59554' - self.revision_parent_2_sha1_bin = hashutil.hex_to_hash(revision_parent_2_sha1_hex) - models.add_revision(db_conn, - self.revision_parent_2_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message 2", - self.authorAndCommitter, - self.authorAndCommitter) - - revision_parent_3_sha1_hex = '43d2a9739ac02431681c317ce449909a46c59554' - self.revision_parent_3_sha1_bin = hashutil.hex_to_hash(revision_parent_3_sha1_hex) - models.add_revision(db_conn, - self.revision_parent_3_sha1_bin, - now(), - now(), - self.directory_sha1_bin, - "revision message 3", - self.authorAndCommitter, - self.authorAndCommitter) - - @classmethod - def tearDownClass(self): - app_client_teardown(self.content_storage_dir) - - @istest - def get_revision_ok(self): - # when - rv = self.app.get('/vcs/revisions/%s' % self.revision_parent_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], self.revision_parent_sha1_hex) - - @istest - def get_revision_not_found(self): - # when - rv = self.app.get('/vcs/revisions/inexistant-sha1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def get_revision_not_found_with_bad_format(self): - # when - rv = self.app.get('/vcs/revisions/1') - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - @istest - def put_revision_create_and_update(self): - revision_sha1_hex = '53d2a9739ac02431681c317ce449909a46c59554' - - rv = self.app.get('/vcs/revisions/%s' % revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 404) - self.assertEquals(rv.data, b'Not found!') - - # we create it - body = serial.dumps({'date': now(), - 'committer-date': now(), - 'directory': self.directory_sha1_bin, - 'message': 'revision message describing it', - 'committer': self.authorAndCommitter, - 'author': self.authorAndCommitter, - 'parent-sha1s': [self.revision_parent_sha1_bin, - self.revision_parent_3_sha1_bin, - self.revision_parent_2_sha1_bin]}) - - rv = self.app.put('/vcs/revisions/%s' % revision_sha1_hex, - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # now it exists - rv = self.app.get('/vcs/revisions/%s' % revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], revision_sha1_hex) - - # we update it - rv = self.app.put('/vcs/revisions/%s' % revision_sha1_hex, - data=body, - headers={'Content-Type': serial.MIMETYPE}) - - self.assertEquals(rv.status_code, 204) - self.assertEquals(rv.data, b'') - - # still the same - rv = self.app.get('/vcs/revisions/%s' % revision_sha1_hex) - - # then - self.assertEquals(rv.status_code, 200) - self.assertEquals(serial.loads(rv.data)['id'], revision_sha1_hex) diff --git a/swh/loader/git/tests/test_date.py b/swh/loader/git/tests/test_date.py deleted file mode 100644 index 052813d..0000000 --- a/swh/loader/git/tests/test_date.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.git import date - - -class DateTestCase(unittest.TestCase): - @istest - def negative_offsets(self): - # when - d0 = date.ts_to_str(1434379797, -120) - - self.assertEquals(d0, '2015-06-15 12:49:57-02:00') - - # when - d1 = date.ts_to_str(1434379797, -60) - - self.assertEquals(d1, '2015-06-15 13:49:57-01:00') - - # when - d2 = date.ts_to_str(1434379797, -30) - - self.assertEquals(d2, '2015-06-15 14:19:57-00:30') - - # when - d3 = date.ts_to_str(1434379797, 30) - - self.assertEquals(d3, '2015-06-15 15:19:57+00:30') - - @istest - def positive_offsets(self): - # when - d0 = date.ts_to_str(1434449254, 120) - - self.assertEquals(d0, '2015-06-16 12:07:34+02:00') - - # when - d1 = date.ts_to_str(1434449254, 60) - - self.assertEquals(d1, '2015-06-16 11:07:34+01:00') - - # when - d2 = date.ts_to_str(1434449254, 0) - - self.assertEquals(d2, '2015-06-16 10:07:34+00:00') - - # when - d3 = date.ts_to_str(1434449254, -60) - - self.assertEquals(d3, '2015-06-16 09:07:34-01:00') diff --git a/swh/loader/git/tests/test_git_utils.py b/swh/loader/git/tests/test_git_utils.py deleted file mode 100644 index 7347c75..0000000 --- a/swh/loader/git/tests/test_git_utils.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pygit2 - - -def create_blob(git_repo, blob_content): - """Create a blob with blob_content and returns its oid. - """ - return git_repo.create_blob(blob_content) - - -def create_tree(git_repo, blob_content=None): - """Create a tree. - If blob_content is specified, create a blob then - create a tree which points to this blob. - Returns the tree's oid. - """ - treeBuilder = git_repo.TreeBuilder() - if blob_content: - new_blob = create_blob(git_repo, blob_content) - treeBuilder.insert('blob', new_blob, - pygit2.GIT_FILEMODE_BLOB_EXECUTABLE) - return treeBuilder.write() - - -def create_author_and_committer(): - """Create a dummy signature for author and committer. - """ - author = pygit2.Signature('Alice Cooper', - 'alice@cooper.tld') - committer = pygit2.Signature('Vincent Furnier', - 'vincent@committers.tld') - return (author, committer) - -def create_tagger(): - """Create a dummy signature for author and committer. - """ - return pygit2.Signature('ToNyX', - 'tony@badass.org') - - -def create_commit_with_content(git_repo, - blob_content, - commit_msg, - commit_parents=None): - """Create a commit inside the git repository and return its oid. - """ - author, committer = create_author_and_committer() - tree = create_tree(git_repo, blob_content) - return git_repo.create_commit( - 'refs/heads/master', # the name of the reference to update - author, committer, commit_msg, - tree, # binary string representing the tree object ID - [] if commit_parents is None else commit_parents # commit parents - ) - -def create_tag(git_repo, name, commit, message): - """Create a dummy tag. - """ - return git_repo.create_tag(name, - commit.hex, - pygit2.GIT_OBJ_COMMIT, - create_tagger(), - message) diff --git a/swh/loader/git/tests/test_http.py b/swh/loader/git/tests/test_http.py deleted file mode 100644 index c519df1..0000000 --- a/swh/loader/git/tests/test_http.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.git.client import http -from swh.loader.git.storage import storage - - -class TestHttp(unittest.TestCase): - @istest - def url(self): - # when - s = http.compute_simple_url('http://base-url', '/end') - - # then - self.assertEquals(s, 'http://base-url/end') - - @istest - def url_lookup_per_type(self): - # then - self.assertEquals(http.url_lookup_per_type, - {storage.Type.origin: "/vcs/origins/", - storage.Type.content: "/vcs/contents/", - storage.Type.directory: "/vcs/directories/", - storage.Type.revision: "/vcs/revisions/"}) - - @istest - def url_store_per_type(self): - # then - self.assertEquals(http.url_store_per_type, - {storage.Type.origin: "/vcs/origins/", - storage.Type.content: "/vcs/contents/", - storage.Type.directory: "/vcs/directories/", - storage.Type.revision: "/vcs/revisions/", - storage.Type.release: "/vcs/releases/", - storage.Type.occurrence: "/vcs/occurrences/", - storage.Type.person: "/vcs/persons/"}) diff --git a/swh/loader/git/tests/test_initdb.py b/swh/loader/git/tests/test_initdb.py deleted file mode 100644 index eb30c2a..0000000 --- a/swh/loader/git/tests/test_initdb.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from swh.loader.git.manager import manage - - -def prepare_db(db_url): - """DB fresh start. - """ - manage('cleandb', db_url) - manage('initdb', db_url) diff --git a/swh/loader/git/tests/test_local_loader.py b/swh/loader/git/tests/test_local_loader.py deleted file mode 100644 index 6a4c94d..0000000 --- a/swh/loader/git/tests/test_local_loader.py +++ /dev/null @@ -1,243 +0,0 @@ -# coding: utf-8 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import unittest -import pygit2 -import tempfile -import shutil - -from nose.plugins.attrib import attr -from nose.tools import istest - -from swh.core import config -from swh.loader.git.storage import db, models -from swh.loader.git import loader - -import test_initdb -from test_utils import list_files_from -from test_git_utils import create_commit_with_content, create_tag - -@attr('slow') -class TestLocalLoader(unittest.TestCase): - def setUp(self): - """Initialize a git repository for the remaining test to manipulate. - """ - tmp_git_folder_path = tempfile.mkdtemp(prefix='test-sgloader.', - dir='/tmp') - self.tmp_git_repo = pygit2.init_repository(tmp_git_folder_path) - - self.conf_back = config.read('./resources/test/back.ini', - {'port': ('int', 9999)}) - - self.db_url = self.conf_back['db_url'] - - self.conf = { - 'action': 'load', - 'repo_path': self.tmp_git_repo.workdir, - 'backend-type': 'local', - 'backend': './resources/test/back.ini' - } - - def init_db_setup(self): - """Initialize a git repository for the remaining test to manipulate. - """ - test_initdb.prepare_db(self.db_url) - - def tearDown(self): - """Destroy the test git repository. - """ - shutil.rmtree(self.tmp_git_repo.workdir) - shutil.rmtree(self.conf_back['content_storage_dir'], ignore_errors=True) - - @istest - def should_fail_on_bad_action(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'unknown'}) - - @istest - def should_fail_on_inexistant_folder(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'load', - 'repo_path': 'something-that-definitely-does-not-exist'}) - - @istest - def should_fail_on_inexistant_backend_type(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'load', - 'repo_path': '.', - 'backend-type': 'unknown'}) # only local or remote supported - - @istest - def local_loader(self): - """Trigger loader and make sure everything is ok. - """ - self.init_db_setup() - - # given - commit0 = create_commit_with_content(self.tmp_git_repo, 'blob 0', - 'commit msg 0') - commit1 = create_commit_with_content(self.tmp_git_repo, 'blob 1', - 'commit msg 1', - [commit0.hex]) - commit2 = create_commit_with_content(self.tmp_git_repo, 'blob 2', - 'commit msg 2', - [commit1.hex]) - commit3 = create_commit_with_content(self.tmp_git_repo, None, - 'commit msg 3', - [commit2.hex]) - commit4 = create_commit_with_content(self.tmp_git_repo, 'blob 4', - 'commit msg 4', - [commit3.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) - self.assertEquals(nb_files, 4, "4 blobs.") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 5, - "Should be 5 commits") - self.assertEquals( - models.count_directories(db_conn), - 5, - "Should be 5 trees") - self.assertEquals( - models.count_contents(db_conn), - 4, - "Should be 4 blobs as we created one commit without data!") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 1, - "Should be 1 reference (master) so 1 occurrence.") - - # given - commit5 = create_commit_with_content(self.tmp_git_repo, 'new blob 5', - 'commit msg 5', - [commit4.hex]) - commit6 = create_commit_with_content(self.tmp_git_repo, - 'new blob and last 6', - 'commit msg 6', - [commit5.hex]) - commit7 = create_commit_with_content(self.tmp_git_repo, 'new blob 7', - 'commit msg 7', - [commit6.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) - self.assertEquals(nb_files, 4+3, "3 new blobs.") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 8, - "Should be 5+3 == 8 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 5+3 == 8 trees") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 4+3 == 7 blobs") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 2, - "Should be 1 reference which changed twice so 2 occurrences (master changed).") - - # given - create_commit_with_content(self.tmp_git_repo, None, - 'commit 8 with parent 2', - [commit7.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) - self.assertEquals(nb_files, 7, "no new blob.") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 9, - "Should be 8+1 == 9 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 8 trees (new commit without blob so no new tree)") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 7 blobs (new commit without new blob)") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 3, - "Should be 1 reference which changed thrice so 3 occurrences (master changed again).") - self.assertEquals( - models.count_person(db_conn), - 2, - "1 author + 1 committer") - - - # add tag - create_tag(self.tmp_git_repo, '0.0.1', commit5, 'bad ass release 0.0.1, towards infinity...') - create_tag(self.tmp_git_repo, '0.0.2', commit7, 'release 0.0.2... and beyond') - - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf_back['content_storage_dir'])) - self.assertEquals(nb_files, 7, "no new blob.") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 9, - "Should be 8+1 == 9 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 8 trees (new commit without blob so no new tree)") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 7 blobs (new commit without new blob)") - self.assertEquals( - models.count_release(db_conn), - 2, - "Should be 2 annotated tags so 2 releases") - self.assertEquals( - models.count_occurrence(db_conn), - 3, - "master did not change this time so still 3 occurrences") - self.assertEquals( - models.count_person(db_conn), - 3, - "1 author + 1 committer + 1 tagger") diff --git a/swh/loader/git/tests/test_remote_loader.py b/swh/loader/git/tests/test_remote_loader.py deleted file mode 100644 index e59354f..0000000 --- a/swh/loader/git/tests/test_remote_loader.py +++ /dev/null @@ -1,245 +0,0 @@ -# coding: utf-8 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import pygit2 -import tempfile -import shutil -import os - -from nose.plugins.attrib import attr -from nose.tools import istest - -from swh.core import config -from swh.loader.git.storage import db, models -from swh.loader.git import loader - -import test_initdb -from test_git_utils import create_commit_with_content, create_tag -from test_utils import list_files_from - - -@attr('slow') -class TestRemoteLoader(unittest.TestCase): - def setUp(self): - tmp_git_folder_path = tempfile.mkdtemp(prefix='test-sgloader.', - dir='/tmp') - self.tmp_git_repo = pygit2.init_repository(tmp_git_folder_path) - self.conf = config.read('./resources/test/back.ini', - {'port': ('int', 9999)}) - - self.db_url = self.conf['db_url'] - self.conf.update({ - 'action': 'load', - 'repo_path': self.tmp_git_repo.workdir, - 'backend-type': 'remote', - 'backend': 'http://localhost:%s' % self.conf['port'] - }) - - # Not the remote loader in charge of creating the folder, so we do it - if not os.path.exists(self.conf['content_storage_dir']): - os.mkdir(self.conf['content_storage_dir']) - - def init_db_setup(self): - """Initialize a git repository for the remaining test to manipulate. - """ - test_initdb.prepare_db(self.db_url) - - def tearDown(self): - """Destroy the test git repository. - """ - shutil.rmtree(self.tmp_git_repo.workdir) - shutil.rmtree(self.conf['content_storage_dir']) - - @istest - def should_fail_on_bad_action(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'unknown'}) - - @istest - def should_fail_on_inexistant_folder(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'load', - 'repo_path': 'something-that-definitely-does-not-exist'}) - - @istest - def should_fail_on_inexistant_backend_type(self): - # when - with self.assertRaises(Exception): - loader.load({'action': 'load', - 'repo_path': '.', - 'backend-type': 'unknown'}) # only local or remote supported - - @istest - def remote_loader(self): - """Trigger loader and make sure everything is ok. - """ - # given - self.init_db_setup() - - # given - commit0 = create_commit_with_content(self.tmp_git_repo, 'blob 0', - 'commit msg 0') - commit1 = create_commit_with_content(self.tmp_git_repo, 'blob 1', - 'commit msg 1', - [commit0.hex]) - commit2 = create_commit_with_content(self.tmp_git_repo, 'blob 2', - 'commit msg 2', - [commit1.hex]) - commit3 = create_commit_with_content(self.tmp_git_repo, None, - 'commit msg 3', - [commit2.hex]) - commit4 = create_commit_with_content(self.tmp_git_repo, 'blob 4', - 'commit msg 4', - [commit3.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf['content_storage_dir'])) - self.assertEquals(nb_files, 4, "4 blobs") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 5, - "Should be 5 commits") - self.assertEquals( - models.count_directories(db_conn), - 5, - "Should be 5 trees") - self.assertEquals( - models.count_contents(db_conn), - 4, - "Should be 4 blobs as we created one commit without data!") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 1, - "Should be 1 reference (master) so 1 occurrence.") - - # given - commit5 = create_commit_with_content(self.tmp_git_repo, 'new blob 5', - 'commit msg 5', - [commit4.hex]) - commit6 = create_commit_with_content(self.tmp_git_repo, - 'new blob and last 6', - 'commit msg 6', - [commit5.hex]) - commit7 = create_commit_with_content(self.tmp_git_repo, 'new blob 7', - 'commit msg 7', - [commit6.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf['content_storage_dir'])) - self.assertEquals(nb_files, 4+3, "3 new blobs") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 8, - "Should be 5+3 == 8 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 5+3 == 8 trees") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 4+3 == 7 blobs") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 2, - "Should be 1 reference which changed twice so 2 occurrences (master changed).") - - # given - create_commit_with_content(self.tmp_git_repo, None, - 'commit 8 with parent 2', - [commit7.hex]) - - # when - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf['content_storage_dir'])) - self.assertEquals(nb_files, 7, "no new blob") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 9, - "Should be 8+1 == 9 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 8 trees (new commit without blob so no new tree)") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 7 blobs (new commit without new blob)") - self.assertEquals( - models.count_release(db_conn), - 0, - "No tag created so 0 release.") - self.assertEquals( - models.count_occurrence(db_conn), - 3, - "Should be 1 reference which changed thrice so 3 occurrences (master changed again).") - self.assertEquals( - models.count_person(db_conn), - 2, - "1 author + 1 committer") - - - # add tag - create_tag(self.tmp_git_repo, '0.0.1', commit5, 'bad ass release 0.0.1, towards infinity...') - create_tag(self.tmp_git_repo, '0.0.2', commit7, 'release 0.0.2... and beyond') - - loader.load(self.conf) - - # then - nb_files = len(list_files_from(self.conf['content_storage_dir'])) - self.assertEquals(nb_files, 7, "no new blob") - - with db.connect(self.db_url) as db_conn: - self.assertEquals( - models.count_revisions(db_conn), - 9, - "Should be 8+1 == 9 commits now") - self.assertEquals( - models.count_directories(db_conn), - 8, - "Should be 8 trees (new commit without blob so no new tree)") - self.assertEquals( - models.count_contents(db_conn), - 7, - "Should be 7 blobs (new commit without new blob)") - self.assertEquals( - models.count_release(db_conn), - 2, - "Should be 2 annotated tags so 2 releases") - self.assertEquals( - models.count_occurrence(db_conn), - 3, - "master did not change this time so still 3 occurrences") - self.assertEquals( - models.count_person(db_conn), - 3, - "1 author + 1 committer + 1 tagger") diff --git a/swh/loader/git/tests/test_swhrepo.py b/swh/loader/git/tests/test_swhrepo.py deleted file mode 100644 index 2564d83..0000000 --- a/swh/loader/git/tests/test_swhrepo.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.git.data import swhrepo - - -class SWHRepoTestCase(unittest.TestCase): - @istest - def new_swhrepo(self): - # when - r = swhrepo.SWHRepo() - - r.add_origin({'url': 'foobar'}) - - r.add_content({'id': 'some-con-sha1'}) - r.add_content({'id': 'some-con-sha1-2','stuff': 'some-stuff'}) - r.add_directory({'id': 'some-dir-sha1'}) - r.add_directory({'id': 'some-dir-sha1-2'}) - r.add_revision({'id': 'some-rev-sha1'}) - r.add_revision({'id': 'some-rev-sha1-2'}) - r.add_person('id0', {'name': 'the one'}) - r.add_person('id1', {'name': 'another one'}) - - r.add_occurrence({'id': 'some-occ-sha1'}) - r.add_release({'id': 'some-rel-sha1'}) - - # then - self.assertEquals(r.get_origin(), {'url': 'foobar'}) - self.assertEquals(r.get_releases(), [{'id': 'some-rel-sha1'}]) - self.assertEquals(r.get_occurrences(), [{'id': 'some-occ-sha1'}]) - - for sha in ['some-con-sha1', 'some-con-sha1-2', - 'some-dir-sha1', 'some-dir-sha1-2', - 'some-rev-sha1', 'some-rev-sha1-2']: - self.assertTrue(r.already_visited(sha)) - - self.assertFalse(r.already_visited('some-occ-sha1')) - self.assertFalse(r.already_visited('some-rel-sha1')) - - self.assertEquals(r.get_contents(), {'some-con-sha1': {'id': 'some-con-sha1'}, - 'some-con-sha1-2': {'id': 'some-con-sha1-2','stuff': 'some-stuff'}}) - self.assertEquals(r.get_directories(), {'some-dir-sha1': {'id': 'some-dir-sha1'}, - 'some-dir-sha1-2': {'id': 'some-dir-sha1-2'}}) - self.assertEquals(r.get_revisions(), {'some-rev-sha1': {'id': 'some-rev-sha1'}, - 'some-rev-sha1-2': {'id': 'some-rev-sha1-2'}}) - - self.assertEquals(len(r.get_persons()), 2) diff --git a/swh/loader/git/tests/test_utils.py b/swh/loader/git/tests/test_utils.py deleted file mode 100644 index bee06f3..0000000 --- a/swh/loader/git/tests/test_utils.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import os -import shutil -import tempfile - -from swh.loader.git import date -from swh.loader.git.backend import api -from swh.storage.objstorage import ObjStorage - -import test_initdb - - -now = date.now - -def list_files_from(root_path): - """Compute the list of files from root_path. - - """ - f = [] - for (dirpath, dirnames, filenames) in os.walk(root_path): - f.extend(filenames) - return f - - -def app_client(db_url="dbname=softwareheritage-dev-test"): - """Setup the application ready for testing. - - """ - content_storage_dir = tempfile.mkdtemp(prefix='test-swh-loader-git.', - dir='/tmp') - folder_depth = 2 - api.app.config['conf'] = {'db_url': db_url, - 'content_storage_dir': content_storage_dir, - 'log_dir': '/tmp/swh-loader-git/log', - 'folder_depth': folder_depth, - 'debug': 'true', - 'objstorage': ObjStorage(content_storage_dir, - folder_depth) - } - - api.app.config['TESTING'] = True - app = api.app.test_client() - test_initdb.prepare_db(db_url) - return app, db_url, content_storage_dir - - -def app_client_teardown(content_storage_dir): - """Tear down app client's context. - - """ - shutil.rmtree(content_storage_dir, ignore_errors=True)