diff --git a/swh/loader/git/storage/models.py b/swh/loader/git/storage/models.py index b319021..ea7a54c 100644 --- a/swh/loader/git/storage/models.py +++ b/swh/loader/git/storage/models.py @@ -1,294 +1,301 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from enum import Enum from . import db class Type(Enum): """Types of git objects. """ occurrence = 'occurrence' # ~git branch release = 'release' # ~git annotated tag revision = 'revision' # ~git commit directory = 'directory' # ~git tree directory_entry = 'directory_entry' # ~git tree_entry content = 'content' # ~git blob origin = 'origin' person = 'person' # committer, tagger, author def initdb(db_conn): """For retrocompatibility. """ pass def cleandb(db_conn): db.queries_execute(db_conn, ['TRUNCATE TABLE content CASCADE;', 'TRUNCATE TABLE organization CASCADE;', 'TRUNCATE TABLE list_history CASCADE;', 'TRUNCATE TABLE origin CASCADE;', 'TRUNCATE TABLE fetch_history CASCADE;', 'TRUNCATE TABLE project CASCADE;', 'TRUNCATE TABLE project_history CASCADE;', 'TRUNCATE TABLE directory CASCADE;', 'TRUNCATE TABLE directory_entry_dir CASCADE;', 'TRUNCATE TABLE directory_list_dir CASCADE;', 'TRUNCATE TABLE directory_entry_file CASCADE;', 'TRUNCATE TABLE directory_list_file CASCADE;', 'TRUNCATE TABLE person CASCADE;', 'TRUNCATE TABLE revision CASCADE;', 'TRUNCATE TABLE revision_history CASCADE;', 'TRUNCATE TABLE occurrence_history CASCADE;', 'TRUNCATE TABLE occurrence CASCADE;', 'TRUNCATE TABLE release CASCADE;', ]) def add_origin(db_conn, url, type, parent=None): """Insert origin and returns the newly inserted id. """ return db.insert(db_conn, ("""INSERT INTO origin (type, url, parent_id) VALUES (%s, %s, %s) RETURNING id""", (type, url, parent))) def add_person(db_conn, name, email): """Insert author and returns the newly inserted id. """ return db.insert(db_conn, ("""INSERT INTO person (name, email) VALUES (%s, %s) RETURNING id""", (name, email))) def add_content(db_conn, sha1, sha1_git, sha256_content, size): """Insert a new content. """ db.query_execute(db_conn, - ("""INSERT INTO content (id, sha1_git, sha256, length) + ("""INSERT INTO content (sha1, sha1_git, sha256, length) VALUES (%s, %s, %s, %s)""", (sha1, sha1_git, sha256_content, size))) def add_directory(db_conn, obj_sha): """Insert a new directory. """ db.query_execute(db_conn, ("""INSERT INTO directory (id) VALUES (%s)""", (obj_sha,))) def add_directory_entry(db_conn, name, sha, type, perms, atime, mtime, ctime, parent): """Insert a new directory. """ db.query_execute(db_conn, ("""INSERT INTO directory_entry (name, id, type, perms, atime, mtime, ctime, directory) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""", (name, sha, type, perms, atime, mtime, ctime, parent))) def add_revision(db_conn, sha, date, directory, message, author, committer, parent_shas=None): """Insert a revision. """ db.query_execute(db_conn, ("""INSERT INTO revision (id, date, directory, message, author, committer) VALUES (%s, %s, %s, %s, (select id from person where name=%s and email=%s), (select id from person where name=%s and email=%s))""", (sha, date, directory, message, author['name'], author['email'], committer['name'], committer['email']))) def add_revision_history(db_conn, couple_parents): """Store the revision history graph. """ tuples = ','.join(["('%s','%s')" % couple for couple in couple_parents]) query = 'INSERT INTO revision_history (id, parent_id) VALUES ' + tuples db.query_execute(db_conn, query) def add_release(db_conn, obj_sha, revision, date, name, comment, author): """Insert a release. """ db.query_execute(db_conn, ("""INSERT INTO release (id, revision, date, name, comment, author) VALUES (%s, %s, %s, %s, %s, (select id from person where name=%s and email=%s))""", (obj_sha, revision, date, name, comment, author['name'], author['email']))) def add_occurrence(db_conn, url_origin, branch, revision): """Insert an occurrence. Check if occurrence history already present. If present do nothing, otherwise insert """ with db_conn.cursor() as cur: occ = find_occurrence(cur, branch, revision, url_origin) if not occ: db.execute( cur, ("""INSERT INTO occurrence (origin, branch, revision) VALUES ((select id from origin where url=%s), %s, %s)""", (url_origin, branch, revision))) def find_revision(db_conn, obj_sha): """Find a revision by its obj_sha. """ return find_object(db_conn, obj_sha, Type.revision) def find_directory(db_conn, obj_sha): """Find a directory by its obj_sha. """ return find_object(db_conn, obj_sha, Type.directory) def find_content(db_conn, obj_sha): """Find a content by its obj_sha. """ - return find_object(db_conn, obj_sha, Type.content) + return find_object(db_conn, obj_sha, Type.content, column='sha1') def find_occurrences_for_revision(db_conn, revision, type): """Find all occurences for a specific revisions. type is not used (implementation detail). """ return db.query_fetch(db_conn, ("""SELECT * FROM occurrence WHERE revision=%s""", (revision,))) def find_origin(db_conn, origin_url, origin_type): """Find all origins matching an url and an origin type. """ return db.query_fetchone(db_conn, ("""SELECT * FROM origin WHERE url=%s AND type=%s""", (origin_url, origin_type))) def find_person(db_conn, email, name): """Find a person uniquely identified by email and name. """ return db.query_fetchone(db_conn, ("""SELECT id FROM person WHERE email=%s AND name=%s""", (email, name))) def find_occurrence(cur, branch, revision, url_origin): """Find an ocurrence with branch pointing on valid revision for date. """ return db.fetchone( cur, ("""SELECT * FROM occurrence oc WHERE branch=%s AND revision=%s AND origin = (select id from origin where url = %s)""", (branch, revision, url_origin))) +def find_object(db_conn, obj_sha, obj_type, column='id'): """Find an object of obj_type by its obj_sha. """ table = obj_type if isinstance(obj_type, str) else obj_type.value - query = 'select id from ' + table + ' where id=%s' + query = 'select '+ column + ' from ' + table + ' where ' + column + '=%s' return db.query_fetchone(db_conn, (query, (obj_sha,))) -def filter_unknown_objects(db_conn, file_sha1s, table_to_filter, tbl_tmp_name): +def filter_unknown_objects(db_conn, file_sha1s, + table_to_filter, tbl_tmp_name, + column_to_filter='id', nature_column='sha1_git'): """Given a list of sha1s, filter the unknown object between this list and the content of the table table_to_filter. tbl_tmp_name is the temporary table used to filter. """ with db_conn.cursor() as cur: # explicit is better than implicit # simply creating the temporary table seems to be enough db.execute(cur, """CREATE TEMPORARY TABLE IF NOT EXISTS %s( - id sha1_git) - ON COMMIT DELETE ROWS;""" % tbl_tmp_name) + %s %s) + ON COMMIT DELETE ROWS;""" % + (tbl_tmp_name, column_to_filter, nature_column)) db.copy_from(cur, file_sha1s, tbl_tmp_name) - db.execute(cur, '(SELECT id FROM %s) EXCEPT (SELECT id FROM %s);' % - (tbl_tmp_name, table_to_filter)) + db.execute(cur, '(SELECT %s FROM %s) EXCEPT (SELECT %s FROM %s);' % + (column_to_filter, tbl_tmp_name, + column_to_filter, table_to_filter)) return cur.fetchall() def find_unknown_revisions(db_conn, file_sha1s): """Filter unknown revisions from file_sha1s. """ - return filter_unknown_objects(db_conn, file_sha1s, 'revision', - 'filter_sha1_revision') + return filter_unknown_objects(db_conn, file_sha1s, + 'revision', 'filter_sha1_revision') def find_unknown_directories(db_conn, file_sha1s): """Filter unknown directories from file_sha1s. """ - return filter_unknown_objects(db_conn, file_sha1s, 'directory', - 'filter_sha1_directory') + return filter_unknown_objects(db_conn, file_sha1s, + 'directory', 'filter_sha1_directory') def find_unknown_contents(db_conn, file_sha1s): """Filter unknown contents from file_sha1s. """ - return filter_unknown_objects(db_conn, file_sha1s, 'content', - 'filter_sha1_content') + return filter_unknown_objects(db_conn, file_sha1s, + 'content', + 'filter_sha1_content', + 'sha1', 'sha1') def _count_objects(db_conn, type): return db.query_fetchone(db_conn, 'SELECT count(*) FROM ' + type.value)[0] def count_revisions(db_conn): """Count the number of revisions. """ return _count_objects(db_conn, Type.revision) def count_directories(db_conn): """Count the number of directories. """ return _count_objects(db_conn, Type.directory) def count_contents(db_conn): """Count the number of contents. """ return _count_objects(db_conn, Type.content) def count_occurrence(db_conn): """Count the number of occurrence. """ return _count_objects(db_conn, Type.occurrence) def count_release(db_conn): """Count the number of occurrence. """ return _count_objects(db_conn, Type.release) def count_person(db_conn): """Count the number of occurrence. """ return _count_objects(db_conn, Type.person) diff --git a/swh/loader/git/storage/storage.py b/swh/loader/git/storage/storage.py index c3c3c31..8c0e361 100755 --- a/swh/loader/git/storage/storage.py +++ b/swh/loader/git/storage/storage.py @@ -1,201 +1,200 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import StringIO from . import models Type = models.Type -_find_object = {Type.occurrence: models.find_occurrences_for_revision} - +_find_object = {Type.occurrence: models.find_occurrences_for_revision, + Type.content: lambda *args: models.find_object(*args, column='sha1')} def find(db_conn, id, type): """Find an object according to its sha1hex and type. """ - find_fn = _find_object.get(type, models.find_object) - return find_fn(db_conn, id, type) + return _find_object.get(type, models.find_object)(db_conn, id, type) _find_unknown = {Type.revision: models.find_unknown_revisions, Type.content: models.find_unknown_contents, Type.directory: models.find_unknown_directories} def find_unknowns(db_conn, obj_type, sha1s_hex): """Given a list of sha1s, return the non presents one in storage. """ def row_to_sha1(row): """Convert a row (memoryview) to a string sha1. """ return row[0] vals = '\n'.join(sha1s_hex) cpy_data_buffer = StringIO() cpy_data_buffer.write(vals) cpy_data_buffer.seek(0) # move file cursor back at start of file find_unknown_fn = _find_unknown[obj_type] unknowns = find_unknown_fn(db_conn, cpy_data_buffer) cpy_data_buffer.close() return list(map(row_to_sha1, unknowns)) def _add_content(db_conn, vcs_object, sha1hex): """Add a blob to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ models.add_content(db_conn, sha1hex, vcs_object['git-sha1'], vcs_object['content-sha256'], vcs_object['size']) return sha1hex def _add_directory(db_conn, vcs_object, sha1hex): """Add a directory to storage. Designed to be wrapped in a db transaction. """ models.add_directory(db_conn, sha1hex) for directory_entry in vcs_object['entries']: _add_directory_entry(db_conn, directory_entry) return sha1hex def _add_directory_entry(db_conn, vcs_object): """Add a directory to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ name = vcs_object['name'] parent = vcs_object['parent'] models.add_directory_entry(db_conn, name, vcs_object['target-sha1'], vcs_object['nature'], vcs_object['perms'], vcs_object['atime'], vcs_object['mtime'], vcs_object['ctime'], parent) return name, parent def _add_revision(db_conn, vcs_object, sha1hex): """Add a revision to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ models.add_revision(db_conn, sha1hex, vcs_object['date'], vcs_object['directory'], vcs_object['message'], vcs_object['author'], vcs_object['committer'], vcs_object['parent-sha1s']) return sha1hex def _add_release(db_conn, vcs_object, sha1hex): """Add a release. """ models.add_release(db_conn, sha1hex, vcs_object['revision'], vcs_object['date'], vcs_object['name'], vcs_object['comment'], vcs_object['author']) return sha1hex def _add_occurrence(db_conn, vcs_object, sha1hex): """Add an occurrence. """ models.add_occurrence(db_conn, vcs_object['url-origin'], vcs_object['branch'], vcs_object['revision']) return sha1hex def add_person(db_conn, vcs_object): """Add an author. """ return models.add_person(db_conn, vcs_object['name'], vcs_object['email']) _store_fn = {Type.directory: _add_directory, Type.revision: _add_revision, Type.release: _add_release, Type.occurrence: _add_occurrence} def add_origin(db_conn, origin): """A a new origin and returns its id. """ return models.add_origin(db_conn, origin['url'], origin['type']) def find_origin(db_conn, origin): """Find an existing origin. """ return models.find_origin(db_conn, origin['url'], origin['type']) def find_person(db_conn, person): """Find an existing person. """ return models.find_person(db_conn, person['email'], person['name']) def add_with_fs_storage(db_conn, config, id, type, vcs_object): """Add vcs_object in the storage - db_conn is the opened connection to the db - config is the map of configuration needed for core layer - type is not used here but represent the type of vcs_object - vcs_object is the object meant to be persisted in fs and db """ id_ = config['objstorage'].add_bytes(vcs_object['content']) if id_ != id: raise Exception("The ids should have been identical. Corruption.") return _add_content(db_conn, vcs_object, id) def add(db_conn, config, id, type, vcs_object): """Given a sha1hex, type and content, store a given object in the store. - db_conn is the opened connection to the db - config is not used here - type is the object's type - vcs_object is the object meant to be persisted in db """ return _store_fn[type](db_conn, vcs_object, id) def add_revision_history(db_conn, couple_parents): """Given a list of tuple (sha, parent_sha), store in revision_history. """ if len(couple_parents) > 0: models.add_revision_history(db_conn, couple_parents) diff --git a/swh/loader/git/tests/test_api_content.py b/swh/loader/git/tests/test_api_content.py index 3c8eb69..2b310ec 100644 --- a/swh/loader/git/tests/test_api_content.py +++ b/swh/loader/git/tests/test_api_content.py @@ -1,110 +1,110 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from nose.plugins.attrib import attr from swh.loader.git.storage import db, models from swh.loader.git.protocols import serial from test_utils import app_client, app_client_teardown @attr('slow') class ContentTestCase(unittest.TestCase): def setUp(self): self.app, db_url, self.content_storage_dir = app_client() with db.connect(db_url) as db_conn: self.content_sha1_id = '222222f9dd5dc46ee476a8be155ab049994f717e' content_sha1_id = 'blabliblablo' self.content_sha256_hex = '222222f9dd5dc46ee476a8be155ab049994f717e' models.add_content(db_conn, self.content_sha1_id, content_sha1_id, self.content_sha256_hex, 10) def tearDown(self): app_client_teardown(self.content_storage_dir) @istest def get_content_ok(self): # when rv = self.app.get('/vcs/contents/%s' % self.content_sha1_id) # then assert rv.status_code == 200 assert serial.loads(rv.data)['id'] == '222222f9dd5dc46ee476a8be155ab049994f717e' @istest def get_content_not_found(self): # when rv = self.app.get('/vcs/contents/222222f9dd5dc46ee476a8be155ab049994f7170') # then assert rv.status_code == 404 assert rv.data == b'Not found!' @istest def get_content_not_found_with_bad_format(self): # when rv = self.app.get('/vcs/contents/1') # then assert rv.status_code == 404 assert rv.data == b'Not found!' @istest def put_content_create_and_update(self): - content_sha1 = 'sha1-contentc46ee476a8be155ab03333333333' + content_sha1 = '62cdb7020ff920e5aa642c3d4066950dd1f01f4d' # real sha1 of 'bar' # does not exist rv = self.app.get('/vcs/contents/%s' % content_sha1) # then assert rv.status_code == 404 assert rv.data == b'Not found!' # we create it body = {'id': content_sha1, - 'content-sha1': 'content-sha1c46ee476a8be155ab03333333333', + 'git-sha1': 'content-sha1c46ee476a8be155ab03333333333', 'content-sha256': 'content-sha2566ee476a8be155ab03333333333', 'content': b'bar', 'size': '3'} rv = self.app.put('/vcs/contents/%s' % content_sha1, data=serial.dumps(body), headers={'Content-Type': serial.MIMETYPE}) assert rv.status_code == 204 assert rv.data == b'' # now it exists rv = self.app.get('/vcs/contents/%s' % content_sha1) # then assert rv.status_code == 200 - assert serial.loads(rv.data)['id'] == 'sha1-contentc46ee476a8be155ab03333333333' + assert serial.loads(rv.data)['id'] == content_sha1 # # we update it body = {'id': content_sha1, 'content-sha1': 'content-sha1c46ee476a8be155ab03333333333', 'content-sha256': 'content-sha2566ee476a8be155ab03333333333', 'content': b'bar', 'size': '3'} rv = self.app.put('/vcs/contents/%s' % content_sha1, data=serial.dumps(body), headers={'Content-Type': serial.MIMETYPE}) assert rv.status_code == 204 assert rv.data == b'' # still the same rv = self.app.get('/vcs/contents/%s' % content_sha1) # then assert rv.status_code == 200 - assert serial.loads(rv.data)['id'] == 'sha1-contentc46ee476a8be155ab03333333333' + assert serial.loads(rv.data)['id'] == content_sha1