diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(68, now(), 'Work In Progress'); + values(69, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); @@ -426,3 +426,29 @@ ); create index on release(target, target_type); + + +-- In order to archive the content of the object storage, add +-- some tables to keep trace of what have already been archived. + +CREATE DOMAIN archive_id AS TEXT; + +CREATE TABLE archives ( + id archive_id PRIMARY KEY, + url TEXT +); + +CREATE TYPE archive_status AS ENUM ( + 'missing', + 'ongoing', + 'present' +); + +CREATE TABLE content_archive ( + content_id sha1 REFERENCES content(sha1), + archive_id archive_id REFERENCES archives(id), + status archive_status, + mtime timestamptz, + PRIMARY KEY (content_id, archive_id) +); + diff --git a/sql/upgrades/069.sql b/sql/upgrades/069.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/069.sql @@ -0,0 +1,28 @@ +-- SWH DB schema upgrade +-- from_version: 68 +-- to_version: 69 +-- description: add tables for the archiver. + +insert into dbversion(version, release, description) + values(69, now(), 'Work In Progress'); + +CREATE DOMAIN archive_id AS TEXT; + +CREATE TABLE archives ( + id archive_id PRIMARY KEY, + url TEXT +); + +CREATE TYPE archive_status AS ENUM ( + 'missing', + 'ongoing', + 'present' +); + +CREATE TABLE content_archive ( + content_id sha1 REFERENCES content(sha1), + archive_id archive_id REFERENCES archives(id), + status archive_status, + mtime timestamptz, + PRIMARY KEY (content_id, archive_id) +); diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -624,3 +624,62 @@ if not data: return None return line_to_bytes(data) + + def archive_ls(self, cur=None): + """ Get all the archives registered on the server. + + Yields: + a tuple (server_id, server_url) for each archive server. + """ + cur = self._cursor(cur) + cur.execute("""SELECT id, url + FROM archives + """) + yield from cursor_to_bytes(cur) + + def content_archive_ls(self, cur=None): + """ Get the archival status of the content + + Get an iterable over all the content that is referenced + in a backup server. + + Yields: + the sha1 of each content referenced at least one time + in the database of archiveal status. + """ + cur = self._cursor(cur) + cur.execute("""SELECT DISTINCT content_id + FROM content_archive""") + yield from cursor_to_bytes(cur) + + def content_archive_get(self, content=None, archive=None, cur=None): + """ Get the archival status of a content in a specific server. + + Retreive from the database the archival status of the given content + in the given archive server. + + Args: + content: the sha1 of the content. May be None for any id. + archive: the database id of the server we're looking into + may be None for any server. + + Yields: + A tuple (content_id, server_id, archival status, mtime, tzinfo). + """ + query = """SELECT content_id, archive_id, status, mtime + FROM content_archive + """ + conditions = [] + if content: + conditions.append("content_id='%s'" % content) + if archive: + conditions.append("archive_id='%s'" % archive) + + if conditions: + query = """%s + WHERE %s + """ % (query, ' and '.join(conditions)) + + cur = self._cursor(cur) + cur.execute(query) + yield from cursor_to_bytes(cur)