diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -40,6 +40,7 @@ Package: python3-swh.storage.archiver Architecture: all Depends: python3-swh.scheduler, + python3-swh.journal, python3-swh.storage (= ${binary:Version}), ${misc:Depends}, ${python3:Depends} diff --git a/swh/storage/archiver/db.py b/swh/storage/archiver/db.py --- a/swh/storage/archiver/db.py +++ b/swh/storage/archiver/db.py @@ -1,9 +1,10 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import time from swh.core import hashutil @@ -258,3 +259,29 @@ cur = self._cursor(cur) cur.execute(query) + + def content_archive_content_add( + self, content_id, sources_present, sources_missing, cur=None): + + if isinstance(content_id, bytes): + content_id = '\\x%s' % hashutil.hash_to_hex(content_id) + + copies = {} + num_present = 0 + for source in sources_present: + copies[source] = { + "status": "present", + "mtime": int(time.time()), + } + num_present += 1 + + for source in sources_missing: + copies[source] = { + "status": "absent", + } + + query = """INSERT INTO content_archive(content_id, copies, num_present) + VALUES('%s', '%s', %s) + """ % (content_id, json.dumps(copies), num_present) + cur = self._cursor(cur) + cur.execute(query) diff --git a/swh/storage/archiver/storage.py b/swh/storage/archiver/storage.py --- a/swh/storage/archiver/storage.py +++ b/swh/storage/archiver/storage.py @@ -166,3 +166,18 @@ """ self.db.content_archive_insert(content_id, source, status, cur) + + @db_transaction + def content_archive_content_add( + self, content_ids, sources_present, sources_missing, cur=None): + """Insert a new entry in db about content_id. + + Args: + content_ids ([bytes|str]): content identifiers + sources_present: name of the source where the contents are present + sources_missing: name of the source where the contents are missing + + """ + for content_id in content_ids: + self.db.content_archive_content_add( + content_id, sources_present, sources_missing) diff --git a/swh/storage/archiver/updater.py b/swh/storage/archiver/updater.py new file mode 100644 --- /dev/null +++ b/swh/storage/archiver/updater.py @@ -0,0 +1,45 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.journal.client import SWHJournalClient + +from .storage import ArchiverStorage + + +class SWHArchiverContentUpdater(SWHJournalClient): + """Client in charge to update new content in the content_archiver db. + + """ + ADDITIONAL_CONFIG = { + 'archiver_storage_conn': ( + 'str', 'dbname=softwareheritage-archiver-dev user=guest'), + 'sources_missing': ('list[str]', ['banco', 'azure']), + 'sources_present': ('list[str]', ['uffizi']) + + } + + def __init__(self): + # Only interested in content here so override the configuration + super().__init__(extra_configuration={'object_types': ['content']}) + + self.sources_present = self.config['sources_present'] + self.sources_missing = self.config['sources_missing'] + + self.archiver_storage = ArchiverStorage( + self.config['archiver_storage_conn']) + + def process_objects(self, messages): + key_id = b'sha1' + + content_ids = [{'content_id': c[key_id]} for c in messages['content']] + unknowns = self.archiver_storage.content_archive_get_unknown( + content_ids) + self.archiver_storage.content_archive_content_add( + unknowns, self.sources_present, self.sources_missing) + + +if __name__ == '__main__': + content_updater = SWHArchiverContentUpdater() + content_updater.process()