diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index a5f3dfd..0ea142a 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,29 +1,57 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information INDEXER_CLASSES = { 'mimetype': 'swh.indexer.mimetype.ContentMimetypeIndexer', 'language': 'swh.indexer.language.ContentLanguageIndexer', 'ctags': 'swh.indexer.ctags.CtagsIndexer', 'fossology_license': 'swh.indexer.fossology_license.ContentFossologyLicenseIndexer', } TASK_NAMES = { 'orchestrator_all': 'swh.indexer.tasks.SWHOrchestratorAllContentsTask', 'orchestrator_text': 'swh.indexer.tasks.SWHOrchestratorTextContentsTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', 'language': 'swh.indexer.tasks.SWHContentLanguageTask', 'ctags': 'swh.indexer.tasks.SWHCtagsTask', 'fossology_license': 'swh.indexer.tasks.SWHContentFossologyLicenseTask', 'rehash': 'swh.indexer.tasks.SWHRecomputeChecksumsTask', } __all__ = [ 'INDEXER_CLASSES', 'TASK_NAMES', ] + + +def get_storage(cls, args): + """ + Get a storage object of class `storage_class` with arguments + `storage_args`. + + Args: + storage (dict): dictionary with keys: + - cls (str): storage's class, either 'local' or 'remote' + - args (dict): dictionary with keys + + Returns: + an instance of swh.indexer's storage (either local or remote) + + Raises: + ValueError if passed an unknown storage class. + + """ + + if cls == 'remote': + from .api.client import RemoteStorage as IndexerStorage + elif cls == 'local': + from .storage import IndexerStorage + else: + raise ValueError('Unknown storage class `%s`' % cls) + + return IndexerStorage(**args) diff --git a/swh/indexer/converters.py b/swh/indexer/converters.py new file mode 100644 index 0000000..db7a295 --- /dev/null +++ b/swh/indexer/converters.py @@ -0,0 +1,140 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def ctags_to_db(ctags): + """Convert a ctags entry into a ready ctags entry. + + Args: + ctags (dict): ctags entry with the following keys: + + - id (bytes): content's identifier + - tool_id (int): tool id used to compute ctags + - ctags ([dict]): List of dictionary with the following keys: + + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language + + Returns: + list: list of ctags entries as dicts with the following keys: + + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool_id (int): tool id used to compute ctags + + """ + id = ctags['id'] + tool_id = ctags['indexer_configuration_id'] + for ctag in ctags['ctags']: + yield { + 'id': id, + 'name': ctag['name'], + 'kind': ctag['kind'], + 'line': ctag['line'], + 'lang': ctag['lang'], + 'indexer_configuration_id': tool_id, + } + + +def db_to_ctags(ctag): + """Convert a ctags entry into a ready ctags entry. + + Args: + ctags (dict): ctags entry with the following keys: + - id (bytes): content's identifier + - ctags ([dict]): List of dictionary with the following keys: + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language + + Returns: + List of ctags ready entry (dict with the following keys): + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool (dict): tool used to compute the ctags + + """ + return { + 'id': ctag['id'], + 'name': ctag['name'], + 'kind': ctag['kind'], + 'line': ctag['line'], + 'lang': ctag['lang'], + 'tool': { + 'id': ctag['tool_id'], + 'name': ctag['tool_name'], + 'version': ctag['tool_version'], + 'configuration': ctag['tool_configuration'] + } + } + + +def db_to_mimetype(mimetype): + """Convert a ctags entry into a ready ctags output. + + """ + return { + 'id': mimetype['id'], + 'encoding': mimetype['encoding'], + 'mimetype': mimetype['mimetype'], + 'tool': { + 'id': mimetype['tool_id'], + 'name': mimetype['tool_name'], + 'version': mimetype['tool_version'], + 'configuration': mimetype['tool_configuration'] + } + } + + +def db_to_language(language): + """Convert a language entry into a ready language output. + + """ + return { + 'id': language['id'], + 'lang': language['lang'], + 'tool': { + 'id': language['tool_id'], + 'name': language['tool_name'], + 'version': language['tool_version'], + 'configuration': language['tool_configuration'] + } + } + + +def db_to_metadata(metadata): + """Convert a metadata entry into a ready metadata output. + + """ + return { + 'id': metadata['id'], + 'translated_metadata': metadata['translated_metadata'], + 'tool': { + 'id': metadata['tool_id'], + 'name': metadata['tool_name'], + 'version': metadata['tool_version'], + 'configuration': metadata['tool_configuration'] + } + } + + +def db_to_fossology_license(license): + return { + 'id': license['id'], + 'licenses': license['licenses'], + 'tool': { + 'id': license['tool_id'], + 'name': license['tool_name'], + 'version': license['tool_version'], + 'configuration': license['tool_configuration'], + } + } diff --git a/swh/indexer/db.py b/swh/indexer/db.py new file mode 100644 index 0000000..b51402e --- /dev/null +++ b/swh/indexer/db.py @@ -0,0 +1,245 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model import hashutil + +from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes +from swh.storage.db import line_to_bytes + + +class Db(BaseDb): + """Proxy to the SWH Indexer DB, with wrappers around stored procedures + + """ + @stored_procedure('swh_mktemp_bytea') + def mktemp_bytea(self, cur=None): pass + + def store_tmp_bytea(self, ids, cur=None): + """Store the given identifiers in a new tmp_bytea table""" + cur = self._cursor(cur) + + self.mktemp_bytea(cur) + self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', + ['id'], cur) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_mimetype_missing') + def mktemp_content_mimetype_missing(self, cur=None): pass + + def content_mimetype_missing_from_temp(self, cur=None): + """List missing mimetypes. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_mimetype_missing()") + yield from cursor_to_bytes(cur) + + @stored_procedure('swh_mktemp_content_mimetype') + def mktemp_content_mimetype(self, cur=None): pass + + def content_mimetype_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", + (conflict_update, )) + + def content_mimetype_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_mimetype_get()" % ( + ','.join(self.content_mimetype_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_language_cols = [ + 'id', 'lang', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_language') + def mktemp_content_language(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_language_missing') + def mktemp_content_language_missing(self, cur=None): pass + + def content_language_missing_from_temp(self, cur=None): + """List missing languages. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_language_missing()") + yield from cursor_to_bytes(cur) + + def content_language_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_language_add(%s)", + (conflict_update, )) + + def content_language_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_language_get()" % ( + ','.join(self.content_language_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_ctags_cols = [ + 'id', 'name', 'kind', 'line', 'lang', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_ctags') + def mktemp_content_ctags(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_ctags_missing') + def mktemp_content_ctags_missing(self, cur=None): pass + + def content_ctags_missing_from_temp(self, cur=None): + """List missing ctags. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_ctags_missing()") + yield from cursor_to_bytes(cur) + + def content_ctags_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", + (conflict_update, )) + + def content_ctags_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_ctags_get()" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + def content_ctags_search(self, expression, last_sha1, limit, cur=None): + cur = self._cursor(cur) + if not last_sha1: + query = """SELECT %s + FROM swh_content_ctags_search(%%s, %%s)""" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query, (expression, limit)) + else: + if last_sha1 and isinstance(last_sha1, bytes): + last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) + elif last_sha1: + last_sha1 = '\\x%s' % last_sha1 + + query = """SELECT %s + FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query, (expression, limit, last_sha1)) + + yield from cursor_to_bytes(cur) + + content_fossology_license_cols = [ + 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', + 'licenses'] + + @stored_procedure('swh_mktemp_content_fossology_license') + def mktemp_content_fossology_license(self, cur=None): pass + + def content_fossology_license_add_from_temp(self, conflict_update, + cur=None): + """Add new licenses per content. + + """ + self._cursor(cur).execute( + "SELECT swh_content_fossology_license_add(%s)", + (conflict_update, )) + + def content_fossology_license_get_from_temp(self, cur=None): + """Retrieve licenses per content. + + """ + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_fossology_license_get()" % ( + ','.join(self.content_fossology_license_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_metadata') + def mktemp_content_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_metadata_missing') + def mktemp_content_metadata_missing(self, cur=None): pass + + def content_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_metadata_missing()") + yield from cursor_to_bytes(cur) + + def content_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", + (conflict_update, )) + + def content_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_metadata_get()" % ( + ','.join(self.content_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + revision_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_revision_metadata') + def mktemp_revision_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_revision_metadata_missing') + def mktemp_revision_metadata_missing(self, cur=None): pass + + def revision_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_revision_metadata_missing()") + yield from cursor_to_bytes(cur) + + def revision_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", + (conflict_update, )) + + def revision_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_revision_metadata_get()" % ( + ','.join(self.revision_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', + 'tool_configuration'] + + @stored_procedure('swh_mktemp_indexer_configuration') + def mktemp_indexer_configuration(self, cur=None): + pass + + def indexer_configuration_add_from_temp(self, cur=None): + cur = self._cursor(cur) + cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( + ','.join(self.indexer_configuration_cols), )) + yield from cursor_to_bytes(cur) + + def indexer_configuration_get(self, tool_name, + tool_version, tool_configuration, cur=None): + cur = self._cursor(cur) + cur.execute('''select %s + from indexer_configuration + where tool_name=%%s and + tool_version=%%s and + tool_configuration=%%s''' % ( + ','.join(self.indexer_configuration_cols)), + (tool_name, tool_version, tool_configuration)) + + data = cur.fetchone() + if not data: + return None + return line_to_bytes(data) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index 5e2ee14..56a0e54 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,211 +1,210 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json def convert(raw_content): """ convert raw_content recursively: - from bytes to string - from string to dict Args: raw_content (bytes / string / dict) Returns: dict: content (if string was json, otherwise returns string) """ if isinstance(raw_content, bytes): return convert(raw_content.decode()) if isinstance(raw_content, str): try: content = json.loads(raw_content) if content: return content else: return raw_content except json.decoder.JSONDecodeError: return raw_content if isinstance(raw_content, dict): return raw_content class BaseMapping(): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - add a local property self.mapping - override translate function """ def translate(self, content_dict): """ Tranlsates content by parsing content to a json object and translating with the npm mapping (for now hard_coded mapping) Args: context_text (text): should be json Returns: dict: translated metadata in jsonb form needed for the indexer """ translated_metadata = {} default = 'other' translated_metadata['other'] = {} try: for k, v in content_dict.items(): try: term = self.mapping.get(k, default) if term not in translated_metadata: translated_metadata[term] = v continue if isinstance(translated_metadata[term], str): in_value = translated_metadata[term] translated_metadata[term] = [in_value, v] continue if isinstance(translated_metadata[term], list): translated_metadata[term].append(v) continue if isinstance(translated_metadata[term], dict): translated_metadata[term][k] = v continue except KeyError: self.log.exception( "Problem during item mapping") continue except: return None return translated_metadata class NpmMapping(BaseMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = { 'repository': 'codeRepository', 'os': 'operatingSystem', 'cpu': 'processorRequirements', 'engines': 'processorRequirements', 'dependencies': 'softwareRequirements', 'bundleDependencies': 'softwareRequirements', 'peerDependencies': 'softwareRequirements', 'author': 'author', 'contributor': 'contributor', 'keywords': 'keywords', 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'devDependencies': 'softwareSuggestions', 'optionalDependencies': 'softwareSuggestions', 'bugs': 'issueTracker', 'homepage': 'url' } def translate(self, raw_content): content_dict = convert(raw_content) return super().translate(content_dict) class MavenMapping(BaseMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ mapping = { 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'prerequisites': 'softwareRequirements', 'repositories': 'codeRepository', 'groupId': 'identifier', 'ciManagement': 'contIntegration', 'issuesManagement': 'issueTracker', } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) class DoapMapping(BaseMapping): mapping = { } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) def parse_xml(content): """ Parses content from xml to a python dict Args: - content (text): the string form of the raw_content ( in xml) Returns: - parsed_xml (dict): a python dict of the content after parsing """ # check if xml # use xml parser to dict return content mapping_tool_fn = { "npm": NpmMapping(), "maven": MavenMapping(), "doap_xml": DoapMapping() } def compute_metadata(context, raw_content): """ first landing method: a dispatcher that sends content to the right function to carry out the real parsing of syntax and translation of terms Args: context (text): defines to which function/tool the content is sent content (text): the string form of the raw_content Returns: dict: translated metadata jsonb dictionary needed for the indexer to store in storage """ if raw_content is None or raw_content is b"": return None # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) # if fetched from storage should be done once for batch of sha1s dictionary = mapping_tool_fn[context] translated_metadata = dictionary.translate(raw_content) - # print(translated_metadata) return translated_metadata def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = compute_metadata("npm", raw_content) result1 = compute_metadata("maven", raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/storage.py b/swh/indexer/storage.py new file mode 100644 index 0000000..8325954 --- /dev/null +++ b/swh/indexer/storage.py @@ -0,0 +1,521 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import json +import dateutil.parser +import psycopg2 + +from swh.storage.common import db_transaction_generator, db_transaction +from swh.storage.exc import StorageDBError +from .db import Db + +from . import converters + + +class IndexerStorage(): + """SWH Indexer Storage + + """ + def __init__(self, db): + """ + Args: + db_conn: either a libpq connection string, or a psycopg2 connection + obj_root: path to the root of the object storage + + """ + try: + if isinstance(db, psycopg2.extensions.connection): + self.db = Db(db) + else: + self.db = Db.connect(db) + except psycopg2.OperationalError as e: + raise StorageDBError(e) + + def check_config(self, *, check_write): + """Check that the storage is configured and ready to go.""" + # Check permissions on one of the tables + with self.db.transaction() as cur: + if check_write: + check = 'INSERT' + else: + check = 'SELECT' + + cur.execute( + "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa + (check,) + ) + return cur.fetchone()[0] + + return True + + @db_transaction_generator + def content_mimetype_missing(self, mimetypes, cur=None): + """List mimetypes missing from storage. + + Args: + mimetypes (iterable): iterable of dict with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: an iterable of missing id for the triplets id, tool_name, + tool_version + + """ + db = self.db + db.mktemp_content_mimetype_missing(cur) + db.copy_to(mimetypes, 'tmp_content_mimetype_missing', + ['id', 'indexer_configuration_id'], + cur) + for obj in db.content_mimetype_missing_from_temp(cur): + yield obj[0] + + @db_transaction + def content_mimetype_add(self, mimetypes, conflict_update=False, cur=None): + """Add mimetypes not present in storage. + + Args: + mimetypes (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - mimetype (bytes): raw content's mimetype + - encoding (bytes): raw content's encoding + - indexer_configuration_id (int): tool's id used to + compute the results + - conflict_update: Flag to determine if we want to + overwrite (true) or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_mimetype(cur) + db.copy_to(mimetypes, 'tmp_content_mimetype', + ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], + cur) + db.content_mimetype_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_mimetype_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_mimetype_get_from_temp(): + yield converters.db_to_mimetype( + dict(zip(db.content_mimetype_cols, c))) + + @db_transaction_generator + def content_language_missing(self, languages, cur=None): + """List languages missing from storage. + + Args: + languages (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: identifiers of missing languages + + """ + db = self.db + db.mktemp_content_language_missing(cur) + db.copy_to(languages, 'tmp_content_language_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_language_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_language_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_language_get_from_temp(): + yield converters.db_to_language( + dict(zip(db.content_language_cols, c))) + + @db_transaction + def content_language_add(self, languages, conflict_update=False, cur=None): + """Add languages not present in storage. + + Args: + languages (iterable): dictionaries with keys: + + - id: sha1 + - lang: bytes + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_language(cur) + # empty language is mapped to 'unknown' + db.copy_to( + ({ + 'id': l['id'], + 'lang': 'unknown' if not l['lang'] else l['lang'], + 'indexer_configuration_id': l['indexer_configuration_id'], + } for l in languages), + 'tmp_content_language', + ['id', 'lang', 'indexer_configuration_id'], cur) + + db.content_language_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_ctags_missing(self, ctags, cur=None): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool name used + - tool_version (str): associated version + + Returns: + an iterable of missing id + + """ + db = self.db + + db.mktemp_content_ctags_missing(cur) + db.copy_to(ctags, + tblname='tmp_content_ctags_missing', + columns=['id', 'indexer_configuration_id'], + cur=cur) + for obj in db.content_ctags_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_ctags_get(self, ids, cur=None): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + """ + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_ctags_get_from_temp(): + yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + + @db_transaction + def content_ctags_add(self, ctags, conflict_update=False, cur=None): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - id (bytes): sha1 + - ctags ([list): List of dictionary with keys: name, kind, + line, language + + """ + db = self.db + + def _convert_ctags(__ctags): + """Convert ctags dict to list of ctags. + + """ + for ctags in __ctags: + yield from converters.ctags_to_db(ctags) + + db.mktemp_content_ctags(cur) + db.copy_to(list(_convert_ctags(ctags)), + tblname='tmp_content_ctags', + columns=['id', 'name', 'kind', 'line', + 'lang', 'indexer_configuration_id'], + cur=cur) + + db.content_ctags_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_ctags_search(self, expression, + limit=10, last_sha1=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + db = self.db + + for obj in db.content_ctags_search(expression, last_sha1, limit, + cur=cur): + yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + + @db_transaction_generator + def content_fossology_license_get(self, ids, cur=None): + """Retrieve licenses per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + - id (bytes) + - licenses ([str]): associated licenses for that content + + """ + db = self.db + db.store_tmp_bytea(ids, cur) + + for c in db.content_fossology_license_get_from_temp(): + license = dict(zip(db.content_fossology_license_cols, c)) + yield converters.db_to_fossology_license(license) + + @db_transaction + def content_fossology_license_add(self, licenses, + conflict_update=False, cur=None): + """Add licenses not present in storage. + + Args: + licenses (iterable): dictionaries with keys: + + - id: sha1 + - license ([bytes]): List of licenses associated to sha1 + - tool (str): nomossa + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + Returns: + list: content_license entries which failed due to unknown licenses + + """ + db = self.db + + # Then, we add the correct ones + db.mktemp_content_fossology_license(cur) + db.copy_to( + ({ + 'id': sha1['id'], + 'indexer_configuration_id': sha1['indexer_configuration_id'], + 'license': license, + } for sha1 in licenses + for license in sha1['licenses']), + tblname='tmp_content_fossology_license', + columns=['id', 'license', 'indexer_configuration_id'], + cur=cur) + db.content_fossology_license_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: missing ids + + """ + db = self.db + db.mktemp_content_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_content_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.content_metadata_cols, c))) + + @db_transaction + def content_metadata_add(self, metadatas, conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id: sha1 + - translated_metadata: bytes / jsonb ? + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_metadata(cur) + # empty metadata is mapped to 'unknown' + + db.copy_to(metadatas, 'tmp_content_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], + cur) + db.content_metadata_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def revision_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id (bytes): sha1_git revision identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: missing ids + + """ + db = self.db + db.mktemp_revision_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_revision_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.revision_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def revision_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.revision_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.revision_metadata_cols, c))) + + @db_transaction + def revision_metadata_add(self, metadatas, + conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id: sha1_git of revision + - translated_metadata: bytes / jsonb ? + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_revision_metadata(cur) + # empty metadata is mapped to 'unknown' + + db.copy_to(metadatas, 'tmp_revision_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], + cur) + db.revision_metadata_add_from_temp(conflict_update, cur) + + @db_transaction + def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, + cur=None): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin_id (int): the origin's id for which the metadata is added + ts (datetime): timestamp of the found metadata + provider (int): the provider of metadata (ex:'hal') + tool (int): tool used to extract metadata + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + """ + if isinstance(ts, str): + ts = dateutil.parser.parse(ts) + + return self.db.origin_metadata_add(origin_id, ts, provider, tool, + metadata, cur) + + @db_transaction_generator + def origin_metadata_get_by(self, origin_id, provider_type=None, cur=None): + """Retrieve list of all origin_metadata entries for the origin_id + + Args: + origin_id (int): the unique origin identifier + provider_type (str): (optional) type of provider + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id (int): origin_metadata's id + - origin_id (int): origin's id + - discovery_date (datetime): timestamp of discovery + - tool_id (int): metadata's extracting tool + - metadata (jsonb) + - provider_id (int): metadata's provider + - provider_name (str) + - provider_type (str) + - provider_url (str) + + """ + db = self.db + for line in db.origin_metadata_get_by(origin_id, provider_type, cur): + yield dict(zip(db.origin_metadata_get_cols, line)) + + @db_transaction_generator + def indexer_configuration_add(self, tools, cur=None): + """Add new tools to the storage. + + Args: + tools ([dict]): List of dictionary representing tool to + insert in the db. Dictionary with the following keys:: + + tool_name (str): tool's name + tool_version (str): tool's version + tool_configuration (dict): tool's configuration (free form + dict) + + Returns: + List of dict inserted in the db (holding the id key as + well). The order of the list is not guaranteed to match + the order of the initial list. + + """ + db = self.db + db.mktemp_indexer_configuration(cur) + db.copy_to(tools, 'tmp_indexer_configuration', + ['tool_name', 'tool_version', 'tool_configuration'], + cur) + + tools = db.indexer_configuration_add_from_temp(cur) + for line in tools: + yield dict(zip(db.indexer_configuration_cols, line)) + + @db_transaction + def indexer_configuration_get(self, tool, cur=None): + """Retrieve tool information. + + Args: + tool (dict): Dictionary representing a tool with the + following keys:: + + tool_name (str): tool's name + tool_version (str): tool's version + tool_configuration (dict): tool's configuration (free form + dict) + + Returns: + The identifier of the tool if it exists, None otherwise. + + """ + db = self.db + tool_conf = tool['tool_configuration'] + if isinstance(tool_conf, dict): + tool_conf = json.dumps(tool_conf) + idx = db.indexer_configuration_get(tool['tool_name'], + tool['tool_version'], + tool_conf) + if not idx: + return None + return dict(zip(self.db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/tests/__init__.py b/swh/indexer/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/indexer/tests/common.py b/swh/indexer/tests/common.py new file mode 100644 index 0000000..e397b48 --- /dev/null +++ b/swh/indexer/tests/common.py @@ -0,0 +1,56 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pathlib + +from swh.indexer import get_storage + + +class StorageTestFixture: + """Mix this in a test subject class to get Storage testing support. + + This fixture requires to come before DbTestFixture in the inheritance list + as it uses its methods to setup its own internal database. + + Usage example: + + class TestStorage(StorageTestFixture, DbTestFixture): + ... + """ + TEST_STORAGE_DB_NAME = 'softwareheritage-test-indexer' + + @classmethod + def setUpClass(cls): + if not hasattr(cls, 'DB_TEST_FIXTURE_IMPORTED'): + raise RuntimeError("StorageTestFixture needs to be followed by " + "DbTestFixture in the inheritance list.") + + test_dir = pathlib.Path(__file__).absolute().parent + test_data_dir = test_dir / '../../../../swh-storage-testdata' + test_db_dump = (test_data_dir / 'dumps/swh.dump').absolute() + cls.add_db(cls.TEST_STORAGE_DB_NAME, str(test_db_dump), 'pg_dump') + super().setUpClass() + + def setUp(self): + super().setUp() + + self.storage_config = { + 'cls': 'local', + 'args': { + 'db': self.test_db[self.TEST_STORAGE_DB_NAME].conn, + }, + } + self.storage = get_storage(**self.storage_config) + + def tearDown(self): + self.objtmp.cleanup() + super().tearDown() + + def reset_storage_tables(self): + excluded = {'indexer_configuration'} + self.reset_db_tables(self.TEST_STORAGE_DB_NAME, excluded=excluded) + + db = self.test_db[self.TEST_STORAGE_DB_NAME] + db.conn.commit() diff --git a/swh/indexer/tests/test_converters.py b/swh/indexer/tests/test_converters.py new file mode 100644 index 0000000..61410eb --- /dev/null +++ b/swh/indexer/tests/test_converters.py @@ -0,0 +1,199 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest +from nose.plugins.attrib import attr + +from swh.indexer import converters + + +@attr('!db') +class TestConverters(unittest.TestCase): + def setUp(self): + self.maxDiff = None + + @istest + def ctags_to_db(self): + input_ctag = { + 'id': b'some-id', + 'indexer_configuration_id': 100, + 'ctags': [ + { + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + }, { + 'name': 'main', + 'kind': 'function', + 'line': 12, + 'lang': 'Yaml', + }, + ] + } + + expected_ctags = [ + { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'indexer_configuration_id': 100, + }, { + 'id': b'some-id', + 'name': 'main', + 'kind': 'function', + 'line': 12, + 'lang': 'Yaml', + 'indexer_configuration_id': 100, + }] + + # when + actual_ctags = list(converters.ctags_to_db(input_ctag)) + + # then + self.assertEquals(actual_ctags, expected_ctags) + + @istest + def db_to_ctags(self): + input_ctags = { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'tool_id': 200, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {} + } + expected_ctags = { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'tool': { + 'id': 200, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + # when + actual_ctags = converters.db_to_ctags(input_ctags) + + # then + self.assertEquals(actual_ctags, expected_ctags) + + @istest + def db_to_mimetype(self): + input_mimetype = { + 'id': b'some-id', + 'tool_id': 10, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'encoding': b'ascii', + 'mimetype': b'text/plain', + } + + expected_mimetype = { + 'id': b'some-id', + 'encoding': b'ascii', + 'mimetype': b'text/plain', + 'tool': { + 'id': 10, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_mimetype = converters.db_to_mimetype(input_mimetype) + + self.assertEquals(actual_mimetype, expected_mimetype) + + @istest + def db_to_language(self): + input_language = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'lang': b'css', + } + + expected_language = { + 'id': b'some-id', + 'lang': b'css', + 'tool': { + 'id': 20, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_language = converters.db_to_language(input_language) + + self.assertEquals(actual_language, expected_language) + + @istest + def db_to_fossology_license(self): + input_license = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'nomossa', + 'tool_version': '5.22', + 'tool_configuration': {}, + 'licenses': ['GPL2.0'], + } + + expected_license = { + 'id': b'some-id', + 'licenses': ['GPL2.0'], + 'tool': { + 'id': 20, + 'name': 'nomossa', + 'version': '5.22', + 'configuration': {}, + } + } + + actual_license = converters.db_to_fossology_license(input_license) + + self.assertEquals(actual_license, expected_license) + + @istest + def db_to_metadata(self): + input_metadata = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'translated_metadata': b'translated_metadata', + } + + expected_metadata = { + 'id': b'some-id', + 'translated_metadata': b'translated_metadata', + 'tool': { + 'id': 20, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_metadata = converters.db_to_metadata(input_metadata) + + self.assertEquals(actual_metadata, expected_metadata) diff --git a/swh/indexer/tests/test_storage.py b/swh/indexer/tests/test_storage.py new file mode 100644 index 0000000..37172ff --- /dev/null +++ b/swh/indexer/tests/test_storage.py @@ -0,0 +1,1439 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest +from nose.plugins.attrib import attr +from swh.model.hashutil import hash_to_bytes + +from swh.core.tests.db_testing import DbTestFixture +from .test_utils import StorageTestFixture + + +@attr('db') +class IndexerTestStorage(StorageTestFixture, DbTestFixture, unittest.TestCase): + """Base class for Indexer Storage testing. + + """ + def setUp(self): + super().setUp() + + db = self.test_db[self.TEST_STORAGE_DB_NAME] + self.conn = db.conn + self.cursor = db.cursor + + self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') + self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') + self.revision_id_1 = hash_to_bytes( + '7026b7c1a2af56521e951c01ed20f255fa054238') + self.revision_id_2 = hash_to_bytes( + '7026b7c1a2af56521e9587659012345678904321') + + def tearDown(self): + self.reset_storage_tables() + super().tearDown() + + def fetch_tools(self): + tools = {} + self.cursor.execute(''' + select tool_name, id, tool_version, tool_configuration + from indexer_configuration + order by id''') + for row in self.cursor.fetchall(): + key = row[0] + while key in tools: + key = '_' + key + tools[key] = { + 'id': row[1], + 'name': row[0], + 'version': row[2], + 'configuration': row[3] + } + + return tools + + @istest + def check_config(self): + self.assertTrue(self.storage.check_config(check_write=True)) + self.assertTrue(self.storage.check_config(check_write=False)) + + @istest + def content_mimetype_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetypes = [ + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }] + + # when + actual_missing = self.storage.content_mimetype_missing(mimetypes) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_1, + self.sha1_2, + ]) + + # given + self.storage.content_mimetype_add([{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + }]) + + # when + actual_missing = self.storage.content_mimetype_missing(mimetypes) + + # then + self.assertEqual(list(actual_missing), [self.sha1_1]) + + @istest + def content_mimetype_add__drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetype_v1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_mimetype_add([mimetype_v1]) + + # when + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + # then + expected_mimetypes_v1 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'], + }] + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + # given + mimetype_v2 = mimetype_v1.copy() + mimetype_v2.update({ + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + }) + + self.storage.content_mimetype_add([mimetype_v2]) + + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + # mimetype did not change as the v2 was dropped. + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + @istest + def content_mimetype_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetype_v1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_mimetype_add([mimetype_v1]) + + # when + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + expected_mimetypes_v1 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'], + }] + + # then + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + # given + mimetype_v2 = mimetype_v1.copy() + mimetype_v2.update({ + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + }) + + self.storage.content_mimetype_add([mimetype_v2], conflict_update=True) + + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + expected_mimetypes_v2 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + 'tool': { + 'id': 2, + 'name': 'file', + 'version': '5.22', + 'configuration': {'command_line': 'file --mime '} + } + }] + + # mimetype did change as the v2 was used to overwrite v1 + self.assertEqual(actual_mimetypes, expected_mimetypes_v2) + + @istest + def content_mimetype_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetypes = [self.sha1_2, self.sha1_1] + + mimetype1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_mimetype_add([mimetype1]) + + # then + actual_mimetypes = list(self.storage.content_mimetype_get(mimetypes)) + + # then + expected_mimetypes = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'] + }] + + self.assertEqual(actual_mimetypes, expected_mimetypes) + + @istest + def content_language_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + languages = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.content_language_missing(languages)) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1, + ]) + + # given + self.storage.content_language_add([{ + 'id': self.sha1_2, + 'lang': 'haskell', + 'indexer_configuration_id': tool_id, + }]) + + # when + actual_missing = list(self.storage.content_language_missing(languages)) + + # then + self.assertEqual(actual_missing, [self.sha1_1]) + + @istest + def content_language_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language1 = { + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_language_add([language1]) + + # then + actual_languages = list(self.storage.content_language_get( + [self.sha1_2, self.sha1_1])) + + # then + expected_languages = [{ + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'tool': tools['pygments'] + }] + + self.assertEqual(actual_languages, expected_languages) + + @istest + def content_language_add__drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language_v1 = { + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_language_add([language_v1]) + + # when + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # then + expected_languages_v1 = [{ + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'tool': tools['pygments'] + }] + self.assertEqual(actual_languages, expected_languages_v1) + + # given + language_v2 = language_v1.copy() + language_v2.update({ + 'lang': 'common-lisp', + }) + + self.storage.content_language_add([language_v2]) + + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + self.assertEqual(actual_languages, expected_languages_v1) + + @istest + def content_language_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language_v1 = { + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_language_add([language_v1]) + + # when + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # then + expected_languages_v1 = [{ + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'tool': tools['pygments'] + }] + self.assertEqual(actual_languages, expected_languages_v1) + + # given + language_v2 = language_v1.copy() + language_v2.update({ + 'lang': 'emacslisp', + }) + + self.storage.content_language_add([language_v2], conflict_update=True) + + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + expected_languages_v2 = [{ + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'tool': tools['pygments'] + }] + + # language did change as the v2 was used to overwrite v1 + self.assertEqual(actual_languages, expected_languages_v2) + + @istest + def content_ctags_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['universal-ctags']['id'] + + ctags = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = self.storage.content_ctags_missing(ctags) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1 + ]) + + # given + self.storage.content_ctags_add([ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 119, + 'lang': 'OCaml', + }] + }, + ]) + + # when + actual_missing = self.storage.content_ctags_missing(ctags) + + # then + self.assertEqual(list(actual_missing), [self.sha1_1]) + + @istest + def content_ctags_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['universal-ctags']['id'] + + ctags = [self.sha1_2, self.sha1_1] + + ctag1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Python', + }, + { + 'name': 'main', + 'kind': 'function', + 'line': 119, + 'lang': 'Python', + }] + } + + # when + self.storage.content_ctags_add([ctag1]) + + # then + actual_ctags = list(self.storage.content_ctags_get(ctags)) + + # then + + expected_ctags = [ + { + 'id': self.sha1_2, + 'tool': tools['universal-ctags'], + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Python', + }, + { + 'id': self.sha1_2, + 'tool': tools['universal-ctags'], + 'name': 'main', + 'kind': 'function', + 'line': 119, + 'lang': 'Python', + } + ] + + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_ctags_search(self): + # 1. given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag1 = { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + }, + { + 'name': 'counter', + 'kind': 'variable', + 'line': 119, + 'lang': 'Python', + }, + ] + } + + ctag2 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + }, + ] + } + + self.storage.content_ctags_add([ctag1, ctag2]) + + # 1. when + actual_ctags = list(self.storage.content_ctags_search('hello', + limit=1)) + + # 1. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag1['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + } + ]) + + # 2. when + actual_ctags = list(self.storage.content_ctags_search( + 'hello', + limit=1, + last_sha1=ctag1['id'])) + + # 2. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag2['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + } + ]) + + # 3. when + actual_ctags = list(self.storage.content_ctags_search('hello')) + + # 3. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag1['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + }, + { + 'id': ctag2['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + }, + ]) + + # 4. when + actual_ctags = list(self.storage.content_ctags_search('counter')) + + # then + self.assertEqual(actual_ctags, [{ + 'id': ctag1['id'], + 'tool': tool, + 'name': 'counter', + 'kind': 'variable', + 'line': 119, + 'lang': 'Python', + }]) + + @istest + def content_ctags_search_no_result(self): + actual_ctags = list(self.storage.content_ctags_search('counter')) + + self.assertEquals(actual_ctags, []) + + @istest + def content_ctags_add__add_new_ctags_added(self): + # given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag_v1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }] + } + + # given + self.storage.content_ctags_add([ctag_v1]) + self.storage.content_ctags_add([ctag_v1]) # conflict does nothing + + # when + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # then + expected_ctags = [{ + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }] + + self.assertEqual(actual_ctags, expected_ctags) + + # given + ctag_v2 = ctag_v1.copy() + ctag_v2.update({ + 'ctags': [ + { + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + } + ] + }) + + self.storage.content_ctags_add([ctag_v2]) + + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }, { + 'id': self.sha1_2, + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + 'tool': tool, + } + ] + + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_ctags_add__update_in_place(self): + # given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag_v1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }] + } + + # given + self.storage.content_ctags_add([ctag_v1]) + + # when + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # then + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool + } + ] + self.assertEqual(actual_ctags, expected_ctags) + + # given + ctag_v2 = ctag_v1.copy() + ctag_v2.update({ + 'ctags': [ + { + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }, + { + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + } + ] + }) + + self.storage.content_ctags_add([ctag_v2], conflict_update=True) + + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # ctag did change as the v2 was used to overwrite v1 + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }, + { + 'id': self.sha1_2, + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + 'tool': tool, + } + ] + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_fossology_license_get(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license1 = { + 'id': self.sha1_1, + 'licenses': ['GPL-2.0+'], + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_fossology_license_add([license1]) + + # then + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_2, self.sha1_1])) + + expected_license = { + 'id': self.sha1_1, + 'licenses': ['GPL-2.0+'], + 'tool': tool, + } + + # then + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_fossology_license_add__new_license_added(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license_v1 = { + 'id': self.sha1_1, + 'licenses': ['Apache-2.0'], + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_fossology_license_add([license_v1]) + # conflict does nothing + self.storage.content_fossology_license_add([license_v1]) + + # when + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # then + expected_license = { + 'id': self.sha1_1, + 'licenses': ['Apache-2.0'], + 'tool': tool, + } + self.assertEqual(actual_licenses, [expected_license]) + + # given + license_v2 = license_v1.copy() + license_v2.update({ + 'licenses': ['BSD-2-Clause'], + }) + + self.storage.content_fossology_license_add([license_v2]) + + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + expected_license.update({ + 'licenses': ['Apache-2.0', 'BSD-2-Clause'], + }) + + # license did not change as the v2 was dropped. + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_fossology_license_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license_v1 = { + 'id': self.sha1_1, + 'licenses': ['CECILL'], + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_fossology_license_add([license_v1]) + # conflict does nothing + self.storage.content_fossology_license_add([license_v1]) + + # when + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # then + expected_license = { + 'id': self.sha1_1, + 'licenses': ['CECILL'], + 'tool': tool, + } + self.assertEqual(actual_licenses, [expected_license]) + + # given + license_v2 = license_v1.copy() + license_v2.update({ + 'licenses': ['CECILL-2.0'] + }) + + self.storage.content_fossology_license_add([license_v2], + conflict_update=True) + + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # license did change as the v2 was used to overwrite v1 + expected_license.update({ + 'licenses': ['CECILL-2.0'] + }) + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadatas = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1, + ]) + + # given + self.storage.content_metadata_add([{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(actual_missing, [self.sha1_1]) + + @istest + def content_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_metadata_add([metadata1]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2, self.sha1_1])) + + expected_metadatas = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def content_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata_v1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + expected_metadatas_v1 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_drop_duplicated_metadata', + 'version': '0.0.1' + }, + }) + + self.storage.content_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def content_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata_v1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # then + expected_metadatas_v1 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + }) + self.storage.content_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + + @istest + def revision_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadatas = [ + { + 'id': self.revision_id_1, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.revision_id_2, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + self.revision_id_1, + self.revision_id_2, + ]) + + # given + self.storage.revision_metadata_add([{ + 'id': self.revision_id_1, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(actual_missing, [self.revision_id_2]) + + @istest + def revision_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_rev = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + } + + # when + self.storage.revision_metadata_add([metadata_rev]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2, self.revision_id_1])) + + expected_metadatas = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_rev['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def revision_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'id': self.revision_id_1, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_1])) + + expected_metadatas_v1 = [{ + 'id': self.revision_id_1, + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_metadata', + 'author': 'MG', + }, + }) + + self.storage.revision_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_1])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def revision_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2])) + + # then + expected_metadatas_v1 = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_update_duplicated_metadata', + 'author': 'MG' + }, + }) + self.storage.revision_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_v2['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + + @istest + def indexer_configuration_add(self): + tool = { + 'tool_name': 'some-unknown-tool', + 'tool_version': 'some-version', + 'tool_configuration': {"debian-package": "some-package"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + self.assertIsNone(actual_tool) # does not exist + + # add it + actual_tools = list(self.storage.indexer_configuration_add([tool])) + + self.assertEquals(len(actual_tools), 1) + actual_tool = actual_tools[0] + self.assertIsNotNone(actual_tool) # now it exists + new_id = actual_tool.pop('id') + self.assertEquals(actual_tool, tool) + + actual_tools2 = list(self.storage.indexer_configuration_add([tool])) + actual_tool2 = actual_tools2[0] + self.assertIsNotNone(actual_tool2) # now it exists + new_id2 = actual_tool2.pop('id') + + self.assertEqual(new_id, new_id2) + self.assertEqual(actual_tool, actual_tool2) + + @istest + def indexer_configuration_add_multiple(self): + tool = { + 'tool_name': 'some-unknown-tool', + 'tool_version': 'some-version', + 'tool_configuration': {"debian-package": "some-package"}, + } + + actual_tools = list(self.storage.indexer_configuration_add([tool])) + self.assertEqual(len(actual_tools), 1) + + new_tools = [tool, { + 'tool_name': 'yet-another-tool', + 'tool_version': 'version', + 'tool_configuration': {}, + }] + + actual_tools = list(self.storage.indexer_configuration_add(new_tools)) + self.assertEqual(len(actual_tools), 2) + + # order not guaranteed, so we iterate over results to check + for tool in actual_tools: + _id = tool.pop('id') + self.assertIsNotNone(_id) + self.assertIn(tool, new_tools) + + @istest + def indexer_configuration_get_missing(self): + tool = { + 'tool_name': 'unknown-tool', + 'tool_version': '3.1.0rc2-31-ga2cbb8c', + 'tool_configuration': {"command_line": "nomossa "}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + self.assertIsNone(actual_tool) + + @istest + def indexer_configuration_get(self): + tool = { + 'tool_name': 'nomos', + 'tool_version': '3.1.0rc2-31-ga2cbb8c', + 'tool_configuration': {"command_line": "nomossa "}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + expected_tool = tool.copy() + expected_tool['id'] = 1 + + self.assertEqual(expected_tool, actual_tool) + + @istest + def indexer_configuration_metadata_get_missing_context(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"context": "unknown-context"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + self.assertIsNone(actual_tool) + + @istest + def indexer_configuration_metadata_get(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"type": "local", "context": "npm"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + expected_tool = tool.copy() + expected_tool['id'] = actual_tool['id'] + + self.assertEqual(expected_tool, actual_tool) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 3626af8..f7599de 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,253 +1,303 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import pathlib + from swh.objstorage.exc import ObjNotFoundError +from swh.indexer import get_storage + + +class StorageTestFixture: + """Mix this in a test subject class to get Storage testing support. + + This fixture requires to come before DbTestFixture in the inheritance list + as it uses its methods to setup its own internal database. + + Usage example: + + class TestStorage(StorageTestFixture, DbTestFixture): + ... + """ + TEST_STORAGE_DB_NAME = 'softwareheritage-test-indexer' + + @classmethod + def setUpClass(cls): + if not hasattr(cls, 'DB_TEST_FIXTURE_IMPORTED'): + raise RuntimeError("StorageTestFixture needs to be followed by " + "DbTestFixture in the inheritance list.") + + test_dir = pathlib.Path(__file__).absolute().parent + test_data_dir = test_dir / '../../../../swh-storage-testdata' + test_db_dump = (test_data_dir / 'dumps/swh-indexer.dump').absolute() + cls.add_db(cls.TEST_STORAGE_DB_NAME, str(test_db_dump), 'pg_dump') + super().setUpClass() + + def setUp(self): + super().setUp() + + self.storage_config = { + 'cls': 'local', + 'args': { + 'db': self.test_db[self.TEST_STORAGE_DB_NAME].conn, + }, + } + self.storage = get_storage(**self.storage_config) + + def tearDown(self): + super().tearDown() + + def reset_storage_tables(self): + excluded = {'indexer_configuration'} + self.reset_db_tables(self.TEST_STORAGE_DB_NAME, excluded=excluded) + + db = self.test_db[self.TEST_STORAGE_DB_NAME] + db.conn.commit() class MockObjStorage: """Mock objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def revision_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def indexer_configuration_add(self, tools): tool = tools[0] if tool['tool_name'] == 'swh-metadata-translator': return [{ 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] elif tool['tool_name'] == 'swh-metadata-detector': return [{ 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }] def content_metadata_get(self, sha1s): return [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'npm' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { 'issueTracker': { 'url': 'https://github.com/librariesio/yarn-parser/issues' }, 'version': '1.0.0', 'name': 'yarn-parser', 'author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'other': { 'scripts': { 'start': 'node index.js' }, 'main': 'index.js' }, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': { 'type': 'git', 'url': 'git+https://github.com/librariesio/yarn-parser.git' }, 'description': 'Tiny web service for parsing yarn.lock files', 'softwareRequirements': { 'yarn': '^0.21.0', 'express': '^4.14.0', 'body-parser': '^1.15.2'} } }]