diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 1755953..040b57b 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,427 +1,484 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil from swh.scheduler.utils import get_task from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the - BaseIndexer and implement the process of indexation: + BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level - classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later - on :class:`OriginIndexer` will also be available) + classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, + :class:`OriginIndexer`. Then you need to implement the following functions: :func:`filter`: filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). :func:`index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :func:`persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :func:`prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :func:`check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :func:`register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), # queue to reschedule if problem (none for no rescheduling, # the default) 'rescheduling_task': ('str', None), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'multiplexer', 'args': { 'objstorages': [{ 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure', 'args': { 'account_name': '0euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '0'} ] } }, { 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure', 'args': { 'account_name': '1euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '1'} ] } }] }, }), } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) if self.config['storage']: self.storage = get_storage(**self.config['storage']) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) rescheduling_task = self.config['rescheduling_task'] if rescheduling_task: self.rescheduling_task = get_task(rescheduling_task) else: self.rescheduling_task = None _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: List of dict with additional id key. Raises: ValueError if not a list nor a dict. """ tools = self.config['tools'] if isinstance(tools, list): tools = map(self._prepare_tool, tools) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') return self.idx_storage.indexer_configuration_add(tools) @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. Returns: None """ pass @abc.abstractmethod def run(self, ids, policy_update): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ pass class ContentIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Contents using the run - method + implements Content indexing using the run method Note: the :class:`ContentIndexer` is not an instantiable object. To use it in another context, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.next_step(results) except Exception: self.log.exception( 'Problem when reading contents metadata.') if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(ids, policy_update) +class OriginIndexer(BaseIndexer): + """An object type indexer, inherits from the :class:`BaseIndexer` and + implements Origin indexing using the run method + + Note: the :class:`OriginIndexer` is not an instantiable object. + To use it in another context one should inherit from this class + and override the methods mentioned in the :class:`BaseIndexer` + class. + + """ + def run(self, ids, policy_update, parse_ids=False): + """Given a list of origin ids: + + - retrieve origins from storage + - execute the indexing computations + - store the results (according to policy_update) + + Args: + ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or + (type, url) tuples. + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore + them + parse_ids ([bool]: If `True`, will try to convert `ids` + from a human input to the valid type. + + """ + if parse_ids: + ids = [ + o.split('+', 1) if ':' in o else int(o) # type+url or id + for o in ids] + + results = [] + + for id_ in ids: + if isinstance(id_, (tuple, list)): + if len(id_) != 2: + raise TypeError('Expected a (type, url) tuple.') + (type_, url) = id_ + params = {'type': type_, 'url': url} + elif isinstance(id_, int): + params = {'id': id_} + else: + raise TypeError('Invalid value for "ids": %r' % id_) + origin = self.storage.origin_get(params) + if not origin: + self.log.warn('Origins %s not found in storage' % + list(ids)) + continue + try: + res = self.index(origin) + if origin: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing origin %s' % id_) + self.persist_index_computations(results, policy_update) + + class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Revisions using the run - method + implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1_git's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warn('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 8adff82..f2121b0 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,283 +1,280 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' def __init__(self, tool, config): # twisted way to use the exact same config of RevisionMetadataIndexer # object that uses internally ContentMetadataIndexer self.config = config self.config['tools'] = tool super().__init__() def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: context = self.tool['tool_configuration']['context'] result['translated_metadata'] = compute_metadata(context, data) # a twisted way to keep result with indexer object for get_results self.results.append(result) except Exception: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """can be called only if run method was called before Returns: list: list of content_metadata entries calculated by current indexer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': ['npm', 'codemeta'] }, }), } ContentMetadataIndexer = ContentMetadataIndexer def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (bytes): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata (bytes): dict of retrieved metadata """ try: result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = (entry for entry in dir_ls if entry['type'] == 'file') detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev') return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { INDEXER_CFG_KEY: self.idx_storage, 'objstorage': self.objstorage } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = self.ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # schedule indexation of content try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: results = c_metadata_indexer.get_results() for result in results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception as e: self.log.warn("""Exception while indexing content""", e) # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata @click.command() @click.option('--revs', '-i', - default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', - '026040ea79dec1b49b4e3e7beda9132b6b26b51b', - '9699072e21eded4be8d45e3b8d543952533fa190'], help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py new file mode 100644 index 0000000..86c622d --- /dev/null +++ b/swh/indexer/origin_head.py @@ -0,0 +1,163 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import click +import logging + +from swh.indexer.indexer import OriginIndexer + + +class OriginHeadIndexer(OriginIndexer): + """Origin-level indexer. + + This indexer is in charge of looking up the revision that acts as the + "head" of an origin. + + In git, this is usually the commit pointed to by the 'master' branch.""" + + ADDITIONAL_CONFIG = { + 'tools': ('dict', { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }), + } + + def filter(self, ids): + yield from ids + + def persist_index_computations(self, results, policy_update): + """Do nothing. The indexer's results are not persistant, they + should only be piped to another indexer via the orchestrator.""" + pass + + # Dispatch + + def index(self, origin): + origin_id = origin['id'] + latest_snapshot = self.storage.snapshot_get_latest(origin_id) + method = getattr(self, '_try_get_%s_head' % origin['type'], None) + if method is None: + method = self._try_get_head_generic + rev_id = method(latest_snapshot) + if rev_id is None: + return None + result = { + 'origin_id': origin_id, + 'revision_id': rev_id, + } + return result + + # VCSs + + def _try_get_vcs_head(self, snapshot): + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + if branches[b'HEAD']['target_type'] == 'revision': + return branches[b'HEAD']['target'] + except KeyError: + return None + + _try_get_hg_head = _try_get_git_head = _try_get_vcs_head + + # Tarballs + + _archive_filename_re = re.compile( + b'^' + b'(?P.*)[-_]' + b'(?P[0-9]+(\.[0-9])*)' + b'(?P[-+][a-zA-Z0-9.~]+?)?' + b'(?P(\.[a-zA-Z0-9]+)+)' + b'$') + + @classmethod + def _parse_version(cls, filename): + """Extracts the release version from an archive filename, + to get an ordering whose maximum is likely to be the last + version of the software + + >>> OriginHeadIndexer._parse_version(b'foo') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') + (0, 0, 1, 0) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') + (0, 0, 1, -1, 'beta2') + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') + (0, 0, 1, 1, 'foobar') + """ + res = cls._archive_filename_re.match(filename) + if res is None: + return (float('-infinity'),) + version = [int(n) for n in res.group('version').decode().split('.')] + if res.group('preversion') is None: + version.append(0) + else: + preversion = res.group('preversion').decode() + if preversion.startswith('-'): + version.append(-1) + version.append(preversion[1:]) + elif preversion.startswith('+'): + version.append(1) + version.append(preversion[1:]) + else: + assert False, res.group('preversion') + return tuple(version) + + def _try_get_ftp_head(self, snapshot): + archive_names = list(snapshot['branches']) + max_archive_name = max(archive_names, key=self._parse_version) + r = self._try_resolve_target(snapshot['branches'], max_archive_name) + return r + + # Generic + + def _try_get_head_generic(self, snapshot): + # Works on 'deposit', 'svn', and 'pypi'. + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + except KeyError: + return None + else: + return ( + self._try_resolve_target(branches, b'HEAD') or + self._try_resolve_target(branches, b'master') + ) + + def _try_resolve_target(self, branches, target_name): + try: + target = branches[target_name] + while target['target_type'] == 'alias': + target = branches[target['target']] + if target['target_type'] == 'revision': + return target['target'] + elif target['target_type'] == 'content': + return None # TODO + elif target['target_type'] == 'directory': + return None # TODO + elif target['target_type'] == 'release': + return None # TODO + else: + assert False + except KeyError: + return None + + +@click.command() +@click.option('--origins', '-i', + help='Origins to lookup, in the "type+url" format', + multiple=True) +def main(origins): + rev_metadata_indexer = OriginHeadIndexer() + rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py new file mode 100644 index 0000000..62912ab --- /dev/null +++ b/swh/indexer/tests/test_origin_head.py @@ -0,0 +1,213 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging +from nose.tools import istest + +from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.tests.test_utils import MockIndexerStorage + +ORIGINS = [ + { + 'id': 52189575, + 'lister': None, + 'project': None, + 'type': 'git', + 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, + { + 'id': 4423668, + 'lister': None, + 'project': None, + 'type': 'ftp', + 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, + { + 'id': 77775770, + 'lister': None, + 'project': None, + 'type': 'deposit', + 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, + { + 'id': 85072327, + 'lister': None, + 'project': None, + 'type': 'pypi', + 'url': 'https://pypi.org/project/limnoria/'}, + { + 'id': 49908349, + 'lister': None, + 'project': None, + 'type': 'svn', + 'url': 'http://0-512-md.googlecode.com/svn/'}, + ] + +SNAPSHOTS = { + 52189575: { + 'branches': { + b'refs/heads/add-revision-origin-cache': { + 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' + b's\xe7/\xe9l\x1e', + 'target_type': 'revision'}, + b'HEAD': { + 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' + b'\xac\xefrm', + 'target_type': 'revision'}, + b'refs/tags/v0.0.103': { + 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' + b'\x0f\xdd', + 'target_type': 'release'}, + }}, + 4423668: { + 'branches': { + b'3DLDF-1.1.4.tar.gz': { + 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' + b'"G\x99\x11', + 'target_type': 'revision'}, + b'3DLDF-2.0.2.tar.gz': { + 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' + b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', + 'target_type': 'revision'}, + b'3DLDF-2.0.3-examples.tar.gz': { + 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' + b'\xfe\xadZ\x80\x80\xc1\x83\xff', + 'target_type': 'revision'}, + b'3DLDF-2.0.3.tar.gz': { + 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' + b'\xcc\x1a\xb4`\x8c\x8by', + 'target_type': 'revision'}, + b'3DLDF-2.0.tar.gz': { + 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' + b'\xd3\xd1m', + b'target_type': 'revision'} + }}, + 77775770: { + 'branches': { + b'master': { + 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' + b'\xa6\xe9\x99\xb1\x9e]q\xeb', + 'target_type': 'revision'} + }, + 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" + b"\x1d\r "}, + 85072327: { + 'branches': { + b'HEAD': { + 'target': b'releases/2018.09.09', + 'target_type': 'alias'}, + b'releases/2018.09.01': { + 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' + b'\xbb\xdfF\xfdw\xcf', + 'target_type': 'revision'}, + b'releases/2018.09.09': { + 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' + b'A\x10\x9d\xc5\xfa2\xf8t', + 'target_type': 'revision'}}, + 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' + b'\x12\x9e\xd6\xb3'}, + 49908349: { + 'branches': { + b'master': { + 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' + b'\xc9\xad#.\x1bw=\x18', + 'target_type': 'revision'}}, + 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' + b'\x05\xea\xb8\x1f\xc4H\xf4s'}, + } + + +class MockStorage: + def origin_get(self, id_): + for origin in ORIGINS: + if origin['type'] == id_['type'] and origin['url'] == id_['url']: + return origin + assert False, id_ + + def snapshot_get_latest(self, origin_id): + if origin_id in SNAPSHOTS: + return SNAPSHOTS[origin_id] + else: + assert False, origin_id + + +class TestOriginHeadIndexer(OriginHeadIndexer): + """Specific indexer whose configuration is enough to satisfy the + indexing tests. + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + } + self.storage = MockStorage() + self.idx_storage = MockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = None + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + self.results = None + + def persist_index_computations(self, results, policy_update): + self.results = results + + +class OriginHead(unittest.TestCase): + @istest + def test_git(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['git+https://github.com/SoftwareHeritage/swh-storage'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' + b'\xd7}\xac\xefrm', + 'origin_id': 52189575}]) + + @istest + def test_ftp(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['ftp+rsync://ftp.gnu.org/gnu/3dldf'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' + b'\xcc\x1a\xb4`\x8c\x8by', + 'origin_id': 4423668}]) + + @istest + def test_deposit(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['deposit+https://forge.softwareheritage.org/source/' + 'jesuisgpl/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' + b'\xa6\xe9\x99\xb1\x9e]q\xeb', + 'origin_id': 77775770}]) + + @istest + def test_pypi(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['pypi+https://pypi.org/project/limnoria/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' + b'A\x10\x9d\xc5\xfa2\xf8t', + 'origin_id': 85072327}]) + + @istest + def test_svn(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['svn+http://0-512-md.googlecode.com/svn/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' + b'\xc9\xad#.\x1bw=\x18', + 'origin_id': 49908349}]) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index a1090d2..4a0f1c5 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,260 +1,269 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockIndexerStorage(): """Mock an swh-indexer storage. """ def indexer_configuration_add(self, tools): tool = tools[0] if tool['tool_name'] == 'swh-metadata-translator': return [{ 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] elif tool['tool_name'] == 'swh-metadata-detector': return [{ 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] + elif tool['tool_name'] == 'origin-metadata': + return [{ + 'id': 8, + 'tool_name': 'origin-metadata', + 'tool_version': '0.0.1', + 'tool_configuration': {}, + }] + else: + assert False, 'Unknown tool {tool_name}'.format(**tool) def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def revision_metadata_add(self, metadata, conflict_update=None): self.state = metadata self.conflict_update = conflict_update def content_metadata_get(self, sha1s): return [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'npm' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { 'issueTracker': { 'url': 'https://github.com/librariesio/yarn-parser/issues' }, 'version': '1.0.0', 'name': 'yarn-parser', 'author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'other': { 'scripts': { 'start': 'node index.js' }, 'main': 'index.js' }, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': { 'type': 'git', 'url': 'git+https://github.com/librariesio/yarn-parser.git' }, 'description': 'Tiny web service for parsing yarn.lock files', 'softwareRequirements': { 'yarn': '^0.21.0', 'express': '^4.14.0', 'body-parser': '^1.15.2'} } }] class MockStorage(): """Mock a real swh-storage storage to simplify reading indexers' outputs. """ def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }]