Page MenuHomeSoftware Heritage

D215.id716.diff
No OneTemporary

D215.id716.diff

diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -17,6 +17,53 @@
from swh.scheduler.utils import get_task
+class DiskIndexer:
+ """Mixin intended to be used with other *Indexer classes.
+
+ Indexer* inheriting from this class are a category of indexers
+ which needs the disk for their computations.
+
+ Expects:
+ self.working_directory variable defined at runtime.
+
+ """
+ def __init__(self):
+ super().__init__()
+
+ def write_to_temp(self, filename, data):
+ """Write the sha1's content in a temporary file.
+
+ Args:
+ sha1 (str): the sha1 name
+ filename (str): one of sha1's many filenames
+ data (bytes): the sha1's content to write in temporary
+ file
+
+ Returns:
+ The path to the temporary file created. That file is
+ filled in with the raw content's data.
+
+ """
+ os.makedirs(self.working_directory, exist_ok=True)
+ temp_dir = tempfile.mkdtemp(dir=self.working_directory)
+ content_path = os.path.join(temp_dir, filename)
+
+ with open(content_path, 'wb') as f:
+ f.write(data)
+
+ return content_path
+
+ def cleanup(self, content_path):
+ """Remove content_path from working directory.
+
+ Args:
+ content_path (str): the file to remove
+
+ """
+ temp_dir = os.path.dirname(content_path)
+ shutil.rmtree(temp_dir)
+
+
class BaseIndexer(SWHConfig,
metaclass=abc.ABCMeta):
"""Base class for indexers to inherit from.
@@ -257,50 +304,3 @@
if self.rescheduling_task:
self.log.warn('Rescheduling batch')
self.rescheduling_task.delay(sha1s, policy_update)
-
-
-class DiskIndexer:
- """Mixin intended to be used with other *Indexer classes.
-
- Indexer* inheriting from this class are a category of indexers
- which needs the disk for their computations.
-
- Expects:
- self.working_directory variable defined at runtime.
-
- """
- def __init__(self):
- super().__init__()
-
- def write_to_temp(self, filename, data):
- """Write the sha1's content in a temporary file.
-
- Args:
- sha1 (str): the sha1 name
- filename (str): one of sha1's many filenames
- data (bytes): the sha1's content to write in temporary
- file
-
- Returns:
- The path to the temporary file created. That file is
- filled in with the raw content's data.
-
- """
- os.makedirs(self.working_directory, exist_ok=True)
- temp_dir = tempfile.mkdtemp(dir=self.working_directory)
- content_path = os.path.join(temp_dir, filename)
-
- with open(content_path, 'wb') as f:
- f.write(data)
-
- return content_path
-
- def cleanup(self, content_path):
- """Remove content_path from working directory.
-
- Args:
- content_path (str): the file to remove
-
- """
- temp_dir = os.path.dirname(content_path)
- shutil.rmtree(temp_dir)
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from .indexer import BaseIndexer
+from swh.indexer.metadata_dictionary import compute_metadata
+
+
+class ContentMetadataIndexer(BaseIndexer):
+ """Indexer in charge of:
+ - filtering out content already indexed
+ - reading content from objstorage with the content's id sha1
+ - computing translated_metadata by given context
+ - using the MetadataDict and a tool for each context
+ - store result in storage
+ """
+ CONFIG_BASE_FILENAME = 'indexer/metadata'
+
+ ADDITIONAL_CONFIG = {
+ 'tools': ('dict', {
+ 'name': 'hard_mapping_npm',
+ 'version': '0.0.1',
+ 'configuration': {
+ 'type': 'test',
+ 'debian-package': ''
+ },
+ }),
+ }
+
+ def prepare(self):
+ super().prepare()
+
+ def filter_contents(self, sha1s):
+ """Filter out known sha1s and return only missing ones.
+ """
+ yield from self.storage.content_metadata_missing((
+ {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tools['id'],
+ } for sha1 in sha1s
+ ))
+
+ def index_content(self, sha1, raw_content):
+ """Index sha1s' content and store result.
+
+ Args:
+ sha1 (bytes): content's identifier
+ raw_content (bytes): raw content in bytes
+
+ Returns:
+ result (dict): representing a content_metadata
+ if translation wasn't successful the translated_metadata keys
+ will be kept as None
+
+ """
+ result = {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tools['id'],
+ 'translated_metadata': None
+ }
+ try:
+ context = self.tools['name']
+ result['translated_metadata'] = compute_metadata(
+ context, raw_content)
+ except:
+ self.log.exception(
+ "Problem during tool retrieval of metadata translation")
+ return result
+
+ def persist_index_computations(self, results, policy_update):
+ """Persist the results in storage.
+
+ Args:
+ results ([dict]): list of content_metadata, dict with the
+ following keys:
+ - id (bytes): content's identifier (sha1)
+ - translated_metadata (jsonb): detected metadata
+ policy_update ([str]): either 'update-dups' or 'ignore-dups' to
+ respectively update duplicates or ignore them
+
+ """
+ self.storage.content_metadata_add(
+ results, conflict_update=(policy_update == 'update-dups'))
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata_dictionary.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import json
+
+
+def convert(raw_content):
+ """
+ convert raw_content recursively:
+ - from bytes to string
+ - from string to dict
+ Args:
+ - raw_content (bytes / string / dict)
+ Returns:
+ - Dict of content (if string was json, otherwise returns string)
+ """
+ if isinstance(raw_content, bytes):
+ return convert(raw_content.decode())
+ if isinstance(raw_content, str):
+ try:
+ content = json.loads(raw_content)
+ if content:
+ return content
+ else:
+ return raw_content
+ except json.decoder.JSONDecodeError:
+ return raw_content
+ if isinstance(raw_content, dict):
+ return raw_content
+
+
+class BaseMapping():
+ """Base class for mappings to inherit from
+
+ To implement a new mapping:
+ - inherit this class
+ - add a local property self.mapping
+ - override translate function
+ """
+
+ def __init__(self):
+ pass
+
+ def translate(self, content_dict):
+ """
+ Tranlsates content by parsing content to a json object
+ and translating with the npm mapping (for now hard_coded mapping)
+ Args:
+ - context_text (text) : should be json
+
+ Returns:
+ - translated_metadata (dict): jsonb form needed for the indexer
+ """
+ translated_metadata = {}
+ default = 'other'
+ translated_metadata['other'] = {}
+ try:
+ for k, v in content_dict.items():
+ try:
+ term = self.mapping.get(k, default)
+ if term not in translated_metadata:
+ translated_metadata[term] = v
+ continue
+ if isinstance(translated_metadata[term], str):
+ in_value = translated_metadata[term]
+ translated_metadata[term] = [in_value, v]
+ continue
+ if isinstance(translated_metadata[term], list):
+ translated_metadata[term].append(v)
+ continue
+ if isinstance(translated_metadata[term], dict):
+ translated_metadata[term][k] = v
+ continue
+ except KeyError:
+ self.log.exception(
+ "Problem during item mapping")
+ continue
+ except:
+ return None
+ return translated_metadata
+
+
+class NpmMapping(BaseMapping):
+ """
+ dedicated class for NPM (package.json) mapping and translation
+ """
+ npm_mapping = {
+ 'repository': 'codeRepository',
+ 'os': 'operatingSystem',
+ 'cpu': 'processorRequirements',
+ 'engines': 'processorRequirements',
+ 'dependencies': 'softwareRequirements',
+ 'bundleDependencies': 'softwareRequirements',
+ 'peerDependencies': 'softwareRequirements',
+ 'author': 'author',
+ 'contributor': 'contributor',
+ 'keywords': 'keywords',
+ 'license': 'license',
+ 'version': 'version',
+ 'description': 'description',
+ 'name': 'name',
+ 'devDependencies': 'softwareSuggestions',
+ 'optionalDependencies': 'softwareSuggestions',
+ 'bugs': 'issueTracker',
+ 'homepage': 'url'
+ }
+
+ def __init__(self):
+ self.mapping = self.npm_mapping
+
+ def translate(self, raw_content):
+ content_dict = convert(raw_content)
+ return super().translate(content_dict)
+
+
+class MavenMapping(BaseMapping):
+ """
+ dedicated class for Maven (pom.xml) mapping and translation
+ """
+ maven_mapping = {
+ 'license': 'license',
+ 'version': 'version',
+ 'description': 'description',
+ 'name': 'name',
+ 'prerequisites': 'softwareRequirements',
+ 'repositories': 'codeRepository',
+ 'groupId': 'identifier',
+ 'ciManagement': 'contIntegration',
+ 'issuesManagement': 'issueTracker',
+ }
+
+ def __init__(self):
+ self.mapping = self.maven_mapping
+
+ def translate(self, raw_content):
+ content = convert(raw_content)
+ # parse content from xml to dict
+ return super().translate(content)
+
+
+class DoapMapping(BaseMapping):
+ doap_mapping = {
+
+ }
+
+ def __init__(self):
+ self.mapping = self.doap_mapping
+
+ def translate(self, raw_content):
+ content = convert(raw_content)
+ # parse content from xml to dict
+ return super().translate(content)
+
+
+def parse_xml(content):
+ """
+ Parses content from xml to a python dict
+ Args:
+ - content (text): the string form of the raw_content ( in xml)
+
+ Returns:
+ - parsed_xml (dict): a python dict of the content after parsing
+ """
+ # check if xml
+ # use xml parser to dict
+ return content
+
+
+mapping_tool_fn = {
+ "hard_mapping_npm": NpmMapping(),
+ "pom_xml": MavenMapping(),
+ "doap_xml": DoapMapping()
+}
+
+
+def compute_metadata(context, raw_content):
+ """
+ first landing method: a dispatcher that sends content
+ to the right function to carry out the real parsing of syntax
+ and translation of terms
+ Args:
+ - context (text) : defines to which function/tool
+ the content is sent
+ - content (text): the string form of the raw_content
+
+ Returns:
+ - translated_metadata (dict): jsonb form needed for the indexer
+ to store in storage
+
+ """
+ if raw_content is None or raw_content is b"":
+ return None
+
+ # TODO: keep mapping not in code (maybe fetch crosswalk from storage?)
+ # if fetched from storage should be done once for batch of sha1s
+ dictionary = mapping_tool_fn[context]
+ translated_metadata = dictionary.translate(raw_content)
+ # print(translated_metadata)
+ return translated_metadata
+
+
+def main():
+ raw_content = """{"name": "test_name", "unknown_term": "ut"}"""
+ raw_content1 = b"""{"name": "test_name",
+ "unknown_term": "ut",
+ "prerequisites" :"packageXYZ"}"""
+ result = compute_metadata("hard_mapping_npm", raw_content)
+ result1 = compute_metadata("pom_xml", raw_content1)
+
+ print(result)
+ print(result1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
--- a/swh/indexer/tests/test_language.py
+++ b/swh/indexer/tests/test_language.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_metadata.py
@@ -0,0 +1,314 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+from nose.tools import istest
+
+from swh.indexer.metadata_dictionary import compute_metadata
+from swh.indexer.metadata import ContentMetadataIndexer
+from swh.indexer.tests.test_utils import MockObjStorage
+
+
+def compare_results(expected, captured):
+ """ The metadata translation doesn't always keep the
+ result in the same order, for this a dedicated function to compare
+ results is needed while checking nested lists and dicts
+ Args:
+ - expected (dict)
+ - captured (dict)
+ Returns:
+ True if results are the same
+ """
+ if expected == captured:
+ return True
+
+ if isinstance(captured, dict):
+ while captured:
+ k, v = captured.popitem()
+ if not compare_results(expected[k], v):
+ return False
+
+ return True
+ if isinstance(captured, list):
+ while captured:
+ elem = captured.pop()
+ if elem in expected:
+ return True
+ else:
+ return False
+ return False
+
+
+class MockStorage():
+ """Mock storage to simplify reading indexers' outputs.
+ """
+ def content_metadata_add(self, metadata, conflict_update=None):
+ self.state = metadata
+ self.conflict_update = conflict_update
+
+ def indexer_configuration_get(self, tool):
+ return {
+ 'id': 30,
+ 'name': 'hard_mapping_npm',
+ 'version': '0.1'
+ }
+
+
+class TestMetadataIndexer(ContentMetadataIndexer):
+ """Specific Metadata whose configuration is enough to satisfy the
+ indexing tests.
+ """
+ def prepare(self):
+ self.config = {
+ 'rescheduling_task': None,
+ 'tools': {
+ 'name': 'hard_mapping_npm',
+ 'version': '0.1',
+ 'configuration': {
+ 'type': 'local',
+ 'debian-package': ''
+ }
+ }
+ }
+ self.storage = MockStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = MockObjStorage()
+ self.task_destination = None
+ self.rescheduling_task = self.config['rescheduling_task']
+ self.tools = self.retrieve_tools_information()
+
+
+class Metadata(unittest.TestCase):
+ """
+ Tests metadata_mock_tool tool for Metadata detection
+ """
+ def setUp(self):
+ """
+ shows the entire diff in the results
+ """
+ self.maxDiff = None
+
+ @istest
+ def test_compute_metadata_none(self):
+ """
+ testing content empty content is empty
+ should return None
+ """
+ # given
+ content = b""
+ tool = "hard_mapping_npm"
+
+ # None if no metadata was found or an error occurred
+ declared_metadata = None
+ # when
+ result = compute_metadata(tool, content)
+ # then
+ self.assertEqual(declared_metadata, result)
+
+ @istest
+ def test_compute_metadata_npm(self):
+ """
+ testing only computation of metadata with hard_mapping_npm
+ """
+ # given
+ content = b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.1",
+ "description": "Simple package.json test for indexer",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/moranegg/metadata_test"
+ }
+ }
+ """
+ declared_metadata = {
+ 'name': 'test_metadata',
+ 'version': '0.0.1',
+ 'description': 'Simple package.json test for indexer',
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'other': {}
+ }
+
+ # when
+ result = compute_metadata("hard_mapping_npm", content)
+ # then
+ self.assertEqual(declared_metadata, result)
+ self.assertTrue(compare_results(declared_metadata, result))
+
+ @istest
+ def test_index_content_metadata_npm(self):
+ """
+ testing NPM with package.json
+ - one sha1 uses a file that can't be translated to metadata and
+ should return None in the translated metadata
+ """
+ # given
+ sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
+ 'd4c647f0fc257591cc9ba1722484229780d1c607',
+ '02fb2c89e14f7fab46701478c83779c7beb7b069']
+ # this metadata indexer computes only metadata for package.json
+ # in npm context with a hard mapping
+ metadata_indexer = TestMetadataIndexer()
+
+ # when
+ metadata_indexer.run(sha1s, policy_update='ignore-dups')
+ results = metadata_indexer.storage.state
+
+ expected_results = [{
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'softwareRequirements': [
+ 'abbrev',
+ 'ansi-regex',
+ {
+ 'JSONStream': '~1.3.1',
+ 'abbrev': '~1.1.0',
+ 'ansi-regex': '~2.1.1',
+ 'ansicolors': '~0.3.2',
+ 'ansistyles': '~0.1.3'
+ }
+ ],
+ 'issueTracker': {
+ 'url': 'https://github.com/npm/npm/issues'
+ },
+ 'author':
+ 'Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)',
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/npm/npm'
+ },
+ 'description': 'a package manager for JavaScript',
+ 'softwareSuggestions': {
+ 'tacks': '~1.2.6',
+ 'tap': '~10.3.2'
+ },
+ 'license': 'Artistic-2.0',
+ 'version': '5.0.3',
+ 'other': {
+ 'preferGlobal': True,
+ 'config': {
+ 'publishtest': False
+ }
+ },
+ 'name': 'npm',
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json'
+ ],
+ 'url': 'https://docs.npmjs.com/'
+ },
+ 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': None,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ }]
+ self.assertTrue(compare_results(expected_results, results))
+ # The assertion bellow returns False sometimes because of nested lists
+ # self.assertEqual(expected_results, results)
+
+ @istest
+ def test_compare_method(self):
+ """
+ testing compare method to view problems for nested lists and dicts
+ """
+ a = {
+ 'indexer_configuration_id': 30,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'translated_metadata': {
+ 'other': {
+ 'preferGlobal': True,
+ 'config': {
+ 'publishtest': False
+ }
+ },
+ 'name': 'npm',
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json'
+ ],
+ }
+ }
+ b = {
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'indexer_configuration_id': 30,
+ 'translated_metadata':
+ {
+ 'other': {
+ 'config': {
+ 'publishtest': False
+ },
+ 'preferGlobal': True
+ },
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json',
+
+ ],
+ 'name': 'npm',
+ }
+ }
+ self.assertTrue(compare_results(a, b))
+
+ @istest
+ def test_index_without_compare_method(self):
+ """
+ testing without compare method to check integrity
+ """
+ # given
+ sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
+ '02fb2c89e14f7fab46701478c83779c7beb7b069']
+ # this metadata indexer computes only metadata for package.json
+ # in npm context with a hard mapping
+ metadata_indexer = TestMetadataIndexer()
+
+ # when
+ metadata_indexer.run(sha1s, policy_update='ignore-dups')
+ results = metadata_indexer.storage.state
+
+ expected_results = [{
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': None,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ }]
+ self.assertEqual(expected_results, results)
+ self.assertTrue(compare_results(expected_results, results))
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -51,6 +51,60 @@
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
+ """,
+ '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.1",
+ "description": "Simple package.json test for indexer",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/moranegg/metadata_test"
+ }
+ }
+ """,
+ 'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
+ {
+ "version": "5.0.3",
+ "name": "npm",
+ "description": "a package manager for JavaScript",
+ "keywords": [
+ "install",
+ "modules",
+ "package manager",
+ "package.json"
+ ],
+ "preferGlobal": true,
+ "config": {
+ "publishtest": false
+ },
+ "homepage": "https://docs.npmjs.com/",
+ "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/npm/npm"
+ },
+ "bugs": {
+ "url": "https://github.com/npm/npm/issues"
+ },
+ "dependencies": {
+ "JSONStream": "~1.3.1",
+ "abbrev": "~1.1.0",
+ "ansi-regex": "~2.1.1",
+ "ansicolors": "~0.3.2",
+ "ansistyles": "~0.1.3"
+ },
+ "bundleDependencies": [
+ "abbrev",
+ "ansi-regex"
+ ],
+ "devDependencies": {
+ "tacks": "~1.2.6",
+ "tap": "~10.3.2"
+ },
+ "license": "Artistic-2.0"
+ }
+
"""
}

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 6:57 AM (20 h, 21 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227123

Event Timeline