Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123259
D215.id716.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
26 KB
Subscribers
None
D215.id716.diff
View Options
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -17,6 +17,53 @@
from swh.scheduler.utils import get_task
+class DiskIndexer:
+ """Mixin intended to be used with other *Indexer classes.
+
+ Indexer* inheriting from this class are a category of indexers
+ which needs the disk for their computations.
+
+ Expects:
+ self.working_directory variable defined at runtime.
+
+ """
+ def __init__(self):
+ super().__init__()
+
+ def write_to_temp(self, filename, data):
+ """Write the sha1's content in a temporary file.
+
+ Args:
+ sha1 (str): the sha1 name
+ filename (str): one of sha1's many filenames
+ data (bytes): the sha1's content to write in temporary
+ file
+
+ Returns:
+ The path to the temporary file created. That file is
+ filled in with the raw content's data.
+
+ """
+ os.makedirs(self.working_directory, exist_ok=True)
+ temp_dir = tempfile.mkdtemp(dir=self.working_directory)
+ content_path = os.path.join(temp_dir, filename)
+
+ with open(content_path, 'wb') as f:
+ f.write(data)
+
+ return content_path
+
+ def cleanup(self, content_path):
+ """Remove content_path from working directory.
+
+ Args:
+ content_path (str): the file to remove
+
+ """
+ temp_dir = os.path.dirname(content_path)
+ shutil.rmtree(temp_dir)
+
+
class BaseIndexer(SWHConfig,
metaclass=abc.ABCMeta):
"""Base class for indexers to inherit from.
@@ -257,50 +304,3 @@
if self.rescheduling_task:
self.log.warn('Rescheduling batch')
self.rescheduling_task.delay(sha1s, policy_update)
-
-
-class DiskIndexer:
- """Mixin intended to be used with other *Indexer classes.
-
- Indexer* inheriting from this class are a category of indexers
- which needs the disk for their computations.
-
- Expects:
- self.working_directory variable defined at runtime.
-
- """
- def __init__(self):
- super().__init__()
-
- def write_to_temp(self, filename, data):
- """Write the sha1's content in a temporary file.
-
- Args:
- sha1 (str): the sha1 name
- filename (str): one of sha1's many filenames
- data (bytes): the sha1's content to write in temporary
- file
-
- Returns:
- The path to the temporary file created. That file is
- filled in with the raw content's data.
-
- """
- os.makedirs(self.working_directory, exist_ok=True)
- temp_dir = tempfile.mkdtemp(dir=self.working_directory)
- content_path = os.path.join(temp_dir, filename)
-
- with open(content_path, 'wb') as f:
- f.write(data)
-
- return content_path
-
- def cleanup(self, content_path):
- """Remove content_path from working directory.
-
- Args:
- content_path (str): the file to remove
-
- """
- temp_dir = os.path.dirname(content_path)
- shutil.rmtree(temp_dir)
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from .indexer import BaseIndexer
+from swh.indexer.metadata_dictionary import compute_metadata
+
+
+class ContentMetadataIndexer(BaseIndexer):
+ """Indexer in charge of:
+ - filtering out content already indexed
+ - reading content from objstorage with the content's id sha1
+ - computing translated_metadata by given context
+ - using the MetadataDict and a tool for each context
+ - store result in storage
+ """
+ CONFIG_BASE_FILENAME = 'indexer/metadata'
+
+ ADDITIONAL_CONFIG = {
+ 'tools': ('dict', {
+ 'name': 'hard_mapping_npm',
+ 'version': '0.0.1',
+ 'configuration': {
+ 'type': 'test',
+ 'debian-package': ''
+ },
+ }),
+ }
+
+ def prepare(self):
+ super().prepare()
+
+ def filter_contents(self, sha1s):
+ """Filter out known sha1s and return only missing ones.
+ """
+ yield from self.storage.content_metadata_missing((
+ {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tools['id'],
+ } for sha1 in sha1s
+ ))
+
+ def index_content(self, sha1, raw_content):
+ """Index sha1s' content and store result.
+
+ Args:
+ sha1 (bytes): content's identifier
+ raw_content (bytes): raw content in bytes
+
+ Returns:
+ result (dict): representing a content_metadata
+ if translation wasn't successful the translated_metadata keys
+ will be kept as None
+
+ """
+ result = {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tools['id'],
+ 'translated_metadata': None
+ }
+ try:
+ context = self.tools['name']
+ result['translated_metadata'] = compute_metadata(
+ context, raw_content)
+ except:
+ self.log.exception(
+ "Problem during tool retrieval of metadata translation")
+ return result
+
+ def persist_index_computations(self, results, policy_update):
+ """Persist the results in storage.
+
+ Args:
+ results ([dict]): list of content_metadata, dict with the
+ following keys:
+ - id (bytes): content's identifier (sha1)
+ - translated_metadata (jsonb): detected metadata
+ policy_update ([str]): either 'update-dups' or 'ignore-dups' to
+ respectively update duplicates or ignore them
+
+ """
+ self.storage.content_metadata_add(
+ results, conflict_update=(policy_update == 'update-dups'))
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata_dictionary.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import json
+
+
+def convert(raw_content):
+ """
+ convert raw_content recursively:
+ - from bytes to string
+ - from string to dict
+ Args:
+ - raw_content (bytes / string / dict)
+ Returns:
+ - Dict of content (if string was json, otherwise returns string)
+ """
+ if isinstance(raw_content, bytes):
+ return convert(raw_content.decode())
+ if isinstance(raw_content, str):
+ try:
+ content = json.loads(raw_content)
+ if content:
+ return content
+ else:
+ return raw_content
+ except json.decoder.JSONDecodeError:
+ return raw_content
+ if isinstance(raw_content, dict):
+ return raw_content
+
+
+class BaseMapping():
+ """Base class for mappings to inherit from
+
+ To implement a new mapping:
+ - inherit this class
+ - add a local property self.mapping
+ - override translate function
+ """
+
+ def __init__(self):
+ pass
+
+ def translate(self, content_dict):
+ """
+ Tranlsates content by parsing content to a json object
+ and translating with the npm mapping (for now hard_coded mapping)
+ Args:
+ - context_text (text) : should be json
+
+ Returns:
+ - translated_metadata (dict): jsonb form needed for the indexer
+ """
+ translated_metadata = {}
+ default = 'other'
+ translated_metadata['other'] = {}
+ try:
+ for k, v in content_dict.items():
+ try:
+ term = self.mapping.get(k, default)
+ if term not in translated_metadata:
+ translated_metadata[term] = v
+ continue
+ if isinstance(translated_metadata[term], str):
+ in_value = translated_metadata[term]
+ translated_metadata[term] = [in_value, v]
+ continue
+ if isinstance(translated_metadata[term], list):
+ translated_metadata[term].append(v)
+ continue
+ if isinstance(translated_metadata[term], dict):
+ translated_metadata[term][k] = v
+ continue
+ except KeyError:
+ self.log.exception(
+ "Problem during item mapping")
+ continue
+ except:
+ return None
+ return translated_metadata
+
+
+class NpmMapping(BaseMapping):
+ """
+ dedicated class for NPM (package.json) mapping and translation
+ """
+ npm_mapping = {
+ 'repository': 'codeRepository',
+ 'os': 'operatingSystem',
+ 'cpu': 'processorRequirements',
+ 'engines': 'processorRequirements',
+ 'dependencies': 'softwareRequirements',
+ 'bundleDependencies': 'softwareRequirements',
+ 'peerDependencies': 'softwareRequirements',
+ 'author': 'author',
+ 'contributor': 'contributor',
+ 'keywords': 'keywords',
+ 'license': 'license',
+ 'version': 'version',
+ 'description': 'description',
+ 'name': 'name',
+ 'devDependencies': 'softwareSuggestions',
+ 'optionalDependencies': 'softwareSuggestions',
+ 'bugs': 'issueTracker',
+ 'homepage': 'url'
+ }
+
+ def __init__(self):
+ self.mapping = self.npm_mapping
+
+ def translate(self, raw_content):
+ content_dict = convert(raw_content)
+ return super().translate(content_dict)
+
+
+class MavenMapping(BaseMapping):
+ """
+ dedicated class for Maven (pom.xml) mapping and translation
+ """
+ maven_mapping = {
+ 'license': 'license',
+ 'version': 'version',
+ 'description': 'description',
+ 'name': 'name',
+ 'prerequisites': 'softwareRequirements',
+ 'repositories': 'codeRepository',
+ 'groupId': 'identifier',
+ 'ciManagement': 'contIntegration',
+ 'issuesManagement': 'issueTracker',
+ }
+
+ def __init__(self):
+ self.mapping = self.maven_mapping
+
+ def translate(self, raw_content):
+ content = convert(raw_content)
+ # parse content from xml to dict
+ return super().translate(content)
+
+
+class DoapMapping(BaseMapping):
+ doap_mapping = {
+
+ }
+
+ def __init__(self):
+ self.mapping = self.doap_mapping
+
+ def translate(self, raw_content):
+ content = convert(raw_content)
+ # parse content from xml to dict
+ return super().translate(content)
+
+
+def parse_xml(content):
+ """
+ Parses content from xml to a python dict
+ Args:
+ - content (text): the string form of the raw_content ( in xml)
+
+ Returns:
+ - parsed_xml (dict): a python dict of the content after parsing
+ """
+ # check if xml
+ # use xml parser to dict
+ return content
+
+
+mapping_tool_fn = {
+ "hard_mapping_npm": NpmMapping(),
+ "pom_xml": MavenMapping(),
+ "doap_xml": DoapMapping()
+}
+
+
+def compute_metadata(context, raw_content):
+ """
+ first landing method: a dispatcher that sends content
+ to the right function to carry out the real parsing of syntax
+ and translation of terms
+ Args:
+ - context (text) : defines to which function/tool
+ the content is sent
+ - content (text): the string form of the raw_content
+
+ Returns:
+ - translated_metadata (dict): jsonb form needed for the indexer
+ to store in storage
+
+ """
+ if raw_content is None or raw_content is b"":
+ return None
+
+ # TODO: keep mapping not in code (maybe fetch crosswalk from storage?)
+ # if fetched from storage should be done once for batch of sha1s
+ dictionary = mapping_tool_fn[context]
+ translated_metadata = dictionary.translate(raw_content)
+ # print(translated_metadata)
+ return translated_metadata
+
+
+def main():
+ raw_content = """{"name": "test_name", "unknown_term": "ut"}"""
+ raw_content1 = b"""{"name": "test_name",
+ "unknown_term": "ut",
+ "prerequisites" :"packageXYZ"}"""
+ result = compute_metadata("hard_mapping_npm", raw_content)
+ result1 = compute_metadata("pom_xml", raw_content1)
+
+ print(result)
+ print(result1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
--- a/swh/indexer/tests/test_language.py
+++ b/swh/indexer/tests/test_language.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_metadata.py
@@ -0,0 +1,314 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+from nose.tools import istest
+
+from swh.indexer.metadata_dictionary import compute_metadata
+from swh.indexer.metadata import ContentMetadataIndexer
+from swh.indexer.tests.test_utils import MockObjStorage
+
+
+def compare_results(expected, captured):
+ """ The metadata translation doesn't always keep the
+ result in the same order, for this a dedicated function to compare
+ results is needed while checking nested lists and dicts
+ Args:
+ - expected (dict)
+ - captured (dict)
+ Returns:
+ True if results are the same
+ """
+ if expected == captured:
+ return True
+
+ if isinstance(captured, dict):
+ while captured:
+ k, v = captured.popitem()
+ if not compare_results(expected[k], v):
+ return False
+
+ return True
+ if isinstance(captured, list):
+ while captured:
+ elem = captured.pop()
+ if elem in expected:
+ return True
+ else:
+ return False
+ return False
+
+
+class MockStorage():
+ """Mock storage to simplify reading indexers' outputs.
+ """
+ def content_metadata_add(self, metadata, conflict_update=None):
+ self.state = metadata
+ self.conflict_update = conflict_update
+
+ def indexer_configuration_get(self, tool):
+ return {
+ 'id': 30,
+ 'name': 'hard_mapping_npm',
+ 'version': '0.1'
+ }
+
+
+class TestMetadataIndexer(ContentMetadataIndexer):
+ """Specific Metadata whose configuration is enough to satisfy the
+ indexing tests.
+ """
+ def prepare(self):
+ self.config = {
+ 'rescheduling_task': None,
+ 'tools': {
+ 'name': 'hard_mapping_npm',
+ 'version': '0.1',
+ 'configuration': {
+ 'type': 'local',
+ 'debian-package': ''
+ }
+ }
+ }
+ self.storage = MockStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = MockObjStorage()
+ self.task_destination = None
+ self.rescheduling_task = self.config['rescheduling_task']
+ self.tools = self.retrieve_tools_information()
+
+
+class Metadata(unittest.TestCase):
+ """
+ Tests metadata_mock_tool tool for Metadata detection
+ """
+ def setUp(self):
+ """
+ shows the entire diff in the results
+ """
+ self.maxDiff = None
+
+ @istest
+ def test_compute_metadata_none(self):
+ """
+ testing content empty content is empty
+ should return None
+ """
+ # given
+ content = b""
+ tool = "hard_mapping_npm"
+
+ # None if no metadata was found or an error occurred
+ declared_metadata = None
+ # when
+ result = compute_metadata(tool, content)
+ # then
+ self.assertEqual(declared_metadata, result)
+
+ @istest
+ def test_compute_metadata_npm(self):
+ """
+ testing only computation of metadata with hard_mapping_npm
+ """
+ # given
+ content = b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.1",
+ "description": "Simple package.json test for indexer",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/moranegg/metadata_test"
+ }
+ }
+ """
+ declared_metadata = {
+ 'name': 'test_metadata',
+ 'version': '0.0.1',
+ 'description': 'Simple package.json test for indexer',
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'other': {}
+ }
+
+ # when
+ result = compute_metadata("hard_mapping_npm", content)
+ # then
+ self.assertEqual(declared_metadata, result)
+ self.assertTrue(compare_results(declared_metadata, result))
+
+ @istest
+ def test_index_content_metadata_npm(self):
+ """
+ testing NPM with package.json
+ - one sha1 uses a file that can't be translated to metadata and
+ should return None in the translated metadata
+ """
+ # given
+ sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
+ 'd4c647f0fc257591cc9ba1722484229780d1c607',
+ '02fb2c89e14f7fab46701478c83779c7beb7b069']
+ # this metadata indexer computes only metadata for package.json
+ # in npm context with a hard mapping
+ metadata_indexer = TestMetadataIndexer()
+
+ # when
+ metadata_indexer.run(sha1s, policy_update='ignore-dups')
+ results = metadata_indexer.storage.state
+
+ expected_results = [{
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'softwareRequirements': [
+ 'abbrev',
+ 'ansi-regex',
+ {
+ 'JSONStream': '~1.3.1',
+ 'abbrev': '~1.1.0',
+ 'ansi-regex': '~2.1.1',
+ 'ansicolors': '~0.3.2',
+ 'ansistyles': '~0.1.3'
+ }
+ ],
+ 'issueTracker': {
+ 'url': 'https://github.com/npm/npm/issues'
+ },
+ 'author':
+ 'Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)',
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/npm/npm'
+ },
+ 'description': 'a package manager for JavaScript',
+ 'softwareSuggestions': {
+ 'tacks': '~1.2.6',
+ 'tap': '~10.3.2'
+ },
+ 'license': 'Artistic-2.0',
+ 'version': '5.0.3',
+ 'other': {
+ 'preferGlobal': True,
+ 'config': {
+ 'publishtest': False
+ }
+ },
+ 'name': 'npm',
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json'
+ ],
+ 'url': 'https://docs.npmjs.com/'
+ },
+ 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': None,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ }]
+ self.assertTrue(compare_results(expected_results, results))
+ # The assertion bellow returns False sometimes because of nested lists
+ # self.assertEqual(expected_results, results)
+
+ @istest
+ def test_compare_method(self):
+ """
+ testing compare method to view problems for nested lists and dicts
+ """
+ a = {
+ 'indexer_configuration_id': 30,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'translated_metadata': {
+ 'other': {
+ 'preferGlobal': True,
+ 'config': {
+ 'publishtest': False
+ }
+ },
+ 'name': 'npm',
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json'
+ ],
+ }
+ }
+ b = {
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'indexer_configuration_id': 30,
+ 'translated_metadata':
+ {
+ 'other': {
+ 'config': {
+ 'publishtest': False
+ },
+ 'preferGlobal': True
+ },
+ 'keywords': [
+ 'install',
+ 'modules',
+ 'package manager',
+ 'package.json',
+
+ ],
+ 'name': 'npm',
+ }
+ }
+ self.assertTrue(compare_results(a, b))
+
+ @istest
+ def test_index_without_compare_method(self):
+ """
+ testing without compare method to check integrity
+ """
+ # given
+ sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
+ '02fb2c89e14f7fab46701478c83779c7beb7b069']
+ # this metadata indexer computes only metadata for package.json
+ # in npm context with a hard mapping
+ metadata_indexer = TestMetadataIndexer()
+
+ # when
+ metadata_indexer.run(sha1s, policy_update='ignore-dups')
+ results = metadata_indexer.storage.state
+
+ expected_results = [{
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
+ }, {
+ 'indexer_configuration_id': 30,
+ 'translated_metadata': None,
+ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ }]
+ self.assertEqual(expected_results, results)
+ self.assertTrue(compare_results(expected_results, results))
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -51,6 +51,60 @@
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
+ """,
+ '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
+ {
+ "name": "test_metadata",
+ "version": "0.0.1",
+ "description": "Simple package.json test for indexer",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/moranegg/metadata_test"
+ }
+ }
+ """,
+ 'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
+ {
+ "version": "5.0.3",
+ "name": "npm",
+ "description": "a package manager for JavaScript",
+ "keywords": [
+ "install",
+ "modules",
+ "package manager",
+ "package.json"
+ ],
+ "preferGlobal": true,
+ "config": {
+ "publishtest": false
+ },
+ "homepage": "https://docs.npmjs.com/",
+ "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/npm/npm"
+ },
+ "bugs": {
+ "url": "https://github.com/npm/npm/issues"
+ },
+ "dependencies": {
+ "JSONStream": "~1.3.1",
+ "abbrev": "~1.1.0",
+ "ansi-regex": "~2.1.1",
+ "ansicolors": "~0.3.2",
+ "ansistyles": "~0.1.3"
+ },
+ "bundleDependencies": [
+ "abbrev",
+ "ansi-regex"
+ ],
+ "devDependencies": {
+ "tacks": "~1.2.6",
+ "tap": "~10.3.2"
+ },
+ "license": "Artistic-2.0"
+ }
+
"""
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 6:57 AM (20 h, 21 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227123
Attached To
D215: First draft of the metadata content indexer for npm (package.json) T715
Event Timeline
Log In to Comment