Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 77fdfb9..8adff82 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,294 +1,283 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import logging
from swh.indexer.indexer import ContentIndexer, RevisionIndexer
from swh.indexer.metadata_dictionary import compute_metadata
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing translated_metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
CONFIG_BASE_FILENAME = 'indexer/metadata'
def __init__(self, tool, config):
# twisted way to use the exact same config of RevisionMetadataIndexer
# object that uses internally ContentMetadataIndexer
self.config = config
self.config['tools'] = tool
super().__init__()
- def prepare(self):
- self.results = []
- if self.config[INDEXER_CFG_KEY]:
- self.idx_storage = self.config[INDEXER_CFG_KEY]
- if self.config['objstorage']:
- self.objstorage = self.config['objstorage']
- _log = logging.getLogger('requests.packages.urllib3.connectionpool')
- _log.setLevel(logging.WARN)
- self.log = logging.getLogger('swh.indexer')
- self.tools = self.register_tools(self.config['tools'])
- # NOTE: only one tool so far, change when no longer true
- self.tool = self.tools[0]
-
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the translated_metadata keys will
be returned as None
"""
result = {
'id': id,
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
context = self.tool['tool_configuration']['context']
result['translated_metadata'] = compute_metadata(context, data)
# a twisted way to keep result with indexer object for get_results
self.results.append(result)
except Exception:
self.log.exception(
"Problem during tool retrieval of metadata translation")
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- translated_metadata (jsonb): detected metadata
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def get_results(self):
"""can be called only if run method was called before
Returns:
list: list of content_metadata entries calculated by
current indexer
"""
return self.results
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_metadata table with
defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
CONFIG_BASE_FILENAME = 'indexer/metadata'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'swh-metadata-detector',
'version': '0.0.1',
'configuration': {
'type': 'local',
'context': ['npm', 'codemeta']
},
}),
}
+ ContentMetadataIndexer = ContentMetadataIndexer
+
def prepare(self):
super().prepare()
self.tool = self.tools[0]
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_metadata_missing((
{
'id': sha1_git,
'indexer_configuration_id': self.tool['id'],
} for sha1_git in sha1_gits
))
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
rev (bytes): revision artifact from storage
Returns:
dict: dictionary representing a revision_metadata, with keys:
- id (bytes): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- translated_metadata (bytes): dict of retrieved metadata
"""
try:
result = {
'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
root_dir = rev['directory']
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = (entry for entry in dir_ls if entry['type'] == 'file')
detected_files = detect_metadata(files)
result['translated_metadata'] = self.translate_revision_metadata(
detected_files)
except Exception as e:
self.log.exception(
'Problem when indexing rev')
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in revision_metadata
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def translate_revision_metadata(self, detected_files):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files (dict): dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
dict: dict with translated metadata according to the CodeMeta
vocabulary
"""
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
'version': '0.0.1',
'configuration': {
'type': 'local',
'context': None
},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {
INDEXER_CFG_KEY: self.idx_storage,
'objstorage': self.objstorage
}
for context in detected_files.keys():
tool['configuration']['context'] = context
- c_metadata_indexer = ContentMetadataIndexer(tool, config)
+ c_metadata_indexer = self.ContentMetadataIndexer(tool, config)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context])
for c in metadata_generator:
# extracting translated_metadata
sha1 = c['id']
sha1s_in_storage.append(sha1)
local_metadata = c['translated_metadata']
# local metadata is aggregated
if local_metadata:
translated_metadata.append(local_metadata)
sha1s_filtered = [item for item in detected_files[context]
if item not in sha1s_in_storage]
if sha1s_filtered:
# schedule indexation of content
try:
c_metadata_indexer.run(sha1s_filtered,
policy_update='ignore-dups')
# on the fly possibility:
results = c_metadata_indexer.get_results()
for result in results:
local_metadata = result['translated_metadata']
translated_metadata.append(local_metadata)
except Exception as e:
self.log.warn("""Exception while indexing content""", e)
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
return min_metadata
@click.command()
@click.option('--revs', '-i',
default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
'026040ea79dec1b49b4e3e7beda9132b6b26b51b',
'9699072e21eded4be8d45e3b8d543952533fa190'],
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
rev_metadata_indexer = RevisionMetadataIndexer()
rev_metadata_indexer.run(_git_sha1s, 'update-dups')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 2423891..51913c7 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,363 +1,366 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
import logging
from nose.tools import istest
from swh.indexer.metadata_dictionary import compute_metadata
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.metadata import ContentMetadataIndexer
from swh.indexer.metadata import RevisionMetadataIndexer
from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
from swh.indexer.tests.test_utils import MockIndexerStorage
class TestContentMetadataIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def prepare(self):
self.config.update({
'rescheduling_task': None,
})
self.idx_storage = MockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.task_destination = None
self.rescheduling_task = self.config['rescheduling_task']
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
self.results = []
class TestRevisionMetadataIndexer(RevisionMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
+
+ ContentMetadataIndexer = TestContentMetadataIndexer
+
def prepare(self):
self.config = {
'rescheduling_task': None,
'storage': {
'cls': 'remote',
'args': {
'url': 'http://localhost:9999',
}
},
'tools': {
'name': 'swh-metadata-detector',
'version': '0.0.1',
'configuration': {
'type': 'local',
'context': 'npm'
}
}
}
self.storage = MockStorage()
self.idx_storage = MockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.task_destination = None
self.rescheduling_task = self.config['rescheduling_task']
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
self.results = []
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.content_tool = {
'name': 'swh-metadata-translator',
'version': '0.0.1',
'configuration': {
'type': 'local',
'context': 'npm'
}
}
@istest
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
context = "npm"
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = compute_metadata(context, content)
# then
self.assertEqual(declared_metadata, result)
@istest
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
"""
declared_metadata = {
'name': 'test_metadata',
'version': '0.0.1',
'description': 'Simple package.json test for indexer',
'codeRepository': {
'type': 'git',
'url': 'https://github.com/moranegg/metadata_test'
},
'other': {}
}
# when
result = compute_metadata("npm", content)
# then
self.assertEqual(declared_metadata, result)
@istest
def test_extract_minimal_metadata_dict(self):
"""
Test the creation of a coherent minimal metadata set
"""
# given
metadata_list = [{
'name': 'test_1',
'version': '0.0.1',
'description': 'Simple package.json test for indexer',
'codeRepository': {
'type': 'git',
'url': 'https://github.com/moranegg/metadata_test'
},
'other': {}
}, {
'name': 'test_0_1',
'version': '0.0.1',
'description': 'Simple package.json test for indexer',
'codeRepository': {
'type': 'git',
'url': 'https://github.com/moranegg/metadata_test'
},
'other': {}
}, {
'name': 'test_metadata',
'version': '0.0.1',
'author': 'moranegg',
'other': {}
}]
# when
results = extract_minimal_metadata_dict(metadata_list)
# then
expected_results = {
"developmentStatus": None,
"version": ['0.0.1'],
"operatingSystem": None,
"description": ['Simple package.json test for indexer'],
"keywords": None,
"issueTracker": None,
"name": ['test_1', 'test_0_1', 'test_metadata'],
"author": ['moranegg'],
"relatedLink": None,
"url": None,
"license": None,
"maintainer": None,
"email": None,
"softwareRequirements": None,
"identifier": None,
"codeRepository": [{
'type': 'git',
'url': 'https://github.com/moranegg/metadata_test'
}]
}
self.assertEqual(expected_results, results)
@istest
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
'd4c647f0fc257591cc9ba1722484229780d1c607',
'02fb2c89e14f7fab46701478c83779c7beb7b069']
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
metadata_indexer = TestContentMetadataIndexer(
tool=self.content_tool, config={})
# when
metadata_indexer.run(sha1s, policy_update='ignore-dups')
results = metadata_indexer.idx_storage.state
expected_results = [{
'indexer_configuration_id': 30,
'translated_metadata': {
'other': {},
'codeRepository': {
'type': 'git',
'url': 'https://github.com/moranegg/metadata_test'
},
'description': 'Simple package.json test for indexer',
'name': 'test_metadata',
'version': '0.0.1'
},
'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
}, {
'indexer_configuration_id': 30,
'translated_metadata': {
'softwareRequirements': {
'JSONStream': '~1.3.1',
'abbrev': '~1.1.0',
'ansi-regex': '~2.1.1',
'ansicolors': '~0.3.2',
'ansistyles': '~0.1.3'
},
'issueTracker': {
'url': 'https://github.com/npm/npm/issues'
},
'author':
'Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)',
'codeRepository': {
'type': 'git',
'url': 'https://github.com/npm/npm'
},
'description': 'a package manager for JavaScript',
'softwareSuggestions': {
'tacks': '~1.2.6',
'tap': '~10.3.2'
},
'license': 'Artistic-2.0',
'version': '5.0.3',
'other': {
'preferGlobal': True,
'config': {
'publishtest': False
}
},
'name': 'npm',
'keywords': [
'install',
'modules',
'package manager',
'package.json'
],
'url': 'https://docs.npmjs.com/'
},
'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
}, {
'indexer_configuration_id': 30,
'translated_metadata': None,
'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
}]
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
@istest
def test_detect_metadata_package_json(self):
# given
df = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'cde'
}]
# when
results = detect_metadata(df)
expected_results = {
'npm': [
b'cde'
]
}
# then
self.assertEqual(expected_results, results)
@istest
def test_revision_metadata_indexer(self):
metadata_indexer = TestRevisionMetadataIndexer()
sha1_gits = [
b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = metadata_indexer.idx_storage.state
expected_results = [{
'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
'translated_metadata': {
'identifier': None,
'maintainer': None,
'url': [
'https://github.com/librariesio/yarn-parser#readme'
],
'codeRepository': [{
'type': 'git',
'url': 'git+https://github.com/librariesio/yarn-parser.git'
}],
'author': ['Andrew Nesbitt'],
'license': ['AGPL-3.0'],
'version': ['1.0.0'],
'description': [
'Tiny web service for parsing yarn.lock files'
],
'relatedLink': None,
'developmentStatus': None,
'operatingSystem': None,
'issueTracker': [{
'url': 'https://github.com/librariesio/yarn-parser/issues'
}],
'softwareRequirements': [{
'express': '^4.14.0',
'yarn': '^0.21.0',
'body-parser': '^1.15.2'
}],
'name': ['yarn-parser'],
'keywords': [['yarn', 'parse', 'lock', 'dependencies']],
'email': None
},
'indexer_configuration_id': 7
}]
# then
self.assertEqual(expected_results, results)

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:45 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3273103

Event Timeline