Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11012755
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
50 KB
Subscribers
None
View Options
diff --git a/swh/indexer/data/package-json/CITATION b/swh/indexer/data/package-json/CITATION
new file mode 100644
index 0000000..52a13c0
--- /dev/null
+++ b/swh/indexer/data/package-json/CITATION
@@ -0,0 +1 @@
+swh:1:dir:49dd6f75450a37243dfcc4b418ca5bf5e0010748;origin=https://github.com/Bartvds/package.json-schema
diff --git a/swh/indexer/data/package-json/LICENSE b/swh/indexer/data/package-json/LICENSE
new file mode 100644
index 0000000..3651abe
--- /dev/null
+++ b/swh/indexer/data/package-json/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2014 Bart van der Schoor
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/swh/indexer/data/package-json/schema.json b/swh/indexer/data/package-json/schema.json
new file mode 100644
index 0000000..e5f799f
--- /dev/null
+++ b/swh/indexer/data/package-json/schema.json
@@ -0,0 +1,377 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema",
+ "id": "lib://package.json",
+ "title": "package.json-schema",
+ "description": "JSON Schema for node/npm package.json",
+ "$ref": "lib://package.json#/definitions/standard",
+ "definitions": {
+ "minimal": {
+ "allOf": [
+ {
+ "$ref": "lib://package.json#/definitions/structure"
+ },
+ {
+ "required": [
+ "name",
+ "version"
+ ]
+ }
+ ]
+ },
+ "standard": {
+ "allOf": [
+ {
+ "$ref": "lib://package.json#/definitions/structure"
+ },
+ {
+ "required": [
+ "name",
+ "version",
+ "description",
+ "keywords",
+ "author",
+ "homepage",
+ "repository",
+ "bugs",
+ "licenses",
+ "engines",
+ "main",
+ "scripts",
+ "dependencies",
+ "devDependencies"
+ ],
+ "properties": {
+ "scripts": {
+ "type": "object",
+ "properties": {
+ "test": {
+ "type" : "string",
+ "pattern": "[a-zA-Z]"
+ }
+ }
+ },
+ "author": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ },
+ "contributors": {
+ "type": "array",
+ "items": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ },
+ "maintainers": {
+ "type": "array",
+ "items": {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ }
+ }
+ }
+ ]
+ },
+ "structure": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "$ref": "lib://package.json#/definitions/name"
+ },
+ "version": {
+ "$ref": "lib://package.json#/definitions/semver"
+ },
+ "description": {
+ "type": "string",
+ "minLength": 1
+ },
+ "keywords": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/name"
+ }
+ },
+ "author": {
+ "$ref": "lib://package.json#/definitions/person"
+ },
+ "contributors": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/person"
+ }
+ },
+ "maintainers": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/person"
+ }
+ },
+ "homepage": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ "repository": {
+ "$ref": "lib://package.json#/definitions/repository"
+ },
+ "man": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ }
+ ]
+ },
+ "bugs": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ {
+ "type": "object",
+ "required": [
+ "url"
+ ],
+ "properties": {
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ },
+ "email": {
+ "$ref": "lib://package.json#/definitions/email"
+ }
+ }
+ }
+ ]
+ },
+ "license": {
+ "$ref": "lib://package.json#/definitions/licence"
+ },
+ "licenses": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/licence"
+ }
+ },
+ "private": {
+ "type": "boolean"
+ },
+ "preferGlobal": {
+ "type": "boolean"
+ },
+ "engines": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ },
+ "engineStrict": {
+ "type": "boolean"
+ },
+ "main": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "bin": {
+ "oneOf": [
+ {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ {
+
+ "$ref": "lib://package.json#/definitions/path-map"
+ }
+ ]
+ },
+ "files": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ },
+ "os": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/identifier"
+ }
+ },
+ "cpu": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "$ref": "lib://package.json#/definitions/identifier"
+ }
+ },
+ "config": {
+ "type": "object"
+ },
+ "publishConfig": {
+ "type": "object"
+ },
+ "directories": {
+ "type": "object",
+ "properties": {
+ "lib": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "bin": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "man": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "doc": {
+ "$ref": "lib://package.json#/definitions/path"
+ },
+ "example": {
+ "$ref": "lib://package.json#/definitions/path"
+ }
+ }
+ },
+ "scripts": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ },
+ "dependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "devDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "bundledDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "bundleDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "optionalDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ },
+ "peerDependencies": {
+ "$ref": "lib://package.json#/definitions/dependency-map"
+ }
+ }
+ },
+ "uri-http": {
+ "type": "string",
+ "pattern": "^https?:\/\/"
+ },
+ "email": {
+ "type": "string",
+ "pattern": "^([0-9a-zA-Z]([-\\.\\w]*[0-9a-zA-Z])*@([0-9a-zA-Z][-\\w]*[0-9a-zA-Z]\\.)+[a-zA-Z]{2,9})$"
+ },
+ "path": {
+ "type": "string",
+ "minLength": 1
+ },
+ "name": {
+ "type": "string",
+ "pattern": "^[A-Za-z](?:[_\\.-]?[A-Za-z0-9]+)*$"
+ },
+ "identifier": {
+ "type": "string",
+ "pattern": "^[A-Za-z](?:[_-]?[A-Za-z0-9]+)*$"
+ },
+ "semver": {
+ "type": "string",
+ "pattern": "^\\d+\\.\\d+\\.\\d+(?:-[a-z]+(?:[_\\.-]*[a-z0-9]+)*)*$"
+ },
+ "type-url": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "url"
+ ],
+ "properties": {
+ "type": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+
+ },
+ "repository": {
+ "$ref": "lib://package.json#/definitions/type-url"
+ },
+ "licence": {
+ "oneOf": [
+ {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ {
+ "$ref": "lib://package.json#/definitions/licence-object"
+ }
+ ]
+ },
+ "licence-object": {
+ "type": "object",
+ "additionalProperties": false,
+ "properties": {
+ "type": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+ },
+ "person": {
+ "oneOf": [
+ {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ {
+ "$ref": "lib://package.json#/definitions/person-object"
+ }
+ ]
+ },
+ "person-object": {
+ "type": "object",
+ "required": [
+ "name"
+ ],
+ "properties": {
+ "name": {
+ "type": "string",
+ "pattern": "[a-zA-Z]"
+ },
+ "email": {
+ "$ref": "lib://package.json#/definitions/email"
+ },
+ "url": {
+ "$ref": "lib://package.json#/definitions/uri-http"
+ }
+ }
+ },
+ "string-map": {
+ "type": "object",
+ "additionalProperties": false,
+ "patternProperties": {
+ ".+": {
+ "type": "string"
+ }
+ }
+ },
+ "path-map": {
+ "type": "object",
+ "additionalProperties": false,
+ "patternProperties": {
+ ".+": {
+ "$ref": "lib://package.json#/definitions/path",
+ "pattern": "[a-zA-Z]"
+ }
+ }
+ },
+ "dependency-map": {
+ "$ref": "lib://package.json#/definitions/string-map"
+ }
+ }
+}
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 836d77d..8b2df2d 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,335 +1,335 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import logging
from copy import deepcopy
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing translated_metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
# Note: This used when the content metadata indexer is used alone
# (not the case for example in the case of the RevisionMetadataIndexer)
CONFIG_BASE_FILENAME = 'indexer/content_metadata'
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def index(self, id, data, log_suffix='unknown revision'):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the translated_metadata keys will
be returned as None
"""
result = {
'id': id,
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
mapping_name = self.tool['tool_configuration']['context']
log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
result['translated_metadata'] = \
MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id))
if result['translated_metadata'] is None:
return None
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- translated_metadata (jsonb): detected metadata
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_metadata table with
defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
CONFIG_BASE_FILENAME = 'indexer/revision_metadata'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': list(MAPPINGS),
},
}),
}
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_metadata_missing((
{
'id': sha1_git,
'indexer_configuration_id': self.tool['id'],
} for sha1_git in sha1_gits
))
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
rev (dict): revision artifact from storage
Returns:
dict: dictionary representing a revision_metadata, with keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- translated_metadata: dict of retrieved metadata
"""
result = {
'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
'mappings': None,
'translated_metadata': None
}
try:
root_dir = rev['directory']
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
(mappings, metadata) = self.translate_revision_metadata(
detected_files,
log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
result['mappings'] = mappings
result['translated_metadata'] = metadata
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in revision_metadata
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def translate_revision_metadata(self, detected_files, log_suffix):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files (dict): dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
(List[str], dict): list of mappings used and dict with
translated metadata according to the CodeMeta vocabulary
"""
used_mappings = [MAPPINGS[context].name for context in detected_files]
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': None
},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {
k: self.config[k]
for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
}
config['tools'] = [tool]
for context in detected_files.keys():
cfg = deepcopy(config)
cfg['tools'][0]['configuration']['context'] = context
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context])
for c in metadata_generator:
# extracting translated_metadata
sha1 = c['id']
sha1s_in_storage.append(sha1)
local_metadata = c['translated_metadata']
# local metadata is aggregated
if local_metadata:
translated_metadata.append(local_metadata)
sha1s_filtered = [item for item in detected_files[context]
if item not in sha1s_in_storage]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(sha1s_filtered,
policy_update='ignore-dups',
log_suffix=log_suffix)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result['translated_metadata']
translated_metadata.append(local_metadata)
except Exception:
self.log.exception(
"Exception while indexing metadata on contents")
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
return (used_mappings, min_metadata)
class OriginMetadataIndexer(OriginIndexer):
CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'
ADDITIONAL_CONFIG = {
'tools': ('list', [])
}
USE_TOOLS = False
def __init__(self):
super().__init__()
self.origin_head_indexer = OriginHeadIndexer()
self.revision_metadata_indexer = RevisionMetadataIndexer()
def index_list(self, origins):
head_rev_ids = []
for origin in origins:
head_result = self.origin_head_indexer.index(origin)
if not head_result:
continue
head_rev_ids.append(head_result['revision_id'])
head_revs = list(self.storage.revision_get(head_rev_ids))
assert len(head_revs) == len(head_rev_ids)
results = []
- for (orig, rev) in zip(origins, head_revs):
+ for (origin, rev) in zip(origins, head_revs):
if not rev:
self.warning('Missing head revision %s of origin %r',
(hashutil.hash_to_bytes(rev['id']), origin))
continue
rev_metadata = self.revision_metadata_indexer.index(rev)
orig_metadata = {
'from_revision': rev_metadata['id'],
'origin_id': origin['id'],
'metadata': rev_metadata['translated_metadata'],
'mappings': rev_metadata['mappings'],
'indexer_configuration_id':
rev_metadata['indexer_configuration_id'],
}
results.append((orig_metadata, rev_metadata))
return results
def persist_index_computations(self, results, policy_update):
conflict_update = (policy_update == 'update-dups')
# Deduplicate revisions
rev_metadata = []
orig_metadata = []
for (orig_item, rev_item) in results:
if rev_item not in rev_metadata:
rev_metadata.append(rev_item)
if rev_item not in orig_metadata:
orig_metadata.append(orig_item)
self.idx_storage.revision_metadata_add(
rev_metadata, conflict_update=conflict_update)
self.idx_storage.origin_intrinsic_metadata_add(
orig_metadata, conflict_update=conflict_update)
@click.command()
@click.option('--revs', '-i',
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
rev_metadata_indexer = RevisionMetadataIndexer()
rev_metadata_indexer.run(_git_sha1s, 'update-dups')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 2d5e653..fb72a5a 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,163 +1,188 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from unittest.mock import patch
from swh.model.hashutil import hash_to_bytes
from swh.indexer.metadata import OriginMetadataIndexer
from .utils import BASE_TEST_CONFIG, YARN_PARSER_METADATA
from .test_metadata import REVISION_METADATA_CONFIG
ORIGIN_HEAD_CONFIG = {
**BASE_TEST_CONFIG,
'tools': {
'name': 'origin-metadata',
'version': '0.0.1',
'configuration': {},
},
'tasks': {
'revision_metadata': 'revision_metadata',
'origin_intrinsic_metadata': 'origin_intrinsic_metadata',
}
}
@pytest.fixture
def origin_metadata_indexer():
prefix = 'swh.indexer.'
suffix = '.parse_config_file'
with patch(prefix + 'metadata.OriginMetadataIndexer' + suffix) as omi, \
patch(prefix + 'origin_head.OriginHeadIndexer' + suffix) as ohi, \
patch(prefix + 'metadata.RevisionMetadataIndexer' + suffix) as rmi:
omi.return_value = BASE_TEST_CONFIG
ohi.return_value = ORIGIN_HEAD_CONFIG
rmi.return_value = REVISION_METADATA_CONFIG
yield OriginMetadataIndexer()
def test_origin_metadata_indexer(
idx_storage, storage, obj_storage, origin_metadata_indexer):
indexer = OriginMetadataIndexer()
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
origin = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
'id': rev_id,
'translated_metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
origin_metadata = {
'origin_id': origin['id'],
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
for result in results:
del result['tool']
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
for result in results:
del result['tool']
assert results == [origin_metadata]
-def test_origin_metadata_indexer_duplicates(
+def test_origin_metadata_indexer_duplicate_origin(
idx_storage, storage, obj_storage, origin_metadata_indexer):
indexer = OriginMetadataIndexer()
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)
origin = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
assert len(results) == 1
def test_origin_metadata_indexer_missing_head(
idx_storage, storage, obj_storage, origin_metadata_indexer):
storage.origin_add([{
'type': 'git',
'url': 'https://example.com'
}])
indexer = OriginMetadataIndexer()
indexer.run(["git+https://example.com"])
origin = storage.origin_get({
'type': 'git',
'url': 'https://example.com'})
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
assert results == []
def test_origin_metadata_indexer_partial_missing_head(
idx_storage, storage, obj_storage, origin_metadata_indexer):
storage.origin_add([{
'type': 'git',
'url': 'https://example.com'
}])
indexer = OriginMetadataIndexer()
indexer.run(["git+https://example.com",
"git+https://github.com/librariesio/yarn-parser"])
origin1 = storage.origin_get({
'type': 'git',
'url': 'https://example.com'})
origin2 = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
'id': rev_id,
'translated_metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
origin_metadata = {
'origin_id': origin2['id'],
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
for result in results:
del result['tool']
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin1['id'], origin2['id']]))
for result in results:
del result['tool']
assert results == [origin_metadata]
+
+
+def test_origin_metadata_indexer_duplicate_revision(
+ idx_storage, storage, obj_storage, origin_metadata_indexer):
+ indexer = OriginMetadataIndexer()
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.run(["git+https://github.com/librariesio/yarn-parser",
+ "git+https://github.com/librariesio/yarn-parser.git"])
+
+ origin1 = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin2 = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser.git'})
+ assert origin1['id'] != origin2['id']
+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ assert len(results) == 1
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin1['id'], origin2['id']]))
+ assert len(results) == 2
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index c19bb7e..2cfc437 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,666 +1,678 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import datetime
import hashlib
import random
from hypothesis import strategies
from swh.model import hashutil
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
BASE_TEST_CONFIG = {
'storage': {
'cls': 'memory',
'args': {
},
},
'objstorage': {
'cls': 'memory',
'args': {
},
},
INDEXER_CFG_KEY: {
'cls': 'memory',
'args': {
},
},
}
ORIGINS = [
{
'id': 52189575,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/SoftwareHeritage/swh-storage'},
{
'id': 4423668,
'lister': None,
'project': None,
'type': 'ftp',
'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
{
'id': 77775770,
'lister': None,
'project': None,
'type': 'deposit',
'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
{
'id': 85072327,
'lister': None,
'project': None,
'type': 'pypi',
'url': 'https://pypi.org/project/limnoria/'},
{
'id': 49908349,
'lister': None,
'project': None,
'type': 'svn',
'url': 'http://0-512-md.googlecode.com/svn/'},
{
'id': 54974445,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'},
+ {
+ 'id': 54974446,
+ 'lister': None,
+ 'project': None,
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser.git'},
]
SNAPSHOTS = {
52189575: {
'branches': {
b'refs/heads/add-revision-origin-cache': {
'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
b's\xe7/\xe9l\x1e',
'target_type': 'revision'},
b'HEAD': {
'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
b'\xac\xefrm',
'target_type': 'revision'},
b'refs/tags/v0.0.103': {
'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
b'\x0f\xdd',
'target_type': 'release'},
}},
4423668: {
'branches': {
b'3DLDF-1.1.4.tar.gz': {
'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
b'"G\x99\x11',
'target_type': 'revision'},
b'3DLDF-2.0.2.tar.gz': {
'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
'target_type': 'revision'},
b'3DLDF-2.0.3-examples.tar.gz': {
'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
b'\xfe\xadZ\x80\x80\xc1\x83\xff',
'target_type': 'revision'},
b'3DLDF-2.0.3.tar.gz': {
'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
b'\xcc\x1a\xb4`\x8c\x8by',
'target_type': 'revision'},
b'3DLDF-2.0.tar.gz': {
'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
b'\xd3\xd1m',
b'target_type': 'revision'}
}},
77775770: {
'branches': {
b'master': {
'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
b'\xa6\xe9\x99\xb1\x9e]q\xeb',
'target_type': 'revision'}
},
'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
b"\x1d\r "},
85072327: {
'branches': {
b'HEAD': {
'target': b'releases/2018.09.09',
'target_type': 'alias'},
b'releases/2018.09.01': {
'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
b'\xbb\xdfF\xfdw\xcf',
'target_type': 'revision'},
b'releases/2018.09.09': {
'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
b'A\x10\x9d\xc5\xfa2\xf8t',
'target_type': 'revision'}},
'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
b'\x12\x9e\xd6\xb3'},
49908349: {
'branches': {
b'master': {
'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
b'\xc9\xad#.\x1bw=\x18',
'target_type': 'revision'}},
'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
b'\x05\xea\xb8\x1f\xc4H\xf4s'},
54974445: {
'branches': {
b'HEAD': {
'target': hash_to_bytes(
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
- 'target_type': 'revision'}}}
+ 'target_type': 'revision'}}},
+ 54974446: {
+ 'branches': {
+ b'HEAD': {
+ 'target': hash_to_bytes(
+ '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
+ 'target_type': 'revision'}}},
}
REVISIONS = [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'author': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
'email': b'andrewnez@gmail.com'
},
'committer': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
'email': b'andrewnez@gmail.com'
},
'synthetic': False,
'date': {
'negative_utc': False,
'timestamp': {
'seconds': 1487596456,
'microseconds': 0
},
'offset': 0
},
'directory': b'10'
}]
DIRECTORY_ID = b'10'
DIRECTORY = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'sha1': b'cde'
},
{
'target': b'11',
'type': 'dir',
'length': None,
'name': b'.github',
'sha1': None,
'perms': 16384,
'sha1_git': None,
'status': None,
'sha256': None
}
]
SHA1_TO_LICENSES = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
}
SHA1_TO_CTAGS = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
'name': 'foo',
'kind': 'str',
'line': 10,
'lang': 'bar',
}],
'd4c647f0fc257591cc9ba1722484229780d1c607': [{
'name': 'let',
'kind': 'int',
'line': 100,
'lang': 'haskell',
}],
'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
'name': 'symbol',
'kind': 'float',
'line': 99,
'lang': 'python',
}],
}
OBJ_STORAGE_DATA = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
import unittest
import logging
from swh.indexer.mimetype import MimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
""",
'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
""",
'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
'636465': b"""
{
"name": "yarn-parser",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "mocha"
},
"engines": {
"node": "9.8.0"
},
"repository": {
"type": "git",
"url": "git+https://github.com/librariesio/yarn-parser.git"
},
"keywords": [
"yarn",
"parse",
"lock",
"dependencies"
],
"author": "Andrew Nesbitt",
"license": "AGPL-3.0",
"bugs": {
"url": "https://github.com/librariesio/yarn-parser/issues"
},
"homepage": "https://github.com/librariesio/yarn-parser#readme",
"dependencies": {
"@yarnpkg/lockfile": "^1.0.0",
"body-parser": "^1.15.2",
"express": "^4.14.0"
},
"devDependencies": {
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"test": "^0.6.0"
}
}
"""
}
YARN_PARSER_METADATA = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'codeRepository':
'git+git+https://github.com/librariesio/yarn-parser.git',
'author': [{
'type': 'Person',
'name': 'Andrew Nesbitt'
}],
'license': 'https://spdx.org/licenses/AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
}
json_dict_keys = strategies.one_of(
strategies.characters(),
*map(strategies.just, ['type', 'url', 'name', 'email', '@id',
'@context', 'repository', 'license',
]),
)
"""Hypothesis strategy that generates strings, with an emphasis on those
that are often used as dictionary keys in metadata files."""
generic_json_document = strategies.recursive(
strategies.none() | strategies.booleans() | strategies.floats() |
strategies.characters(),
lambda children: (
strategies.lists(children, 1) |
strategies.dictionaries(json_dict_keys, children, min_size=1)
)
)
"""Hypothesis strategy that generates possible values for values of JSON
metadata files."""
def json_document_strategy(keys=None):
"""Generates an hypothesis strategy that generates metadata files
for a format that uses the given keys."""
if keys is None:
keys = strategies.characters()
else:
keys = strategies.one_of(map(strategies.just, keys))
return strategies.dictionaries(keys, generic_json_document, min_size=1)
def filter_dict(d, keys):
'return a copy of the dict with keys deleted'
if not isinstance(keys, (list, tuple)):
keys = (keys, )
return dict((k, v) for (k, v) in d.items() if k not in keys)
def fill_obj_storage(obj_storage):
"""Add some content in an object storage."""
for (obj_id, content) in OBJ_STORAGE_DATA.items():
obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
def fill_storage(storage):
for origin in ORIGINS:
origin = origin.copy()
del origin['id']
storage.origin_add_one(origin)
for (orig_pseudo_id, snap) in SNAPSHOTS.items():
for orig in ORIGINS:
if orig_pseudo_id == orig['id']:
origin_id = storage.origin_get(
{'type': orig['type'], 'url': orig['url']})['id']
break
else:
assert False
visit = storage.origin_visit_add(origin_id, datetime.datetime.now())
snap_id = snap.get('id') or \
bytes([random.randint(0, 255) for _ in range(32)])
storage.snapshot_add(origin_id, visit['visit'], {
'id': snap_id,
'branches': snap['branches']
})
storage.revision_add(REVISIONS)
storage.directory_add([{
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
for (obj_id, content) in OBJ_STORAGE_DATA.items():
# TODO: use MultiHash
if hasattr(hashlib, 'blake2s'):
blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
else:
# fallback for Python <3.6
blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
storage.content_add([{
'data': content,
'length': len(content),
'status': 'visible',
'sha1': hash_to_bytes(obj_id),
'sha1_git': hash_to_bytes(obj_id),
'sha256': hashlib.sha256(content).digest(),
'blake2s256': blake2s256
}])
class CommonContentIndexerTest(metaclass=abc.ABCMeta):
legacy_get_format = False
"""True if and only if the tested indexer uses the legacy format.
see: https://forge.softwareheritage.org/T1433
"""
def get_indexer_results(self, ids):
"""Override this for indexers that don't have a mock storage."""
return self.indexer.idx_storage.state
def assert_legacy_results_ok(self, sha1s, expected_results=None):
# XXX old format, remove this when all endpoints are
# updated to the new one
# see: https://forge.softwareheritage.org/T1433
sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
for sha1 in sha1s]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(len(expected_results), len(actual_results),
(expected_results, actual_results))
for indexed_data in actual_results:
_id = indexed_data['id']
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data['id'] = _id
self.assertEqual(indexed_data, expected_data)
def assert_results_ok(self, sha1s, expected_results=None):
if self.legacy_get_format:
self.assert_legacy_results_ok(sha1s, expected_results)
return
sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
for sha1 in sha1s]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(len(expected_results), len(actual_results),
(expected_results, actual_results))
for indexed_data in actual_results:
(_id, indexed_data) = list(indexed_data.items())[0]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data = [expected_data]
self.assertEqual(indexed_data, expected_data)
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update='update-dups')
self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [self.id1,
'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
# when
self.indexer.run(sha1s, policy_update='update-dups')
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def setUp(self):
self.contents = sorted(OBJ_STORAGE_DATA)
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
assert isinstance(_id, bytes)
indexed_data = indexed_data.copy()
indexed_data['id'] = hash_to_hex(indexed_data['id'])
self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(
start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
# then
self.assertTrue(actual_results)
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run( # checks the bytes input this time
start, end, skip_existing=False)
# no already indexed data so same result as prior test
# then
self.assertTrue(actual_results)
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
_start, _end = ['0000000000000000000000000000000000000000',
'0000000000000000000000000000000000000001']
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)
# then
self.assertFalse(actual_results)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Sep 18, 4:32 AM (50 m, 52 s ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3297959
Attached To
rDCIDX Metadata indexer
Event Timeline
Log In to Comment