Page MenuHomeSoftware Heritage

D1010.id3243.diff
No OneTemporary

D1010.id3243.diff

diff --git a/sql/upgrades/119.sql b/sql/upgrades/119.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/119.sql
@@ -0,0 +1,19 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 118
+-- to_version: 119
+-- description: metadata tables: add 'mappings' column
+
+insert into dbversion(version, release, description)
+values(119, now(), 'Work In Progress');
+
+alter table revision_metadata
+ add column mappings text array not null default {};
+alter table revision_metadata
+ alter column mappings
+ drop default;
+
+alter table origin_intrinsic_metadata
+ add column mappings text array not null default {};
+alter table origin_intrinsic_metadata
+ alter column mappings
+ drop default;
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -150,6 +150,7 @@
result = {
'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
+ 'mappings': None,
'translated_metadata': None
}
@@ -158,10 +159,11 @@
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
- result['translated_metadata'] = self.translate_revision_metadata(
+ (mappings, metadata) = self.translate_revision_metadata(
detected_files,
- log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])
- )
+ log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']))
+ result['mappings'] = mappings
+ result['translated_metadata'] = metadata
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
@@ -194,10 +196,11 @@
"npm", "authors") to list of sha1
Returns:
- dict: dict with translated metadata according to the CodeMeta
- vocabulary
+ (List[str], dict): list of mappings used and dict with
+ translated metadata according to the CodeMeta vocabulary
"""
+ used_mappings = [MAPPINGS[context].name for context in detected_files]
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
@@ -252,7 +255,7 @@
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
- return min_metadata
+ return (used_mappings, min_metadata)
class OriginMetadataIndexer(OriginIndexer):
@@ -287,6 +290,7 @@
'from_revision': rev_metadata['id'],
'origin_id': origin['id'],
'metadata': rev_metadata['translated_metadata'],
+ 'mappings': rev_metadata['mappings'],
'indexer_configuration_id':
rev_metadata['indexer_configuration_id'],
}
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
--- a/swh/indexer/metadata_dictionary.py
+++ b/swh/indexer/metadata_dictionary.py
@@ -76,6 +76,13 @@
self.__class__.__module__,
self.__class__.__name__))
+ @property
+ @abc.abstractmethod
+ def name(self):
+ """A name of this mapping, used as an identifier in the
+ indexer storage."""
+ pass
+
@classmethod
@abc.abstractmethod
def detect_metadata_files(cls, files):
@@ -203,6 +210,7 @@
"""
dedicated class for NPM (package.json) mapping and translation
"""
+ name = 'npm'
mapping = CROSSWALK_TABLE['NodeJS']
filename = b'package.json'
@@ -341,6 +349,7 @@
"""
dedicated class for CodeMeta (codemeta.json) mapping and translation
"""
+ name = 'codemeta'
filename = b'codemeta.json'
def translate(self, content):
@@ -352,6 +361,7 @@
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
+ name = 'maven'
filename = b'pom.xml'
mapping = CROSSWALK_TABLE['Java (Maven)']
@@ -484,6 +494,7 @@
"""Dedicated class for Python's PKG-INFO mapping and translation.
https://www.python.org/dev/peps/pep-0314/"""
+ name = 'pkg-info'
filename = b'PKG-INFO'
mapping = {_normalize_pkginfo_key(k): v
for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()}
@@ -520,11 +531,12 @@
@register_mapping
class GemspecMapping(DictMapping):
+ name = 'gemspec'
+ mapping = CROSSWALK_TABLE['Ruby Gem']
+
_re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*')
_re_spec_entry = re.compile(r'\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)')
- mapping = CROSSWALK_TABLE['Ruby Gem']
-
@classmethod
def detect_metadata_files(cls, file_entries):
for entry in file_entries:
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(118, now(), 'Work In Progress');
+ values(119, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
@@ -118,7 +118,8 @@
create table revision_metadata(
id sha1_git not null,
translated_metadata jsonb not null,
- indexer_configuration_id bigint not null
+ indexer_configuration_id bigint not null,
+ mappings text array not null
);
comment on table revision_metadata is 'metadata semantically detected and translated in a revision';
@@ -131,7 +132,8 @@
metadata jsonb,
indexer_configuration_id bigint not null,
from_revision sha1_git not null,
- metadata_tsvector tsvector
+ metadata_tsvector tsvector,
+ mappings text array not null
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql
--- a/swh/indexer/sql/40-swh-func.sql
+++ b/swh/indexer/sql/40-swh-func.sql
@@ -315,15 +315,15 @@
as $$
begin
if conflict_update then
- insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
- select id, translated_metadata, indexer_configuration_id
+ insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
+ select id, translated_metadata, mappings, indexer_configuration_id
from tmp_revision_metadata tcm
on conflict(id, indexer_configuration_id)
do update set translated_metadata = excluded.translated_metadata;
else
- insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
- select id, translated_metadata, indexer_configuration_id
+ insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id)
+ select id, translated_metadata, mappings, indexer_configuration_id
from tmp_revision_metadata tcm
on conflict(id, indexer_configuration_id)
do nothing;
@@ -410,17 +410,17 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
if conflict_update then
- insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector)
+ insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
select origin_id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector
+ metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(origin_id, indexer_configuration_id)
do update set metadata = excluded.metadata;
else
- insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector)
+ insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
select origin_id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector
+ metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(origin_id, indexer_configuration_id)
do nothing;
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -586,6 +586,8 @@
- **id** (bytes)
- **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
"""
for c in db.revision_metadata_get_from_list(ids, cur):
@@ -604,6 +606,8 @@
- **id**: sha1_git of revision
- **translated_metadata**: arbitrary dict
- **indexer_configuration_id**: tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)
@@ -612,7 +616,8 @@
db.mktemp_revision_metadata(cur)
db.copy_to(metadata, 'tmp_revision_metadata',
- ['id', 'translated_metadata', 'indexer_configuration_id'],
+ ['id', 'translated_metadata', 'mappings',
+ 'indexer_configuration_id'],
cur)
db.revision_metadata_add_from_temp(conflict_update, cur)
@@ -630,6 +635,8 @@
- **origin_id** (int)
- **metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
"""
for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
@@ -651,6 +658,8 @@
these metadata.
- **metadata**: arbitrary dict
- **indexer_configuration_id**: tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)
@@ -660,7 +669,7 @@
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
['origin_id', 'metadata', 'indexer_configuration_id',
- 'from_revision'],
+ 'from_revision', 'mappings'],
cur)
db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
@@ -680,6 +689,8 @@
- **id** (int)
- **metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
"""
for c in db.origin_intrinsic_metadata_search_fulltext(
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -295,7 +295,8 @@
yield from self._get_from_list(
'content_metadata', ids, self.content_metadata_cols, cur=cur)
- revision_metadata_hash_keys = ['id', 'indexer_configuration_id']
+ revision_metadata_hash_keys = [
+ 'id', 'indexer_configuration_id']
def revision_metadata_missing_from_list(self, metadata, cur=None):
"""List missing metadata.
@@ -306,7 +307,7 @@
cur=cur)
revision_metadata_cols = [
- 'id', 'translated_metadata',
+ 'id', 'translated_metadata', 'mappings',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_revision_metadata')
@@ -321,7 +322,7 @@
'revision_metadata', ids, self.revision_metadata_cols, cur=cur)
origin_intrinsic_metadata_cols = [
- 'origin_id', 'metadata', 'from_revision',
+ 'origin_id', 'metadata', 'from_revision', 'mappings',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
origin_intrinsic_metadata_regconfig = 'pg_catalog.simple'
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1217,6 +1217,7 @@
'softwareRequirements': None,
'identifier': None
},
+ 'mappings': [],
'indexer_configuration_id': tool_id
}])
@@ -1250,6 +1251,7 @@
'softwareRequirements': None,
'identifier': None
},
+ 'mappings': ['mapping1', 'mapping2'],
'indexer_configuration_id': tool_id
}
@@ -1263,9 +1265,11 @@
expected_metadata = [{
'id': self.revision_id_2,
'translated_metadata': metadata_rev['translated_metadata'],
+ 'mappings': ['mapping1', 'mapping2'],
'tool': self.tools['swh-metadata-detector']
}]
+ self.maxDiff = None
self.assertEqual(actual_metadata, expected_metadata)
def test_revision_metadata_add_drop_duplicate(self):
@@ -1291,6 +1295,7 @@
'softwareRequirements': None,
'identifier': None
},
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
@@ -1304,6 +1309,7 @@
expected_metadata_v1 = [{
'id': self.revision_id_1,
'translated_metadata': metadata_v1['translated_metadata'],
+ 'mappings': [],
'tool': self.tools['swh-metadata-detector']
}]
@@ -1350,6 +1356,7 @@
'softwareRequirements': None,
'identifier': None
},
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
@@ -1364,6 +1371,7 @@
expected_metadata_v1 = [{
'id': self.revision_id_2,
'translated_metadata': metadata_v1['translated_metadata'],
+ 'mappings': [],
'tool': self.tools['swh-metadata-detector']
}]
self.assertEqual(actual_metadata, expected_metadata_v1)
@@ -1384,6 +1392,7 @@
expected_metadata_v2 = [{
'id': self.revision_id_2,
'translated_metadata': metadata_v2['translated_metadata'],
+ 'mappings': [],
'tool': self.tools['swh-metadata-detector']
}]
@@ -1414,12 +1423,14 @@
metadata_rev = {
'id': self.revision_id_2,
'translated_metadata': metadata,
+ 'mappings': ['mapping1'],
'indexer_configuration_id': tool_id,
}
metadata_origin = {
'origin_id': self.origin_id_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
+ 'mappings': ['mapping1'],
'from_revision': self.revision_id_2,
}
@@ -1436,6 +1447,7 @@
'metadata': metadata,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
+ 'mappings': ['mapping1'],
}]
self.assertEqual(actual_metadata, expected_metadata)
@@ -1464,12 +1476,14 @@
metadata_rev_v1 = {
'id': self.revision_id_1,
'translated_metadata': metadata_v1.copy(),
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
'origin_id': self.origin_id_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
+ 'mappings': [],
'from_revision': self.revision_id_1,
}
@@ -1486,6 +1500,7 @@
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_1,
+ 'mappings': [],
}]
self.assertEqual(actual_metadata, expected_metadata_v1)
@@ -1535,12 +1550,14 @@
metadata_rev_v1 = {
'id': self.revision_id_2,
'translated_metadata': metadata_v1,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
'origin_id': self.origin_id_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
+ 'mappings': [],
'from_revision': self.revision_id_2,
}
@@ -1558,6 +1575,7 @@
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
+ 'mappings': [],
}]
self.assertEqual(actual_metadata, expected_metadata_v1)
@@ -1585,6 +1603,7 @@
'metadata': metadata_v2,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
+ 'mappings': [],
}]
# metadata did change as the v2 was used to overwrite v1
@@ -1600,11 +1619,13 @@
metadata1_rev = {
'id': self.revision_id_1,
'translated_metadata': metadata1,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
'origin_id': self.origin_id_1,
'metadata': metadata1,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
'from_revision': self.revision_id_1,
}
@@ -1614,11 +1635,13 @@
metadata2_rev = {
'id': self.revision_id_2,
'translated_metadata': metadata2,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
'origin_id': self.origin_id_2,
'metadata': metadata2,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
'from_revision': self.revision_id_2,
}
@@ -1662,11 +1685,13 @@
metadata1_rev = {
'id': self.revision_id_1,
'translated_metadata': metadata1,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
'origin_id': self.origin_id_1,
'metadata': metadata1,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
'from_revision': self.revision_id_1,
}
@@ -1679,11 +1704,13 @@
metadata2_rev = {
'id': self.revision_id_2,
'translated_metadata': metadata2,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
'origin_id': self.origin_id_2,
'metadata': metadata2,
+ 'mappings': [],
'indexer_configuration_id': tool_id,
'from_revision': self.revision_id_2,
}
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -254,7 +254,7 @@
'name': 'test_metadata',
'version': '0.0.1'
},
- 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5')
+ 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
}, {
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
@@ -927,6 +927,7 @@
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
},
+ 'mappings': ['npm'],
}]
for result in results:
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -94,11 +94,13 @@
rev_metadata = {
'id': rev_id,
'translated_metadata': metadata,
+ 'mappings': ['npm'],
}
origin_metadata = {
'origin_id': origin['id'],
'from_revision': rev_id,
'metadata': metadata,
+ 'mappings': ['npm'],
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 5:04 PM (3 h, 8 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226749

Event Timeline