Page MenuHomeSoftware Heritage

D8130.diff
No OneTemporary

D8130.diff

diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -45,7 +45,7 @@
# All fields stored as string in the metadata
# even the booleans
"match_mapping_type": "boolean",
- "path_match": "intrinsic_metadata.*",
+ "path_match": "jsonld.*",
"mapping": {"type": "keyword"},
}
},
@@ -54,7 +54,7 @@
# All fields stored as string in the metadata
# even the floats
"match_mapping_type": "double",
- "path_match": "intrinsic_metadata.*",
+ "path_match": "jsonld.*",
"mapping": {"type": "text"},
}
},
@@ -63,7 +63,7 @@
# All fields stored as string in the metadata
# even the longs
"match_mapping_type": "long",
- "path_match": "intrinsic_metadata.*",
+ "path_match": "jsonld.*",
"mapping": {"type": "text"},
}
},
@@ -102,7 +102,7 @@
"last_eventful_visit_date": {"type": "date"},
"last_release_date": {"type": "date"},
"last_revision_date": {"type": "date"},
- "intrinsic_metadata": {
+ "jsonld": {
"type": "nested",
"properties": {
"@context": {
@@ -246,7 +246,7 @@
for field_name in (
"blocklisted",
"has_visits",
- "intrinsic_metadata",
+ "jsonld",
"visit_types",
"nb_visits",
"snapshot_id",
@@ -270,21 +270,21 @@
# * {"author": {"@value": "Jane Doe"}}
# * {"author": [{"@value": "Jane Doe"}]}
# and JSON-LD expansion will convert them all to the last one.
- if "intrinsic_metadata" in res:
- intrinsic_metadata = res["intrinsic_metadata"]
+ if "jsonld" in res:
+ jsonld = res["jsonld"]
for date_field in ["dateCreated", "dateModified", "datePublished"]:
- if date_field in intrinsic_metadata:
- date = intrinsic_metadata[date_field]
+ if date_field in jsonld:
+ date = jsonld[date_field]
# If date{Created,Modified,Published} value isn't parsable
# It gets rejected and isn't stored (unlike other fields)
formatted_date = parse_and_format_date(date)
if formatted_date is None:
- intrinsic_metadata.pop(date_field)
+ jsonld.pop(date_field)
else:
- intrinsic_metadata[date_field] = formatted_date
+ jsonld[date_field] = formatted_date
- res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata)
+ res["jsonld"] = codemeta.expand(jsonld)
return res
@@ -513,7 +513,7 @@
sorting_params.append(
{
get_expansion(field, "."): {
- "nested_path": "intrinsic_metadata",
+ "nested_path": "jsonld",
"order": order,
}
}
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -245,27 +245,27 @@
.replace("Z", "+00:00")
),
).isoformat()
- if "intrinsic_metadata" in document:
- intrinsic_metadata = document["intrinsic_metadata"]
+ if "jsonld" in document:
+ jsonld = document["jsonld"]
for date_field in ["dateCreated", "dateModified", "datePublished"]:
- if date_field in intrinsic_metadata:
- date = intrinsic_metadata[date_field]
+ if date_field in jsonld:
+ date = jsonld[date_field]
# If date{Created,Modified,Published} value isn't parsable
# It gets rejected and isn't stored (unlike other fields)
formatted_date = parse_and_format_date(date)
if formatted_date is None:
- intrinsic_metadata.pop(date_field)
+ jsonld.pop(date_field)
else:
- intrinsic_metadata[date_field] = formatted_date
+ jsonld[date_field] = formatted_date
- document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata)
+ document["jsonld"] = codemeta.expand(jsonld)
- if len(document["intrinsic_metadata"]) != 1:
+ if len(document["jsonld"]) != 1:
continue
- metadata = document["intrinsic_metadata"][0]
+ metadata = document["jsonld"][0]
if "http://schema.org/license" in metadata:
metadata["http://schema.org/license"] = [
{"@id": license["@id"].lower()}
@@ -332,12 +332,10 @@
)
def predicate(match):
- if "intrinsic_metadata" not in match:
+ if "jsonld" not in match:
return False
- return metadata_pattern_words.issubset(
- _dict_words_set(match["intrinsic_metadata"])
- )
+ return metadata_pattern_words.issubset(_dict_words_set(match["jsonld"]))
hits = filter(predicate, hits)
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -90,7 +90,7 @@
url_pattern: Part of the URL to search for, if empty and no filter
parameters used return all origins
metadata_pattern: Keywords to look for (across all the fields of
- intrinsic_metadata)
+ "jsonld")
with_visit: Whether origins with no visits are to be filtered out
visit_types: Only origins having any of the provided visit types
(e.g. git, svn, pypi) will be returned
@@ -106,11 +106,11 @@
min_last_release_date: Filter origins that have
last_release_date on or after the provided date(ISO format)
min_date_created: Filter origins that have date_created
- from intrinsic_metadata on or after the provided date
+ from ``jsonld`` on or after the provided date
min_date_modified: Filter origins that have date_modified
- from intrinsic_metadata on or after the provided date
+ from ``jsonld`` on or after the provided date
min_date_published: Filter origins that have date_published
- from intrinsic_metadata on or after the provided date
+ from ``jsonld`` on or after the provided date
programming_languages: Filter origins with programming languages
present in the given list (based on instrinsic_metadata)
licenses: Filter origins with licenses present in the given list
diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py
--- a/swh/search/journal_client.py
+++ b/swh/search/journal_client.py
@@ -126,7 +126,7 @@
origin_metadata = [
{
"url": item["id"],
- "intrinsic_metadata": item["metadata"],
+ "jsonld": item["metadata"],
}
for item in origin_metadata
]
diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py
--- a/swh/search/tests/test_journal_client.py
+++ b/swh/search/tests/test_journal_client.py
@@ -303,7 +303,7 @@
[
{
"url": "http://foobar.baz",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"programmingLanguage": "python",
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -430,7 +430,7 @@
ORIGINS = [
{
"url": "http://foobar.0.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_0,
"dateModified": DATE_1,
@@ -439,7 +439,7 @@
},
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_1,
"dateModified": DATE_2,
@@ -448,7 +448,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_2,
"dateModified": DATE_2,
@@ -498,7 +498,7 @@
ORIGINS = [
{
"url": "http://foobar.0.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_0,
"dateModified": DATE_1,
@@ -507,7 +507,7 @@
},
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_1,
"dateModified": DATE_2,
@@ -516,7 +516,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": DATE_2,
"dateModified": DATE_2,
@@ -539,7 +539,7 @@
ORIGINS = [
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "Django is a backend framework for applications",
"keywords": "django,backend,server,web,framework",
@@ -547,7 +547,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "Native Android applications are fast",
"keywords": "android,mobile,ui",
@@ -555,7 +555,7 @@
},
{
"url": "http://foobar.3.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "React framework helps you build web applications",
"keywords": "react,web,ui",
@@ -648,7 +648,7 @@
ORIGINS = [
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"license": "https://spdx.org/licenses/MIT",
@@ -656,7 +656,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"license": "BSD-3-Clause",
@@ -681,7 +681,7 @@
ORIGINS = [
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"programmingLanguage": "python",
@@ -689,7 +689,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"programmingLanguage": "javascript",
@@ -716,7 +716,7 @@
ORIGINS = [
{
"url": "http://foobar.1.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar 1",
"programmingLanguage": "python",
@@ -725,7 +725,7 @@
},
{
"url": "http://foobar.2.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar 2",
"programmingLanguage": ["javascript", "html", "css"],
@@ -737,7 +737,7 @@
},
{
"url": "http://foobar.3.com",
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar 3",
"programmingLanguage": ["Cpp", "c"],
@@ -785,7 +785,7 @@
expected_results = [origin_url]
assert results == expected_results
- def test_origin_intrinsic_metadata_description(self):
+ def test_origin_jsonld_description(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
@@ -794,18 +794,18 @@
[
{
**origin1_nothin,
- "intrinsic_metadata": {},
+ "jsonld": {},
},
{
**origin2_foobar,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
},
},
{
**origin3_barbaz,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "bar baz",
},
@@ -826,7 +826,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
- def test_origin_intrinsic_metadata_all_terms(self):
+ def test_origin_jsonld_all_terms(self):
origin1_foobarfoobar = {"url": "http://origin1"}
origin3_foobarbaz = {"url": "http://origin2"}
@@ -834,14 +834,14 @@
[
{
**origin1_foobarfoobar,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar foo bar",
},
},
{
**origin3_foobarbaz,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar baz",
},
@@ -854,11 +854,11 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_foobarbaz]
- def test_origin_intrinsic_metadata_long_description(self):
+ def test_origin_jsonld_long_description(self):
"""Checks ElasticSearch does not try to store large values untokenize,
which would be inefficient and crash it with:
- Document contains at least one immense term in field="intrinsic_metadata.http://schema.org/description.@value" (whose UTF8 encoding is longer than the max length 32766), all of which were skipped.
+ Document contains at least one immense term in field="jsonld.http://schema.org/description.@value" (whose UTF8 encoding is longer than the max length 32766), all of which were skipped.
""" # noqa
origin1 = {"url": "http://origin1"}
@@ -866,7 +866,7 @@
[
{
**origin1,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": " ".join(f"foo{i}" for i in range(100000)),
},
@@ -879,7 +879,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
- def test_origin_intrinsic_metadata_matches_cross_fields(self):
+ def test_origin_jsonld_matches_cross_fields(self):
"""Checks the backend finds results even if the two words in the query are
each in a different field."""
origin1 = {"url": "http://origin1"}
@@ -888,7 +888,7 @@
[
{
**origin1,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"author": "John Doe",
@@ -902,7 +902,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
- def test_origin_intrinsic_metadata_nested(self):
+ def test_origin_jsonld_nested(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
@@ -911,18 +911,18 @@
[
{
**origin1_nothin,
- "intrinsic_metadata": {},
+ "jsonld": {},
},
{
**origin2_foobar,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_barbaz,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["bar", "baz"],
},
@@ -943,7 +943,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
- def test_origin_intrinsic_metadata_inconsistent_type(self):
+ def test_origin_jsonld_inconsistent_type(self):
"""Checks the same field can have a concrete value, an object, or an array
in different documents."""
origin1_foobar = {"url": "http://origin1"}
@@ -954,7 +954,7 @@
[
{
**origin1_foobar,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {
"familyName": "Foo",
@@ -969,14 +969,14 @@
[
{
**origin2_barbaz,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": "Bar Baz",
},
},
{
**origin3_bazqux,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": ["Baz", "Qux"],
},
@@ -1015,7 +1015,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin1_foobar]
- def test_origin_intrinsic_metadata_string_mapping(self):
+ def test_origin_jsonld_string_mapping(self):
"""Checks inserting a date-like in a field does not update the mapping to
require every document uses a date in that field; or that search queries
use a date either.
@@ -1027,7 +1027,7 @@
[
{
**origin1,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": "2021-02-18T10:16:52",
"version": 1.0,
@@ -1043,7 +1043,7 @@
[
{
**origin2,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"dateCreated": "a long time ago",
"address": "in a galaxy far, far away",
@@ -1075,11 +1075,11 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin2]
- def test_origin_intrinsic_metadata_update(self):
+ def test_origin_jsonld_update(self):
origin = {"url": "http://origin1"}
origin_data = {
**origin,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": "John Doe",
},
@@ -1092,7 +1092,7 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin]
- origin_data["intrinsic_metadata"]["author"] = "Jane Doe"
+ origin_data["jsonld"]["author"] = "Jane Doe"
self.search.origin_update([origin_data])
self.search.flush()
@@ -1142,7 +1142,7 @@
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
- def test_origin_intrinsic_metadata_paging(self, limit):
+ def test_origin_jsonld_paging(self, limit):
# TODO: no hypothesis
origin1_foo = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
@@ -1153,21 +1153,21 @@
[
{
**origin1_foo,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo"],
},
},
{
**origin2_foobar,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_foobarbaz,
- "intrinsic_metadata": {
+ "jsonld": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar", "baz"],
},
diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py
--- a/swh/search/tests/test_translator.py
+++ b/swh/search/tests/test_translator.py
@@ -132,13 +132,13 @@
},
{
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": "framework and web",
"type": "cross_fields",
"operator": "and",
- "fields": ["intrinsic_metadata.*"],
+ "fields": ["jsonld.*"],
"lenient": True,
}
},
@@ -179,7 +179,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": r"""word1 word2 " ' word3""",
@@ -201,7 +201,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"should": [
@@ -239,7 +239,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"should": [
@@ -261,7 +261,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"must_not": [
@@ -288,7 +288,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"must": [
@@ -380,7 +380,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": r"""foo '" bar""",
@@ -401,7 +401,7 @@
expected = {
"filters": {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": r"""café""",
@@ -425,7 +425,7 @@
"must": [
{
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": r"""🐍""",
diff --git a/swh/search/translator.py b/swh/search/translator.py
--- a/swh/search/translator.py
+++ b/swh/search/translator.py
@@ -152,7 +152,7 @@
elif name == "metadata":
return {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": value,
@@ -164,9 +164,9 @@
# be considered a match.
# TODO: allow missing keywords?
"operator": "and",
- # Searches on all fields of the intrinsic_metadata dict,
+ # Searches on all fields of the JSON-LD dict,
# recursively.
- "fields": ["intrinsic_metadata.*"],
+ "fields": ["jsonld.*"],
# date{Created,Modified,Published} are of type date
"lenient": True,
}
@@ -219,7 +219,7 @@
if name == "keyword":
return {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"multi_match": {
"query": " ".join(value_array),
@@ -228,7 +228,7 @@
get_expansion("descriptions", "."),
# "^2" boosts an origin's score by 2x
# if it the queried keywords are
- # found in its intrinsic_metadata.keywords
+ # found in its jsonld.keywords
],
}
},
@@ -243,7 +243,7 @@
return {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"should": [
@@ -261,7 +261,7 @@
if op in ["=", "!="]:
return {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
("must" if op == "=" else "must_not"): [
@@ -281,7 +281,7 @@
return {
"nested": {
- "path": "intrinsic_metadata",
+ "path": "jsonld",
"query": {
"bool": {
"must": [
diff --git a/swh/search/utils.py b/swh/search/utils.py
--- a/swh/search/utils.py
+++ b/swh/search/utils.py
@@ -12,34 +12,34 @@
def get_expansion(field, sep=None):
METADATA_FIELDS = {
- "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"],
+ "licenses": ["jsonld", "http://schema.org/license", "@id"],
"programming_languages": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/programmingLanguage",
"@value",
],
"keywords": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/keywords",
"@value",
],
"descriptions": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/description",
"@value",
],
"date_created": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/dateCreated",
"@value",
],
"date_modified": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/dateModified",
"@value",
],
"date_published": [
- "intrinsic_metadata",
+ "jsonld",
"http://schema.org/datePublished",
"@value",
],

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:42 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218480

Event Timeline