diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -45,7 +45,7 @@ # All fields stored as string in the metadata # even the booleans "match_mapping_type": "boolean", - "path_match": "intrinsic_metadata.*", + "path_match": "jsonld.*", "mapping": {"type": "keyword"}, } }, @@ -54,7 +54,7 @@ # All fields stored as string in the metadata # even the floats "match_mapping_type": "double", - "path_match": "intrinsic_metadata.*", + "path_match": "jsonld.*", "mapping": {"type": "text"}, } }, @@ -63,7 +63,7 @@ # All fields stored as string in the metadata # even the longs "match_mapping_type": "long", - "path_match": "intrinsic_metadata.*", + "path_match": "jsonld.*", "mapping": {"type": "text"}, } }, @@ -102,7 +102,7 @@ "last_eventful_visit_date": {"type": "date"}, "last_release_date": {"type": "date"}, "last_revision_date": {"type": "date"}, - "intrinsic_metadata": { + "jsonld": { "type": "nested", "properties": { "@context": { @@ -246,7 +246,7 @@ for field_name in ( "blocklisted", "has_visits", - "intrinsic_metadata", + "jsonld", "visit_types", "nb_visits", "snapshot_id", @@ -270,21 +270,21 @@ # * {"author": {"@value": "Jane Doe"}} # * {"author": [{"@value": "Jane Doe"}]} # and JSON-LD expansion will convert them all to the last one. - if "intrinsic_metadata" in res: - intrinsic_metadata = res["intrinsic_metadata"] + if "jsonld" in res: + jsonld = res["jsonld"] for date_field in ["dateCreated", "dateModified", "datePublished"]: - if date_field in intrinsic_metadata: - date = intrinsic_metadata[date_field] + if date_field in jsonld: + date = jsonld[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) formatted_date = parse_and_format_date(date) if formatted_date is None: - intrinsic_metadata.pop(date_field) + jsonld.pop(date_field) else: - intrinsic_metadata[date_field] = formatted_date + jsonld[date_field] = formatted_date - res["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) + res["jsonld"] = codemeta.expand(jsonld) return res @@ -513,7 +513,7 @@ sorting_params.append( { get_expansion(field, "."): { - "nested_path": "intrinsic_metadata", + "nested_path": "jsonld", "order": order, } } diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -245,27 +245,27 @@ .replace("Z", "+00:00") ), ).isoformat() - if "intrinsic_metadata" in document: - intrinsic_metadata = document["intrinsic_metadata"] + if "jsonld" in document: + jsonld = document["jsonld"] for date_field in ["dateCreated", "dateModified", "datePublished"]: - if date_field in intrinsic_metadata: - date = intrinsic_metadata[date_field] + if date_field in jsonld: + date = jsonld[date_field] # If date{Created,Modified,Published} value isn't parsable # It gets rejected and isn't stored (unlike other fields) formatted_date = parse_and_format_date(date) if formatted_date is None: - intrinsic_metadata.pop(date_field) + jsonld.pop(date_field) else: - intrinsic_metadata[date_field] = formatted_date + jsonld[date_field] = formatted_date - document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) + document["jsonld"] = codemeta.expand(jsonld) - if len(document["intrinsic_metadata"]) != 1: + if len(document["jsonld"]) != 1: continue - metadata = document["intrinsic_metadata"][0] + metadata = document["jsonld"][0] if "http://schema.org/license" in metadata: metadata["http://schema.org/license"] = [ {"@id": license["@id"].lower()} @@ -332,12 +332,10 @@ ) def predicate(match): - if "intrinsic_metadata" not in match: + if "jsonld" not in match: return False - return metadata_pattern_words.issubset( - _dict_words_set(match["intrinsic_metadata"]) - ) + return metadata_pattern_words.issubset(_dict_words_set(match["jsonld"])) hits = filter(predicate, hits) diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -90,7 +90,7 @@ url_pattern: Part of the URL to search for, if empty and no filter parameters used return all origins metadata_pattern: Keywords to look for (across all the fields of - intrinsic_metadata) + "jsonld") with_visit: Whether origins with no visits are to be filtered out visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned @@ -106,11 +106,11 @@ min_last_release_date: Filter origins that have last_release_date on or after the provided date(ISO format) min_date_created: Filter origins that have date_created - from intrinsic_metadata on or after the provided date + from ``jsonld`` on or after the provided date min_date_modified: Filter origins that have date_modified - from intrinsic_metadata on or after the provided date + from ``jsonld`` on or after the provided date min_date_published: Filter origins that have date_published - from intrinsic_metadata on or after the provided date + from ``jsonld`` on or after the provided date programming_languages: Filter origins with programming languages present in the given list (based on instrinsic_metadata) licenses: Filter origins with licenses present in the given list diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -126,7 +126,7 @@ origin_metadata = [ { "url": item["id"], - "intrinsic_metadata": item["metadata"], + "jsonld": item["metadata"], } for item in origin_metadata ] diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -303,7 +303,7 @@ [ { "url": "http://foobar.baz", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -430,7 +430,7 @@ ORIGINS = [ { "url": "http://foobar.0.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_0, "dateModified": DATE_1, @@ -439,7 +439,7 @@ }, { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_1, "dateModified": DATE_2, @@ -448,7 +448,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_2, "dateModified": DATE_2, @@ -498,7 +498,7 @@ ORIGINS = [ { "url": "http://foobar.0.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_0, "dateModified": DATE_1, @@ -507,7 +507,7 @@ }, { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_1, "dateModified": DATE_2, @@ -516,7 +516,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": DATE_2, "dateModified": DATE_2, @@ -539,7 +539,7 @@ ORIGINS = [ { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "Django is a backend framework for applications", "keywords": "django,backend,server,web,framework", @@ -547,7 +547,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "Native Android applications are fast", "keywords": "android,mobile,ui", @@ -555,7 +555,7 @@ }, { "url": "http://foobar.3.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "React framework helps you build web applications", "keywords": "react,web,ui", @@ -648,7 +648,7 @@ ORIGINS = [ { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "license": "https://spdx.org/licenses/MIT", @@ -656,7 +656,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "license": "BSD-3-Clause", @@ -681,7 +681,7 @@ ORIGINS = [ { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "python", @@ -689,7 +689,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "programmingLanguage": "javascript", @@ -716,7 +716,7 @@ ORIGINS = [ { "url": "http://foobar.1.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 1", "programmingLanguage": "python", @@ -725,7 +725,7 @@ }, { "url": "http://foobar.2.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 2", "programmingLanguage": ["javascript", "html", "css"], @@ -737,7 +737,7 @@ }, { "url": "http://foobar.3.com", - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar 3", "programmingLanguage": ["Cpp", "c"], @@ -785,7 +785,7 @@ expected_results = [origin_url] assert results == expected_results - def test_origin_intrinsic_metadata_description(self): + def test_origin_jsonld_description(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} @@ -794,18 +794,18 @@ [ { **origin1_nothin, - "intrinsic_metadata": {}, + "jsonld": {}, }, { **origin2_foobar, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, { **origin3_barbaz, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "bar baz", }, @@ -826,7 +826,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] - def test_origin_intrinsic_metadata_all_terms(self): + def test_origin_jsonld_all_terms(self): origin1_foobarfoobar = {"url": "http://origin1"} origin3_foobarbaz = {"url": "http://origin2"} @@ -834,14 +834,14 @@ [ { **origin1_foobarfoobar, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar foo bar", }, }, { **origin3_foobarbaz, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar baz", }, @@ -854,11 +854,11 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin3_foobarbaz] - def test_origin_intrinsic_metadata_long_description(self): + def test_origin_jsonld_long_description(self): """Checks ElasticSearch does not try to store large values untokenize, which would be inefficient and crash it with: - Document contains at least one immense term in field="intrinsic_metadata.http://schema.org/description.@value" (whose UTF8 encoding is longer than the max length 32766), all of which were skipped. + Document contains at least one immense term in field="jsonld.http://schema.org/description.@value" (whose UTF8 encoding is longer than the max length 32766), all of which were skipped. """ # noqa origin1 = {"url": "http://origin1"} @@ -866,7 +866,7 @@ [ { **origin1, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": " ".join(f"foo{i}" for i in range(100000)), }, @@ -879,7 +879,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin1] - def test_origin_intrinsic_metadata_matches_cross_fields(self): + def test_origin_jsonld_matches_cross_fields(self): """Checks the backend finds results even if the two words in the query are each in a different field.""" origin1 = {"url": "http://origin1"} @@ -888,7 +888,7 @@ [ { **origin1, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", "author": "John Doe", @@ -902,7 +902,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin1] - def test_origin_intrinsic_metadata_nested(self): + def test_origin_jsonld_nested(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} @@ -911,18 +911,18 @@ [ { **origin1_nothin, - "intrinsic_metadata": {}, + "jsonld": {}, }, { **origin2_foobar, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_barbaz, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["bar", "baz"], }, @@ -943,7 +943,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] - def test_origin_intrinsic_metadata_inconsistent_type(self): + def test_origin_jsonld_inconsistent_type(self): """Checks the same field can have a concrete value, an object, or an array in different documents.""" origin1_foobar = {"url": "http://origin1"} @@ -954,7 +954,7 @@ [ { **origin1_foobar, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": { "familyName": "Foo", @@ -969,14 +969,14 @@ [ { **origin2_barbaz, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": "Bar Baz", }, }, { **origin3_bazqux, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": ["Baz", "Qux"], }, @@ -1015,7 +1015,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin1_foobar] - def test_origin_intrinsic_metadata_string_mapping(self): + def test_origin_jsonld_string_mapping(self): """Checks inserting a date-like in a field does not update the mapping to require every document uses a date in that field; or that search queries use a date either. @@ -1027,7 +1027,7 @@ [ { **origin1, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": "2021-02-18T10:16:52", "version": 1.0, @@ -1043,7 +1043,7 @@ [ { **origin2, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "dateCreated": "a long time ago", "address": "in a galaxy far, far away", @@ -1075,11 +1075,11 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin2] - def test_origin_intrinsic_metadata_update(self): + def test_origin_jsonld_update(self): origin = {"url": "http://origin1"} origin_data = { **origin, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": "John Doe", }, @@ -1092,7 +1092,7 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin] - origin_data["intrinsic_metadata"]["author"] = "Jane Doe" + origin_data["jsonld"]["author"] = "Jane Doe" self.search.origin_update([origin_data]) self.search.flush() @@ -1142,7 +1142,7 @@ @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) - def test_origin_intrinsic_metadata_paging(self, limit): + def test_origin_jsonld_paging(self, limit): # TODO: no hypothesis origin1_foo = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} @@ -1153,21 +1153,21 @@ [ { **origin1_foo, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo"], }, }, { **origin2_foobar, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_foobarbaz, - "intrinsic_metadata": { + "jsonld": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar", "baz"], }, diff --git a/swh/search/tests/test_translator.py b/swh/search/tests/test_translator.py --- a/swh/search/tests/test_translator.py +++ b/swh/search/tests/test_translator.py @@ -132,13 +132,13 @@ }, { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": "framework and web", "type": "cross_fields", "operator": "and", - "fields": ["intrinsic_metadata.*"], + "fields": ["jsonld.*"], "lenient": True, } }, @@ -179,7 +179,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": r"""word1 word2 " ' word3""", @@ -201,7 +201,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "should": [ @@ -239,7 +239,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "should": [ @@ -261,7 +261,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "must_not": [ @@ -288,7 +288,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "must": [ @@ -380,7 +380,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": r"""foo '" bar""", @@ -401,7 +401,7 @@ expected = { "filters": { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": r"""café""", @@ -425,7 +425,7 @@ "must": [ { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": r"""🐍""", diff --git a/swh/search/translator.py b/swh/search/translator.py --- a/swh/search/translator.py +++ b/swh/search/translator.py @@ -152,7 +152,7 @@ elif name == "metadata": return { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": value, @@ -164,9 +164,9 @@ # be considered a match. # TODO: allow missing keywords? "operator": "and", - # Searches on all fields of the intrinsic_metadata dict, + # Searches on all fields of the JSON-LD dict, # recursively. - "fields": ["intrinsic_metadata.*"], + "fields": ["jsonld.*"], # date{Created,Modified,Published} are of type date "lenient": True, } @@ -219,7 +219,7 @@ if name == "keyword": return { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "multi_match": { "query": " ".join(value_array), @@ -228,7 +228,7 @@ get_expansion("descriptions", "."), # "^2" boosts an origin's score by 2x # if it the queried keywords are - # found in its intrinsic_metadata.keywords + # found in its jsonld.keywords ], } }, @@ -243,7 +243,7 @@ return { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "should": [ @@ -261,7 +261,7 @@ if op in ["=", "!="]: return { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { ("must" if op == "=" else "must_not"): [ @@ -281,7 +281,7 @@ return { "nested": { - "path": "intrinsic_metadata", + "path": "jsonld", "query": { "bool": { "must": [ diff --git a/swh/search/utils.py b/swh/search/utils.py --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -12,34 +12,34 @@ def get_expansion(field, sep=None): METADATA_FIELDS = { - "licenses": ["intrinsic_metadata", "http://schema.org/license", "@id"], + "licenses": ["jsonld", "http://schema.org/license", "@id"], "programming_languages": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/programmingLanguage", "@value", ], "keywords": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/keywords", "@value", ], "descriptions": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/description", "@value", ], "date_created": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/dateCreated", "@value", ], "date_modified": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/dateModified", "@value", ], "date_published": [ - "intrinsic_metadata", + "jsonld", "http://schema.org/datePublished", "@value", ],