Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
Show All 9 Lines | |||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
OriginDict, | OriginDict, | ||||
PagedResult, | PagedResult, | ||||
get_expansion, | |||||
) | ) | ||||
from swh.search.utils import get_expansion, is_date_parsable | |||||
_words_regexp = re.compile(r"\w+") | _words_regexp = re.compile(r"\w+") | ||||
def _dict_words_set(d): | def _dict_words_set(d): | ||||
"""Recursively extract set of words from dict content.""" | """Recursively extract set of words from dict content.""" | ||||
values = set() | values = set() | ||||
def extract(obj, words): | def extract(obj, words): | ||||
if isinstance(obj, dict): | if isinstance(obj, dict): | ||||
for k, v in obj.items(): | for k, v in obj.items(): | ||||
extract(v, words) | extract(v, words) | ||||
elif isinstance(obj, list): | elif isinstance(obj, list): | ||||
for item in obj: | for item in obj: | ||||
extract(item, words) | extract(item, words) | ||||
else: | else: | ||||
words.update(_words_regexp.findall(str(obj).lower())) | words.update(_words_regexp.findall(str(obj).lower())) | ||||
return words | return words | ||||
return extract(d, values) | return extract(d, values) | ||||
def _nested_get(nested_dict, nested_keys): | def _nested_get(nested_dict, nested_keys, default=""): | ||||
"""Extracts values from deeply nested dictionary nested_dict | """Extracts values from deeply nested dictionary nested_dict | ||||
using the nested_keys and returns a list of all of the values | using the nested_keys and returns a list of all of the values | ||||
discovered in the process. | discovered in the process. | ||||
>>> nested_dict = [ | >>> nested_dict = [ | ||||
... {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, | ... {"name": [{"@value": {"first": "f1", "last": "l1"}}], "address": "XYZ"}, | ||||
... {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, | ... {"name": [{"@value": {"first": "f2", "last": "l2"}}], "address": "ABC"}, | ||||
Show All 19 Lines | def _nested_get_recursive(nested_dict, nested_keys): | ||||
else: | else: | ||||
if type_curr_obj == list: | if type_curr_obj == list: | ||||
curr_obj = [ | curr_obj = [ | ||||
_nested_get_recursive(obj, nested_keys[i:]) | _nested_get_recursive(obj, nested_keys[i:]) | ||||
for obj in curr_obj | for obj in curr_obj | ||||
] | ] | ||||
# If value isn't a list or string or integer | # If value isn't a list or string or integer | ||||
elif type_curr_obj != str and type_curr_obj != int: | elif type_curr_obj != str and type_curr_obj != int: | ||||
return "" | return default | ||||
# If only one element is present in the list, take it out | # If only one element is present in the list, take it out | ||||
# This ensures a flat array every time | # This ensures a flat array every time | ||||
if type_curr_obj == list and len(curr_obj) == 1: | if type_curr_obj == list and len(curr_obj) == 1: | ||||
curr_obj = curr_obj[0] | curr_obj = curr_obj[0] | ||||
return curr_obj | return curr_obj | ||||
except Exception: | except Exception: | ||||
return [] | return default | ||||
res = _nested_get_recursive(nested_dict, nested_keys) | res = _nested_get_recursive(nested_dict, nested_keys) | ||||
if type(res) != list: | if type(res) != list: | ||||
return [res] | return [res] | ||||
return res | return res | ||||
def _tokenize(x): | def _tokenize(x): | ||||
return x.lower().replace(",", " ").split() | return x.lower().replace(",", " ").split() | ||||
def _get_sorting_key(origin, field): | def _get_sorting_key(origin, field): | ||||
"""Get value of the field from an origin for sorting origins. | """Get value of the field from an origin for sorting origins. | ||||
Here field should be a member of SORT_BY_OPTIONS. | Here field should be a member of SORT_BY_OPTIONS. | ||||
If "-" is present at the start of field then invert the value | If "-" is present at the start of field then invert the value | ||||
in a way that it reverses the sorting order. | in a way that it reverses the sorting order. | ||||
""" | """ | ||||
reversed = False | reversed = False | ||||
if field[0] == "-": | if field[0] == "-": | ||||
field = field[1:] | field = field[1:] | ||||
reversed = True | reversed = True | ||||
DATETIME_OBJ_MAX = datetime.max.replace(tzinfo=timezone.utc) | |||||
DATETIME_MIN = "0001-01-01T00:00:00Z" | |||||
DATE_OBJ_MAX = datetime.max | |||||
DATE_MIN = "0001-01-01" | |||||
if field == "score": | if field == "score": | ||||
if reversed: | if reversed: | ||||
return -origin.get(field, 0) | return -origin.get(field, 0) | ||||
else: | else: | ||||
return origin.get(field, 0) | return origin.get(field, 0) | ||||
datetime_max = datetime.max.replace(tzinfo=timezone.utc) | if field in ["date_created", "date_modified", "date_published"]: | ||||
date = datetime.strptime( | |||||
_nested_get(origin, get_expansion(field), DATE_MIN)[0], "%Y-%m-%d" | |||||
) | |||||
if reversed: | |||||
return DATE_OBJ_MAX - date | |||||
else: | |||||
return date | |||||
vlorentz: It looks like you don't have a test for this; can you add one?
(also, could you deduplicate it? | |||||
if field in ["nb_visits"]: # unlike other options, nb_visits is of type integer | elif field in ["nb_visits"]: # unlike other options, nb_visits is of type integer | ||||
if reversed: | if reversed: | ||||
return -origin.get(field, 0) | return -origin.get(field, 0) | ||||
else: | else: | ||||
return origin.get(field, 0) | return origin.get(field, 0) | ||||
elif field in SORT_BY_OPTIONS: | elif field in SORT_BY_OPTIONS: | ||||
if reversed: | date = datetime.fromisoformat( | ||||
return datetime_max - datetime.fromisoformat( | origin.get(field, DATETIME_MIN).replace("Z", "+00:00") | ||||
origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") | |||||
) | ) | ||||
if reversed: | |||||
return DATETIME_OBJ_MAX - date | |||||
else: | else: | ||||
return datetime.fromisoformat( | return date | ||||
origin.get(field, "0001-01-01T00:00:00Z").replace("Z", "+00:00") | |||||
) | |||||
class InMemorySearch: | class InMemorySearch: | ||||
def __init__(self): | def __init__(self): | ||||
pass | pass | ||||
def check(self): | def check(self): | ||||
return True | return True | ||||
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
datetime.fromisoformat(document["last_release_date"]), | datetime.fromisoformat(document["last_release_date"]), | ||||
datetime.fromisoformat( | datetime.fromisoformat( | ||||
self._origins[id_] | self._origins[id_] | ||||
.get("last_release_date", "0001-01-01T00:00:00Z",) | .get("last_release_date", "0001-01-01T00:00:00Z",) | ||||
.replace("Z", "+00:00") | .replace("Z", "+00:00") | ||||
), | ), | ||||
).isoformat() | ).isoformat() | ||||
if "intrinsic_metadata" in document: | if "intrinsic_metadata" in document: | ||||
document["intrinsic_metadata"] = codemeta.expand( | intrinsic_metadata = document["intrinsic_metadata"] | ||||
document["intrinsic_metadata"] | |||||
) | for date_field in ["dateCreated", "dateModified", "datePublished"]: | ||||
if date_field in intrinsic_metadata: | |||||
date = intrinsic_metadata[date_field] | |||||
# If date{Created,Modified,Published} value isn't parsable | |||||
# It gets rejected and isn't stored (unlike other fields) | |||||
if not is_date_parsable(date): | |||||
intrinsic_metadata.pop(date_field) | |||||
document["intrinsic_metadata"] = codemeta.expand(intrinsic_metadata) | |||||
if len(document["intrinsic_metadata"]) != 1: | if len(document["intrinsic_metadata"]) != 1: | ||||
continue | continue | ||||
metadata = document["intrinsic_metadata"][0] | metadata = document["intrinsic_metadata"][0] | ||||
if "http://schema.org/license" in metadata: | if "http://schema.org/license" in metadata: | ||||
metadata["http://schema.org/license"] = [ | metadata["http://schema.org/license"] = [ | ||||
{"@id": license["@id"].lower()} | {"@id": license["@id"].lower()} | ||||
Show All 17 Lines | def origin_search( | ||||
metadata_pattern: Optional[str] = None, | metadata_pattern: Optional[str] = None, | ||||
with_visit: bool = False, | with_visit: bool = False, | ||||
visit_types: Optional[List[str]] = None, | visit_types: Optional[List[str]] = None, | ||||
min_nb_visits: int = 0, | min_nb_visits: int = 0, | ||||
min_last_visit_date: str = "", | min_last_visit_date: str = "", | ||||
min_last_eventful_visit_date: str = "", | min_last_eventful_visit_date: str = "", | ||||
min_last_revision_date: str = "", | min_last_revision_date: str = "", | ||||
min_last_release_date: str = "", | min_last_release_date: str = "", | ||||
min_date_created: str = "", | |||||
min_date_modified: str = "", | |||||
min_date_published: str = "", | |||||
programming_languages: Optional[List[str]] = None, | programming_languages: Optional[List[str]] = None, | ||||
licenses: Optional[List[str]] = None, | licenses: Optional[List[str]] = None, | ||||
keywords: Optional[List[str]] = None, | keywords: Optional[List[str]] = None, | ||||
sort_by: Optional[List[str]] = None, | sort_by: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
hits: Iterator[Dict[str, Any]] = ( | hits: Iterator[Dict[str, Any]] = ( | ||||
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
o.get("last_release_date", "0001-01-01T00:00:00Z").replace( | o.get("last_release_date", "0001-01-01T00:00:00Z").replace( | ||||
"Z", "+00:00" | "Z", "+00:00" | ||||
) | ) | ||||
) | ) | ||||
>= datetime.fromisoformat(min_last_release_date), | >= datetime.fromisoformat(min_last_release_date), | ||||
hits, | hits, | ||||
) | ) | ||||
if min_date_created: | |||||
min_date_created_obj = datetime.strptime(min_date_created, "%Y-%m-%d") | |||||
hits = filter( | |||||
lambda o: datetime.strptime( | |||||
Done Inline ActionsWhy is this one different from the others? vlorentz: Why is this one different from the others? | |||||
Done Inline ActionsFixed KShivendu: Fixed | |||||
_nested_get(o, get_expansion("date_created"))[0], "%Y-%m-%d" | |||||
) | |||||
>= min_date_created_obj, | |||||
hits, | |||||
) | |||||
if min_date_modified: | |||||
min_date_modified_obj = datetime.strptime(min_date_modified, "%Y-%m-%d") | |||||
hits = filter( | |||||
lambda o: datetime.strptime( | |||||
_nested_get(o, get_expansion("date_modified"))[0], "%Y-%m-%d" | |||||
) | |||||
>= min_date_modified_obj, | |||||
hits, | |||||
) | |||||
if min_date_published: | |||||
min_date_published_obj = datetime.strptime(min_date_published, "%Y-%m-%d") | |||||
hits = filter( | |||||
lambda o: datetime.strptime( | |||||
_nested_get(o, get_expansion("date_published"))[0], "%Y-%m-%d" | |||||
) | |||||
>= min_date_published_obj, | |||||
hits, | |||||
) | |||||
if licenses: | if licenses: | ||||
queried_licenses = [license_keyword.lower() for license_keyword in licenses] | queried_licenses = [license_keyword.lower() for license_keyword in licenses] | ||||
hits = filter( | hits = filter( | ||||
lambda o: any( | lambda o: any( | ||||
# If any of the queried licenses are found, include the origin | # If any of the queried licenses are found, include the origin | ||||
any( | any( | ||||
# returns True if queried_license_keyword is found | # returns True if queried_license_keyword is found | ||||
# in any of the licenses of the origin | # in any of the licenses of the origin | ||||
▲ Show 20 Lines • Show All 88 Lines • Show Last 20 Lines |
It looks like you don't have a test for this; can you add one?
(also, could you deduplicate it?)