Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
Show First 20 Lines • Show All 165 Lines • ▼ Show 20 Lines | class InMemorySearch: | ||||
def flush(self) -> None: | def flush(self) -> None: | ||||
pass | pass | ||||
_url_splitter = re.compile(r"\W") | _url_splitter = re.compile(r"\W") | ||||
def origin_update(self, documents: Iterable[OriginDict]) -> None: | def origin_update(self, documents: Iterable[OriginDict]) -> None: | ||||
for source_document in documents: | for source_document in documents: | ||||
document: Dict[str, Any] = dict(source_document) | id_ = hash_to_hex(model.Origin(url=source_document["url"]).id) | ||||
id_ = hash_to_hex(model.Origin(url=document["url"]).id) | document: Dict[str, Any] = { | ||||
**source_document, | |||||
"sha1": id_, | |||||
} | |||||
if "url" in document: | if "url" in document: | ||||
document["_url_tokens"] = set( | document["_url_tokens"] = set( | ||||
self._url_splitter.split(source_document["url"]) | self._url_splitter.split(source_document["url"]) | ||||
) | ) | ||||
if "visit_types" in document: | if "visit_types" in document: | ||||
document["visit_types"] = set(source_document["visit_types"]) | document["visit_types"] = set(source_document["visit_types"]) | ||||
if "visit_types" in self._origins[id_]: | if "visit_types" in self._origins[id_]: | ||||
document["visit_types"].update(self._origins[id_]["visit_types"]) | document["visit_types"].update(self._origins[id_]["visit_types"]) | ||||
▲ Show 20 Lines • Show All 332 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
assert len(origins) <= limit | assert len(origins) <= limit | ||||
return PagedResult( | return PagedResult( | ||||
results=origins, | results=origins, | ||||
next_page_token=next_page_token, | next_page_token=next_page_token, | ||||
) | ) | ||||
def origin_get(self, url: str) -> Optional[Dict[str, Any]]: | |||||
origin_id = hash_to_hex(model.Origin(url=url).id) | |||||
document = self._origins.get(origin_id) | |||||
if document is None: | |||||
return None | |||||
else: | |||||
return {k: v for (k, v) in document.items() if k != "_url_tokens"} | |||||
def visit_types_count(self) -> Counter: | def visit_types_count(self) -> Counter: | ||||
hits = self._get_hits() | hits = self._get_hits() | ||||
return Counter(chain(*[hit.get("visit_types", []) for hit in hits])) | return Counter(chain(*[hit.get("visit_types", []) for hit in hits])) | ||||
def _get_hits(self) -> Iterator[Dict[str, Any]]: | def _get_hits(self) -> Iterator[Dict[str, Any]]: | ||||
return ( | return ( | ||||
self._origins[id_] | self._origins[id_] | ||||
for id_ in self._origin_ids | for id_ in self._origin_ids | ||||
if not self._origins[id_].get("blocklisted") | if not self._origins[id_].get("blocklisted") | ||||
) | ) |