Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import Counter, defaultdict | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
from itertools import chain | |||||
import re | import re | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from swh.indexer import codemeta | from swh.indexer import codemeta | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import ( | from swh.search.interface import ( | ||||
SORT_BY_OPTIONS, | SORT_BY_OPTIONS, | ||||
MinimalOriginDict, | MinimalOriginDict, | ||||
▲ Show 20 Lines • Show All 267 Lines • ▼ Show 20 Lines | def origin_search( | ||||
min_date_published: str = "", | min_date_published: str = "", | ||||
programming_languages: Optional[List[str]] = None, | programming_languages: Optional[List[str]] = None, | ||||
licenses: Optional[List[str]] = None, | licenses: Optional[List[str]] = None, | ||||
keywords: Optional[List[str]] = None, | keywords: Optional[List[str]] = None, | ||||
sort_by: Optional[List[str]] = None, | sort_by: Optional[List[str]] = None, | ||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
limit: int = 50, | limit: int = 50, | ||||
) -> PagedResult[MinimalOriginDict]: | ) -> PagedResult[MinimalOriginDict]: | ||||
hits: Iterator[Dict[str, Any]] = ( | hits = self._get_hits() | ||||
self._origins[id_] | |||||
for id_ in self._origin_ids | |||||
if not self._origins[id_].get("blocklisted") | |||||
) | |||||
if url_pattern: | if url_pattern: | ||||
tokens = set(self._url_splitter.split(url_pattern)) | tokens = set(self._url_splitter.split(url_pattern)) | ||||
def predicate(match): | def predicate(match): | ||||
missing_tokens = tokens - match["_url_tokens"] | missing_tokens = tokens - match["_url_tokens"] | ||||
if len(missing_tokens) == 0: | if len(missing_tokens) == 0: | ||||
return True | return True | ||||
▲ Show 20 Lines • Show All 197 Lines • ▼ Show 20 Lines | ) -> PagedResult[MinimalOriginDict]: | ||||
] | ] | ||||
if len(origins) == limit: | if len(origins) == limit: | ||||
next_page_token = str(start_at_index + limit) | next_page_token = str(start_at_index + limit) | ||||
assert len(origins) <= limit | assert len(origins) <= limit | ||||
return PagedResult(results=origins, next_page_token=next_page_token,) | return PagedResult(results=origins, next_page_token=next_page_token,) | ||||
def visit_types_count(self) -> Counter: | |||||
hits = self._get_hits() | |||||
return Counter(chain(*[hit.get("visit_types", []) for hit in hits])) | |||||
def _get_hits(self) -> Iterator[Dict[str, Any]]: | |||||
return ( | |||||
self._origins[id_] | |||||
for id_ in self._origin_ids | |||||
if not self._origins[id_].get("blocklisted") | |||||
) |