Changeset View
Changeset View
Standalone View
Standalone View
swh/web/common/service.py
# Copyright (C) 2015-2019 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import itertools | |||||
import os | import os | ||||
import re | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from typing import Any, Dict | from typing import Any, Dict | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.storage.algos import diff, revisions_walker | from swh.storage.algos import diff, revisions_walker | ||||
from swh.model.identifiers import ( | from swh.model.identifiers import ( | ||||
CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT | CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT | ||||
) | ) | ||||
from swh.web.common import converters | from swh.web.common import converters | ||||
from swh.web.common import query | from swh.web.common import query | ||||
from swh.web.common.exc import BadInputExc, NotFoundExc | from swh.web.common.exc import BadInputExc, NotFoundExc | ||||
from swh.web.common.origin_visits import get_origin_visit | from swh.web.common.origin_visits import get_origin_visit | ||||
from swh.web import config | from swh.web import config | ||||
search = config.search() | |||||
storage = config.storage() | storage = config.storage() | ||||
vault = config.vault() | vault = config.vault() | ||||
idx_storage = config.indexer_storage() | idx_storage = config.indexer_storage() | ||||
MAX_LIMIT = 50 # Top limit the users can ask for | MAX_LIMIT = 50 # Top limit the users can ask for | ||||
▲ Show 20 Lines • Show All 208 Lines • ▼ Show 20 Lines | def lookup_origins(origin_from=1, origin_count=100): | ||||
Yields: | Yields: | ||||
origins information as dicts | origins information as dicts | ||||
""" | """ | ||||
origins = storage.origin_get_range(origin_from, origin_count) | origins = storage.origin_get_range(origin_from, origin_count) | ||||
return map(converters.from_origin, origins) | return map(converters.from_origin, origins) | ||||
def search_origin(url_pattern, offset=0, limit=50, regexp=False, | def search_origin(url_pattern, limit=50, page_token=None, with_visit=False): | ||||
with_visit=False): | |||||
"""Search for origins whose urls contain a provided string pattern | """Search for origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
Args: | Args: | ||||
url_pattern: the string pattern to search for in origin urls | url_pattern: the string pattern to search for in origin urls | ||||
offset: number of found origins to skip before returning results | offset: number of found origins to skip before returning results | ||||
limit: the maximum number of found origins to return | limit: the maximum number of found origins to return | ||||
Returns: | Returns: | ||||
list of origin information as dict. | list of origin information as dict. | ||||
""" | """ | ||||
if not regexp: | results = search.origin_search(url_pattern=url_pattern, count=limit, | ||||
# If the query is not a regexp, rewrite it as a regexp. | page_token=page_token, | ||||
regexp = True | with_visit=with_visit) | ||||
search_words = [re.escape(word) for word in url_pattern.split()] | origins = map(converters.from_origin, results['results']) | ||||
if len(search_words) >= 7: | return (origins, results['next_page_token']) | ||||
url_pattern = '.*'.join(search_words) | |||||
else: | |||||
pattern_parts = [] | |||||
for permut in itertools.permutations(search_words): | |||||
pattern_parts.append('.*'.join(permut)) | |||||
url_pattern = '|'.join(pattern_parts) | |||||
origins = storage.origin_search(url_pattern, offset, limit, regexp, | |||||
with_visit) | |||||
return map(converters.from_origin, origins) | |||||
def search_origin_metadata(fulltext, limit=50): | def search_origin_metadata(fulltext, limit=50): | ||||
"""Search for origins whose metadata match a provided string pattern. | """Search for origins whose metadata match a provided string pattern. | ||||
Args: | Args: | ||||
fulltext: the string pattern to search for in origin metadata | fulltext: the string pattern to search for in origin metadata | ||||
offset: number of found origins to skip before returning results | offset: number of found origins to skip before returning results | ||||
▲ Show 20 Lines • Show All 875 Lines • Show Last 20 Lines |