Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/storage.py
# Copyright (C) 2015-2019 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
import datetime | import datetime | ||||
import itertools | import itertools | ||||
import json | import json | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from concurrent.futures import ThreadPoolExecutor | from concurrent.futures import ThreadPoolExecutor | ||||
from contextlib import contextmanager | from contextlib import contextmanager | ||||
from typing import Any, Dict, Mapping | from typing import Any, Dict, Mapping, Optional | ||||
import dateutil.parser | import dateutil.parser | ||||
import psycopg2 | import psycopg2 | ||||
import psycopg2.pool | import psycopg2.pool | ||||
from swh.core.api import remote_api_endpoint | from swh.core.api import remote_api_endpoint | ||||
from swh.model.hashutil import ALGORITHMS, hash_to_bytes | from swh.model.hashutil import ALGORITHMS, hash_to_bytes | ||||
from swh.objstorage import get_objstorage | from swh.objstorage import get_objstorage | ||||
▲ Show 20 Lines • Show All 1,662 Lines • ▼ Show 20 Lines | def origin_get_range(self, origin_from=1, origin_count=100, | ||||
Yields: | Yields: | ||||
dicts containing origin information as returned | dicts containing origin information as returned | ||||
by :meth:`swh.storage.storage.Storage.origin_get`. | by :meth:`swh.storage.storage.Storage.origin_get`. | ||||
""" | """ | ||||
for origin in db.origin_get_range(origin_from, origin_count, cur): | for origin in db.origin_get_range(origin_from, origin_count, cur): | ||||
yield dict(zip(db.origin_get_range_cols, origin)) | yield dict(zip(db.origin_get_range_cols, origin)) | ||||
@remote_api_endpoint('origin/list') | |||||
@timed | |||||
@db_transaction() | |||||
def origin_list(self, page_token: Optional[str] = None, limit: int = 100, | |||||
*, db=None, cur=None) -> dict: | |||||
"""Returns the list of origins | |||||
Args: | |||||
page_token: opaque token used for pagination. | |||||
limit: the maximum number of results to return | |||||
Returns: | |||||
dict: dict with the following keys: | |||||
- **next_page_token** (str, optional): opaque token to be used as | |||||
`page_token` for retrieving the next page. if absent, there is | |||||
ardumont: same remark than for in-memory implementation (type, name, typo). | |||||
no more pages to gather. | |||||
- **origins** (List[dict]): list of origins, as returned by | |||||
`origin_get`. | |||||
Not Done Inline ActionsWhy do we force the string type here? ardumont: Why do we force the string type here?
| |||||
Done Inline ActionsBecause it's an opaque token, so clients shouldn't know what it actually contains so we can change it at any time. (eg. for Cassandra, it will be a large byte array) vlorentz: Because it's an opaque token, so clients shouldn't know what it actually contains so we can… | |||||
""" | |||||
page_token = page_token or '0' | |||||
if not isinstance(page_token, str): | |||||
raise TypeError('page_token must be a string.') | |||||
origin_from = int(page_token) | |||||
result: Dict[str, Any] = { | |||||
'origins': [ | |||||
dict(zip(db.origin_get_range_cols, origin)) | |||||
for origin in db.origin_get_range(origin_from, limit, cur) | |||||
], | |||||
} | |||||
assert len(result['origins']) <= limit | |||||
if len(result['origins']) == limit: | |||||
Not Done Inline ActionsWhy don't you change the db.origin_get_range implementation so that it retuns what you want instead? ardumont: Why don't you change the `db.origin_get_range` implementation so that it retuns what you want… | |||||
Done Inline ActionsBecause I need the origin id to compute the next_page_token. vlorentz: Because I need the origin id to compute the `next_page_token`. | |||||
result['next_page_token'] = str(result['origins'][limit-1]['id']+1) | |||||
for origin in result['origins']: | |||||
del origin['id'] | |||||
return result | |||||
@remote_api_endpoint('origin/search') | @remote_api_endpoint('origin/search') | ||||
@timed | @timed | ||||
@db_transaction_generator() | @db_transaction_generator() | ||||
def origin_search(self, url_pattern, offset=0, limit=50, | def origin_search(self, url_pattern, offset=0, limit=50, | ||||
regexp=False, with_visit=False, db=None, cur=None): | regexp=False, with_visit=False, db=None, cur=None): | ||||
"""Search for origins whose urls contain a provided string pattern | """Search for origins whose urls contain a provided string pattern | ||||
or match a provided regular expression. | or match a provided regular expression. | ||||
The search is performed in a case insensitive way. | The search is performed in a case insensitive way. | ||||
▲ Show 20 Lines • Show All 347 Lines • Show Last 20 Lines |
same remark than for in-memory implementation (type, name, typo).