Changeset View
Standalone View
swh/storage/cassandra/storage.py
Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines | ) -> Iterable[Optional[Dict[str, bytes]]]: | ||||
f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents." | f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents." | ||||
) | ) | ||||
yield from self.objstorage.content_get(contents) | yield from self.objstorage.content_get(contents) | ||||
def content_get_partition( | def content_get_partition( | ||||
self, | self, | ||||
partition_id: int, | partition_id: int, | ||||
nb_partitions: int, | nb_partitions: int, | ||||
limit: int = 1000, | |||||
page_token: Optional[str] = None, | page_token: Optional[str] = None, | ||||
) -> Dict[str, Any]: | limit: int = 1000, | ||||
) -> PagedResult[Content]: | |||||
if limit is None: | if limit is None: | ||||
raise StorageArgumentException("limit should not be None") | raise StorageArgumentException("limit should not be None") | ||||
# Compute start and end of the range of tokens covered by the | # Compute start and end of the range of tokens covered by the | ||||
# requested partition | # requested partition | ||||
partition_size = (TOKEN_END - TOKEN_BEGIN) // nb_partitions | partition_size = (TOKEN_END - TOKEN_BEGIN) // nb_partitions | ||||
range_start = TOKEN_BEGIN + partition_id * partition_size | range_start = TOKEN_BEGIN + partition_id * partition_size | ||||
range_end = TOKEN_BEGIN + (partition_id + 1) * partition_size | range_end = TOKEN_BEGIN + (partition_id + 1) * partition_size | ||||
# offset the range start according to the `page_token`. | # offset the range start according to the `page_token`. | ||||
if page_token is not None: | if page_token is not None: | ||||
if not (range_start <= int(page_token) <= range_end): | if not (range_start <= int(page_token) <= range_end): | ||||
raise StorageArgumentException("Invalid page_token.") | raise StorageArgumentException("Invalid page_token.") | ||||
range_start = int(page_token) | range_start = int(page_token) | ||||
# Get the first rows of the range | next_page_token: Optional[str] = None | ||||
rows = self._cql_runner.content_get_token_range(range_start, range_end, limit) | rows = self._cql_runner.content_get_token_range(range_start, range_end, limit) | ||||
rows = list(rows) | contents = [] | ||||
last_id: Optional[int] = None | |||||
for row in rows: | |||||
if row.status == "absent": | |||||
continue | |||||
row_d = row._asdict() | |||||
last_id = row_d.pop("tok") | |||||
contents.append(Content(**row_d)) | |||||
if len(rows) == limit: | if len(contents) == limit: | ||||
next_page_token: Optional[str] = str(rows[-1].tok + 1) | assert last_id is not None | ||||
ardumont: I found this not good.
I highly prefer taking one more object from the backend and take its… | |||||
Done Inline Actions
was talking about this: next_page_token = str(last_id + 1) ardumont: > I found this not good.
was talking about this:
```
next_page_token = str(last_id + 1)
``` | |||||
Not Done Inline ActionsI am not a cassandra expert so I do not know which approach is better for getting the next token. At least no elements will be missed with the current one. anlambert: I am not a cassandra expert so I do not know which approach is better for getting the next… | |||||
Done Inline Actionsoh well, apparently, tests do no like this version so i'll revert. ardumont: oh well, apparently, tests do no like this version so i'll revert. | |||||
Not Done Inline Actionstok+1 is not good, it would skip some contents in case of collision on tok. (And collisions do happen, it's a noncryptographic hash on 32 bits.) You should instead use the token of the next token. It will return the same contents twice though; but that's less of an issue. Regardless, that's something to be fixed in another diff, so keep it like this for now and open a task vlorentz: `tok+1` is not good, it would skip some contents in case of collision on `tok`. (And collisions… | |||||
Done Inline Actionsardumont: T2518 | |||||
else: | next_page_token = str(last_id + 1) | ||||
next_page_token = None | |||||
return { | assert len(contents) <= limit | ||||
"contents": [row._asdict() for row in rows if row.status != "absent"], | return PagedResult(results=contents, next_page_token=next_page_token,) | ||||
"next_page_token": next_page_token, | |||||
} | |||||
def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: | def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]: | ||||
result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents} | result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents} | ||||
for sha1 in contents: | for sha1 in contents: | ||||
Not Done Inline ActionsYou should pop the tok value in the if body only anlambert: You should pop the `tok` value in the if body only | |||||
Done Inline ActionsI can't have for the Content creation line 212 so i'm popping it all the time. ardumont: I can't have for the Content creation line 212 so i'm popping it all the time.
We really need… | |||||
Not Done Inline ActionsOh right, did not notice anlambert: Oh right, did not notice | |||||
Done Inline Actions/me mutters something about reading himself prior to submit "I can't put the pop tok instruction in the body..." ardumont: /me mutters something about reading himself prior to submit
"I can't put the pop tok… | |||||
Done Inline Actionsheads up also, it should be: next_page_token = str(last_id) (why doesn't mypy say anything...?) ardumont: heads up also, it should be:
```
next_page_token = str(last_id)
```
(why doesn't mypy say… | |||||
# Get all (sha1, sha1_git, sha256, blake2s256) whose sha1 | # Get all (sha1, sha1_git, sha256, blake2s256) whose sha1 | ||||
# matches the argument, from the index table ('content_by_sha1') | # matches the argument, from the index table ('content_by_sha1') | ||||
for row in self._content_get_from_hash("sha1", sha1): | for row in self._content_get_from_hash("sha1", sha1): | ||||
content_metadata = row._asdict() | content_metadata = row._asdict() | ||||
content_metadata.pop("ctime") | content_metadata.pop("ctime") | ||||
result[content_metadata["sha1"]].append(content_metadata) | result[content_metadata["sha1"]].append(content_metadata) | ||||
return result | return result | ||||
▲ Show 20 Lines • Show All 1,045 Lines • Show Last 20 Lines |
I found this not good.
I highly prefer taking one more object from the backend and take its "tok" id instead.
We assume too much by doing this...