Differential D3713 Diff 13090 swh/storage/cassandra/storage.py

Changeset View

Standalone View

swh/storage/cassandra/storage.py

Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines	) -> Iterable[Optional[Dict[str, bytes]]]:
f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents."		f"Send at maximum {BULK_BLOCK_CONTENT_LEN_MAX} contents."
)		)
yield from self.objstorage.content_get(contents)		yield from self.objstorage.content_get(contents)

def content_get_partition(		def content_get_partition(
self,		self,
partition_id: int,		partition_id: int,
nb_partitions: int,		nb_partitions: int,
limit: int = 1000,
page_token: Optional[str] = None,		page_token: Optional[str] = None,
) -> Dict[str, Any]:		limit: int = 1000,
		) -> PagedResult[Content]:
if limit is None:		if limit is None:
raise StorageArgumentException("limit should not be None")		raise StorageArgumentException("limit should not be None")

# Compute start and end of the range of tokens covered by the		# Compute start and end of the range of tokens covered by the
# requested partition		# requested partition
partition_size = (TOKEN_END - TOKEN_BEGIN) // nb_partitions		partition_size = (TOKEN_END - TOKEN_BEGIN) // nb_partitions
range_start = TOKEN_BEGIN + partition_id * partition_size		range_start = TOKEN_BEGIN + partition_id * partition_size
range_end = TOKEN_BEGIN + (partition_id + 1) * partition_size		range_end = TOKEN_BEGIN + (partition_id + 1) * partition_size

# offset the range start according to the `page_token`.		# offset the range start according to the `page_token`.
if page_token is not None:		if page_token is not None:
if not (range_start <= int(page_token) <= range_end):		if not (range_start <= int(page_token) <= range_end):
raise StorageArgumentException("Invalid page_token.")		raise StorageArgumentException("Invalid page_token.")
range_start = int(page_token)		range_start = int(page_token)

# Get the first rows of the range		next_page_token: Optional[str] = None

rows = self._cql_runner.content_get_token_range(range_start, range_end, limit)		rows = self._cql_runner.content_get_token_range(range_start, range_end, limit)
rows = list(rows)		contents = []
		last_id: Optional[int] = None
		for row in rows:
		if row.status == "absent":
		continue
		row_d = row._asdict()
		last_id = row_d.pop("tok")
		contents.append(Content(**row_d))

if len(rows) == limit:		if len(contents) == limit:
next_page_token: Optional[str] = str(rows[-1].tok + 1)		assert last_id is not None
		ardumontAuthorUnsubmitted Done Inline Actions I found this not good. I highly prefer taking one more object from the backend and take its "tok" id instead. We assume too much by doing this... ardumont: I found this not good. I highly prefer taking one more object from the backend and take its…
		ardumontAuthorUnsubmitted Done Inline Actions I found this not good. was talking about this: next_page_token = str(last_id + 1) ardumont: > I found this not good. was talking about this: ``` next_page_token = str(last_id + 1) ```
		anlambertUnsubmitted Not Done Inline Actions I am not a cassandra expert so I do not know which approach is better for getting the next token. At least no elements will be missed with the current one. anlambert: I am not a cassandra expert so I do not know which approach is better for getting the next…
		ardumontAuthorUnsubmitted Done Inline Actions oh well, apparently, tests do no like this version so i'll revert. ardumont: oh well, apparently, tests do no like this version so i'll revert.
		vlorentzUnsubmitted Not Done Inline Actions `tok+1` is not good, it would skip some contents in case of collision on `tok`. (And collisions do happen, it's a noncryptographic hash on 32 bits.) You should instead use the token of the next token. It will return the same contents twice though; but that's less of an issue. Regardless, that's something to be fixed in another diff, so keep it like this for now and open a task vlorentz: `tok+1` is not good, it would skip some contents in case of collision on `tok`. (And collisions…
		ardumontAuthorUnsubmitted Done Inline Actions T2518 ardumont: T2518
else:		next_page_token = str(last_id + 1)
next_page_token = None

return {		assert len(contents) <= limit
"contents": [row._asdict() for row in rows if row.status != "absent"],		return PagedResult(results=contents, next_page_token=next_page_token,)
"next_page_token": next_page_token,
}

def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]:		def content_get_metadata(self, contents: List[bytes]) -> Dict[bytes, List[Dict]]:
result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents}		result: Dict[bytes, List[Dict]] = {sha1: [] for sha1 in contents}
for sha1 in contents:		for sha1 in contents:
		anlambertUnsubmitted Not Done Inline Actions You should pop the `tok` value in the if body only anlambert: You should pop the `tok` value in the if body only
		ardumontAuthorUnsubmitted Done Inline Actions I can't have for the Content creation line 212 so i'm popping it all the time. We really need its value once though (for the pagination token). ardumont: I can't have for the Content creation line 212 so i'm popping it all the time. We really need…
		anlambertUnsubmitted Not Done Inline Actions Oh right, did not notice anlambert: Oh right, did not notice
		ardumontAuthorUnsubmitted Done Inline Actions /me mutters something about reading himself prior to submit "I can't put the pop tok instruction in the body..." ardumont: /me mutters something about reading himself prior to submit "I can't put the pop tok…
		ardumontAuthorUnsubmitted Done Inline Actions heads up also, it should be: next_page_token = str(last_id) (why doesn't mypy say anything...?) ardumont: heads up also, it should be: ``` next_page_token = str(last_id) ``` (why doesn't mypy say…
# Get all (sha1, sha1_git, sha256, blake2s256) whose sha1		# Get all (sha1, sha1_git, sha256, blake2s256) whose sha1
# matches the argument, from the index table ('content_by_sha1')		# matches the argument, from the index table ('content_by_sha1')
for row in self._content_get_from_hash("sha1", sha1):		for row in self._content_get_from_hash("sha1", sha1):
content_metadata = row._asdict()		content_metadata = row._asdict()
content_metadata.pop("ctime")		content_metadata.pop("ctime")
result[content_metadata["sha1"]].append(content_metadata)		result[content_metadata["sha1"]].append(content_metadata)
return result		return result

▲ Show 20 Lines • Show All 1,045 Lines • Show Last 20 Lines