D4615.id18738.diff
No OneTemporary
Actions

Size

16 KB

Subscribers

None

D4615.id18738.diff
View Options

	diff --git a/sql/upgrades/168.sql b/sql/upgrades/168.sql
	new file mode 100644
	--- /dev/null
	+++ b/sql/upgrades/168.sql
	@@ -0,0 +1,39 @@
	+-- SWH DB schema upgrade
	+-- from_version: 167
	+-- to_version: 168
	+-- description: Make origin_visit_status.type not null
	+
	+insert into dbversion(version, release, description)
	+ values(168, now(), 'Work In Progress');
	+
	+create or replace function swh_snapshot_get_by_id(id sha1_git,
	+ branches_from bytea default '', branches_count bigint default null,
	+ target_types snapshot_target[] default NULL,
	+ branch_name_include_pattern bytea default NULL,
	+ branch_name_exclude_prefix bytea default NULL)
	+ returns setof snapshot_result
	+ language sql
	+ stable
	+as $$
	+ -- with small limits, the "naive" version of this query can degenerate into
	+ -- using the deduplication index on snapshot_branch (name, target,
	+ -- target_type); The planner happily scans several hundred million rows.
	+
	+ -- Do the query in two steps: first pull the relevant branches for the given
	+ -- snapshot (filtering them by type), then do the limiting. This two-step
	+ -- process guides the planner into using the proper index.
	+ with filtered_snapshot_branches as (
	+ select swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
	+ from snapshot_branches
	+ inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
	+ where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
	+ and (target_types is null or target_type = any(target_types))
	+ order by name
	+ )
	+ select snapshot_id, name, target, target_type
	+ from filtered_snapshot_branches
	+ where name >= branches_from
	+ and (branch_name_include_pattern is null or name like '%'\|\|branch_name_include_pattern\|\|'%')
	+ and (branch_name_exclude_prefix is null or name not like branch_name_exclude_prefix\|\|'%')
	+ order by name limit branches_count;
	+$$;
	diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
	--- a/swh/storage/cassandra/cql.py
	+++ b/swh/storage/cassandra/cql.py
	@@ -626,14 +626,14 @@
	return {None: nb_none, **counts}

	@_prepared_select_statement(
	- SnapshotBranchRow, "WHERE snapshot_id = ? AND name >= ? LIMIT ?"
	+ SnapshotBranchRow, "WHERE snapshot_id = ? AND name >= ? AND name < ? LIMIT ?"
	)
	def snapshot_branch_get(
	- self, snapshot_id: Sha1Git, from_: bytes, limit: int, *, statement
	+ self, snapshot_id: Sha1Git, from_: bytes, to: bytes, limit: int, *, statement
	) -> Iterable[SnapshotBranchRow]:
	return map(
	SnapshotBranchRow.from_dict,
	- self._execute_with_retries(statement, [snapshot_id, from_, limit]),
	+ self._execute_with_retries(statement, [snapshot_id, from_, to, limit]),
	)

	##########################
	diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
	--- a/swh/storage/cassandra/storage.py
	+++ b/swh/storage/cassandra/storage.py
	@@ -678,6 +678,8 @@
	branches_from: bytes = b"",
	branches_count: int = 1000,
	target_types: Optional[List[str]] = None,
	+ branch_name_include_pattern: Optional[bytes] = None,
	+ branch_name_exclude_prefix: Optional[bytes] = None,
	) -> Optional[PartialBranches]:
	if self._cql_runner.snapshot_missing([snapshot_id]):
	# Makes sure we don't fetch branches for a snapshot that is
	@@ -685,18 +687,18 @@
	return None

	branches: List = []
	+ from_branch = branches_from
	+ if branch_name_exclude_prefix is not None:
	+ to_branch = branch_name_exclude_prefix
	+ else:
	+ to_branch = b"\xff"
	while len(branches) < branches_count + 1:
	new_branches = list(
	self._cql_runner.snapshot_branch_get(
	- snapshot_id, branches_from, branches_count + 1
	+ snapshot_id, from_branch, to_branch, branches_count + 1
	)
	)

	- if not new_branches:
	- break
	-
	- branches_from = new_branches[-1].name
	-
	new_branches_filtered = new_branches

	# Filter by target_type
	@@ -707,11 +709,36 @@
	if branch.target is not None and branch.target_type in target_types
	]

	+ # Filter by branches_name_pattern
	+ if branch_name_include_pattern:
	+ new_branches_filtered = [
	+ branch
	+ for branch in new_branches_filtered
	+ if branch.name is not None
	+ and (
	+ branch_name_include_pattern is None
	+ or branch_name_include_pattern in branch.name
	+ )
	+ ]
	+
	branches.extend(new_branches_filtered)

	- if len(new_branches) < branches_count + 1:
	+ if len(branches) > branches_count or (
	+ len(new_branches) < branches_count + 1
	+ and to_branch != branch_name_exclude_prefix
	+ ):
	break

	+ if (
	+ len(new_branches) < branches_count + 1
	+ and to_branch == branch_name_exclude_prefix
	+ ):
	+ prefix = branch_name_exclude_prefix
	+ from_branch = prefix[:-1] + bytes([prefix[-1] + 1])
	+ to_branch = b"\xff"
	+ else:
	+ from_branch = new_branches[-1].name
	+
	if len(branches) > branches_count:
	last_branch = branches.pop(-1).name
	else:
	diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
	--- a/swh/storage/in_memory.py
	+++ b/swh/storage/in_memory.py
	@@ -399,11 +399,11 @@
	return counts

	def snapshot_branch_get(
	- self, snapshot_id: Sha1Git, from_: bytes, limit: int
	+ self, snapshot_id: Sha1Git, from_: bytes, to: bytes, limit: int
	) -> Iterable[SnapshotBranchRow]:
	count = 0
	for branch in self._snapshot_branches.get_from_partition_key((snapshot_id,)):
	- if branch.name >= from_:
	+ if branch.name >= from_ and branch.name < to:
	count += 1
	yield branch
	if count >= limit:
	diff --git a/swh/storage/interface.py b/swh/storage/interface.py
	--- a/swh/storage/interface.py
	+++ b/swh/storage/interface.py
	@@ -701,6 +701,8 @@
	branches_from: bytes = b"",
	branches_count: int = 1000,
	target_types: Optional[List[str]] = None,
	+ branch_name_include_pattern: Optional[bytes] = None,
	+ branch_name_exclude_prefix: Optional[bytes] = None,
	) -> Optional[PartialBranches]:
	"""Get the content, possibly partial, of a snapshot with the given id

	@@ -717,6 +719,10 @@
	target types of branch to return (possible values that can be
	contained in that list are `'content', 'directory',
	'revision', 'release', 'snapshot', 'alias'`)
	+ branch_name_include_pattern: if provided, only return branches whose name
	+ contains given pattern
	+ branch_name_exclude_prefix: if provided, do not return branches whose name
	+ contains given prefix

	Returns:
	dict: None if the snapshot does not exist;
	diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
	--- a/swh/storage/postgresql/db.py
	+++ b/swh/storage/postgresql/db.py
	@@ -28,7 +28,7 @@

	"""

	- current_version = 167
	+ current_version = 168

	def mktemp_dir_entry(self, entry_type, cur=None):
	self._cursor(cur).execute(
	@@ -254,17 +254,29 @@
	branches_from=b"",
	branches_count=None,
	target_types=None,
	+ branch_name_include_pattern=None,
	+ branch_name_exclude_prefix=None,
	cur=None,
	):
	cur = self._cursor(cur)
	query = """\
	- SELECT %s
	- FROM swh_snapshot_get_by_id(%%s, %%s, %%s, %%s :: snapshot_target[])
	+ SELECT %s
	+ FROM swh_snapshot_get_by_id(%%s, %%s, %%s, %%s :: snapshot_target[], %%s, %%s)
	""" % ", ".join(
	self.snapshot_get_cols
	)

	- cur.execute(query, (snapshot_id, branches_from, branches_count, target_types))
	+ cur.execute(
	+ query,
	+ (
	+ snapshot_id,
	+ branches_from,
	+ branches_count,
	+ target_types,
	+ branch_name_include_pattern,
	+ branch_name_exclude_prefix,
	+ ),
	+ )

	yield from cur

	diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py
	--- a/swh/storage/postgresql/storage.py
	+++ b/swh/storage/postgresql/storage.py
	@@ -766,6 +766,8 @@
	branches_from: bytes = b"",
	branches_count: int = 1000,
	target_types: Optional[List[str]] = None,
	+ branch_name_include_pattern: Optional[bytes] = None,
	+ branch_name_exclude_prefix: Optional[bytes] = None,
	db=None,
	cur=None,
	) -> Optional[PartialBranches]:
	@@ -784,6 +786,8 @@
	# optimal performances
	branches_count=max(branches_count + 1, 10),
	target_types=target_types,
	+ branch_name_include_pattern=branch_name_include_pattern,
	+ branch_name_exclude_prefix=branch_name_exclude_prefix,
	cur=cur,
	)
	)
	diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql
	--- a/swh/storage/sql/30-schema.sql
	+++ b/swh/storage/sql/30-schema.sql
	@@ -17,7 +17,7 @@

	-- latest schema version
	insert into dbversion(version, release, description)
	- values(167, now(), 'Work In Progress');
	+ values(168, now(), 'Work In Progress');

	-- a SHA1 checksum
	create domain sha1 as bytea check (length(value) = 20);
	diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql
	--- a/swh/storage/sql/40-funcs.sql
	+++ b/swh/storage/sql/40-funcs.sql
	@@ -657,7 +657,9 @@

	create or replace function swh_snapshot_get_by_id(id sha1_git,
	branches_from bytea default '', branches_count bigint default null,
	- target_types snapshot_target[] default NULL)
	+ target_types snapshot_target[] default NULL,
	+ branch_name_include_pattern bytea default NULL,
	+ branch_name_exclude_prefix bytea default NULL)
	returns setof snapshot_result
	language sql
	stable
	@@ -680,6 +682,8 @@
	select snapshot_id, name, target, target_type
	from filtered_snapshot_branches
	where name >= branches_from
	+ and (branch_name_include_pattern is null or name like '%'\|\|branch_name_include_pattern\|\|'%')
	+ and (branch_name_exclude_prefix is null or name not like branch_name_exclude_prefix\|\|'%')
	order by name limit branches_count;
	$$;

	diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
	--- a/swh/storage/tests/storage_tests.py
	+++ b/swh/storage/tests/storage_tests.py
	@@ -10,6 +10,7 @@
	import itertools
	import math
	import random
	+import re
	from typing import Any, ClassVar, Dict, Iterator, Optional

	import attr
	@@ -30,6 +31,7 @@
	Revision,
	SkippedContent,
	Snapshot,
	+ SnapshotBranch,
	TargetType,
	)
	from swh.storage import get_storage
	@@ -2992,6 +2994,111 @@
	assert len(branches) == 1
	assert alias1 in branches

	+ def test_snapshot_add_get_by_branches_name_pattern(self, swh_storage, sample_data):
	+ snapshot = Snapshot(
	+ branches={
	+ b"refs/heads/master": SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ ),
	+ b"refs/heads/incoming": SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ ),
	+ b"refs/pull/1": SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ ),
	+ b"refs/pull/2": SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ ),
	+ b"dangling": None,
	+ },
	+ )
	+ swh_storage.snapshot_add([snapshot])
	+
	+ for include_pattern, exclude_prefix, nb_results in (
	+ (b"pull", None, 2),
	+ (b"incoming", None, 1),
	+ (b"dangling", None, 1),
	+ (None, b"refs/heads/", 3),
	+ (b"refs", b"refs/heads/master", 3),
	+ ):
	+ branches = swh_storage.snapshot_get_branches(
	+ snapshot.id,
	+ branch_name_include_pattern=include_pattern,
	+ branch_name_exclude_prefix=exclude_prefix,
	+ )["branches"]
	+ assert len(branches) == nb_results
	+ for branch_name in branches:
	+ if include_pattern:
	+ assert include_pattern in branch_name
	+ if exclude_prefix:
	+ assert not branch_name.startswith(exclude_prefix)
	+
	+ def test_snapshot_add_get_by_branches_name_pattern_filtered_paginated(
	+ self, swh_storage, sample_data
	+ ):
	+ pattern = "foo"
	+ nb_branches_by_target_type = 10
	+ branches = {}
	+ for i in range(nb_branches_by_target_type):
	+ branches[f"branch/directory/bar{i}".encode()] = SnapshotBranch(
	+ target=sample_data.directory.id, target_type=TargetType.DIRECTORY,
	+ )
	+ branches[f"branch/revision/bar{i}".encode()] = SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ )
	+ branches[f"branch/directory/{pattern}{i}".encode()] = SnapshotBranch(
	+ target=sample_data.directory.id, target_type=TargetType.DIRECTORY,
	+ )
	+ branches[f"branch/revision/{pattern}{i}".encode()] = SnapshotBranch(
	+ target=sample_data.revision.id, target_type=TargetType.REVISION,
	+ )
	+
	+ snapshot = Snapshot(branches=branches)
	+ swh_storage.snapshot_add([snapshot])
	+
	+ regexp = re.compile(pattern)
	+ branches_count = nb_branches_by_target_type // 2
	+
	+ for target_type in (
	+ TargetType.DIRECTORY,
	+ TargetType.REVISION,
	+ ):
	+ target_type_str = target_type.value
	+ partial_branches = swh_storage.snapshot_get_branches(
	+ snapshot.id,
	+ branch_name_include_pattern=pattern.encode(),
	+ target_types=[target_type_str],
	+ branches_count=branches_count,
	+ )
	+ branches = partial_branches["branches"]
	+
	+ assert len(branches) == branches_count
	+ for branch_name, branch_data in branches.items():
	+ assert regexp.search(branch_name.decode("utf-8"))
	+ assert branch_data.target_type == target_type
	+ for i in range(branches_count):
	+ assert f"branch/{target_type_str}/{pattern}{i}".encode() in branches
	+ assert (
	+ partial_branches["next_branch"]
	+ == f"branch/{target_type_str}/{pattern}{branches_count}".encode()
	+ )
	+
	+ partial_branches = swh_storage.snapshot_get_branches(
	+ snapshot.id,
	+ branch_name_include_pattern=pattern.encode(),
	+ target_types=[target_type_str],
	+ branches_from=partial_branches["next_branch"],
	+ )
	+ branches = partial_branches["branches"]
	+
	+ assert len(branches) == branches_count
	+ for branch_name, branch_data in branches.items():
	+ assert regexp.search(branch_name.decode("utf-8"))
	+ assert branch_data.target_type == target_type
	+ for i in range(branches_count, 2 * branches_count):
	+ assert f"branch/{target_type_str}/{pattern}{i}".encode() in branches
	+ assert partial_branches["next_branch"] is None
	+
	def test_snapshot_add_get(self, swh_storage, sample_data):
	snapshot = sample_data.snapshot
	origin = sample_data.origin

File Metadata

Mime Type: text/plain
Expires: Wed, Dec 18, 8:09 AM (1 d, 23 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3230579

D4615.id18738.diffNo OneTemporaryActions

D4615.id18738.diffView Options

File Metadata

Event Timeline

D4615.id18738.diff
No OneTemporary
Actions

D4615.id18738.diff
View Options