D4397.id15580.diff
No OneTemporary
Actions

Size

70 KB

Subscribers

None

D4397.id15580.diff
View Options

	diff --git a/sql/upgrades/133.sql b/sql/upgrades/133.sql
	new file mode 100644
	--- /dev/null
	+++ b/sql/upgrades/133.sql
	@@ -0,0 +1,177 @@
	+-- SWH Indexer DB schema upgrade
	+-- from_version: 132
	+-- to_version: 133
	+-- description: remove 'conflict_update' argument
	+
	+insert into dbversion(version, release, description)
	+ values(132, now(), 'Work In Progress');
	+
	+create or replace function swh_content_mimetype_add(conflict_update boolean);
	+create or replace function swh_content_mimetype_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
	+ select id, mimetype, encoding, indexer_configuration_id
	+ from tmp_content_mimetype tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set mimetype = excluded.mimetype,
	+ encoding = excluded.encoding;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
	+
	+
	+
	+drop function swh_content_language_add(conflict_update boolean);
	+create or replace function swh_content_language_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ insert into content_language (id, lang, indexer_configuration_id)
	+ select id, lang, indexer_configuration_id
	+ from tmp_content_language tcl
	+ on conflict(id, indexer_configuration_id)
	+ do update set lang = excluded.lang;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_content_language_add() IS 'Add new content languages';
	+
	+
	+
	+drop function swh_content_ctags_add(conflict_update boolean);
	+create or replace function swh_content_ctags_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
	+ select id, name, kind, line, lang, indexer_configuration_id
	+ from tmp_content_ctags tct
	+
	+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
	+
	+
	+
	+drop function swh_content_fossology_license_add(conflict_update boolean);
	+create or replace function swh_content_fossology_license_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ -- insert unknown licenses first
	+ insert into fossology_license (name)
	+ select distinct license from tmp_content_fossology_license tmp
	+ where not exists (select 1 from fossology_license where name=tmp.license)
	+ on conflict(name) do nothing;
	+
	+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
	+ select tcl.id,
	+ (select id from fossology_license where name = tcl.license) as license,
	+ indexer_configuration_id
	+ from tmp_content_fossology_license tcl
	+ on conflict(id, license_id, indexer_configuration_id)
	+ do update set license_id = excluded.license_id;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
	+
	+
	+
	+drop function swh_content_metadata_add(conflict_update boolean);
	+create or replace function swh_content_metadata_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ insert into content_metadata (id, metadata, indexer_configuration_id)
	+ select id, metadata, indexer_configuration_id
	+ from tmp_content_metadata tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set metadata = excluded.metadata;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_content_metadata_add() IS 'Add new content metadata';
	+
	+
	+
	+drop function swh_revision_intrinsic_metadata_add(conflict_update boolean);
	+create or replace function swh_revision_intrinsic_metadata_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
	+ select id, metadata, mappings, indexer_configuration_id
	+ from tmp_revision_intrinsic_metadata tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set
	+ metadata = excluded.metadata,
	+ mappings = excluded.mappings;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
	+
	+
	+
	+drop function swh_origin_intrinsic_metadata_add(conflict_update boolean);
	+create or replace function swh_origin_intrinsic_metadata_add()
	+ returns bigint
	+ language plpgsql
	+as $$
	+declare
	+ res bigint;
	+begin
	+ perform swh_origin_intrinsic_metadata_compute_tsvector();
	+
	+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
	+ select id, metadata, indexer_configuration_id, from_revision,
	+ metadata_tsvector, mappings
	+ from tmp_origin_intrinsic_metadata
	+ on conflict(id, indexer_configuration_id)
	+ do update set
	+ metadata = excluded.metadata,
	+ metadata_tsvector = excluded.metadata_tsvector,
	+ mappings = excluded.mappings,
	+ from_revision = excluded.from_revision;
	+
	+ get diagnostics res = ROW_COUNT;
	+ return res;
	+end
	+$$;
	+
	+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
	diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
	--- a/swh/indexer/cli.py
	+++ b/swh/indexer/cli.py
	@@ -205,7 +205,7 @@

	origins = list_origins_by_producer(idx_storage, mappings, tool_ids)

	- kwargs = {"policy_update": "update-dups", "retries_left": 1}
	+ kwargs = {"retries_left": 1}
	schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)


	diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
	--- a/swh/indexer/ctags.py
	+++ b/swh/indexer/ctags.py
	@@ -136,16 +136,12 @@
	return ctags

	def persist_index_computations(
	- self, results: List[ContentCtagsRow], policy_update: str
	+ self, results: List[ContentCtagsRow]
	) -> Dict[str, int]:
	"""Persist the results in storage.

	Args:
	results: list of ctags returned by index()
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them

	"""
	- return self.idx_storage.content_ctags_add(
	- results, conflict_update=(policy_update == "update-dups")
	- )
	+ return self.idx_storage.content_ctags_add(results)
	diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
	--- a/swh/indexer/fossology_license.py
	+++ b/swh/indexer/fossology_license.py
	@@ -114,7 +114,7 @@
	]

	def persist_index_computations(
	- self, results: List[ContentLicenseRow], policy_update: str
	+ self, results: List[ContentLicenseRow]
	) -> Dict[str, int]:
	"""Persist the results in storage.

	@@ -126,13 +126,8 @@
	- license (bytes): license in bytes
	- path (bytes): path

	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them
	-
	"""
	- return self.idx_storage.content_fossology_license_add(
	- results, conflict_update=(policy_update == "update-dups")
	- )
	+ return self.idx_storage.content_fossology_license_add(results)


	class FossologyLicenseIndexer(
	diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
	--- a/swh/indexer/indexer.py
	+++ b/swh/indexer/indexer.py
	@@ -10,6 +10,7 @@
	import shutil
	import tempfile
	from typing import Any, Dict, Generic, Iterator, List, Optional, Set, TypeVar, Union
	+import warnings

	from swh.core import utils
	from swh.core.config import load_from_envvar, merge_configs
	@@ -250,17 +251,13 @@
	yield from ids

	@abc.abstractmethod
	- def persist_index_computations(
	- self, results: List[TResult], policy_update: str
	- ) -> Dict[str, int]:
	+ def persist_index_computations(self, results: List[TResult]) -> Dict[str, int]:
	"""Persist the computation resulting from the index.

	Args:

	results: List of results. One result is the
	result of the index function.
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them

	Returns:
	a summary dict of what has been inserted in the storage
	@@ -281,24 +278,28 @@

	"""

	- def run(self, ids: List[Sha1], policy_update: str, **kwargs) -> Dict:
	+ def run(self, ids: List[Sha1], **kwargs) -> Dict:
	"""Given a list of ids:

	- retrieve the content from the storage
	- execute the indexing computations
	- - store the results (according to policy_update)
	+ - store the results

	Args:
	ids (Iterable[Union[bytes, str]]): sha1's identifier list
	- policy_update (str): either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore
	- them
	**kwargs: passed to the `index` method

	Returns:
	A summary Dict of the task's status

	"""
	+ if "policy_update" in kwargs:
	+ warnings.warn(
	+ "'policy_update' argument is deprecated and ignored.",
	+ DeprecationWarning,
	+ )
	+ del kwargs["policy_update"]
	+
	sha1s = [
	hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
	]
	@@ -318,7 +319,7 @@
	if res: # If no results, skip it
	results.extend(res)
	summary["status"] = "eventful"
	- summary = self.persist_index_computations(results, policy_update)
	+ summary = self.persist_index_computations(results)
	self.results = results
	except Exception:
	if not self.catch_exceptions:
	@@ -478,9 +479,7 @@
	count_object_added_key: Optional[str] = None

	for contents in utils.grouper(gen, n=self.config["write_batch_size"]):
	- res = self.persist_index_computations(
	- list(contents), policy_update="update-dups"
	- )
	+ res = self.persist_index_computations(list(contents))
	if not count_object_added_key:
	count_object_added_key = list(res.keys())[0]
	count += res[count_object_added_key]
	@@ -508,22 +507,24 @@

	"""

	- def run(
	- self, origin_urls: List[str], policy_update: str = "update-dups", **kwargs
	- ) -> Dict:
	+ def run(self, origin_urls: List[str], **kwargs) -> Dict:
	"""Given a list of origin urls:

	- retrieve origins from storage
	- execute the indexing computations
	- - store the results (according to policy_update)
	+ - store the results

	Args:
	origin_urls: list of origin urls.
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates (default) or ignore them
	**kwargs: passed to the `index` method

	"""
	+ if "policy_update" in kwargs:
	+ warnings.warn(
	+ "'policy_update' argument is deprecated and ignored.",
	+ DeprecationWarning,
	+ )
	+ del kwargs["policy_update"]
	summary: Dict[str, Any] = {"status": "uneventful"}
	try:
	results = self.index_list(origin_urls, **kwargs)
	@@ -533,7 +534,7 @@
	summary["status"] = "failed"
	return summary

	- summary_persist = self.persist_index_computations(results, policy_update)
	+ summary_persist = self.persist_index_computations(results)
	self.results = results
	if summary_persist:
	for value in summary_persist.values():
	@@ -564,19 +565,23 @@

	"""

	- def run(self, ids: List[Sha1Git], policy_update: str) -> Dict:
	+ def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
	"""Given a list of sha1_gits:

	- retrieve revisions from storage
	- execute the indexing computations
	- - store the results (according to policy_update)
	+ - store the results

	Args:
	ids: sha1_git's identifier list
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them

	"""
	+ if "policy_update" in kwargs:
	+ warnings.warn(
	+ "'policy_update' argument is deprecated and ignored.",
	+ DeprecationWarning,
	+ )
	+ del kwargs["policy_update"]
	summary: Dict[str, Any] = {"status": "uneventful"}
	results = []

	@@ -599,7 +604,7 @@
	summary["status"] = "failed"
	return summary

	- summary_persist = self.persist_index_computations(results, policy_update)
	+ summary_persist = self.persist_index_computations(results)
	if summary_persist:
	for value in summary_persist.values():
	if value > 0:
	diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py
	--- a/swh/indexer/journal_client.py
	+++ b/swh/indexer/journal_client.py
	@@ -36,7 +36,6 @@
	task_names["origin_metadata"],
	"oneshot",
	visit_urls,
	- policy_update="update-dups",
	retries_left=1,
	)
	)
	diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
	--- a/swh/indexer/metadata.py
	+++ b/swh/indexer/metadata.py
	@@ -109,7 +109,7 @@
	]

	def persist_index_computations(
	- self, results: List[ContentMetadataRow], policy_update: str
	+ self, results: List[ContentMetadataRow]
	) -> Dict[str, int]:
	"""Persist the results in storage.

	@@ -118,13 +118,9 @@
	following keys:
	- id (bytes): content's identifier (sha1)
	- metadata (jsonb): detected metadata
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them

	"""
	- return self.idx_storage.content_metadata_add(
	- results, conflict_update=(policy_update == "update-dups")
	- )
	+ return self.idx_storage.content_metadata_add(results)


	DEFAULT_CONFIG: Dict[str, Any] = {
	@@ -217,7 +213,7 @@
	]

	def persist_index_computations(
	- self, results: List[RevisionIntrinsicMetadataRow], policy_update: str
	+ self, results: List[RevisionIntrinsicMetadataRow]
	) -> Dict[str, int]:
	"""Persist the results in storage.

	@@ -227,15 +223,11 @@
	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes
	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them

	"""
	# TODO: add functions in storage to keep data in
	# revision_intrinsic_metadata
	- return self.idx_storage.revision_intrinsic_metadata_add(
	- results, conflict_update=(policy_update == "update-dups")
	- )
	+ return self.idx_storage.revision_intrinsic_metadata_add(results)

	def translate_revision_intrinsic_metadata(
	self, detected_files: Dict[str, List[Any]], log_suffix: str
	@@ -291,9 +283,7 @@
	# content indexing
	try:
	c_metadata_indexer.run(
	- sha1s_filtered,
	- policy_update="ignore-dups",
	- log_suffix=log_suffix,
	+ sha1s_filtered, log_suffix=log_suffix,
	)
	# on the fly possibility:
	for result in c_metadata_indexer.results:
	@@ -364,10 +354,7 @@
	def persist_index_computations(
	self,
	results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
	- policy_update: str,
	) -> Dict[str, int]:
	- conflict_update = policy_update == "update-dups"
	-
	# Deduplicate revisions
	rev_metadata: List[RevisionIntrinsicMetadataRow] = []
	orig_metadata: List[OriginIntrinsicMetadataRow] = []
	@@ -382,14 +369,10 @@
	orig_metadata.append(orig_item)

	if rev_metadata:
	- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(
	- rev_metadata, conflict_update=conflict_update
	- )
	+ summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
	summary.update(summary_rev)
	if orig_metadata:
	- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
	- orig_metadata, conflict_update=conflict_update
	- )
	+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
	summary.update(summary_ori)

	return summary
	diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
	--- a/swh/indexer/mimetype.py
	+++ b/swh/indexer/mimetype.py
	@@ -95,7 +95,7 @@
	]

	def persist_index_computations(
	- self, results: List[ContentMimetypeRow], policy_update: str
	+ self, results: List[ContentMimetypeRow]
	) -> Dict[str, int]:
	"""Persist the results in storage.

	@@ -103,13 +103,8 @@
	results: list of content's mimetype dicts
	(see :meth:`.index`)

	- policy_update: either 'update-dups' or 'ignore-dups' to
	- respectively update duplicates or ignore them
	-
	"""
	- return self.idx_storage.content_mimetype_add(
	- results, conflict_update=(policy_update == "update-dups")
	- )
	+ return self.idx_storage.content_mimetype_add(results)


	class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]):
	diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
	--- a/swh/indexer/origin_head.py
	+++ b/swh/indexer/origin_head.py
	@@ -25,9 +25,7 @@

	USE_TOOLS = False

	- def persist_index_computations(
	- self, results: Any, policy_update: str
	- ) -> Dict[str, int]:
	+ def persist_index_computations(self, results: Any) -> Dict[str, int]:
	"""Do nothing. The indexer's results are not persistent, they
	should only be piped to another indexer."""
	return {}
	diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
	--- a/swh/indexer/sql/30-schema.sql
	+++ b/swh/indexer/sql/30-schema.sql
	@@ -14,7 +14,7 @@
	);

	insert into dbversion(version, release, description)
	- values(132, now(), 'Work In Progress');
	+ values(133, now(), 'Work In Progress');
	-- Computing metadata on sha1's contents

	-- a SHA1 checksum (not necessarily originating from Git)
	diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
	--- a/swh/indexer/sql/50-func.sql
	+++ b/swh/indexer/sql/50-func.sql
	@@ -41,7 +41,7 @@
	comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information';

	-- add tmp_content_mimetype entries to content_mimetype, overwriting
	--- duplicates if conflict_update is true, skipping duplicates otherwise.
	+-- overwriting duplicates.
	--
	-- If filtering duplicates is in order, the call to
	-- swh_content_mimetype_missing must take place before calling this
	@@ -49,36 +49,28 @@
	--
	-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
	-- 2. call this function
	-create or replace function swh_content_mimetype_add(conflict_update boolean)
	+create or replace function swh_content_mimetype_add()
	returns bigint
	language plpgsql
	as $$
	declare
	res bigint;
	begin
	- if conflict_update then
	- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
	- select id, mimetype, encoding, indexer_configuration_id
	- from tmp_content_mimetype tcm
	- on conflict(id, indexer_configuration_id)
	- do update set mimetype = excluded.mimetype,
	- encoding = excluded.encoding;
	- else
	- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
	- select id, mimetype, encoding, indexer_configuration_id
	- from tmp_content_mimetype tcm
	- on conflict(id, indexer_configuration_id)
	- do nothing;
	- end if;
	+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
	+ select id, mimetype, encoding, indexer_configuration_id
	+ from tmp_content_mimetype tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set mimetype = excluded.mimetype,
	+ encoding = excluded.encoding;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
	+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';

	--- add tmp_content_language entries to content_language, overwriting
	--- duplicates if conflict_update is true, skipping duplicates otherwise.
	+-- add tmp_content_language entries to content_language, overwriting duplicates.
	--
	-- If filtering duplicates is in order, the call to
	-- swh_content_language_missing must take place before calling this
	@@ -86,32 +78,25 @@
	--
	-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
	-- tmp_content_language, 2. call this function
	-create or replace function swh_content_language_add(conflict_update boolean)
	+create or replace function swh_content_language_add()
	returns bigint
	language plpgsql
	as $$
	declare
	res bigint;
	begin
	- if conflict_update then
	- insert into content_language (id, lang, indexer_configuration_id)
	- select id, lang, indexer_configuration_id
	- from tmp_content_language tcl
	- on conflict(id, indexer_configuration_id)
	- do update set lang = excluded.lang;
	- else
	- insert into content_language (id, lang, indexer_configuration_id)
	- select id, lang, indexer_configuration_id
	- from tmp_content_language tcl
	- on conflict(id, indexer_configuration_id)
	- do nothing;
	- end if;
	+ insert into content_language (id, lang, indexer_configuration_id)
	+ select id, lang, indexer_configuration_id
	+ from tmp_content_language tcl
	+ on conflict(id, indexer_configuration_id)
	+ do update set lang = excluded.lang;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_content_language_add(boolean) IS 'Add new content languages';
	+comment on function swh_content_language_add() IS 'Add new content languages';

	-- create a temporary table for retrieving content_language
	create or replace function swh_mktemp_content_language()
	@@ -139,36 +124,29 @@
	comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';


	--- add tmp_content_ctags entries to content_ctags, overwriting
	--- duplicates if conflict_update is true, skipping duplicates otherwise.
	+-- add tmp_content_ctags entries to content_ctags, overwriting duplicates
	--
	-- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
	-- 2. call this function
	-create or replace function swh_content_ctags_add(conflict_update boolean)
	+create or replace function swh_content_ctags_add()
	returns bigint
	language plpgsql
	as $$
	declare
	res bigint;
	begin
	- if conflict_update then
	- delete from content_ctags
	- where id in (select tmp.id
	- from tmp_content_ctags tmp
	- inner join indexer_configuration i on i.id=tmp.indexer_configuration_id);
	- end if;
	-
	insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
	select id, name, kind, line, lang, indexer_configuration_id
	from tmp_content_ctags tct
	on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
	do nothing;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
	+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';

	create type content_ctags_signature as (
	id sha1,
	@@ -218,12 +196,12 @@

	comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';

	--- add tmp_content_fossology_license entries to content_fossology_license, overwriting
	--- duplicates if conflict_update is true, skipping duplicates otherwise.
	+-- add tmp_content_fossology_license entries to content_fossology_license,
	+-- overwriting duplicates.
	--
	-- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to
	-- tmp_content_fossology_license, 2. call this function
	-create or replace function swh_content_fossology_license_add(conflict_update boolean)
	+create or replace function swh_content_fossology_license_add()
	returns bigint
	language plpgsql
	as $$
	@@ -236,36 +214,25 @@
	where not exists (select 1 from fossology_license where name=tmp.license)
	on conflict(name) do nothing;

	- if conflict_update then
	- insert into content_fossology_license (id, license_id, indexer_configuration_id)
	- select tcl.id,
	- (select id from fossology_license where name = tcl.license) as license,
	- indexer_configuration_id
	- from tmp_content_fossology_license tcl
	- on conflict(id, license_id, indexer_configuration_id)
	- do update set license_id = excluded.license_id;
	- else
	- insert into content_fossology_license (id, license_id, indexer_configuration_id)
	- select tcl.id,
	- (select id from fossology_license where name = tcl.license) as license,
	- indexer_configuration_id
	- from tmp_content_fossology_license tcl
	- on conflict(id, license_id, indexer_configuration_id)
	- do nothing;
	- end if;
	+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
	+ select tcl.id,
	+ (select id from fossology_license where name = tcl.license) as license,
	+ indexer_configuration_id
	+ from tmp_content_fossology_license tcl
	+ on conflict(id, license_id, indexer_configuration_id)
	+ do update set license_id = excluded.license_id;

	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
	+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';


	-- content_metadata functions

	--- add tmp_content_metadata entries to content_metadata, overwriting
	--- duplicates if conflict_update is true, skipping duplicates otherwise.
	+-- add tmp_content_metadata entries to content_metadata, overwriting duplicates
	--
	-- If filtering duplicates is in order, the call to
	-- swh_content_metadata_missing must take place before calling this
	@@ -273,32 +240,25 @@
	--
	-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
	-- tmp_content_metadata, 2. call this function
	-create or replace function swh_content_metadata_add(conflict_update boolean)
	+create or replace function swh_content_metadata_add()
	returns bigint
	language plpgsql
	as $$
	declare
	res bigint;
	begin
	- if conflict_update then
	- insert into content_metadata (id, metadata, indexer_configuration_id)
	- select id, metadata, indexer_configuration_id
	- from tmp_content_metadata tcm
	- on conflict(id, indexer_configuration_id)
	- do update set metadata = excluded.metadata;
	- else
	- insert into content_metadata (id, metadata, indexer_configuration_id)
	- select id, metadata, indexer_configuration_id
	- from tmp_content_metadata tcm
	- on conflict(id, indexer_configuration_id)
	- do nothing;
	- end if;
	+ insert into content_metadata (id, metadata, indexer_configuration_id)
	+ select id, metadata, indexer_configuration_id
	+ from tmp_content_metadata tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set metadata = excluded.metadata;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
	+comment on function swh_content_metadata_add() IS 'Add new content metadata';

	-- create a temporary table for retrieving content_metadata
	create or replace function swh_mktemp_content_metadata()
	@@ -315,8 +275,7 @@
	-- end content_metadata functions

	-- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
	--- overwriting duplicates if conflict_update is true, skipping duplicates
	--- otherwise.
	+-- overwriting duplicates.
	--
	-- If filtering duplicates is in order, the call to
	-- swh_revision_intrinsic_metadata_missing must take place before calling this
	@@ -324,34 +283,27 @@
	--
	-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
	-- tmp_revision_intrinsic_metadata, 2. call this function
	-create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean)
	+create or replace function swh_revision_intrinsic_metadata_add()
	returns bigint
	language plpgsql
	as $$
	declare
	res bigint;
	begin
	- if conflict_update then
	- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
	- select id, metadata, mappings, indexer_configuration_id
	- from tmp_revision_intrinsic_metadata tcm
	- on conflict(id, indexer_configuration_id)
	- do update set
	- metadata = excluded.metadata,
	- mappings = excluded.mappings;
	- else
	- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
	- select id, metadata, mappings, indexer_configuration_id
	- from tmp_revision_intrinsic_metadata tcm
	- on conflict(id, indexer_configuration_id)
	- do nothing;
	- end if;
	+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
	+ select id, metadata, mappings, indexer_configuration_id
	+ from tmp_revision_intrinsic_metadata tcm
	+ on conflict(id, indexer_configuration_id)
	+ do update set
	+ metadata = excluded.metadata,
	+ mappings = excluded.mappings;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata';
	+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';

	-- create a temporary table for retrieving revision_intrinsic_metadata
	create or replace function swh_mktemp_revision_intrinsic_metadata()
	@@ -389,7 +341,7 @@


	-- add tmp_indexer_configuration entries to indexer_configuration,
	--- skipping duplicates if any.
	+-- overwriting duplicates if any.
	--
	-- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to
	-- it, 2. call this function to insert and filtering out duplicates
	@@ -412,8 +364,7 @@
	$$;

	-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
	--- overwriting duplicates if conflict_update is true, skipping duplicates
	--- otherwise.
	+-- overwriting duplicates.
	--
	-- If filtering duplicates is in order, the call to
	-- swh_origin_intrinsic_metadata_missing must take place before calling this
	@@ -421,8 +372,7 @@
	--
	-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
	-- tmp_origin_intrinsic_metadata, 2. call this function
	-create or replace function swh_origin_intrinsic_metadata_add(
	- conflict_update boolean)
	+create or replace function swh_origin_intrinsic_metadata_add()
	returns bigint
	language plpgsql
	as $$
	@@ -430,31 +380,24 @@
	res bigint;
	begin
	perform swh_origin_intrinsic_metadata_compute_tsvector();
	- if conflict_update then
	- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
	- select id, metadata, indexer_configuration_id, from_revision,
	- metadata_tsvector, mappings
	- from tmp_origin_intrinsic_metadata
	- on conflict(id, indexer_configuration_id)
	- do update set
	- metadata = excluded.metadata,
	- metadata_tsvector = excluded.metadata_tsvector,
	- mappings = excluded.mappings,
	- from_revision = excluded.from_revision;
	- else
	- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
	- select id, metadata, indexer_configuration_id, from_revision,
	- metadata_tsvector, mappings
	- from tmp_origin_intrinsic_metadata
	- on conflict(id, indexer_configuration_id)
	- do nothing;
	- end if;
	+
	+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
	+ select id, metadata, indexer_configuration_id, from_revision,
	+ metadata_tsvector, mappings
	+ from tmp_origin_intrinsic_metadata
	+ on conflict(id, indexer_configuration_id)
	+ do update set
	+ metadata = excluded.metadata,
	+ metadata_tsvector = excluded.metadata_tsvector,
	+ mappings = excluded.mappings,
	+ from_revision = excluded.from_revision;
	+
	get diagnostics res = ROW_COUNT;
	return res;
	end
	$$;

	-comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
	+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';


	-- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata.
	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -267,11 +267,7 @@
	@process_metrics
	@db_transaction()
	def content_mimetype_add(
	- self,
	- mimetypes: List[ContentMimetypeRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, mimetypes: List[ContentMimetypeRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(mimetypes)
	mimetypes.sort(key=lambda m: m.id)
	@@ -282,7 +278,7 @@
	["id", "mimetype", "encoding", "indexer_configuration_id"],
	cur,
	)
	- count = db.content_mimetype_add_from_temp(conflict_update, cur)
	+ count = db.content_mimetype_add_from_temp(cur)
	return {"content_mimetype:add": count}

	@timed
	@@ -320,11 +316,7 @@
	@process_metrics
	@db_transaction()
	def content_language_add(
	- self,
	- languages: List[ContentLanguageRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, languages: List[ContentLanguageRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(languages)
	languages.sort(key=lambda m: m.id)
	@@ -344,7 +336,7 @@
	cur,
	)

	- count = db.content_language_add_from_temp(conflict_update, cur)
	+ count = db.content_language_add_from_temp(cur)
	return {"content_language:add": count}

	@timed
	@@ -370,11 +362,7 @@
	@process_metrics
	@db_transaction()
	def content_ctags_add(
	- self,
	- ctags: List[ContentCtagsRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, ctags: List[ContentCtagsRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(ctags)
	ctags.sort(key=lambda m: m.id)
	@@ -387,7 +375,7 @@
	cur=cur,
	)

	- count = db.content_ctags_add_from_temp(conflict_update, cur)
	+ count = db.content_ctags_add_from_temp(cur)
	return {"content_ctags:add": count}

	@timed
	@@ -425,11 +413,7 @@
	@process_metrics
	@db_transaction()
	def content_fossology_license_add(
	- self,
	- licenses: List[ContentLicenseRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, licenses: List[ContentLicenseRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(licenses)
	licenses.sort(key=lambda m: m.id)
	@@ -440,7 +424,7 @@
	columns=["id", "license", "indexer_configuration_id"],
	cur=cur,
	)
	- count = db.content_fossology_license_add_from_temp(conflict_update, cur)
	+ count = db.content_fossology_license_add_from_temp(cur)
	return {"content_fossology_license:add": count}

	@timed
	@@ -490,11 +474,7 @@
	@process_metrics
	@db_transaction()
	def content_metadata_add(
	- self,
	- metadata: List[ContentMetadataRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, metadata: List[ContentMetadataRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m.id)
	@@ -507,7 +487,7 @@
	["id", "metadata", "indexer_configuration_id"],
	cur,
	)
	- count = db.content_metadata_add_from_temp(conflict_update, cur)
	+ count = db.content_metadata_add_from_temp(cur)
	return {
	"content_metadata:add": count,
	}
	@@ -540,11 +520,7 @@
	@process_metrics
	@db_transaction()
	def revision_intrinsic_metadata_add(
	- self,
	- metadata: List[RevisionIntrinsicMetadataRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m.id)
	@@ -557,7 +533,7 @@
	["id", "metadata", "mappings", "indexer_configuration_id"],
	cur,
	)
	- count = db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
	+ count = db.revision_intrinsic_metadata_add_from_temp(cur)
	return {
	"revision_intrinsic_metadata:add": count,
	}
	@@ -580,11 +556,7 @@
	@process_metrics
	@db_transaction()
	def origin_intrinsic_metadata_add(
	- self,
	- metadata: List[OriginIntrinsicMetadataRow],
	- conflict_update: bool = False,
	- db=None,
	- cur=None,
	+ self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None,
	) -> Dict[str, int]:
	check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m.id)
	@@ -597,7 +569,7 @@
	["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
	cur,
	)
	- count = db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
	+ count = db.origin_intrinsic_metadata_add_from_temp(cur)
	return {
	"origin_intrinsic_metadata:add": count,
	}
	diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
	--- a/swh/indexer/storage/db.py
	+++ b/swh/indexer/storage/db.py
	@@ -74,9 +74,9 @@
	def mktemp_content_mimetype(self, cur=None):
	pass

	- def content_mimetype_add_from_temp(self, conflict_update, cur=None):
	+ def content_mimetype_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute("select * from swh_content_mimetype_add(%s)", (conflict_update,))
	+ cur.execute("select * from swh_content_mimetype_add()")
	return cur.fetchone()[0]

	def _convert_key(self, key, main_table="c"):
	@@ -209,9 +209,9 @@
	def mktemp_content_language(self, cur=None):
	pass

	- def content_language_add_from_temp(self, conflict_update, cur=None):
	+ def content_language_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute("select * from swh_content_language_add(%s)", (conflict_update,))
	+ cur.execute("select * from swh_content_language_add()")
	return cur.fetchone()[0]

	def content_language_get_from_list(self, ids, cur=None):
	@@ -245,9 +245,9 @@
	def mktemp_content_ctags(self, cur=None):
	pass

	- def content_ctags_add_from_temp(self, conflict_update, cur=None):
	+ def content_ctags_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute("select * from swh_content_ctags_add(%s)", (conflict_update,))
	+ cur.execute("select * from swh_content_ctags_add()")
	return cur.fetchone()[0]

	def content_ctags_get_from_list(self, ids, cur=None):
	@@ -303,14 +303,12 @@
	def mktemp_content_fossology_license(self, cur=None):
	pass

	- def content_fossology_license_add_from_temp(self, conflict_update, cur=None):
	+ def content_fossology_license_add_from_temp(self, cur=None):
	"""Add new licenses per content.

	"""
	cur = self._cursor(cur)
	- cur.execute(
	- "select * from swh_content_fossology_license_add(%s)", (conflict_update,)
	- )
	+ cur.execute("select * from swh_content_fossology_license_add()")
	return cur.fetchone()[0]

	def content_fossology_license_get_from_list(self, ids, cur=None):
	@@ -355,9 +353,9 @@
	def mktemp_content_metadata(self, cur=None):
	pass

	- def content_metadata_add_from_temp(self, conflict_update, cur=None):
	+ def content_metadata_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute("select * from swh_content_metadata_add(%s)", (conflict_update,))
	+ cur.execute("select * from swh_content_metadata_add()")
	return cur.fetchone()[0]

	def content_metadata_get_from_list(self, ids, cur=None):
	@@ -392,11 +390,9 @@
	def mktemp_revision_intrinsic_metadata(self, cur=None):
	pass

	- def revision_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
	+ def revision_intrinsic_metadata_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute(
	- "select * from swh_revision_intrinsic_metadata_add(%s)", (conflict_update,)
	- )
	+ cur.execute("select * from swh_revision_intrinsic_metadata_add()")
	return cur.fetchone()[0]

	def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
	@@ -429,11 +425,9 @@
	def mktemp_origin_intrinsic_metadata(self, cur=None):
	pass

	- def origin_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
	+ def origin_intrinsic_metadata_add_from_temp(self, cur=None):
	cur = self._cursor(cur)
	- cur.execute(
	- "select * from swh_origin_intrinsic_metadata_add(%s)", (conflict_update,)
	- )
	+ cur.execute("select * from swh_origin_intrinsic_metadata_add()")
	return cur.fetchone()[0]

	def origin_intrinsic_metadata_get_from_list(self, ids, cur=None):
	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -193,7 +193,7 @@
	assert len(ids) <= limit
	return PagedResult(results=ids, next_page_token=next_page_token)

	- def add(self, data: Iterable[TValue], conflict_update: bool) -> int:
	+ def add(self, data: Iterable[TValue]) -> int:
	"""Add data not present in storage.

	Args:
	@@ -204,9 +204,6 @@
	results
	- arbitrary data

	- conflict_update (bool): Flag to determine if we want to overwrite
	- (true) or skip duplicates (false)
	-
	"""
	data = list(data)
	check_id_duplicates(data)
	@@ -216,9 +213,6 @@
	id_ = item.pop("id")
	tool_id = item["indexer_configuration_id"]
	key = _key_from_dict(obj.unique_key())
	- if not conflict_update and key in self._data[id_]:
	- # Duplicate, should not be updated
	- continue
	self._data[id_][key] = item
	self._tools_per_id[id_].add(tool_id)
	count += 1
	@@ -265,9 +259,9 @@
	)

	def content_mimetype_add(
	- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
	+ self, mimetypes: List[ContentMimetypeRow]
	) -> Dict[str, int]:
	- added = self._mimetypes.add(mimetypes, conflict_update)
	+ added = self._mimetypes.add(mimetypes)
	return {"content_mimetype:add": added}

	def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]:
	@@ -282,9 +276,9 @@
	return self._languages.get(ids)

	def content_language_add(
	- self, languages: List[ContentLanguageRow], conflict_update: bool = False
	+ self, languages: List[ContentLanguageRow]
	) -> Dict[str, int]:
	- added = self._languages.add(languages, conflict_update)
	+ added = self._languages.add(languages)
	return {"content_language:add": added}

	def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]:
	@@ -293,10 +287,8 @@
	def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]:
	return self._content_ctags.get(ids)

	- def content_ctags_add(
	- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
	- ) -> Dict[str, int]:
	- added = self._content_ctags.add(ctags, conflict_update,)
	+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
	+ added = self._content_ctags.add(ctags)
	return {"content_ctags:add": added}

	def content_ctags_search(
	@@ -329,9 +321,9 @@
	return self._licenses.get(ids)

	def content_fossology_license_add(
	- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
	+ self, licenses: List[ContentLicenseRow]
	) -> Dict[str, int]:
	- added = self._licenses.add(licenses, conflict_update)
	+ added = self._licenses.add(licenses)
	return {"content_fossology_license:add": added}

	def content_fossology_license_get_partition(
	@@ -355,9 +347,9 @@
	return self._content_metadata.get(ids)

	def content_metadata_add(
	- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
	+ self, metadata: List[ContentMetadataRow]
	) -> Dict[str, int]:
	- added = self._content_metadata.add(metadata, conflict_update)
	+ added = self._content_metadata.add(metadata)
	return {"content_metadata:add": added}

	def revision_intrinsic_metadata_missing(
	@@ -371,11 +363,9 @@
	return self._revision_intrinsic_metadata.get(ids)

	def revision_intrinsic_metadata_add(
	- self,
	- metadata: List[RevisionIntrinsicMetadataRow],
	- conflict_update: bool = False,
	+ self, metadata: List[RevisionIntrinsicMetadataRow]
	) -> Dict[str, int]:
	- added = self._revision_intrinsic_metadata.add(metadata, conflict_update)
	+ added = self._revision_intrinsic_metadata.add(metadata)
	return {"revision_intrinsic_metadata:add": added}

	def origin_intrinsic_metadata_get(
	@@ -384,9 +374,9 @@
	return self._origin_intrinsic_metadata.get(urls)

	def origin_intrinsic_metadata_add(
	- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
	+ self, metadata: List[OriginIntrinsicMetadataRow]
	) -> Dict[str, int]:
	- added = self._origin_intrinsic_metadata.add(metadata, conflict_update)
	+ added = self._origin_intrinsic_metadata.add(metadata)
	return {"origin_intrinsic_metadata:add": added}

	def origin_intrinsic_metadata_search_fulltext(
	diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
	--- a/swh/indexer/storage/interface.py
	+++ b/swh/indexer/storage/interface.py
	@@ -84,14 +84,13 @@

	@remote_api_endpoint("content_mimetype/add")
	def content_mimetype_add(
	- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
	+ self, mimetypes: List[ContentMimetypeRow]
	) -> Dict[str, int]:
	"""Add mimetypes not present in storage.

	Args:
	mimetypes: mimetype rows to be added, with their `tool` attribute set to
	None.
	- conflict_update: Flag to determine if we want to
	overwrite (``True``) or skip duplicates (``False``, the
	default)

	@@ -148,17 +147,13 @@

	@remote_api_endpoint("content_language/add")
	def content_language_add(
	- self, languages: List[ContentLanguageRow], conflict_update: bool = False
	+ self, languages: List[ContentLanguageRow]
	) -> Dict[str, int]:
	"""Add languages not present in storage.

	Args:
	languages: language row objects

	- conflict_update (bool): Flag to determine if we want to
	- overwrite (true) or skip duplicates (false, the
	- default)
	-
	Returns:
	Dict summary of number of rows added

	@@ -198,9 +193,7 @@
	...

	@remote_api_endpoint("content/ctags/add")
	- def content_ctags_add(
	- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
	- ) -> Dict[str, int]:
	+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
	"""Add ctags not present in storage

	Args:
	@@ -251,7 +244,7 @@

	@remote_api_endpoint("content/fossology_license/add")
	def content_fossology_license_add(
	- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
	+ self, licenses: List[ContentLicenseRow]
	) -> Dict[str, int]:
	"""Add licenses not present in storage.

	@@ -259,9 +252,6 @@
	license: license rows to be added, with their `tool` attribute set to
	None.

	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	Returns:
	Dict summary of number of rows added

	@@ -335,7 +325,7 @@

	@remote_api_endpoint("content_metadata/add")
	def content_metadata_add(
	- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
	+ self, metadata: List[ContentMetadataRow]
	) -> Dict[str, int]:
	"""Add metadata not present in storage.

	@@ -345,9 +335,6 @@
	- id: sha1
	- metadata: arbitrary dict

	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	Returns:
	Dict summary of number of rows added

	@@ -390,18 +377,13 @@

	@remote_api_endpoint("revision_intrinsic_metadata/add")
	def revision_intrinsic_metadata_add(
	- self,
	- metadata: List[RevisionIntrinsicMetadataRow],
	- conflict_update: bool = False,
	+ self, metadata: List[RevisionIntrinsicMetadataRow],
	) -> Dict[str, int]:
	"""Add metadata not present in storage.

	Args:
	metadata: ContentMetadataRow objects

	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	Returns:
	Dict summary of number of rows added

	@@ -423,16 +405,13 @@

	@remote_api_endpoint("origin_intrinsic_metadata/add")
	def origin_intrinsic_metadata_add(
	- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
	+ self, metadata: List[OriginIntrinsicMetadataRow]
	) -> Dict[str, int]:
	"""Add origin metadata not present in storage.

	Args:
	metadata: list of OriginIntrinsicMetadataRow objects

	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	Returns:
	Dict summary of number of rows added

	diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
	--- a/swh/indexer/tests/storage/test_storage.py
	+++ b/swh/indexer/tests/storage/test_storage.py
	@@ -137,45 +137,6 @@
	actual_missing = endpoint(storage, etype, "missing")(query)
	assert list(actual_missing) == [data.sha1_1]

	- def test_add__drop_duplicate(
	- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	- ) -> None:
	- storage, data = swh_indexer_storage_with_data
	- etype = self.endpoint_type
	- tool_id = data.tools[self.tool_name]["id"]
	-
	- # add the first object
	- data_v1 = {
	- "id": data.sha1_2,
	- **self.example_data[0],
	- "indexer_configuration_id": tool_id,
	- }
	- summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)])
	- assert summary == expected_summary(1, etype)
	-
	- # should be able to retrieve it
	- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
	- expected_data_v1 = [
	- self.row_class.from_dict(
	- {
	- "id": data.sha1_2,
	- **self.example_data[0],
	- "tool": data.tools[self.tool_name],
	- }
	- )
	- ]
	- assert actual_data == expected_data_v1
	-
	- # now if we add a modified version of the same object (same id)
	- data_v2 = data_v1.copy()
	- data_v2.update(self.example_data[1])
	- summary2 = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
	- assert summary2 == expected_summary(0, etype) # not added
	-
	- # we expect to retrieve the original data, not the modified one
	- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
	- assert actual_data == expected_data_v1
	-
	def test_add__update_in_place_duplicate(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	) -> None:
	@@ -209,9 +170,7 @@
	data_v2 = data_v1.copy()
	data_v2.update(self.example_data[1])

	- endpoint(storage, etype, "add")(
	- [self.row_class.from_dict(data_v2)], conflict_update=True
	- )
	+ endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
	assert summary == expected_summary(1, etype) # modified so counted

	actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
	@@ -225,7 +184,7 @@
	# data did change as the v2 was used to overwrite v1
	assert actual_data == expected_data_v2

	- def test_add__update_in_place_deadlock(
	+ def test_add_deadlock(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	) -> None:
	storage, data = swh_indexer_storage_with_data
	@@ -267,7 +226,9 @@
	endpoint(storage, etype, "add")(data_v1)

	# when
	- actual_data = list(endpoint(storage, etype, "get")(hashes))
	+ actual_data = sorted(
	+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
	+ )

	expected_data_v1 = [
	self.row_class.from_dict(
	@@ -281,10 +242,10 @@

	# given
	def f1() -> None:
	- endpoint(storage, etype, "add")(data_v2a, conflict_update=True)
	+ endpoint(storage, etype, "add")(data_v2a)

	def f2() -> None:
	- endpoint(storage, etype, "add")(data_v2b, conflict_update=True)
	+ endpoint(storage, etype, "add")(data_v2b)

	t1 = threading.Thread(target=f1)
	t2 = threading.Thread(target=f2)
	@@ -295,15 +256,21 @@
	t2.join()

	actual_data = sorted(
	- (row.to_dict() for row in endpoint(storage, etype, "get")(hashes)),
	- key=lambda x: x["id"],
	+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
	)

	expected_data_v2 = [
	- {"id": hash_, **self.example_data[1], "tool": tool} for hash_ in hashes
	+ self.row_class.from_dict(
	+ {"id": hash_, **self.example_data[1], "tool": tool}
	+ )
	+ for hash_ in hashes
	]

	- assert actual_data == expected_data_v2
	+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
	+ for (item, expected_item_v1, expected_item_v2) in zip(
	+ actual_data, expected_data_v1, expected_data_v2
	+ ):
	+ assert item in (expected_item_v1, expected_item_v2)

	def test_add__duplicate_twice(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	@@ -333,9 +300,7 @@
	assert summary == expected_summary(1, etype)

	with pytest.raises(DuplicateId):
	- endpoint(storage, etype, "add")(
	- [data_rev2, data_rev2], conflict_update=True
	- )
	+ endpoint(storage, etype, "add")([data_rev2, data_rev2])

	# then
	actual_data = list(
	@@ -545,16 +510,12 @@
	row_class = ContentCtagsRow

	# the following tests are disabled because CTAGS behaves differently
	- @pytest.mark.skip
	- def test_add__drop_duplicate(self):
	- pass
	-
	@pytest.mark.skip
	def test_add__update_in_place_duplicate(self):
	pass

	@pytest.mark.skip
	- def test_add__update_in_place_deadlock(self):
	+ def test_add_deadlock(self):
	pass

	@pytest.mark.skip
	@@ -736,7 +697,7 @@
	)
	ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)

	- storage.content_ctags_add([ctag1, ctag2], conflict_update=True)
	+ storage.content_ctags_add([ctag1, ctag2])

	actual_ctags = list(storage.content_ctags_get([data.sha1_2]))

	@@ -889,7 +850,7 @@
	mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
	indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

	- storage.content_mimetype_add(mimetypes, conflict_update=True)
	+ storage.content_mimetype_add(mimetypes)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	@@ -925,7 +886,7 @@
	mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
	indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

	- storage.content_mimetype_add(mimetypes, conflict_update=True)
	+ storage.content_mimetype_add(mimetypes)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	@@ -951,7 +912,7 @@
	mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
	indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

	- storage.content_mimetype_add(mimetypes, conflict_update=True)
	+ storage.content_mimetype_add(mimetypes)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	@@ -993,7 +954,7 @@
	mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
	indexer_configuration_id = fossology_licenses[0].indexer_configuration_id

	- storage.content_mimetype_add(mimetypes, conflict_update=True)
	+ storage.content_mimetype_add(mimetypes)
	# add fossology_licenses to storage
	storage.content_fossology_license_add(fossology_licenses)

	@@ -1093,71 +1054,6 @@

	assert actual_metadata == expected_metadata

	- def test_origin_intrinsic_metadata_add_drop_duplicate(
	- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	- ) -> None:
	- storage, data = swh_indexer_storage_with_data
	- # given
	- tool_id = data.tools["swh-metadata-detector"]["id"]
	-
	- metadata_v1: Dict[str, Any] = {
	- "version": None,
	- "name": None,
	- }
	- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
	- id=data.revision_id_1,
	- metadata=metadata_v1.copy(),
	- mappings=[],
	- indexer_configuration_id=tool_id,
	- )
	- metadata_origin_v1 = OriginIntrinsicMetadataRow(
	- id=data.origin_url_1,
	- metadata=metadata_v1.copy(),
	- indexer_configuration_id=tool_id,
	- mappings=[],
	- from_revision=data.revision_id_1,
	- )
	-
	- # given
	- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
	- storage.origin_intrinsic_metadata_add([metadata_origin_v1])
	-
	- # when
	- actual_metadata = list(
	- storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"])
	- )
	-
	- expected_metadata_v1 = [
	- OriginIntrinsicMetadataRow(
	- id=data.origin_url_1,
	- metadata=metadata_v1,
	- tool=data.tools["swh-metadata-detector"],
	- from_revision=data.revision_id_1,
	- mappings=[],
	- )
	- ]
	-
	- assert actual_metadata == expected_metadata_v1
	-
	- # given
	- metadata_v2 = metadata_v1.copy()
	- metadata_v2.update(
	- {"name": "test_metadata", "author": "MG",}
	- )
	- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
	- metadata_origin_v2 = attr.evolve(metadata_origin_v1, metadata=metadata_v2)
	-
	- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
	- storage.origin_intrinsic_metadata_add([metadata_origin_v2])
	-
	- # then
	- actual_metadata = list(
	- storage.origin_intrinsic_metadata_get([data.origin_url_1])
	- )
	-
	- # metadata did not change as the v2 was dropped.
	- assert actual_metadata == expected_metadata_v1
	-
	def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	) -> None:
	@@ -1218,10 +1114,8 @@
	from_revision=data.revision_id_1,
	)

	- storage.revision_intrinsic_metadata_add([metadata_rev_v2], conflict_update=True)
	- storage.origin_intrinsic_metadata_add(
	- [metadata_origin_v2], conflict_update=True
	- )
	+ storage.revision_intrinsic_metadata_add([metadata_rev_v2])
	+ storage.origin_intrinsic_metadata_add([metadata_origin_v2])

	actual_metadata = list(
	storage.origin_intrinsic_metadata_get([data.origin_url_1])
	@@ -1240,7 +1134,7 @@
	# metadata did change as the v2 was used to overwrite v1
	assert actual_metadata == expected_metadata_v2

	- def test_origin_intrinsic_metadata_add__update_in_place_deadlock(
	+ def test_origin_intrinsic_metadata_add__deadlock(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	) -> None:
	storage, data = swh_indexer_storage_with_data
	@@ -1312,10 +1206,10 @@

	# given
	def f1() -> None:
	- storage.origin_intrinsic_metadata_add(data_v2a, conflict_update=True)
	+ storage.origin_intrinsic_metadata_add(data_v2a)

	def f2() -> None:
	- storage.origin_intrinsic_metadata_add(data_v2b, conflict_update=True)
	+ storage.origin_intrinsic_metadata_add(data_v2b)

	t1 = threading.Thread(target=f1)
	t2 = threading.Thread(target=f2)
	@@ -1337,8 +1231,11 @@
	for id_ in ids
	]

	- assert len(actual_data) == len(expected_data_v2)
	- assert sorted(actual_data, key=lambda x: x.id) == expected_data_v2
	+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
	+ for (item, expected_item_v1, expected_item_v2) in zip(
	+ actual_data, expected_data_v1, expected_data_v2
	+ ):
	+ assert item in (expected_item_v1, expected_item_v2)

	def test_origin_intrinsic_metadata_add__duplicate_twice(
	self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
	diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
	--- a/swh/indexer/tests/test_cli.py
	+++ b/swh/indexer/tests/test_cli.py
	@@ -74,7 +74,7 @@


	def _assert_tasks_for_origins(tasks, origins):
	- expected_kwargs = {"policy_update": "update-dups"}
	+ expected_kwargs = {}
	assert {task["type"] for task in tasks} == {"index-origin-metadata"}
	assert all(len(task["arguments"]["args"]) == 1 for task in tasks)
	for task in tasks:
	diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
	--- a/swh/indexer/tests/test_indexer.py
	+++ b/swh/indexer/tests/test_indexer.py
	@@ -31,7 +31,7 @@
	) -> List[Dict[str, Any]]:
	raise _TestException()

	- def persist_index_computations(self, results, policy_update) -> Dict[str, int]:
	+ def persist_index_computations(self, results) -> Dict[str, int]:
	return {}

	def indexed_contents_in_partition(
	@@ -61,12 +61,12 @@
	indexer.objstorage = Mock()
	indexer.objstorage.get.return_value = b"content"

	- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
	+ assert indexer.run([b"foo"]) == {"status": "failed"}

	indexer.catch_exceptions = False

	with pytest.raises(_TestException):
	- indexer.run([b"foo"], policy_update=True)
	+ indexer.run([b"foo"])


	def test_revision_indexer_catch_exceptions():
	@@ -74,25 +74,23 @@
	indexer.storage = Mock()
	indexer.storage.revision_get.return_value = ["rev"]

	- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
	+ assert indexer.run([b"foo"]) == {"status": "failed"}

	indexer.catch_exceptions = False

	with pytest.raises(_TestException):
	- indexer.run([b"foo"], policy_update=True)
	+ indexer.run([b"foo"])


	def test_origin_indexer_catch_exceptions():
	indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)

	- assert indexer.run(["http://example.org"], policy_update=True) == {
	- "status": "failed"
	- }
	+ assert indexer.run(["http://example.org"]) == {"status": "failed"}

	indexer.catch_exceptions = False

	with pytest.raises(_TestException):
	- indexer.run(["http://example.org"], policy_update=True)
	+ indexer.run(["http://example.org"])


	def test_content_partition_indexer_catch_exceptions():
	@@ -100,9 +98,9 @@
	config={**BASE_TEST_CONFIG, "write_batch_size": 42}
	)

	- assert indexer.run(0, 42, policy_update=True) == {"status": "failed"}
	+ assert indexer.run(0, 42) == {"status": "failed"}

	indexer.catch_exceptions = False

	with pytest.raises(_TestException):
	- indexer.run(0, 42, policy_update=True)
	+ indexer.run(0, 42)
	diff --git a/swh/indexer/tests/test_journal_client.py b/swh/indexer/tests/test_journal_client.py
	--- a/swh/indexer/tests/test_journal_client.py
	+++ b/swh/indexer/tests/test_journal_client.py
	@@ -30,10 +30,7 @@
	(
	[
	{
	- "arguments": {
	- "kwargs": {"policy_update": "update-dups"},
	- "args": (["file:///dev/zero"],),
	- },
	+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
	"policy": "oneshot",
	"type": "task-name",
	"retries_left": 1,
	@@ -64,10 +61,7 @@
	(
	[
	{
	- "arguments": {
	- "kwargs": {"policy_update": "update-dups"},
	- "args": (["file:///dev/zero"],),
	- },
	+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
	"policy": "oneshot",
	"type": "task-name",
	"retries_left": 1,
	@@ -100,7 +94,7 @@
	[
	{
	"arguments": {
	- "kwargs": {"policy_update": "update-dups"},
	+ "kwargs": {},
	"args": (["file:///dev/zero", "file:///tmp/foobar"],),
	},
	"policy": "oneshot",
	@@ -138,7 +132,7 @@
	[
	{
	"arguments": {
	- "kwargs": {"policy_update": "update-dups"},
	+ "kwargs": {},
	"args": (["file:///dev/zero", "file:///tmp/foobar"],),
	},
	"policy": "oneshot",
	@@ -147,7 +141,7 @@
	},
	{
	"arguments": {
	- "kwargs": {"policy_update": "update-dups"},
	+ "kwargs": {},
	"args": (["file:///tmp/spamegg"],),
	},
	"policy": "oneshot",
	diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
	--- a/swh/indexer/tests/test_metadata.py
	+++ b/swh/indexer/tests/test_metadata.py
	@@ -140,7 +140,7 @@
	fill_storage(metadata_indexer.storage)

	# when
	- metadata_indexer.run(sha1s, policy_update="ignore-dups")
	+ metadata_indexer.run(sha1s)
	results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))

	expected_results = [
	@@ -1122,7 +1122,7 @@
	]
	)

	- metadata_indexer.run([rev.id], "update-dups")
	+ metadata_indexer.run([rev.id])

	results = list(
	metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
	@@ -1183,7 +1183,7 @@
	]
	)

	- metadata_indexer.run([new_rev.id], "update-dups")
	+ metadata_indexer.run([new_rev.id])

	results = list(
	metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
	diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
	--- a/swh/indexer/tests/test_origin_head.py
	+++ b/swh/indexer/tests/test_origin_head.py
	@@ -46,7 +46,7 @@
	indexing tests.
	"""

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(self, results):
	self.results = results


	diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
	--- a/swh/indexer/tests/utils.py
	+++ b/swh/indexer/tests/utils.py
	@@ -262,7 +262,6 @@
	class MockStorage():
	def content_mimetype_add(self, mimetypes):
	self.state = mimetypes
	- self.conflict_update = conflict_update

	def indexer_configuration_add(self, tools):
	return [{
	@@ -606,12 +605,12 @@
	sha1s = [self.id0, self.id1, self.id2]

	# when
	- self.indexer.run(sha1s, policy_update="update-dups")
	+ self.indexer.run(sha1s)

	self.assert_results_ok(sha1s)

	# 2nd pass
	- self.indexer.run(sha1s, policy_update="ignore-dups")
	+ self.indexer.run(sha1s)

	self.assert_results_ok(sha1s)

	@@ -624,7 +623,7 @@
	] # unknown

	# when
	- self.indexer.run(sha1s, policy_update="update-dups")
	+ self.indexer.run(sha1s)

	# then
	expected_results = [

File Metadata

Mime Type: text/plain
Expires: Jul 3 2025, 6:06 PM (4 w, 5 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3234822

D4397.id15580.diffNo OneTemporaryActions

D4397.id15580.diffView Options

File Metadata

Event Timeline

D4397.id15580.diff
No OneTemporary
Actions

D4397.id15580.diff
View Options