Page MenuHomeSoftware Heritage

D4397.id15580.diff
No OneTemporary

D4397.id15580.diff

diff --git a/sql/upgrades/133.sql b/sql/upgrades/133.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/133.sql
@@ -0,0 +1,177 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 132
+-- to_version: 133
+-- description: remove 'conflict_update' argument
+
+insert into dbversion(version, release, description)
+ values(132, now(), 'Work In Progress');
+
+create or replace function swh_content_mimetype_add(conflict_update boolean);
+create or replace function swh_content_mimetype_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
+ select id, mimetype, encoding, indexer_configuration_id
+ from tmp_content_mimetype tcm
+ on conflict(id, indexer_configuration_id)
+ do update set mimetype = excluded.mimetype,
+ encoding = excluded.encoding;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
+
+
+
+drop function swh_content_language_add(conflict_update boolean);
+create or replace function swh_content_language_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
+ from tmp_content_language tcl
+ on conflict(id, indexer_configuration_id)
+ do update set lang = excluded.lang;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_language_add() IS 'Add new content languages';
+
+
+
+drop function swh_content_ctags_add(conflict_update boolean);
+create or replace function swh_content_ctags_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
+ select id, name, kind, line, lang, indexer_configuration_id
+ from tmp_content_ctags tct
+
+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
+
+
+
+drop function swh_content_fossology_license_add(conflict_update boolean);
+create or replace function swh_content_fossology_license_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ -- insert unknown licenses first
+ insert into fossology_license (name)
+ select distinct license from tmp_content_fossology_license tmp
+ where not exists (select 1 from fossology_license where name=tmp.license)
+ on conflict(name) do nothing;
+
+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
+ select tcl.id,
+ (select id from fossology_license where name = tcl.license) as license,
+ indexer_configuration_id
+ from tmp_content_fossology_license tcl
+ on conflict(id, license_id, indexer_configuration_id)
+ do update set license_id = excluded.license_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
+
+
+
+drop function swh_content_metadata_add(conflict_update boolean);
+create or replace function swh_content_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_metadata (id, metadata, indexer_configuration_id)
+ select id, metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set metadata = excluded.metadata;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_metadata_add() IS 'Add new content metadata';
+
+
+
+drop function swh_revision_intrinsic_metadata_add(conflict_update boolean);
+create or replace function swh_revision_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_revision_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+
+
+
+drop function swh_origin_intrinsic_metadata_add(conflict_update boolean);
+create or replace function swh_origin_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -205,7 +205,7 @@
origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
- kwargs = {"policy_update": "update-dups", "retries_left": 1}
+ kwargs = {"retries_left": 1}
schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)
diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -136,16 +136,12 @@
return ctags
def persist_index_computations(
- self, results: List[ContentCtagsRow], policy_update: str
+ self, results: List[ContentCtagsRow]
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of ctags returned by index()
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
- return self.idx_storage.content_ctags_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_ctags_add(results)
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -114,7 +114,7 @@
]
def persist_index_computations(
- self, results: List[ContentLicenseRow], policy_update: str
+ self, results: List[ContentLicenseRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -126,13 +126,8 @@
- license (bytes): license in bytes
- path (bytes): path
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
-
"""
- return self.idx_storage.content_fossology_license_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_fossology_license_add(results)
class FossologyLicenseIndexer(
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -10,6 +10,7 @@
import shutil
import tempfile
from typing import Any, Dict, Generic, Iterator, List, Optional, Set, TypeVar, Union
+import warnings
from swh.core import utils
from swh.core.config import load_from_envvar, merge_configs
@@ -250,17 +251,13 @@
yield from ids
@abc.abstractmethod
- def persist_index_computations(
- self, results: List[TResult], policy_update: str
- ) -> Dict[str, int]:
+ def persist_index_computations(self, results: List[TResult]) -> Dict[str, int]:
"""Persist the computation resulting from the index.
Args:
results: List of results. One result is the
result of the index function.
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
Returns:
a summary dict of what has been inserted in the storage
@@ -281,24 +278,28 @@
"""
- def run(self, ids: List[Sha1], policy_update: str, **kwargs) -> Dict:
+ def run(self, ids: List[Sha1], **kwargs) -> Dict:
"""Given a list of ids:
- retrieve the content from the storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
ids (Iterable[Union[bytes, str]]): sha1's identifier list
- policy_update (str): either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore
- them
**kwargs: passed to the `index` method
Returns:
A summary Dict of the task's status
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
+
sha1s = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
@@ -318,7 +319,7 @@
if res: # If no results, skip it
results.extend(res)
summary["status"] = "eventful"
- summary = self.persist_index_computations(results, policy_update)
+ summary = self.persist_index_computations(results)
self.results = results
except Exception:
if not self.catch_exceptions:
@@ -478,9 +479,7 @@
count_object_added_key: Optional[str] = None
for contents in utils.grouper(gen, n=self.config["write_batch_size"]):
- res = self.persist_index_computations(
- list(contents), policy_update="update-dups"
- )
+ res = self.persist_index_computations(list(contents))
if not count_object_added_key:
count_object_added_key = list(res.keys())[0]
count += res[count_object_added_key]
@@ -508,22 +507,24 @@
"""
- def run(
- self, origin_urls: List[str], policy_update: str = "update-dups", **kwargs
- ) -> Dict:
+ def run(self, origin_urls: List[str], **kwargs) -> Dict:
"""Given a list of origin urls:
- retrieve origins from storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
origin_urls: list of origin urls.
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates (default) or ignore them
**kwargs: passed to the `index` method
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
summary: Dict[str, Any] = {"status": "uneventful"}
try:
results = self.index_list(origin_urls, **kwargs)
@@ -533,7 +534,7 @@
summary["status"] = "failed"
return summary
- summary_persist = self.persist_index_computations(results, policy_update)
+ summary_persist = self.persist_index_computations(results)
self.results = results
if summary_persist:
for value in summary_persist.values():
@@ -564,19 +565,23 @@
"""
- def run(self, ids: List[Sha1Git], policy_update: str) -> Dict:
+ def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- retrieve revisions from storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
ids: sha1_git's identifier list
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
summary: Dict[str, Any] = {"status": "uneventful"}
results = []
@@ -599,7 +604,7 @@
summary["status"] = "failed"
return summary
- summary_persist = self.persist_index_computations(results, policy_update)
+ summary_persist = self.persist_index_computations(results)
if summary_persist:
for value in summary_persist.values():
if value > 0:
diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py
--- a/swh/indexer/journal_client.py
+++ b/swh/indexer/journal_client.py
@@ -36,7 +36,6 @@
task_names["origin_metadata"],
"oneshot",
visit_urls,
- policy_update="update-dups",
retries_left=1,
)
)
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -109,7 +109,7 @@
]
def persist_index_computations(
- self, results: List[ContentMetadataRow], policy_update: str
+ self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -118,13 +118,9 @@
following keys:
- id (bytes): content's identifier (sha1)
- metadata (jsonb): detected metadata
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
- return self.idx_storage.content_metadata_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_metadata_add(results)
DEFAULT_CONFIG: Dict[str, Any] = {
@@ -217,7 +213,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow], policy_update: str
+ self, results: List[RevisionIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -227,15 +223,11 @@
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in
# revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.revision_intrinsic_metadata_add(results)
def translate_revision_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
@@ -291,9 +283,7 @@
# content indexing
try:
c_metadata_indexer.run(
- sha1s_filtered,
- policy_update="ignore-dups",
- log_suffix=log_suffix,
+ sha1s_filtered, log_suffix=log_suffix,
)
# on the fly possibility:
for result in c_metadata_indexer.results:
@@ -364,10 +354,7 @@
def persist_index_computations(
self,
results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
- policy_update: str,
) -> Dict[str, int]:
- conflict_update = policy_update == "update-dups"
-
# Deduplicate revisions
rev_metadata: List[RevisionIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
@@ -382,14 +369,10 @@
orig_metadata.append(orig_item)
if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(
- rev_metadata, conflict_update=conflict_update
- )
+ summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
summary.update(summary_rev)
if orig_metadata:
- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
- orig_metadata, conflict_update=conflict_update
- )
+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -95,7 +95,7 @@
]
def persist_index_computations(
- self, results: List[ContentMimetypeRow], policy_update: str
+ self, results: List[ContentMimetypeRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -103,13 +103,8 @@
results: list of content's mimetype dicts
(see :meth:`.index`)
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
-
"""
- return self.idx_storage.content_mimetype_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_mimetype_add(results)
class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]):
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -25,9 +25,7 @@
USE_TOOLS = False
- def persist_index_computations(
- self, results: Any, policy_update: str
- ) -> Dict[str, int]:
+ def persist_index_computations(self, results: Any) -> Dict[str, int]:
"""Do nothing. The indexer's results are not persistent, they
should only be piped to another indexer."""
return {}
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(132, now(), 'Work In Progress');
+ values(133, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -41,7 +41,7 @@
comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information';
-- add tmp_content_mimetype entries to content_mimetype, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_content_mimetype_missing must take place before calling this
@@ -49,36 +49,28 @@
--
-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
-- 2. call this function
-create or replace function swh_content_mimetype_add(conflict_update boolean)
+create or replace function swh_content_mimetype_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
- select id, mimetype, encoding, indexer_configuration_id
- from tmp_content_mimetype tcm
- on conflict(id, indexer_configuration_id)
- do update set mimetype = excluded.mimetype,
- encoding = excluded.encoding;
- else
- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
- select id, mimetype, encoding, indexer_configuration_id
- from tmp_content_mimetype tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
+ select id, mimetype, encoding, indexer_configuration_id
+ from tmp_content_mimetype tcm
+ on conflict(id, indexer_configuration_id)
+ do update set mimetype = excluded.mimetype,
+ encoding = excluded.encoding;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
--- add tmp_content_language entries to content_language, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_language entries to content_language, overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_content_language_missing must take place before calling this
@@ -86,32 +78,25 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_content_language, 2. call this function
-create or replace function swh_content_language_add(conflict_update boolean)
+create or replace function swh_content_language_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
- from tmp_content_language tcl
- on conflict(id, indexer_configuration_id)
- do update set lang = excluded.lang;
- else
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
- from tmp_content_language tcl
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
+ from tmp_content_language tcl
+ on conflict(id, indexer_configuration_id)
+ do update set lang = excluded.lang;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_language_add(boolean) IS 'Add new content languages';
+comment on function swh_content_language_add() IS 'Add new content languages';
-- create a temporary table for retrieving content_language
create or replace function swh_mktemp_content_language()
@@ -139,36 +124,29 @@
comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';
--- add tmp_content_ctags entries to content_ctags, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_ctags entries to content_ctags, overwriting duplicates
--
-- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
-- 2. call this function
-create or replace function swh_content_ctags_add(conflict_update boolean)
+create or replace function swh_content_ctags_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- delete from content_ctags
- where id in (select tmp.id
- from tmp_content_ctags tmp
- inner join indexer_configuration i on i.id=tmp.indexer_configuration_id);
- end if;
-
insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
select id, name, kind, line, lang, indexer_configuration_id
from tmp_content_ctags tct
on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
do nothing;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
create type content_ctags_signature as (
id sha1,
@@ -218,12 +196,12 @@
comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';
--- add tmp_content_fossology_license entries to content_fossology_license, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_fossology_license entries to content_fossology_license,
+-- overwriting duplicates.
--
-- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to
-- tmp_content_fossology_license, 2. call this function
-create or replace function swh_content_fossology_license_add(conflict_update boolean)
+create or replace function swh_content_fossology_license_add()
returns bigint
language plpgsql
as $$
@@ -236,36 +214,25 @@
where not exists (select 1 from fossology_license where name=tmp.license)
on conflict(name) do nothing;
- if conflict_update then
- insert into content_fossology_license (id, license_id, indexer_configuration_id)
- select tcl.id,
- (select id from fossology_license where name = tcl.license) as license,
- indexer_configuration_id
- from tmp_content_fossology_license tcl
- on conflict(id, license_id, indexer_configuration_id)
- do update set license_id = excluded.license_id;
- else
- insert into content_fossology_license (id, license_id, indexer_configuration_id)
- select tcl.id,
- (select id from fossology_license where name = tcl.license) as license,
- indexer_configuration_id
- from tmp_content_fossology_license tcl
- on conflict(id, license_id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
+ select tcl.id,
+ (select id from fossology_license where name = tcl.license) as license,
+ indexer_configuration_id
+ from tmp_content_fossology_license tcl
+ on conflict(id, license_id, indexer_configuration_id)
+ do update set license_id = excluded.license_id;
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
-- content_metadata functions
--- add tmp_content_metadata entries to content_metadata, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_metadata entries to content_metadata, overwriting duplicates
--
-- If filtering duplicates is in order, the call to
-- swh_content_metadata_missing must take place before calling this
@@ -273,32 +240,25 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_content_metadata, 2. call this function
-create or replace function swh_content_metadata_add(conflict_update boolean)
+create or replace function swh_content_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_metadata (id, metadata, indexer_configuration_id)
- select id, metadata, indexer_configuration_id
- from tmp_content_metadata tcm
- on conflict(id, indexer_configuration_id)
- do update set metadata = excluded.metadata;
- else
- insert into content_metadata (id, metadata, indexer_configuration_id)
- select id, metadata, indexer_configuration_id
- from tmp_content_metadata tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_metadata (id, metadata, indexer_configuration_id)
+ select id, metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set metadata = excluded.metadata;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
+comment on function swh_content_metadata_add() IS 'Add new content metadata';
-- create a temporary table for retrieving content_metadata
create or replace function swh_mktemp_content_metadata()
@@ -315,8 +275,7 @@
-- end content_metadata functions
-- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
--- overwriting duplicates if conflict_update is true, skipping duplicates
--- otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_revision_intrinsic_metadata_missing must take place before calling this
@@ -324,34 +283,27 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean)
+create or replace function swh_revision_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
- select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
- on conflict(id, indexer_configuration_id)
- do update set
- metadata = excluded.metadata,
- mappings = excluded.mappings;
- else
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
- select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_revision_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata';
+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
-- create a temporary table for retrieving revision_intrinsic_metadata
create or replace function swh_mktemp_revision_intrinsic_metadata()
@@ -389,7 +341,7 @@
-- add tmp_indexer_configuration entries to indexer_configuration,
--- skipping duplicates if any.
+-- overwriting duplicates if any.
--
-- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to
-- it, 2. call this function to insert and filtering out duplicates
@@ -412,8 +364,7 @@
$$;
-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
--- overwriting duplicates if conflict_update is true, skipping duplicates
--- otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_origin_intrinsic_metadata_missing must take place before calling this
@@ -421,8 +372,7 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_origin_intrinsic_metadata, 2. call this function
-create or replace function swh_origin_intrinsic_metadata_add(
- conflict_update boolean)
+create or replace function swh_origin_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
@@ -430,31 +380,24 @@
res bigint;
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- if conflict_update then
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector, mappings
- from tmp_origin_intrinsic_metadata
- on conflict(id, indexer_configuration_id)
- do update set
- metadata = excluded.metadata,
- metadata_tsvector = excluded.metadata_tsvector,
- mappings = excluded.mappings,
- from_revision = excluded.from_revision;
- else
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector, mappings
- from tmp_origin_intrinsic_metadata
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
-- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata.
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -267,11 +267,7 @@
@process_metrics
@db_transaction()
def content_mimetype_add(
- self,
- mimetypes: List[ContentMimetypeRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, mimetypes: List[ContentMimetypeRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(mimetypes)
mimetypes.sort(key=lambda m: m.id)
@@ -282,7 +278,7 @@
["id", "mimetype", "encoding", "indexer_configuration_id"],
cur,
)
- count = db.content_mimetype_add_from_temp(conflict_update, cur)
+ count = db.content_mimetype_add_from_temp(cur)
return {"content_mimetype:add": count}
@timed
@@ -320,11 +316,7 @@
@process_metrics
@db_transaction()
def content_language_add(
- self,
- languages: List[ContentLanguageRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, languages: List[ContentLanguageRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(languages)
languages.sort(key=lambda m: m.id)
@@ -344,7 +336,7 @@
cur,
)
- count = db.content_language_add_from_temp(conflict_update, cur)
+ count = db.content_language_add_from_temp(cur)
return {"content_language:add": count}
@timed
@@ -370,11 +362,7 @@
@process_metrics
@db_transaction()
def content_ctags_add(
- self,
- ctags: List[ContentCtagsRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, ctags: List[ContentCtagsRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(ctags)
ctags.sort(key=lambda m: m.id)
@@ -387,7 +375,7 @@
cur=cur,
)
- count = db.content_ctags_add_from_temp(conflict_update, cur)
+ count = db.content_ctags_add_from_temp(cur)
return {"content_ctags:add": count}
@timed
@@ -425,11 +413,7 @@
@process_metrics
@db_transaction()
def content_fossology_license_add(
- self,
- licenses: List[ContentLicenseRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, licenses: List[ContentLicenseRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(licenses)
licenses.sort(key=lambda m: m.id)
@@ -440,7 +424,7 @@
columns=["id", "license", "indexer_configuration_id"],
cur=cur,
)
- count = db.content_fossology_license_add_from_temp(conflict_update, cur)
+ count = db.content_fossology_license_add_from_temp(cur)
return {"content_fossology_license:add": count}
@timed
@@ -490,11 +474,7 @@
@process_metrics
@db_transaction()
def content_metadata_add(
- self,
- metadata: List[ContentMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[ContentMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -507,7 +487,7 @@
["id", "metadata", "indexer_configuration_id"],
cur,
)
- count = db.content_metadata_add_from_temp(conflict_update, cur)
+ count = db.content_metadata_add_from_temp(cur)
return {
"content_metadata:add": count,
}
@@ -540,11 +520,7 @@
@process_metrics
@db_transaction()
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -557,7 +533,7 @@
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ count = db.revision_intrinsic_metadata_add_from_temp(cur)
return {
"revision_intrinsic_metadata:add": count,
}
@@ -580,11 +556,7 @@
@process_metrics
@db_transaction()
def origin_intrinsic_metadata_add(
- self,
- metadata: List[OriginIntrinsicMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -597,7 +569,7 @@
["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
cur,
)
- count = db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ count = db.origin_intrinsic_metadata_add_from_temp(cur)
return {
"origin_intrinsic_metadata:add": count,
}
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -74,9 +74,9 @@
def mktemp_content_mimetype(self, cur=None):
pass
- def content_mimetype_add_from_temp(self, conflict_update, cur=None):
+ def content_mimetype_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_mimetype_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_mimetype_add()")
return cur.fetchone()[0]
def _convert_key(self, key, main_table="c"):
@@ -209,9 +209,9 @@
def mktemp_content_language(self, cur=None):
pass
- def content_language_add_from_temp(self, conflict_update, cur=None):
+ def content_language_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_language_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_language_add()")
return cur.fetchone()[0]
def content_language_get_from_list(self, ids, cur=None):
@@ -245,9 +245,9 @@
def mktemp_content_ctags(self, cur=None):
pass
- def content_ctags_add_from_temp(self, conflict_update, cur=None):
+ def content_ctags_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_ctags_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_ctags_add()")
return cur.fetchone()[0]
def content_ctags_get_from_list(self, ids, cur=None):
@@ -303,14 +303,12 @@
def mktemp_content_fossology_license(self, cur=None):
pass
- def content_fossology_license_add_from_temp(self, conflict_update, cur=None):
+ def content_fossology_license_add_from_temp(self, cur=None):
"""Add new licenses per content.
"""
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_content_fossology_license_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_content_fossology_license_add()")
return cur.fetchone()[0]
def content_fossology_license_get_from_list(self, ids, cur=None):
@@ -355,9 +353,9 @@
def mktemp_content_metadata(self, cur=None):
pass
- def content_metadata_add_from_temp(self, conflict_update, cur=None):
+ def content_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_metadata_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_metadata_add()")
return cur.fetchone()[0]
def content_metadata_get_from_list(self, ids, cur=None):
@@ -392,11 +390,9 @@
def mktemp_revision_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
+ def revision_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_revision_intrinsic_metadata_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_revision_intrinsic_metadata_add()")
return cur.fetchone()[0]
def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
@@ -429,11 +425,9 @@
def mktemp_origin_intrinsic_metadata(self, cur=None):
pass
- def origin_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
+ def origin_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_origin_intrinsic_metadata_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_origin_intrinsic_metadata_add()")
return cur.fetchone()[0]
def origin_intrinsic_metadata_get_from_list(self, ids, cur=None):
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -193,7 +193,7 @@
assert len(ids) <= limit
return PagedResult(results=ids, next_page_token=next_page_token)
- def add(self, data: Iterable[TValue], conflict_update: bool) -> int:
+ def add(self, data: Iterable[TValue]) -> int:
"""Add data not present in storage.
Args:
@@ -204,9 +204,6 @@
results
- arbitrary data
- conflict_update (bool): Flag to determine if we want to overwrite
- (true) or skip duplicates (false)
-
"""
data = list(data)
check_id_duplicates(data)
@@ -216,9 +213,6 @@
id_ = item.pop("id")
tool_id = item["indexer_configuration_id"]
key = _key_from_dict(obj.unique_key())
- if not conflict_update and key in self._data[id_]:
- # Duplicate, should not be updated
- continue
self._data[id_][key] = item
self._tools_per_id[id_].add(tool_id)
count += 1
@@ -265,9 +259,9 @@
)
def content_mimetype_add(
- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
+ self, mimetypes: List[ContentMimetypeRow]
) -> Dict[str, int]:
- added = self._mimetypes.add(mimetypes, conflict_update)
+ added = self._mimetypes.add(mimetypes)
return {"content_mimetype:add": added}
def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]:
@@ -282,9 +276,9 @@
return self._languages.get(ids)
def content_language_add(
- self, languages: List[ContentLanguageRow], conflict_update: bool = False
+ self, languages: List[ContentLanguageRow]
) -> Dict[str, int]:
- added = self._languages.add(languages, conflict_update)
+ added = self._languages.add(languages)
return {"content_language:add": added}
def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]:
@@ -293,10 +287,8 @@
def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]:
return self._content_ctags.get(ids)
- def content_ctags_add(
- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
- ) -> Dict[str, int]:
- added = self._content_ctags.add(ctags, conflict_update,)
+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
+ added = self._content_ctags.add(ctags)
return {"content_ctags:add": added}
def content_ctags_search(
@@ -329,9 +321,9 @@
return self._licenses.get(ids)
def content_fossology_license_add(
- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
+ self, licenses: List[ContentLicenseRow]
) -> Dict[str, int]:
- added = self._licenses.add(licenses, conflict_update)
+ added = self._licenses.add(licenses)
return {"content_fossology_license:add": added}
def content_fossology_license_get_partition(
@@ -355,9 +347,9 @@
return self._content_metadata.get(ids)
def content_metadata_add(
- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
+ self, metadata: List[ContentMetadataRow]
) -> Dict[str, int]:
- added = self._content_metadata.add(metadata, conflict_update)
+ added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
def revision_intrinsic_metadata_missing(
@@ -371,11 +363,9 @@
return self._revision_intrinsic_metadata.get(ids)
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
+ self, metadata: List[RevisionIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata, conflict_update)
+ added = self._revision_intrinsic_metadata.add(metadata)
return {"revision_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
@@ -384,9 +374,9 @@
return self._origin_intrinsic_metadata.get(urls)
def origin_intrinsic_metadata_add(
- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
+ self, metadata: List[OriginIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._origin_intrinsic_metadata.add(metadata, conflict_update)
+ added = self._origin_intrinsic_metadata.add(metadata)
return {"origin_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_search_fulltext(
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -84,14 +84,13 @@
@remote_api_endpoint("content_mimetype/add")
def content_mimetype_add(
- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
+ self, mimetypes: List[ContentMimetypeRow]
) -> Dict[str, int]:
"""Add mimetypes not present in storage.
Args:
mimetypes: mimetype rows to be added, with their `tool` attribute set to
None.
- conflict_update: Flag to determine if we want to
overwrite (``True``) or skip duplicates (``False``, the
default)
@@ -148,17 +147,13 @@
@remote_api_endpoint("content_language/add")
def content_language_add(
- self, languages: List[ContentLanguageRow], conflict_update: bool = False
+ self, languages: List[ContentLanguageRow]
) -> Dict[str, int]:
"""Add languages not present in storage.
Args:
languages: language row objects
- conflict_update (bool): Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the
- default)
-
Returns:
Dict summary of number of rows added
@@ -198,9 +193,7 @@
...
@remote_api_endpoint("content/ctags/add")
- def content_ctags_add(
- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
- ) -> Dict[str, int]:
+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
"""Add ctags not present in storage
Args:
@@ -251,7 +244,7 @@
@remote_api_endpoint("content/fossology_license/add")
def content_fossology_license_add(
- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
+ self, licenses: List[ContentLicenseRow]
) -> Dict[str, int]:
"""Add licenses not present in storage.
@@ -259,9 +252,6 @@
license: license rows to be added, with their `tool` attribute set to
None.
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -335,7 +325,7 @@
@remote_api_endpoint("content_metadata/add")
def content_metadata_add(
- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
+ self, metadata: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Add metadata not present in storage.
@@ -345,9 +335,6 @@
- **id**: sha1
- **metadata**: arbitrary dict
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -390,18 +377,13 @@
@remote_api_endpoint("revision_intrinsic_metadata/add")
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
+ self, metadata: List[RevisionIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
Args:
metadata: ContentMetadataRow objects
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -423,16 +405,13 @@
@remote_api_endpoint("origin_intrinsic_metadata/add")
def origin_intrinsic_metadata_add(
- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
+ self, metadata: List[OriginIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Add origin metadata not present in storage.
Args:
metadata: list of OriginIntrinsicMetadataRow objects
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -137,45 +137,6 @@
actual_missing = endpoint(storage, etype, "missing")(query)
assert list(actual_missing) == [data.sha1_1]
- def test_add__drop_duplicate(
- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
- ) -> None:
- storage, data = swh_indexer_storage_with_data
- etype = self.endpoint_type
- tool_id = data.tools[self.tool_name]["id"]
-
- # add the first object
- data_v1 = {
- "id": data.sha1_2,
- **self.example_data[0],
- "indexer_configuration_id": tool_id,
- }
- summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)])
- assert summary == expected_summary(1, etype)
-
- # should be able to retrieve it
- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
- expected_data_v1 = [
- self.row_class.from_dict(
- {
- "id": data.sha1_2,
- **self.example_data[0],
- "tool": data.tools[self.tool_name],
- }
- )
- ]
- assert actual_data == expected_data_v1
-
- # now if we add a modified version of the same object (same id)
- data_v2 = data_v1.copy()
- data_v2.update(self.example_data[1])
- summary2 = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
- assert summary2 == expected_summary(0, etype) # not added
-
- # we expect to retrieve the original data, not the modified one
- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
- assert actual_data == expected_data_v1
-
def test_add__update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
@@ -209,9 +170,7 @@
data_v2 = data_v1.copy()
data_v2.update(self.example_data[1])
- endpoint(storage, etype, "add")(
- [self.row_class.from_dict(data_v2)], conflict_update=True
- )
+ endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
assert summary == expected_summary(1, etype) # modified so counted
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
@@ -225,7 +184,7 @@
# data did change as the v2 was used to overwrite v1
assert actual_data == expected_data_v2
- def test_add__update_in_place_deadlock(
+ def test_add_deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
@@ -267,7 +226,9 @@
endpoint(storage, etype, "add")(data_v1)
# when
- actual_data = list(endpoint(storage, etype, "get")(hashes))
+ actual_data = sorted(
+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
+ )
expected_data_v1 = [
self.row_class.from_dict(
@@ -281,10 +242,10 @@
# given
def f1() -> None:
- endpoint(storage, etype, "add")(data_v2a, conflict_update=True)
+ endpoint(storage, etype, "add")(data_v2a)
def f2() -> None:
- endpoint(storage, etype, "add")(data_v2b, conflict_update=True)
+ endpoint(storage, etype, "add")(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
@@ -295,15 +256,21 @@
t2.join()
actual_data = sorted(
- (row.to_dict() for row in endpoint(storage, etype, "get")(hashes)),
- key=lambda x: x["id"],
+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
)
expected_data_v2 = [
- {"id": hash_, **self.example_data[1], "tool": tool} for hash_ in hashes
+ self.row_class.from_dict(
+ {"id": hash_, **self.example_data[1], "tool": tool}
+ )
+ for hash_ in hashes
]
- assert actual_data == expected_data_v2
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
def test_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
@@ -333,9 +300,7 @@
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")(
- [data_rev2, data_rev2], conflict_update=True
- )
+ endpoint(storage, etype, "add")([data_rev2, data_rev2])
# then
actual_data = list(
@@ -545,16 +510,12 @@
row_class = ContentCtagsRow
# the following tests are disabled because CTAGS behaves differently
- @pytest.mark.skip
- def test_add__drop_duplicate(self):
- pass
-
@pytest.mark.skip
def test_add__update_in_place_duplicate(self):
pass
@pytest.mark.skip
- def test_add__update_in_place_deadlock(self):
+ def test_add_deadlock(self):
pass
@pytest.mark.skip
@@ -736,7 +697,7 @@
)
ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)
- storage.content_ctags_add([ctag1, ctag2], conflict_update=True)
+ storage.content_ctags_add([ctag1, ctag2])
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
@@ -889,7 +850,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -925,7 +886,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -951,7 +912,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -993,7 +954,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -1093,71 +1054,6 @@
assert actual_metadata == expected_metadata
- def test_origin_intrinsic_metadata_add_drop_duplicate(
- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
- ) -> None:
- storage, data = swh_indexer_storage_with_data
- # given
- tool_id = data.tools["swh-metadata-detector"]["id"]
-
- metadata_v1: Dict[str, Any] = {
- "version": None,
- "name": None,
- }
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
- metadata=metadata_v1.copy(),
- mappings=[],
- indexer_configuration_id=tool_id,
- )
- metadata_origin_v1 = OriginIntrinsicMetadataRow(
- id=data.origin_url_1,
- metadata=metadata_v1.copy(),
- indexer_configuration_id=tool_id,
- mappings=[],
- from_revision=data.revision_id_1,
- )
-
- # given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
- storage.origin_intrinsic_metadata_add([metadata_origin_v1])
-
- # when
- actual_metadata = list(
- storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"])
- )
-
- expected_metadata_v1 = [
- OriginIntrinsicMetadataRow(
- id=data.origin_url_1,
- metadata=metadata_v1,
- tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
- mappings=[],
- )
- ]
-
- assert actual_metadata == expected_metadata_v1
-
- # given
- metadata_v2 = metadata_v1.copy()
- metadata_v2.update(
- {"name": "test_metadata", "author": "MG",}
- )
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
- metadata_origin_v2 = attr.evolve(metadata_origin_v1, metadata=metadata_v2)
-
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
- storage.origin_intrinsic_metadata_add([metadata_origin_v2])
-
- # then
- actual_metadata = list(
- storage.origin_intrinsic_metadata_get([data.origin_url_1])
- )
-
- # metadata did not change as the v2 was dropped.
- assert actual_metadata == expected_metadata_v1
-
def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
@@ -1218,10 +1114,8 @@
from_revision=data.revision_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2], conflict_update=True)
- storage.origin_intrinsic_metadata_add(
- [metadata_origin_v2], conflict_update=True
- )
+ storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])
@@ -1240,7 +1134,7 @@
# metadata did change as the v2 was used to overwrite v1
assert actual_metadata == expected_metadata_v2
- def test_origin_intrinsic_metadata_add__update_in_place_deadlock(
+ def test_origin_intrinsic_metadata_add__deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
@@ -1312,10 +1206,10 @@
# given
def f1() -> None:
- storage.origin_intrinsic_metadata_add(data_v2a, conflict_update=True)
+ storage.origin_intrinsic_metadata_add(data_v2a)
def f2() -> None:
- storage.origin_intrinsic_metadata_add(data_v2b, conflict_update=True)
+ storage.origin_intrinsic_metadata_add(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
@@ -1337,8 +1231,11 @@
for id_ in ids
]
- assert len(actual_data) == len(expected_data_v2)
- assert sorted(actual_data, key=lambda x: x.id) == expected_data_v2
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
def test_origin_intrinsic_metadata_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -74,7 +74,7 @@
def _assert_tasks_for_origins(tasks, origins):
- expected_kwargs = {"policy_update": "update-dups"}
+ expected_kwargs = {}
assert {task["type"] for task in tasks} == {"index-origin-metadata"}
assert all(len(task["arguments"]["args"]) == 1 for task in tasks)
for task in tasks:
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -31,7 +31,7 @@
) -> List[Dict[str, Any]]:
raise _TestException()
- def persist_index_computations(self, results, policy_update) -> Dict[str, int]:
+ def persist_index_computations(self, results) -> Dict[str, int]:
return {}
def indexed_contents_in_partition(
@@ -61,12 +61,12 @@
indexer.objstorage = Mock()
indexer.objstorage.get.return_value = b"content"
- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
+ assert indexer.run([b"foo"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run([b"foo"], policy_update=True)
+ indexer.run([b"foo"])
def test_revision_indexer_catch_exceptions():
@@ -74,25 +74,23 @@
indexer.storage = Mock()
indexer.storage.revision_get.return_value = ["rev"]
- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
+ assert indexer.run([b"foo"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run([b"foo"], policy_update=True)
+ indexer.run([b"foo"])
def test_origin_indexer_catch_exceptions():
indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)
- assert indexer.run(["http://example.org"], policy_update=True) == {
- "status": "failed"
- }
+ assert indexer.run(["http://example.org"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run(["http://example.org"], policy_update=True)
+ indexer.run(["http://example.org"])
def test_content_partition_indexer_catch_exceptions():
@@ -100,9 +98,9 @@
config={**BASE_TEST_CONFIG, "write_batch_size": 42}
)
- assert indexer.run(0, 42, policy_update=True) == {"status": "failed"}
+ assert indexer.run(0, 42) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run(0, 42, policy_update=True)
+ indexer.run(0, 42)
diff --git a/swh/indexer/tests/test_journal_client.py b/swh/indexer/tests/test_journal_client.py
--- a/swh/indexer/tests/test_journal_client.py
+++ b/swh/indexer/tests/test_journal_client.py
@@ -30,10 +30,7 @@
(
[
{
- "arguments": {
- "kwargs": {"policy_update": "update-dups"},
- "args": (["file:///dev/zero"],),
- },
+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
"policy": "oneshot",
"type": "task-name",
"retries_left": 1,
@@ -64,10 +61,7 @@
(
[
{
- "arguments": {
- "kwargs": {"policy_update": "update-dups"},
- "args": (["file:///dev/zero"],),
- },
+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
"policy": "oneshot",
"type": "task-name",
"retries_left": 1,
@@ -100,7 +94,7 @@
[
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///dev/zero", "file:///tmp/foobar"],),
},
"policy": "oneshot",
@@ -138,7 +132,7 @@
[
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///dev/zero", "file:///tmp/foobar"],),
},
"policy": "oneshot",
@@ -147,7 +141,7 @@
},
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///tmp/spamegg"],),
},
"policy": "oneshot",
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -140,7 +140,7 @@
fill_storage(metadata_indexer.storage)
# when
- metadata_indexer.run(sha1s, policy_update="ignore-dups")
+ metadata_indexer.run(sha1s)
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
@@ -1122,7 +1122,7 @@
]
)
- metadata_indexer.run([rev.id], "update-dups")
+ metadata_indexer.run([rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
@@ -1183,7 +1183,7 @@
]
)
- metadata_indexer.run([new_rev.id], "update-dups")
+ metadata_indexer.run([new_rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -46,7 +46,7 @@
indexing tests.
"""
- def persist_index_computations(self, results, policy_update):
+ def persist_index_computations(self, results):
self.results = results
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -262,7 +262,6 @@
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
- self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
@@ -606,12 +605,12 @@
sha1s = [self.id0, self.id1, self.id2]
# when
- self.indexer.run(sha1s, policy_update="update-dups")
+ self.indexer.run(sha1s)
self.assert_results_ok(sha1s)
# 2nd pass
- self.indexer.run(sha1s, policy_update="ignore-dups")
+ self.indexer.run(sha1s)
self.assert_results_ok(sha1s)
@@ -624,7 +623,7 @@
] # unknown
# when
- self.indexer.run(sha1s, policy_update="update-dups")
+ self.indexer.run(sha1s)
# then
expected_results = [

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:06 PM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234822

Event Timeline