Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9347974
D4397.id15580.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
70 KB
Subscribers
None
D4397.id15580.diff
View Options
diff --git a/sql/upgrades/133.sql b/sql/upgrades/133.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/133.sql
@@ -0,0 +1,177 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 132
+-- to_version: 133
+-- description: remove 'conflict_update' argument
+
+insert into dbversion(version, release, description)
+ values(132, now(), 'Work In Progress');
+
+create or replace function swh_content_mimetype_add(conflict_update boolean);
+create or replace function swh_content_mimetype_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
+ select id, mimetype, encoding, indexer_configuration_id
+ from tmp_content_mimetype tcm
+ on conflict(id, indexer_configuration_id)
+ do update set mimetype = excluded.mimetype,
+ encoding = excluded.encoding;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
+
+
+
+drop function swh_content_language_add(conflict_update boolean);
+create or replace function swh_content_language_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
+ from tmp_content_language tcl
+ on conflict(id, indexer_configuration_id)
+ do update set lang = excluded.lang;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_language_add() IS 'Add new content languages';
+
+
+
+drop function swh_content_ctags_add(conflict_update boolean);
+create or replace function swh_content_ctags_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
+ select id, name, kind, line, lang, indexer_configuration_id
+ from tmp_content_ctags tct
+
+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
+
+
+
+drop function swh_content_fossology_license_add(conflict_update boolean);
+create or replace function swh_content_fossology_license_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ -- insert unknown licenses first
+ insert into fossology_license (name)
+ select distinct license from tmp_content_fossology_license tmp
+ where not exists (select 1 from fossology_license where name=tmp.license)
+ on conflict(name) do nothing;
+
+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
+ select tcl.id,
+ (select id from fossology_license where name = tcl.license) as license,
+ indexer_configuration_id
+ from tmp_content_fossology_license tcl
+ on conflict(id, license_id, indexer_configuration_id)
+ do update set license_id = excluded.license_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
+
+
+
+drop function swh_content_metadata_add(conflict_update boolean);
+create or replace function swh_content_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into content_metadata (id, metadata, indexer_configuration_id)
+ select id, metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set metadata = excluded.metadata;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_content_metadata_add() IS 'Add new content metadata';
+
+
+
+drop function swh_revision_intrinsic_metadata_add(conflict_update boolean);
+create or replace function swh_revision_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_revision_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
+
+
+
+drop function swh_origin_intrinsic_metadata_add(conflict_update boolean);
+create or replace function swh_origin_intrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -205,7 +205,7 @@
origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
- kwargs = {"policy_update": "update-dups", "retries_left": 1}
+ kwargs = {"retries_left": 1}
schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)
diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -136,16 +136,12 @@
return ctags
def persist_index_computations(
- self, results: List[ContentCtagsRow], policy_update: str
+ self, results: List[ContentCtagsRow]
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of ctags returned by index()
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
- return self.idx_storage.content_ctags_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_ctags_add(results)
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -114,7 +114,7 @@
]
def persist_index_computations(
- self, results: List[ContentLicenseRow], policy_update: str
+ self, results: List[ContentLicenseRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -126,13 +126,8 @@
- license (bytes): license in bytes
- path (bytes): path
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
-
"""
- return self.idx_storage.content_fossology_license_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_fossology_license_add(results)
class FossologyLicenseIndexer(
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -10,6 +10,7 @@
import shutil
import tempfile
from typing import Any, Dict, Generic, Iterator, List, Optional, Set, TypeVar, Union
+import warnings
from swh.core import utils
from swh.core.config import load_from_envvar, merge_configs
@@ -250,17 +251,13 @@
yield from ids
@abc.abstractmethod
- def persist_index_computations(
- self, results: List[TResult], policy_update: str
- ) -> Dict[str, int]:
+ def persist_index_computations(self, results: List[TResult]) -> Dict[str, int]:
"""Persist the computation resulting from the index.
Args:
results: List of results. One result is the
result of the index function.
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
Returns:
a summary dict of what has been inserted in the storage
@@ -281,24 +278,28 @@
"""
- def run(self, ids: List[Sha1], policy_update: str, **kwargs) -> Dict:
+ def run(self, ids: List[Sha1], **kwargs) -> Dict:
"""Given a list of ids:
- retrieve the content from the storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
ids (Iterable[Union[bytes, str]]): sha1's identifier list
- policy_update (str): either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore
- them
**kwargs: passed to the `index` method
Returns:
A summary Dict of the task's status
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
+
sha1s = [
hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
]
@@ -318,7 +319,7 @@
if res: # If no results, skip it
results.extend(res)
summary["status"] = "eventful"
- summary = self.persist_index_computations(results, policy_update)
+ summary = self.persist_index_computations(results)
self.results = results
except Exception:
if not self.catch_exceptions:
@@ -478,9 +479,7 @@
count_object_added_key: Optional[str] = None
for contents in utils.grouper(gen, n=self.config["write_batch_size"]):
- res = self.persist_index_computations(
- list(contents), policy_update="update-dups"
- )
+ res = self.persist_index_computations(list(contents))
if not count_object_added_key:
count_object_added_key = list(res.keys())[0]
count += res[count_object_added_key]
@@ -508,22 +507,24 @@
"""
- def run(
- self, origin_urls: List[str], policy_update: str = "update-dups", **kwargs
- ) -> Dict:
+ def run(self, origin_urls: List[str], **kwargs) -> Dict:
"""Given a list of origin urls:
- retrieve origins from storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
origin_urls: list of origin urls.
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates (default) or ignore them
**kwargs: passed to the `index` method
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
summary: Dict[str, Any] = {"status": "uneventful"}
try:
results = self.index_list(origin_urls, **kwargs)
@@ -533,7 +534,7 @@
summary["status"] = "failed"
return summary
- summary_persist = self.persist_index_computations(results, policy_update)
+ summary_persist = self.persist_index_computations(results)
self.results = results
if summary_persist:
for value in summary_persist.values():
@@ -564,19 +565,23 @@
"""
- def run(self, ids: List[Sha1Git], policy_update: str) -> Dict:
+ def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
"""Given a list of sha1_gits:
- retrieve revisions from storage
- execute the indexing computations
- - store the results (according to policy_update)
+ - store the results
Args:
ids: sha1_git's identifier list
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
+ if "policy_update" in kwargs:
+ warnings.warn(
+ "'policy_update' argument is deprecated and ignored.",
+ DeprecationWarning,
+ )
+ del kwargs["policy_update"]
summary: Dict[str, Any] = {"status": "uneventful"}
results = []
@@ -599,7 +604,7 @@
summary["status"] = "failed"
return summary
- summary_persist = self.persist_index_computations(results, policy_update)
+ summary_persist = self.persist_index_computations(results)
if summary_persist:
for value in summary_persist.values():
if value > 0:
diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py
--- a/swh/indexer/journal_client.py
+++ b/swh/indexer/journal_client.py
@@ -36,7 +36,6 @@
task_names["origin_metadata"],
"oneshot",
visit_urls,
- policy_update="update-dups",
retries_left=1,
)
)
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -109,7 +109,7 @@
]
def persist_index_computations(
- self, results: List[ContentMetadataRow], policy_update: str
+ self, results: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -118,13 +118,9 @@
following keys:
- id (bytes): content's identifier (sha1)
- metadata (jsonb): detected metadata
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
- return self.idx_storage.content_metadata_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_metadata_add(results)
DEFAULT_CONFIG: Dict[str, Any] = {
@@ -217,7 +213,7 @@
]
def persist_index_computations(
- self, results: List[RevisionIntrinsicMetadataRow], policy_update: str
+ self, results: List[RevisionIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -227,15 +223,11 @@
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in
# revision_intrinsic_metadata
- return self.idx_storage.revision_intrinsic_metadata_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.revision_intrinsic_metadata_add(results)
def translate_revision_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
@@ -291,9 +283,7 @@
# content indexing
try:
c_metadata_indexer.run(
- sha1s_filtered,
- policy_update="ignore-dups",
- log_suffix=log_suffix,
+ sha1s_filtered, log_suffix=log_suffix,
)
# on the fly possibility:
for result in c_metadata_indexer.results:
@@ -364,10 +354,7 @@
def persist_index_computations(
self,
results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
- policy_update: str,
) -> Dict[str, int]:
- conflict_update = policy_update == "update-dups"
-
# Deduplicate revisions
rev_metadata: List[RevisionIntrinsicMetadataRow] = []
orig_metadata: List[OriginIntrinsicMetadataRow] = []
@@ -382,14 +369,10 @@
orig_metadata.append(orig_item)
if rev_metadata:
- summary_rev = self.idx_storage.revision_intrinsic_metadata_add(
- rev_metadata, conflict_update=conflict_update
- )
+ summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
summary.update(summary_rev)
if orig_metadata:
- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
- orig_metadata, conflict_update=conflict_update
- )
+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -95,7 +95,7 @@
]
def persist_index_computations(
- self, results: List[ContentMimetypeRow], policy_update: str
+ self, results: List[ContentMimetypeRow]
) -> Dict[str, int]:
"""Persist the results in storage.
@@ -103,13 +103,8 @@
results: list of content's mimetype dicts
(see :meth:`.index`)
- policy_update: either 'update-dups' or 'ignore-dups' to
- respectively update duplicates or ignore them
-
"""
- return self.idx_storage.content_mimetype_add(
- results, conflict_update=(policy_update == "update-dups")
- )
+ return self.idx_storage.content_mimetype_add(results)
class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer[ContentMimetypeRow]):
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -25,9 +25,7 @@
USE_TOOLS = False
- def persist_index_computations(
- self, results: Any, policy_update: str
- ) -> Dict[str, int]:
+ def persist_index_computations(self, results: Any) -> Dict[str, int]:
"""Do nothing. The indexer's results are not persistent, they
should only be piped to another indexer."""
return {}
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(132, now(), 'Work In Progress');
+ values(133, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -41,7 +41,7 @@
comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information';
-- add tmp_content_mimetype entries to content_mimetype, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_content_mimetype_missing must take place before calling this
@@ -49,36 +49,28 @@
--
-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
-- 2. call this function
-create or replace function swh_content_mimetype_add(conflict_update boolean)
+create or replace function swh_content_mimetype_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
- select id, mimetype, encoding, indexer_configuration_id
- from tmp_content_mimetype tcm
- on conflict(id, indexer_configuration_id)
- do update set mimetype = excluded.mimetype,
- encoding = excluded.encoding;
- else
- insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
- select id, mimetype, encoding, indexer_configuration_id
- from tmp_content_mimetype tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id)
+ select id, mimetype, encoding, indexer_configuration_id
+ from tmp_content_mimetype tcm
+ on conflict(id, indexer_configuration_id)
+ do update set mimetype = excluded.mimetype,
+ encoding = excluded.encoding;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
+comment on function swh_content_mimetype_add() IS 'Add new content mimetypes';
--- add tmp_content_language entries to content_language, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_language entries to content_language, overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_content_language_missing must take place before calling this
@@ -86,32 +78,25 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_content_language, 2. call this function
-create or replace function swh_content_language_add(conflict_update boolean)
+create or replace function swh_content_language_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
- from tmp_content_language tcl
- on conflict(id, indexer_configuration_id)
- do update set lang = excluded.lang;
- else
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
- from tmp_content_language tcl
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
+ from tmp_content_language tcl
+ on conflict(id, indexer_configuration_id)
+ do update set lang = excluded.lang;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_language_add(boolean) IS 'Add new content languages';
+comment on function swh_content_language_add() IS 'Add new content languages';
-- create a temporary table for retrieving content_language
create or replace function swh_mktemp_content_language()
@@ -139,36 +124,29 @@
comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags';
--- add tmp_content_ctags entries to content_ctags, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_ctags entries to content_ctags, overwriting duplicates
--
-- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
-- 2. call this function
-create or replace function swh_content_ctags_add(conflict_update boolean)
+create or replace function swh_content_ctags_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- delete from content_ctags
- where id in (select tmp.id
- from tmp_content_ctags tmp
- inner join indexer_configuration i on i.id=tmp.indexer_configuration_id);
- end if;
-
insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id)
select id, name, kind, line, lang, indexer_configuration_id
from tmp_content_ctags tct
on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id)
do nothing;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
+comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content';
create type content_ctags_signature as (
id sha1,
@@ -218,12 +196,12 @@
comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';
--- add tmp_content_fossology_license entries to content_fossology_license, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_fossology_license entries to content_fossology_license,
+-- overwriting duplicates.
--
-- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to
-- tmp_content_fossology_license, 2. call this function
-create or replace function swh_content_fossology_license_add(conflict_update boolean)
+create or replace function swh_content_fossology_license_add()
returns bigint
language plpgsql
as $$
@@ -236,36 +214,25 @@
where not exists (select 1 from fossology_license where name=tmp.license)
on conflict(name) do nothing;
- if conflict_update then
- insert into content_fossology_license (id, license_id, indexer_configuration_id)
- select tcl.id,
- (select id from fossology_license where name = tcl.license) as license,
- indexer_configuration_id
- from tmp_content_fossology_license tcl
- on conflict(id, license_id, indexer_configuration_id)
- do update set license_id = excluded.license_id;
- else
- insert into content_fossology_license (id, license_id, indexer_configuration_id)
- select tcl.id,
- (select id from fossology_license where name = tcl.license) as license,
- indexer_configuration_id
- from tmp_content_fossology_license tcl
- on conflict(id, license_id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_fossology_license (id, license_id, indexer_configuration_id)
+ select tcl.id,
+ (select id from fossology_license where name = tcl.license) as license,
+ indexer_configuration_id
+ from tmp_content_fossology_license tcl
+ on conflict(id, license_id, indexer_configuration_id)
+ do update set license_id = excluded.license_id;
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
+comment on function swh_content_fossology_license_add() IS 'Add new content licenses';
-- content_metadata functions
--- add tmp_content_metadata entries to content_metadata, overwriting
--- duplicates if conflict_update is true, skipping duplicates otherwise.
+-- add tmp_content_metadata entries to content_metadata, overwriting duplicates
--
-- If filtering duplicates is in order, the call to
-- swh_content_metadata_missing must take place before calling this
@@ -273,32 +240,25 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_content_metadata, 2. call this function
-create or replace function swh_content_metadata_add(conflict_update boolean)
+create or replace function swh_content_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into content_metadata (id, metadata, indexer_configuration_id)
- select id, metadata, indexer_configuration_id
- from tmp_content_metadata tcm
- on conflict(id, indexer_configuration_id)
- do update set metadata = excluded.metadata;
- else
- insert into content_metadata (id, metadata, indexer_configuration_id)
- select id, metadata, indexer_configuration_id
- from tmp_content_metadata tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into content_metadata (id, metadata, indexer_configuration_id)
+ select id, metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set metadata = excluded.metadata;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
+comment on function swh_content_metadata_add() IS 'Add new content metadata';
-- create a temporary table for retrieving content_metadata
create or replace function swh_mktemp_content_metadata()
@@ -315,8 +275,7 @@
-- end content_metadata functions
-- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata,
--- overwriting duplicates if conflict_update is true, skipping duplicates
--- otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_revision_intrinsic_metadata_missing must take place before calling this
@@ -324,34 +283,27 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_revision_intrinsic_metadata, 2. call this function
-create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean)
+create or replace function swh_revision_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
declare
res bigint;
begin
- if conflict_update then
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
- select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
- on conflict(id, indexer_configuration_id)
- do update set
- metadata = excluded.metadata,
- mappings = excluded.mappings;
- else
- insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
- select id, metadata, mappings, indexer_configuration_id
- from tmp_revision_intrinsic_metadata tcm
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+ insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id)
+ select id, metadata, mappings, indexer_configuration_id
+ from tmp_revision_intrinsic_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ mappings = excluded.mappings;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata';
+comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata';
-- create a temporary table for retrieving revision_intrinsic_metadata
create or replace function swh_mktemp_revision_intrinsic_metadata()
@@ -389,7 +341,7 @@
-- add tmp_indexer_configuration entries to indexer_configuration,
--- skipping duplicates if any.
+-- overwriting duplicates if any.
--
-- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to
-- it, 2. call this function to insert and filtering out duplicates
@@ -412,8 +364,7 @@
$$;
-- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata,
--- overwriting duplicates if conflict_update is true, skipping duplicates
--- otherwise.
+-- overwriting duplicates.
--
-- If filtering duplicates is in order, the call to
-- swh_origin_intrinsic_metadata_missing must take place before calling this
@@ -421,8 +372,7 @@
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
-- tmp_origin_intrinsic_metadata, 2. call this function
-create or replace function swh_origin_intrinsic_metadata_add(
- conflict_update boolean)
+create or replace function swh_origin_intrinsic_metadata_add()
returns bigint
language plpgsql
as $$
@@ -430,31 +380,24 @@
res bigint;
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
- if conflict_update then
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector, mappings
- from tmp_origin_intrinsic_metadata
- on conflict(id, indexer_configuration_id)
- do update set
- metadata = excluded.metadata,
- metadata_tsvector = excluded.metadata_tsvector,
- mappings = excluded.mappings,
- from_revision = excluded.from_revision;
- else
- insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, metadata, indexer_configuration_id, from_revision,
- metadata_tsvector, mappings
- from tmp_origin_intrinsic_metadata
- on conflict(id, indexer_configuration_id)
- do nothing;
- end if;
+
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
get diagnostics res = ROW_COUNT;
return res;
end
$$;
-comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
+comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata';
-- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata.
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -267,11 +267,7 @@
@process_metrics
@db_transaction()
def content_mimetype_add(
- self,
- mimetypes: List[ContentMimetypeRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, mimetypes: List[ContentMimetypeRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(mimetypes)
mimetypes.sort(key=lambda m: m.id)
@@ -282,7 +278,7 @@
["id", "mimetype", "encoding", "indexer_configuration_id"],
cur,
)
- count = db.content_mimetype_add_from_temp(conflict_update, cur)
+ count = db.content_mimetype_add_from_temp(cur)
return {"content_mimetype:add": count}
@timed
@@ -320,11 +316,7 @@
@process_metrics
@db_transaction()
def content_language_add(
- self,
- languages: List[ContentLanguageRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, languages: List[ContentLanguageRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(languages)
languages.sort(key=lambda m: m.id)
@@ -344,7 +336,7 @@
cur,
)
- count = db.content_language_add_from_temp(conflict_update, cur)
+ count = db.content_language_add_from_temp(cur)
return {"content_language:add": count}
@timed
@@ -370,11 +362,7 @@
@process_metrics
@db_transaction()
def content_ctags_add(
- self,
- ctags: List[ContentCtagsRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, ctags: List[ContentCtagsRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(ctags)
ctags.sort(key=lambda m: m.id)
@@ -387,7 +375,7 @@
cur=cur,
)
- count = db.content_ctags_add_from_temp(conflict_update, cur)
+ count = db.content_ctags_add_from_temp(cur)
return {"content_ctags:add": count}
@timed
@@ -425,11 +413,7 @@
@process_metrics
@db_transaction()
def content_fossology_license_add(
- self,
- licenses: List[ContentLicenseRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, licenses: List[ContentLicenseRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(licenses)
licenses.sort(key=lambda m: m.id)
@@ -440,7 +424,7 @@
columns=["id", "license", "indexer_configuration_id"],
cur=cur,
)
- count = db.content_fossology_license_add_from_temp(conflict_update, cur)
+ count = db.content_fossology_license_add_from_temp(cur)
return {"content_fossology_license:add": count}
@timed
@@ -490,11 +474,7 @@
@process_metrics
@db_transaction()
def content_metadata_add(
- self,
- metadata: List[ContentMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[ContentMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -507,7 +487,7 @@
["id", "metadata", "indexer_configuration_id"],
cur,
)
- count = db.content_metadata_add_from_temp(conflict_update, cur)
+ count = db.content_metadata_add_from_temp(cur)
return {
"content_metadata:add": count,
}
@@ -540,11 +520,7 @@
@process_metrics
@db_transaction()
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -557,7 +533,7 @@
["id", "metadata", "mappings", "indexer_configuration_id"],
cur,
)
- count = db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ count = db.revision_intrinsic_metadata_add_from_temp(cur)
return {
"revision_intrinsic_metadata:add": count,
}
@@ -580,11 +556,7 @@
@process_metrics
@db_transaction()
def origin_intrinsic_metadata_add(
- self,
- metadata: List[OriginIntrinsicMetadataRow],
- conflict_update: bool = False,
- db=None,
- cur=None,
+ self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None,
) -> Dict[str, int]:
check_id_duplicates(metadata)
metadata.sort(key=lambda m: m.id)
@@ -597,7 +569,7 @@
["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"],
cur,
)
- count = db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
+ count = db.origin_intrinsic_metadata_add_from_temp(cur)
return {
"origin_intrinsic_metadata:add": count,
}
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -74,9 +74,9 @@
def mktemp_content_mimetype(self, cur=None):
pass
- def content_mimetype_add_from_temp(self, conflict_update, cur=None):
+ def content_mimetype_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_mimetype_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_mimetype_add()")
return cur.fetchone()[0]
def _convert_key(self, key, main_table="c"):
@@ -209,9 +209,9 @@
def mktemp_content_language(self, cur=None):
pass
- def content_language_add_from_temp(self, conflict_update, cur=None):
+ def content_language_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_language_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_language_add()")
return cur.fetchone()[0]
def content_language_get_from_list(self, ids, cur=None):
@@ -245,9 +245,9 @@
def mktemp_content_ctags(self, cur=None):
pass
- def content_ctags_add_from_temp(self, conflict_update, cur=None):
+ def content_ctags_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_ctags_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_ctags_add()")
return cur.fetchone()[0]
def content_ctags_get_from_list(self, ids, cur=None):
@@ -303,14 +303,12 @@
def mktemp_content_fossology_license(self, cur=None):
pass
- def content_fossology_license_add_from_temp(self, conflict_update, cur=None):
+ def content_fossology_license_add_from_temp(self, cur=None):
"""Add new licenses per content.
"""
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_content_fossology_license_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_content_fossology_license_add()")
return cur.fetchone()[0]
def content_fossology_license_get_from_list(self, ids, cur=None):
@@ -355,9 +353,9 @@
def mktemp_content_metadata(self, cur=None):
pass
- def content_metadata_add_from_temp(self, conflict_update, cur=None):
+ def content_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute("select * from swh_content_metadata_add(%s)", (conflict_update,))
+ cur.execute("select * from swh_content_metadata_add()")
return cur.fetchone()[0]
def content_metadata_get_from_list(self, ids, cur=None):
@@ -392,11 +390,9 @@
def mktemp_revision_intrinsic_metadata(self, cur=None):
pass
- def revision_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
+ def revision_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_revision_intrinsic_metadata_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_revision_intrinsic_metadata_add()")
return cur.fetchone()[0]
def revision_intrinsic_metadata_get_from_list(self, ids, cur=None):
@@ -429,11 +425,9 @@
def mktemp_origin_intrinsic_metadata(self, cur=None):
pass
- def origin_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None):
+ def origin_intrinsic_metadata_add_from_temp(self, cur=None):
cur = self._cursor(cur)
- cur.execute(
- "select * from swh_origin_intrinsic_metadata_add(%s)", (conflict_update,)
- )
+ cur.execute("select * from swh_origin_intrinsic_metadata_add()")
return cur.fetchone()[0]
def origin_intrinsic_metadata_get_from_list(self, ids, cur=None):
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -193,7 +193,7 @@
assert len(ids) <= limit
return PagedResult(results=ids, next_page_token=next_page_token)
- def add(self, data: Iterable[TValue], conflict_update: bool) -> int:
+ def add(self, data: Iterable[TValue]) -> int:
"""Add data not present in storage.
Args:
@@ -204,9 +204,6 @@
results
- arbitrary data
- conflict_update (bool): Flag to determine if we want to overwrite
- (true) or skip duplicates (false)
-
"""
data = list(data)
check_id_duplicates(data)
@@ -216,9 +213,6 @@
id_ = item.pop("id")
tool_id = item["indexer_configuration_id"]
key = _key_from_dict(obj.unique_key())
- if not conflict_update and key in self._data[id_]:
- # Duplicate, should not be updated
- continue
self._data[id_][key] = item
self._tools_per_id[id_].add(tool_id)
count += 1
@@ -265,9 +259,9 @@
)
def content_mimetype_add(
- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
+ self, mimetypes: List[ContentMimetypeRow]
) -> Dict[str, int]:
- added = self._mimetypes.add(mimetypes, conflict_update)
+ added = self._mimetypes.add(mimetypes)
return {"content_mimetype:add": added}
def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]:
@@ -282,9 +276,9 @@
return self._languages.get(ids)
def content_language_add(
- self, languages: List[ContentLanguageRow], conflict_update: bool = False
+ self, languages: List[ContentLanguageRow]
) -> Dict[str, int]:
- added = self._languages.add(languages, conflict_update)
+ added = self._languages.add(languages)
return {"content_language:add": added}
def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]:
@@ -293,10 +287,8 @@
def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]:
return self._content_ctags.get(ids)
- def content_ctags_add(
- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
- ) -> Dict[str, int]:
- added = self._content_ctags.add(ctags, conflict_update,)
+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
+ added = self._content_ctags.add(ctags)
return {"content_ctags:add": added}
def content_ctags_search(
@@ -329,9 +321,9 @@
return self._licenses.get(ids)
def content_fossology_license_add(
- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
+ self, licenses: List[ContentLicenseRow]
) -> Dict[str, int]:
- added = self._licenses.add(licenses, conflict_update)
+ added = self._licenses.add(licenses)
return {"content_fossology_license:add": added}
def content_fossology_license_get_partition(
@@ -355,9 +347,9 @@
return self._content_metadata.get(ids)
def content_metadata_add(
- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
+ self, metadata: List[ContentMetadataRow]
) -> Dict[str, int]:
- added = self._content_metadata.add(metadata, conflict_update)
+ added = self._content_metadata.add(metadata)
return {"content_metadata:add": added}
def revision_intrinsic_metadata_missing(
@@ -371,11 +363,9 @@
return self._revision_intrinsic_metadata.get(ids)
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
+ self, metadata: List[RevisionIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._revision_intrinsic_metadata.add(metadata, conflict_update)
+ added = self._revision_intrinsic_metadata.add(metadata)
return {"revision_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_get(
@@ -384,9 +374,9 @@
return self._origin_intrinsic_metadata.get(urls)
def origin_intrinsic_metadata_add(
- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
+ self, metadata: List[OriginIntrinsicMetadataRow]
) -> Dict[str, int]:
- added = self._origin_intrinsic_metadata.add(metadata, conflict_update)
+ added = self._origin_intrinsic_metadata.add(metadata)
return {"origin_intrinsic_metadata:add": added}
def origin_intrinsic_metadata_search_fulltext(
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -84,14 +84,13 @@
@remote_api_endpoint("content_mimetype/add")
def content_mimetype_add(
- self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False
+ self, mimetypes: List[ContentMimetypeRow]
) -> Dict[str, int]:
"""Add mimetypes not present in storage.
Args:
mimetypes: mimetype rows to be added, with their `tool` attribute set to
None.
- conflict_update: Flag to determine if we want to
overwrite (``True``) or skip duplicates (``False``, the
default)
@@ -148,17 +147,13 @@
@remote_api_endpoint("content_language/add")
def content_language_add(
- self, languages: List[ContentLanguageRow], conflict_update: bool = False
+ self, languages: List[ContentLanguageRow]
) -> Dict[str, int]:
"""Add languages not present in storage.
Args:
languages: language row objects
- conflict_update (bool): Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the
- default)
-
Returns:
Dict summary of number of rows added
@@ -198,9 +193,7 @@
...
@remote_api_endpoint("content/ctags/add")
- def content_ctags_add(
- self, ctags: List[ContentCtagsRow], conflict_update: bool = False
- ) -> Dict[str, int]:
+ def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]:
"""Add ctags not present in storage
Args:
@@ -251,7 +244,7 @@
@remote_api_endpoint("content/fossology_license/add")
def content_fossology_license_add(
- self, licenses: List[ContentLicenseRow], conflict_update: bool = False
+ self, licenses: List[ContentLicenseRow]
) -> Dict[str, int]:
"""Add licenses not present in storage.
@@ -259,9 +252,6 @@
license: license rows to be added, with their `tool` attribute set to
None.
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -335,7 +325,7 @@
@remote_api_endpoint("content_metadata/add")
def content_metadata_add(
- self, metadata: List[ContentMetadataRow], conflict_update: bool = False
+ self, metadata: List[ContentMetadataRow]
) -> Dict[str, int]:
"""Add metadata not present in storage.
@@ -345,9 +335,6 @@
- **id**: sha1
- **metadata**: arbitrary dict
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -390,18 +377,13 @@
@remote_api_endpoint("revision_intrinsic_metadata/add")
def revision_intrinsic_metadata_add(
- self,
- metadata: List[RevisionIntrinsicMetadataRow],
- conflict_update: bool = False,
+ self, metadata: List[RevisionIntrinsicMetadataRow],
) -> Dict[str, int]:
"""Add metadata not present in storage.
Args:
metadata: ContentMetadataRow objects
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
@@ -423,16 +405,13 @@
@remote_api_endpoint("origin_intrinsic_metadata/add")
def origin_intrinsic_metadata_add(
- self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False
+ self, metadata: List[OriginIntrinsicMetadataRow]
) -> Dict[str, int]:
"""Add origin metadata not present in storage.
Args:
metadata: list of OriginIntrinsicMetadataRow objects
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
-
Returns:
Dict summary of number of rows added
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -137,45 +137,6 @@
actual_missing = endpoint(storage, etype, "missing")(query)
assert list(actual_missing) == [data.sha1_1]
- def test_add__drop_duplicate(
- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
- ) -> None:
- storage, data = swh_indexer_storage_with_data
- etype = self.endpoint_type
- tool_id = data.tools[self.tool_name]["id"]
-
- # add the first object
- data_v1 = {
- "id": data.sha1_2,
- **self.example_data[0],
- "indexer_configuration_id": tool_id,
- }
- summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)])
- assert summary == expected_summary(1, etype)
-
- # should be able to retrieve it
- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
- expected_data_v1 = [
- self.row_class.from_dict(
- {
- "id": data.sha1_2,
- **self.example_data[0],
- "tool": data.tools[self.tool_name],
- }
- )
- ]
- assert actual_data == expected_data_v1
-
- # now if we add a modified version of the same object (same id)
- data_v2 = data_v1.copy()
- data_v2.update(self.example_data[1])
- summary2 = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
- assert summary2 == expected_summary(0, etype) # not added
-
- # we expect to retrieve the original data, not the modified one
- actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
- assert actual_data == expected_data_v1
-
def test_add__update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
@@ -209,9 +170,7 @@
data_v2 = data_v1.copy()
data_v2.update(self.example_data[1])
- endpoint(storage, etype, "add")(
- [self.row_class.from_dict(data_v2)], conflict_update=True
- )
+ endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)])
assert summary == expected_summary(1, etype) # modified so counted
actual_data = list(endpoint(storage, etype, "get")([data.sha1_2]))
@@ -225,7 +184,7 @@
# data did change as the v2 was used to overwrite v1
assert actual_data == expected_data_v2
- def test_add__update_in_place_deadlock(
+ def test_add_deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
@@ -267,7 +226,9 @@
endpoint(storage, etype, "add")(data_v1)
# when
- actual_data = list(endpoint(storage, etype, "get")(hashes))
+ actual_data = sorted(
+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
+ )
expected_data_v1 = [
self.row_class.from_dict(
@@ -281,10 +242,10 @@
# given
def f1() -> None:
- endpoint(storage, etype, "add")(data_v2a, conflict_update=True)
+ endpoint(storage, etype, "add")(data_v2a)
def f2() -> None:
- endpoint(storage, etype, "add")(data_v2b, conflict_update=True)
+ endpoint(storage, etype, "add")(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
@@ -295,15 +256,21 @@
t2.join()
actual_data = sorted(
- (row.to_dict() for row in endpoint(storage, etype, "get")(hashes)),
- key=lambda x: x["id"],
+ endpoint(storage, etype, "get")(hashes), key=lambda x: x.id,
)
expected_data_v2 = [
- {"id": hash_, **self.example_data[1], "tool": tool} for hash_ in hashes
+ self.row_class.from_dict(
+ {"id": hash_, **self.example_data[1], "tool": tool}
+ )
+ for hash_ in hashes
]
- assert actual_data == expected_data_v2
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
def test_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
@@ -333,9 +300,7 @@
assert summary == expected_summary(1, etype)
with pytest.raises(DuplicateId):
- endpoint(storage, etype, "add")(
- [data_rev2, data_rev2], conflict_update=True
- )
+ endpoint(storage, etype, "add")([data_rev2, data_rev2])
# then
actual_data = list(
@@ -545,16 +510,12 @@
row_class = ContentCtagsRow
# the following tests are disabled because CTAGS behaves differently
- @pytest.mark.skip
- def test_add__drop_duplicate(self):
- pass
-
@pytest.mark.skip
def test_add__update_in_place_duplicate(self):
pass
@pytest.mark.skip
- def test_add__update_in_place_deadlock(self):
+ def test_add_deadlock(self):
pass
@pytest.mark.skip
@@ -736,7 +697,7 @@
)
ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool)
- storage.content_ctags_add([ctag1, ctag2], conflict_update=True)
+ storage.content_ctags_add([ctag1, ctag2])
actual_ctags = list(storage.content_ctags_get([data.sha1_2]))
@@ -889,7 +850,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -925,7 +886,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -951,7 +912,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -993,7 +954,7 @@
mimetypes = prepare_mimetypes_from_licenses(fossology_licenses)
indexer_configuration_id = fossology_licenses[0].indexer_configuration_id
- storage.content_mimetype_add(mimetypes, conflict_update=True)
+ storage.content_mimetype_add(mimetypes)
# add fossology_licenses to storage
storage.content_fossology_license_add(fossology_licenses)
@@ -1093,71 +1054,6 @@
assert actual_metadata == expected_metadata
- def test_origin_intrinsic_metadata_add_drop_duplicate(
- self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
- ) -> None:
- storage, data = swh_indexer_storage_with_data
- # given
- tool_id = data.tools["swh-metadata-detector"]["id"]
-
- metadata_v1: Dict[str, Any] = {
- "version": None,
- "name": None,
- }
- metadata_rev_v1 = RevisionIntrinsicMetadataRow(
- id=data.revision_id_1,
- metadata=metadata_v1.copy(),
- mappings=[],
- indexer_configuration_id=tool_id,
- )
- metadata_origin_v1 = OriginIntrinsicMetadataRow(
- id=data.origin_url_1,
- metadata=metadata_v1.copy(),
- indexer_configuration_id=tool_id,
- mappings=[],
- from_revision=data.revision_id_1,
- )
-
- # given
- storage.revision_intrinsic_metadata_add([metadata_rev_v1])
- storage.origin_intrinsic_metadata_add([metadata_origin_v1])
-
- # when
- actual_metadata = list(
- storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"])
- )
-
- expected_metadata_v1 = [
- OriginIntrinsicMetadataRow(
- id=data.origin_url_1,
- metadata=metadata_v1,
- tool=data.tools["swh-metadata-detector"],
- from_revision=data.revision_id_1,
- mappings=[],
- )
- ]
-
- assert actual_metadata == expected_metadata_v1
-
- # given
- metadata_v2 = metadata_v1.copy()
- metadata_v2.update(
- {"name": "test_metadata", "author": "MG",}
- )
- metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2)
- metadata_origin_v2 = attr.evolve(metadata_origin_v1, metadata=metadata_v2)
-
- storage.revision_intrinsic_metadata_add([metadata_rev_v2])
- storage.origin_intrinsic_metadata_add([metadata_origin_v2])
-
- # then
- actual_metadata = list(
- storage.origin_intrinsic_metadata_get([data.origin_url_1])
- )
-
- # metadata did not change as the v2 was dropped.
- assert actual_metadata == expected_metadata_v1
-
def test_origin_intrinsic_metadata_add_update_in_place_duplicate(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
@@ -1218,10 +1114,8 @@
from_revision=data.revision_id_1,
)
- storage.revision_intrinsic_metadata_add([metadata_rev_v2], conflict_update=True)
- storage.origin_intrinsic_metadata_add(
- [metadata_origin_v2], conflict_update=True
- )
+ storage.revision_intrinsic_metadata_add([metadata_rev_v2])
+ storage.origin_intrinsic_metadata_add([metadata_origin_v2])
actual_metadata = list(
storage.origin_intrinsic_metadata_get([data.origin_url_1])
@@ -1240,7 +1134,7 @@
# metadata did change as the v2 was used to overwrite v1
assert actual_metadata == expected_metadata_v2
- def test_origin_intrinsic_metadata_add__update_in_place_deadlock(
+ def test_origin_intrinsic_metadata_add__deadlock(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
) -> None:
storage, data = swh_indexer_storage_with_data
@@ -1312,10 +1206,10 @@
# given
def f1() -> None:
- storage.origin_intrinsic_metadata_add(data_v2a, conflict_update=True)
+ storage.origin_intrinsic_metadata_add(data_v2a)
def f2() -> None:
- storage.origin_intrinsic_metadata_add(data_v2b, conflict_update=True)
+ storage.origin_intrinsic_metadata_add(data_v2b)
t1 = threading.Thread(target=f1)
t2 = threading.Thread(target=f2)
@@ -1337,8 +1231,11 @@
for id_ in ids
]
- assert len(actual_data) == len(expected_data_v2)
- assert sorted(actual_data, key=lambda x: x.id) == expected_data_v2
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
def test_origin_intrinsic_metadata_add__duplicate_twice(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -74,7 +74,7 @@
def _assert_tasks_for_origins(tasks, origins):
- expected_kwargs = {"policy_update": "update-dups"}
+ expected_kwargs = {}
assert {task["type"] for task in tasks} == {"index-origin-metadata"}
assert all(len(task["arguments"]["args"]) == 1 for task in tasks)
for task in tasks:
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -31,7 +31,7 @@
) -> List[Dict[str, Any]]:
raise _TestException()
- def persist_index_computations(self, results, policy_update) -> Dict[str, int]:
+ def persist_index_computations(self, results) -> Dict[str, int]:
return {}
def indexed_contents_in_partition(
@@ -61,12 +61,12 @@
indexer.objstorage = Mock()
indexer.objstorage.get.return_value = b"content"
- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
+ assert indexer.run([b"foo"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run([b"foo"], policy_update=True)
+ indexer.run([b"foo"])
def test_revision_indexer_catch_exceptions():
@@ -74,25 +74,23 @@
indexer.storage = Mock()
indexer.storage.revision_get.return_value = ["rev"]
- assert indexer.run([b"foo"], policy_update=True) == {"status": "failed"}
+ assert indexer.run([b"foo"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run([b"foo"], policy_update=True)
+ indexer.run([b"foo"])
def test_origin_indexer_catch_exceptions():
indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)
- assert indexer.run(["http://example.org"], policy_update=True) == {
- "status": "failed"
- }
+ assert indexer.run(["http://example.org"]) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run(["http://example.org"], policy_update=True)
+ indexer.run(["http://example.org"])
def test_content_partition_indexer_catch_exceptions():
@@ -100,9 +98,9 @@
config={**BASE_TEST_CONFIG, "write_batch_size": 42}
)
- assert indexer.run(0, 42, policy_update=True) == {"status": "failed"}
+ assert indexer.run(0, 42) == {"status": "failed"}
indexer.catch_exceptions = False
with pytest.raises(_TestException):
- indexer.run(0, 42, policy_update=True)
+ indexer.run(0, 42)
diff --git a/swh/indexer/tests/test_journal_client.py b/swh/indexer/tests/test_journal_client.py
--- a/swh/indexer/tests/test_journal_client.py
+++ b/swh/indexer/tests/test_journal_client.py
@@ -30,10 +30,7 @@
(
[
{
- "arguments": {
- "kwargs": {"policy_update": "update-dups"},
- "args": (["file:///dev/zero"],),
- },
+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
"policy": "oneshot",
"type": "task-name",
"retries_left": 1,
@@ -64,10 +61,7 @@
(
[
{
- "arguments": {
- "kwargs": {"policy_update": "update-dups"},
- "args": (["file:///dev/zero"],),
- },
+ "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),},
"policy": "oneshot",
"type": "task-name",
"retries_left": 1,
@@ -100,7 +94,7 @@
[
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///dev/zero", "file:///tmp/foobar"],),
},
"policy": "oneshot",
@@ -138,7 +132,7 @@
[
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///dev/zero", "file:///tmp/foobar"],),
},
"policy": "oneshot",
@@ -147,7 +141,7 @@
},
{
"arguments": {
- "kwargs": {"policy_update": "update-dups"},
+ "kwargs": {},
"args": (["file:///tmp/spamegg"],),
},
"policy": "oneshot",
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -140,7 +140,7 @@
fill_storage(metadata_indexer.storage)
# when
- metadata_indexer.run(sha1s, policy_update="ignore-dups")
+ metadata_indexer.run(sha1s)
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
@@ -1122,7 +1122,7 @@
]
)
- metadata_indexer.run([rev.id], "update-dups")
+ metadata_indexer.run([rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
@@ -1183,7 +1183,7 @@
]
)
- metadata_indexer.run([new_rev.id], "update-dups")
+ metadata_indexer.run([new_rev.id])
results = list(
metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -46,7 +46,7 @@
indexing tests.
"""
- def persist_index_computations(self, results, policy_update):
+ def persist_index_computations(self, results):
self.results = results
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -262,7 +262,6 @@
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
- self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
@@ -606,12 +605,12 @@
sha1s = [self.id0, self.id1, self.id2]
# when
- self.indexer.run(sha1s, policy_update="update-dups")
+ self.indexer.run(sha1s)
self.assert_results_ok(sha1s)
# 2nd pass
- self.indexer.run(sha1s, policy_update="ignore-dups")
+ self.indexer.run(sha1s)
self.assert_results_ok(sha1s)
@@ -624,7 +623,7 @@
] # unknown
# when
- self.indexer.run(sha1s, policy_update="update-dups")
+ self.indexer.run(sha1s)
# then
expected_results = [
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 6:06 PM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234822
Attached To
D4397: Remove conflict_update/policy_update option.
Event Timeline
Log In to Comment