diff --git a/sql/upgrades/133.sql b/sql/upgrades/133.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/133.sql @@ -0,0 +1,184 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 132 +-- to_version: 133 +-- description: remove 'conflict_update' argument + +insert into dbversion(version, release, description) + values(133, now(), 'Work In Progress'); + +drop function swh_content_mimetype_add(conflict_update boolean); +create or replace function swh_content_mimetype_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) + do update set mimetype = excluded.mimetype, + encoding = excluded.encoding; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_content_mimetype_add() IS 'Add new content mimetypes'; + + + +drop function swh_content_language_add(conflict_update boolean); +create or replace function swh_content_language_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do update set lang = excluded.lang; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_content_language_add() IS 'Add new content languages'; + + + +drop function swh_content_ctags_add(conflict_update boolean); +create or replace function swh_content_ctags_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) + select id, name, kind, line, lang, indexer_configuration_id + from tmp_content_ctags tct + on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) + do nothing; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content'; + + + +drop function swh_content_fossology_license_add(conflict_update boolean); +create or replace function swh_content_fossology_license_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + -- insert unknown licenses first + insert into fossology_license (name) + select distinct license from tmp_content_fossology_license tmp + where not exists (select 1 from fossology_license where name=tmp.license) + on conflict(name) do nothing; + + insert into content_fossology_license (id, license_id, indexer_configuration_id) + select tcl.id, + (select id from fossology_license where name = tcl.license) as license, + indexer_configuration_id + from tmp_content_fossology_license tcl + on conflict(id, license_id, indexer_configuration_id) + do update set license_id = excluded.license_id; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_content_fossology_license_add() IS 'Add new content licenses'; + + + +drop function swh_content_metadata_add(conflict_update boolean); +create or replace function swh_content_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set metadata = excluded.metadata; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_content_metadata_add() IS 'Add new content metadata'; + + + +drop function swh_revision_intrinsic_metadata_add(conflict_update boolean); +create or replace function swh_revision_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata'; + + + +drop function swh_origin_intrinsic_metadata_add(conflict_update boolean); +create or replace function swh_origin_intrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_revision = excluded.from_revision; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata'; diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(132, now(), 'Work In Progress'); + values(133, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql --- a/swh/indexer/sql/50-func.sql +++ b/swh/indexer/sql/50-func.sql @@ -40,8 +40,7 @@ comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information'; --- add tmp_content_mimetype entries to content_mimetype, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_content_mimetype entries to content_mimetype, overwriting duplicates. -- -- If filtering duplicates is in order, the call to -- swh_content_mimetype_missing must take place before calling this @@ -49,36 +48,28 @@ -- -- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype, -- 2. call this function -create or replace function swh_content_mimetype_add(conflict_update boolean) +create or replace function swh_content_mimetype_add() returns bigint language plpgsql as $$ declare res bigint; begin - if conflict_update then - insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) - select id, mimetype, encoding, indexer_configuration_id - from tmp_content_mimetype tcm - on conflict(id, indexer_configuration_id) - do update set mimetype = excluded.mimetype, - encoding = excluded.encoding; - else - insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) - select id, mimetype, encoding, indexer_configuration_id - from tmp_content_mimetype tcm - on conflict(id, indexer_configuration_id) - do nothing; - end if; + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) + do update set mimetype = excluded.mimetype, + encoding = excluded.encoding; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; +comment on function swh_content_mimetype_add() IS 'Add new content mimetypes'; --- add tmp_content_language entries to content_language, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_content_language entries to content_language, overwriting duplicates. -- -- If filtering duplicates is in order, the call to -- swh_content_language_missing must take place before calling this @@ -86,32 +77,25 @@ -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_language, 2. call this function -create or replace function swh_content_language_add(conflict_update boolean) +create or replace function swh_content_language_add() returns bigint language plpgsql as $$ declare res bigint; begin - if conflict_update then - insert into content_language (id, lang, indexer_configuration_id) - select id, lang, indexer_configuration_id - from tmp_content_language tcl - on conflict(id, indexer_configuration_id) - do update set lang = excluded.lang; - else - insert into content_language (id, lang, indexer_configuration_id) - select id, lang, indexer_configuration_id - from tmp_content_language tcl - on conflict(id, indexer_configuration_id) - do nothing; - end if; + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do update set lang = excluded.lang; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_content_language_add(boolean) IS 'Add new content languages'; +comment on function swh_content_language_add() IS 'Add new content languages'; -- create a temporary table for retrieving content_language create or replace function swh_mktemp_content_language() @@ -139,36 +123,29 @@ comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; --- add tmp_content_ctags entries to content_ctags, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_content_ctags entries to content_ctags, overwriting duplicates -- -- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, -- 2. call this function -create or replace function swh_content_ctags_add(conflict_update boolean) +create or replace function swh_content_ctags_add() returns bigint language plpgsql as $$ declare res bigint; begin - if conflict_update then - delete from content_ctags - where id in (select tmp.id - from tmp_content_ctags tmp - inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); - end if; - insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) select id, name, kind, line, lang, indexer_configuration_id from tmp_content_ctags tct on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) do nothing; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; +comment on function swh_content_ctags_add() IS 'Add new ctags symbols per content'; create type content_ctags_signature as ( id sha1, @@ -218,12 +195,12 @@ comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; --- add tmp_content_fossology_license entries to content_fossology_license, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_content_fossology_license entries to content_fossology_license, +-- overwriting duplicates. -- -- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to -- tmp_content_fossology_license, 2. call this function -create or replace function swh_content_fossology_license_add(conflict_update boolean) +create or replace function swh_content_fossology_license_add() returns bigint language plpgsql as $$ @@ -236,36 +213,25 @@ where not exists (select 1 from fossology_license where name=tmp.license) on conflict(name) do nothing; - if conflict_update then - insert into content_fossology_license (id, license_id, indexer_configuration_id) - select tcl.id, - (select id from fossology_license where name = tcl.license) as license, - indexer_configuration_id - from tmp_content_fossology_license tcl - on conflict(id, license_id, indexer_configuration_id) - do update set license_id = excluded.license_id; - else - insert into content_fossology_license (id, license_id, indexer_configuration_id) - select tcl.id, - (select id from fossology_license where name = tcl.license) as license, - indexer_configuration_id - from tmp_content_fossology_license tcl - on conflict(id, license_id, indexer_configuration_id) - do nothing; - end if; + insert into content_fossology_license (id, license_id, indexer_configuration_id) + select tcl.id, + (select id from fossology_license where name = tcl.license) as license, + indexer_configuration_id + from tmp_content_fossology_license tcl + on conflict(id, license_id, indexer_configuration_id) + do update set license_id = excluded.license_id; get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; +comment on function swh_content_fossology_license_add() IS 'Add new content licenses'; -- content_metadata functions --- add tmp_content_metadata entries to content_metadata, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_content_metadata entries to content_metadata, overwriting duplicates -- -- If filtering duplicates is in order, the call to -- swh_content_metadata_missing must take place before calling this @@ -273,32 +239,25 @@ -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_metadata, 2. call this function -create or replace function swh_content_metadata_add(conflict_update boolean) +create or replace function swh_content_metadata_add() returns bigint language plpgsql as $$ declare res bigint; begin - if conflict_update then - insert into content_metadata (id, metadata, indexer_configuration_id) - select id, metadata, indexer_configuration_id - from tmp_content_metadata tcm - on conflict(id, indexer_configuration_id) - do update set metadata = excluded.metadata; - else - insert into content_metadata (id, metadata, indexer_configuration_id) - select id, metadata, indexer_configuration_id - from tmp_content_metadata tcm - on conflict(id, indexer_configuration_id) - do nothing; - end if; + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set metadata = excluded.metadata; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; +comment on function swh_content_metadata_add() IS 'Add new content metadata'; -- create a temporary table for retrieving content_metadata create or replace function swh_mktemp_content_metadata() @@ -315,8 +274,7 @@ -- end content_metadata functions -- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, --- overwriting duplicates if conflict_update is true, skipping duplicates --- otherwise. +-- overwriting duplicates. -- -- If filtering duplicates is in order, the call to -- swh_revision_intrinsic_metadata_missing must take place before calling this @@ -324,34 +282,27 @@ -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_revision_intrinsic_metadata, 2. call this function -create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean) +create or replace function swh_revision_intrinsic_metadata_add() returns bigint language plpgsql as $$ declare res bigint; begin - if conflict_update then - insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) - select id, metadata, mappings, indexer_configuration_id - from tmp_revision_intrinsic_metadata tcm - on conflict(id, indexer_configuration_id) - do update set - metadata = excluded.metadata, - mappings = excluded.mappings; - else - insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) - select id, metadata, mappings, indexer_configuration_id - from tmp_revision_intrinsic_metadata tcm - on conflict(id, indexer_configuration_id) - do nothing; - end if; + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata'; +comment on function swh_revision_intrinsic_metadata_add() IS 'Add new revision intrinsic metadata'; -- create a temporary table for retrieving revision_intrinsic_metadata create or replace function swh_mktemp_revision_intrinsic_metadata() @@ -389,7 +340,7 @@ -- add tmp_indexer_configuration entries to indexer_configuration, --- skipping duplicates if any. +-- overwriting duplicates if any. -- -- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to -- it, 2. call this function to insert and filtering out duplicates @@ -412,8 +363,7 @@ $$; -- add tmp_origin_intrinsic_metadata entries to origin_intrinsic_metadata, --- overwriting duplicates if conflict_update is true, skipping duplicates --- otherwise. +-- overwriting duplicates. -- -- If filtering duplicates is in order, the call to -- swh_origin_intrinsic_metadata_missing must take place before calling this @@ -421,8 +371,7 @@ -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_origin_intrinsic_metadata, 2. call this function -create or replace function swh_origin_intrinsic_metadata_add( - conflict_update boolean) +create or replace function swh_origin_intrinsic_metadata_add() returns bigint language plpgsql as $$ @@ -430,31 +379,24 @@ res bigint; begin perform swh_origin_intrinsic_metadata_compute_tsvector(); - if conflict_update then - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector, mappings - from tmp_origin_intrinsic_metadata - on conflict(id, indexer_configuration_id) - do update set - metadata = excluded.metadata, - metadata_tsvector = excluded.metadata_tsvector, - mappings = excluded.mappings, - from_revision = excluded.from_revision; - else - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, - metadata_tsvector, mappings - from tmp_origin_intrinsic_metadata - on conflict(id, indexer_configuration_id) - do nothing; - end if; + + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_revision = excluded.from_revision; + get diagnostics res = ROW_COUNT; return res; end $$; -comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; +comment on function swh_origin_intrinsic_metadata_add() IS 'Add new origin intrinsic metadata'; -- Compute the metadata_tsvector column in tmp_origin_intrinsic_metadata. diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -267,11 +267,7 @@ @process_metrics @db_transaction() def content_mimetype_add( - self, - mimetypes: List[ContentMimetypeRow], - conflict_update: bool = False, - db=None, - cur=None, + self, mimetypes: List[ContentMimetypeRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m.id) @@ -282,7 +278,7 @@ ["id", "mimetype", "encoding", "indexer_configuration_id"], cur, ) - count = db.content_mimetype_add_from_temp(conflict_update, cur) + count = db.content_mimetype_add_from_temp(cur) return {"content_mimetype:add": count} @timed @@ -320,11 +316,7 @@ @process_metrics @db_transaction() def content_language_add( - self, - languages: List[ContentLanguageRow], - conflict_update: bool = False, - db=None, - cur=None, + self, languages: List[ContentLanguageRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(languages) languages.sort(key=lambda m: m.id) @@ -344,7 +336,7 @@ cur, ) - count = db.content_language_add_from_temp(conflict_update, cur) + count = db.content_language_add_from_temp(cur) return {"content_language:add": count} @timed @@ -370,11 +362,7 @@ @process_metrics @db_transaction() def content_ctags_add( - self, - ctags: List[ContentCtagsRow], - conflict_update: bool = False, - db=None, - cur=None, + self, ctags: List[ContentCtagsRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(ctags) ctags.sort(key=lambda m: m.id) @@ -387,7 +375,7 @@ cur=cur, ) - count = db.content_ctags_add_from_temp(conflict_update, cur) + count = db.content_ctags_add_from_temp(cur) return {"content_ctags:add": count} @timed @@ -425,11 +413,7 @@ @process_metrics @db_transaction() def content_fossology_license_add( - self, - licenses: List[ContentLicenseRow], - conflict_update: bool = False, - db=None, - cur=None, + self, licenses: List[ContentLicenseRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(licenses) licenses.sort(key=lambda m: m.id) @@ -440,7 +424,7 @@ columns=["id", "license", "indexer_configuration_id"], cur=cur, ) - count = db.content_fossology_license_add_from_temp(conflict_update, cur) + count = db.content_fossology_license_add_from_temp(cur) return {"content_fossology_license:add": count} @timed @@ -490,11 +474,7 @@ @process_metrics @db_transaction() def content_metadata_add( - self, - metadata: List[ContentMetadataRow], - conflict_update: bool = False, - db=None, - cur=None, + self, metadata: List[ContentMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) @@ -507,7 +487,7 @@ ["id", "metadata", "indexer_configuration_id"], cur, ) - count = db.content_metadata_add_from_temp(conflict_update, cur) + count = db.content_metadata_add_from_temp(cur) return { "content_metadata:add": count, } @@ -540,11 +520,7 @@ @process_metrics @db_transaction() def revision_intrinsic_metadata_add( - self, - metadata: List[RevisionIntrinsicMetadataRow], - conflict_update: bool = False, - db=None, - cur=None, + self, metadata: List[RevisionIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) @@ -557,7 +533,7 @@ ["id", "metadata", "mappings", "indexer_configuration_id"], cur, ) - count = db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) + count = db.revision_intrinsic_metadata_add_from_temp(cur) return { "revision_intrinsic_metadata:add": count, } @@ -580,11 +556,7 @@ @process_metrics @db_transaction() def origin_intrinsic_metadata_add( - self, - metadata: List[OriginIntrinsicMetadataRow], - conflict_update: bool = False, - db=None, - cur=None, + self, metadata: List[OriginIntrinsicMetadataRow], db=None, cur=None, ) -> Dict[str, int]: check_id_duplicates(metadata) metadata.sort(key=lambda m: m.id) @@ -597,7 +569,7 @@ ["id", "metadata", "indexer_configuration_id", "from_revision", "mappings"], cur, ) - count = db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + count = db.origin_intrinsic_metadata_add_from_temp(cur) return { "origin_intrinsic_metadata:add": count, } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -74,9 +74,9 @@ def mktemp_content_mimetype(self, cur=None): pass - def content_mimetype_add_from_temp(self, conflict_update, cur=None): + def content_mimetype_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_content_mimetype_add(%s)", (conflict_update,)) + cur.execute("select * from swh_content_mimetype_add()") return cur.fetchone()[0] def _convert_key(self, key, main_table="c"): @@ -209,9 +209,9 @@ def mktemp_content_language(self, cur=None): pass - def content_language_add_from_temp(self, conflict_update, cur=None): + def content_language_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_content_language_add(%s)", (conflict_update,)) + cur.execute("select * from swh_content_language_add()") return cur.fetchone()[0] def content_language_get_from_list(self, ids, cur=None): @@ -245,9 +245,9 @@ def mktemp_content_ctags(self, cur=None): pass - def content_ctags_add_from_temp(self, conflict_update, cur=None): + def content_ctags_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_content_ctags_add(%s)", (conflict_update,)) + cur.execute("select * from swh_content_ctags_add()") return cur.fetchone()[0] def content_ctags_get_from_list(self, ids, cur=None): @@ -303,14 +303,12 @@ def mktemp_content_fossology_license(self, cur=None): pass - def content_fossology_license_add_from_temp(self, conflict_update, cur=None): + def content_fossology_license_add_from_temp(self, cur=None): """Add new licenses per content. """ cur = self._cursor(cur) - cur.execute( - "select * from swh_content_fossology_license_add(%s)", (conflict_update,) - ) + cur.execute("select * from swh_content_fossology_license_add()") return cur.fetchone()[0] def content_fossology_license_get_from_list(self, ids, cur=None): @@ -355,9 +353,9 @@ def mktemp_content_metadata(self, cur=None): pass - def content_metadata_add_from_temp(self, conflict_update, cur=None): + def content_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute("select * from swh_content_metadata_add(%s)", (conflict_update,)) + cur.execute("select * from swh_content_metadata_add()") return cur.fetchone()[0] def content_metadata_get_from_list(self, ids, cur=None): @@ -392,11 +390,9 @@ def mktemp_revision_intrinsic_metadata(self, cur=None): pass - def revision_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None): + def revision_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute( - "select * from swh_revision_intrinsic_metadata_add(%s)", (conflict_update,) - ) + cur.execute("select * from swh_revision_intrinsic_metadata_add()") return cur.fetchone()[0] def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): @@ -429,11 +425,9 @@ def mktemp_origin_intrinsic_metadata(self, cur=None): pass - def origin_intrinsic_metadata_add_from_temp(self, conflict_update, cur=None): + def origin_intrinsic_metadata_add_from_temp(self, cur=None): cur = self._cursor(cur) - cur.execute( - "select * from swh_origin_intrinsic_metadata_add(%s)", (conflict_update,) - ) + cur.execute("select * from swh_origin_intrinsic_metadata_add()") return cur.fetchone()[0] def origin_intrinsic_metadata_get_from_list(self, ids, cur=None): diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -193,7 +193,7 @@ assert len(ids) <= limit return PagedResult(results=ids, next_page_token=next_page_token) - def add(self, data: Iterable[TValue], conflict_update: bool) -> int: + def add(self, data: Iterable[TValue]) -> int: """Add data not present in storage. Args: @@ -204,9 +204,6 @@ results - arbitrary data - conflict_update (bool): Flag to determine if we want to overwrite - (true) or skip duplicates (false) - """ data = list(data) check_id_duplicates(data) @@ -216,9 +213,6 @@ id_ = item.pop("id") tool_id = item["indexer_configuration_id"] key = _key_from_dict(obj.unique_key()) - if not conflict_update and key in self._data[id_]: - # Duplicate, should not be updated - continue self._data[id_][key] = item self._tools_per_id[id_].add(tool_id) count += 1 @@ -265,9 +259,9 @@ ) def content_mimetype_add( - self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False + self, mimetypes: List[ContentMimetypeRow] ) -> Dict[str, int]: - added = self._mimetypes.add(mimetypes, conflict_update) + added = self._mimetypes.add(mimetypes) return {"content_mimetype:add": added} def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]: @@ -282,9 +276,9 @@ return self._languages.get(ids) def content_language_add( - self, languages: List[ContentLanguageRow], conflict_update: bool = False + self, languages: List[ContentLanguageRow] ) -> Dict[str, int]: - added = self._languages.add(languages, conflict_update) + added = self._languages.add(languages) return {"content_language:add": added} def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]: @@ -293,10 +287,8 @@ def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]: return self._content_ctags.get(ids) - def content_ctags_add( - self, ctags: List[ContentCtagsRow], conflict_update: bool = False - ) -> Dict[str, int]: - added = self._content_ctags.add(ctags, conflict_update,) + def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]: + added = self._content_ctags.add(ctags) return {"content_ctags:add": added} def content_ctags_search( @@ -329,9 +321,9 @@ return self._licenses.get(ids) def content_fossology_license_add( - self, licenses: List[ContentLicenseRow], conflict_update: bool = False + self, licenses: List[ContentLicenseRow] ) -> Dict[str, int]: - added = self._licenses.add(licenses, conflict_update) + added = self._licenses.add(licenses) return {"content_fossology_license:add": added} def content_fossology_license_get_partition( @@ -355,9 +347,9 @@ return self._content_metadata.get(ids) def content_metadata_add( - self, metadata: List[ContentMetadataRow], conflict_update: bool = False + self, metadata: List[ContentMetadataRow] ) -> Dict[str, int]: - added = self._content_metadata.add(metadata, conflict_update) + added = self._content_metadata.add(metadata) return {"content_metadata:add": added} def revision_intrinsic_metadata_missing( @@ -371,11 +363,9 @@ return self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add( - self, - metadata: List[RevisionIntrinsicMetadataRow], - conflict_update: bool = False, + self, metadata: List[RevisionIntrinsicMetadataRow] ) -> Dict[str, int]: - added = self._revision_intrinsic_metadata.add(metadata, conflict_update) + added = self._revision_intrinsic_metadata.add(metadata) return {"revision_intrinsic_metadata:add": added} def origin_intrinsic_metadata_get( @@ -384,9 +374,9 @@ return self._origin_intrinsic_metadata.get(urls) def origin_intrinsic_metadata_add( - self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False + self, metadata: List[OriginIntrinsicMetadataRow] ) -> Dict[str, int]: - added = self._origin_intrinsic_metadata.add(metadata, conflict_update) + added = self._origin_intrinsic_metadata.add(metadata) return {"origin_intrinsic_metadata:add": added} def origin_intrinsic_metadata_search_fulltext( diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -84,14 +84,13 @@ @remote_api_endpoint("content_mimetype/add") def content_mimetype_add( - self, mimetypes: List[ContentMimetypeRow], conflict_update: bool = False + self, mimetypes: List[ContentMimetypeRow] ) -> Dict[str, int]: """Add mimetypes not present in storage. Args: mimetypes: mimetype rows to be added, with their `tool` attribute set to None. - conflict_update: Flag to determine if we want to overwrite (``True``) or skip duplicates (``False``, the default) @@ -148,17 +147,13 @@ @remote_api_endpoint("content_language/add") def content_language_add( - self, languages: List[ContentLanguageRow], conflict_update: bool = False + self, languages: List[ContentLanguageRow] ) -> Dict[str, int]: """Add languages not present in storage. Args: languages: language row objects - conflict_update (bool): Flag to determine if we want to - overwrite (true) or skip duplicates (false, the - default) - Returns: Dict summary of number of rows added @@ -198,9 +193,7 @@ ... @remote_api_endpoint("content/ctags/add") - def content_ctags_add( - self, ctags: List[ContentCtagsRow], conflict_update: bool = False - ) -> Dict[str, int]: + def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]: """Add ctags not present in storage Args: @@ -251,7 +244,7 @@ @remote_api_endpoint("content/fossology_license/add") def content_fossology_license_add( - self, licenses: List[ContentLicenseRow], conflict_update: bool = False + self, licenses: List[ContentLicenseRow] ) -> Dict[str, int]: """Add licenses not present in storage. @@ -259,9 +252,6 @@ license: license rows to be added, with their `tool` attribute set to None. - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - Returns: Dict summary of number of rows added @@ -335,7 +325,7 @@ @remote_api_endpoint("content_metadata/add") def content_metadata_add( - self, metadata: List[ContentMetadataRow], conflict_update: bool = False + self, metadata: List[ContentMetadataRow] ) -> Dict[str, int]: """Add metadata not present in storage. @@ -345,9 +335,6 @@ - **id**: sha1 - **metadata**: arbitrary dict - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - Returns: Dict summary of number of rows added @@ -390,18 +377,13 @@ @remote_api_endpoint("revision_intrinsic_metadata/add") def revision_intrinsic_metadata_add( - self, - metadata: List[RevisionIntrinsicMetadataRow], - conflict_update: bool = False, + self, metadata: List[RevisionIntrinsicMetadataRow], ) -> Dict[str, int]: """Add metadata not present in storage. Args: metadata: ContentMetadataRow objects - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - Returns: Dict summary of number of rows added @@ -423,16 +405,13 @@ @remote_api_endpoint("origin_intrinsic_metadata/add") def origin_intrinsic_metadata_add( - self, metadata: List[OriginIntrinsicMetadataRow], conflict_update: bool = False + self, metadata: List[OriginIntrinsicMetadataRow] ) -> Dict[str, int]: """Add origin metadata not present in storage. Args: metadata: list of OriginIntrinsicMetadataRow objects - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - Returns: Dict summary of number of rows added diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -137,45 +137,6 @@ actual_missing = endpoint(storage, etype, "missing")(query) assert list(actual_missing) == [data.sha1_1] - def test_add__drop_duplicate( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - etype = self.endpoint_type - tool_id = data.tools[self.tool_name]["id"] - - # add the first object - data_v1 = { - "id": data.sha1_2, - **self.example_data[0], - "indexer_configuration_id": tool_id, - } - summary = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v1)]) - assert summary == expected_summary(1, etype) - - # should be able to retrieve it - actual_data = list(endpoint(storage, etype, "get")([data.sha1_2])) - expected_data_v1 = [ - self.row_class.from_dict( - { - "id": data.sha1_2, - **self.example_data[0], - "tool": data.tools[self.tool_name], - } - ) - ] - assert actual_data == expected_data_v1 - - # now if we add a modified version of the same object (same id) - data_v2 = data_v1.copy() - data_v2.update(self.example_data[1]) - summary2 = endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)]) - assert summary2 == expected_summary(0, etype) # not added - - # we expect to retrieve the original data, not the modified one - actual_data = list(endpoint(storage, etype, "get")([data.sha1_2])) - assert actual_data == expected_data_v1 - def test_add__update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: @@ -209,9 +170,7 @@ data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) - endpoint(storage, etype, "add")( - [self.row_class.from_dict(data_v2)], conflict_update=True - ) + endpoint(storage, etype, "add")([self.row_class.from_dict(data_v2)]) assert summary == expected_summary(1, etype) # modified so counted actual_data = list(endpoint(storage, etype, "get")([data.sha1_2])) @@ -225,7 +184,7 @@ # data did change as the v2 was used to overwrite v1 assert actual_data == expected_data_v2 - def test_add__update_in_place_deadlock( + def test_add_deadlock( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data @@ -267,7 +226,9 @@ endpoint(storage, etype, "add")(data_v1) # when - actual_data = list(endpoint(storage, etype, "get")(hashes)) + actual_data = sorted( + endpoint(storage, etype, "get")(hashes), key=lambda x: x.id, + ) expected_data_v1 = [ self.row_class.from_dict( @@ -281,10 +242,10 @@ # given def f1() -> None: - endpoint(storage, etype, "add")(data_v2a, conflict_update=True) + endpoint(storage, etype, "add")(data_v2a) def f2() -> None: - endpoint(storage, etype, "add")(data_v2b, conflict_update=True) + endpoint(storage, etype, "add")(data_v2b) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) @@ -295,15 +256,21 @@ t2.join() actual_data = sorted( - (row.to_dict() for row in endpoint(storage, etype, "get")(hashes)), - key=lambda x: x["id"], + endpoint(storage, etype, "get")(hashes), key=lambda x: x.id, ) expected_data_v2 = [ - {"id": hash_, **self.example_data[1], "tool": tool} for hash_ in hashes + self.row_class.from_dict( + {"id": hash_, **self.example_data[1], "tool": tool} + ) + for hash_ in hashes ] - assert actual_data == expected_data_v2 + assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2) + for (item, expected_item_v1, expected_item_v2) in zip( + actual_data, expected_data_v1, expected_data_v2 + ): + assert item in (expected_item_v1, expected_item_v2) def test_add__duplicate_twice( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] @@ -333,9 +300,7 @@ assert summary == expected_summary(1, etype) with pytest.raises(DuplicateId): - endpoint(storage, etype, "add")( - [data_rev2, data_rev2], conflict_update=True - ) + endpoint(storage, etype, "add")([data_rev2, data_rev2]) # then actual_data = list( @@ -545,16 +510,12 @@ row_class = ContentCtagsRow # the following tests are disabled because CTAGS behaves differently - @pytest.mark.skip - def test_add__drop_duplicate(self): - pass - @pytest.mark.skip def test_add__update_in_place_duplicate(self): pass @pytest.mark.skip - def test_add__update_in_place_deadlock(self): + def test_add_deadlock(self): pass @pytest.mark.skip @@ -736,7 +697,7 @@ ) ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) - storage.content_ctags_add([ctag1, ctag2], conflict_update=True) + storage.content_ctags_add([ctag1, ctag2]) actual_ctags = list(storage.content_ctags_get([data.sha1_2])) @@ -889,7 +850,7 @@ mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id - storage.content_mimetype_add(mimetypes, conflict_update=True) + storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) @@ -925,7 +886,7 @@ mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id - storage.content_mimetype_add(mimetypes, conflict_update=True) + storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) @@ -951,7 +912,7 @@ mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id - storage.content_mimetype_add(mimetypes, conflict_update=True) + storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) @@ -993,7 +954,7 @@ mimetypes = prepare_mimetypes_from_licenses(fossology_licenses) indexer_configuration_id = fossology_licenses[0].indexer_configuration_id - storage.content_mimetype_add(mimetypes, conflict_update=True) + storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) @@ -1093,71 +1054,6 @@ assert actual_metadata == expected_metadata - def test_origin_intrinsic_metadata_add_drop_duplicate( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - # given - tool_id = data.tools["swh-metadata-detector"]["id"] - - metadata_v1: Dict[str, Any] = { - "version": None, - "name": None, - } - metadata_rev_v1 = RevisionIntrinsicMetadataRow( - id=data.revision_id_1, - metadata=metadata_v1.copy(), - mappings=[], - indexer_configuration_id=tool_id, - ) - metadata_origin_v1 = OriginIntrinsicMetadataRow( - id=data.origin_url_1, - metadata=metadata_v1.copy(), - indexer_configuration_id=tool_id, - mappings=[], - from_revision=data.revision_id_1, - ) - - # given - storage.revision_intrinsic_metadata_add([metadata_rev_v1]) - storage.origin_intrinsic_metadata_add([metadata_origin_v1]) - - # when - actual_metadata = list( - storage.origin_intrinsic_metadata_get([data.origin_url_1, "no://where"]) - ) - - expected_metadata_v1 = [ - OriginIntrinsicMetadataRow( - id=data.origin_url_1, - metadata=metadata_v1, - tool=data.tools["swh-metadata-detector"], - from_revision=data.revision_id_1, - mappings=[], - ) - ] - - assert actual_metadata == expected_metadata_v1 - - # given - metadata_v2 = metadata_v1.copy() - metadata_v2.update( - {"name": "test_metadata", "author": "MG",} - ) - metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) - metadata_origin_v2 = attr.evolve(metadata_origin_v1, metadata=metadata_v2) - - storage.revision_intrinsic_metadata_add([metadata_rev_v2]) - storage.origin_intrinsic_metadata_add([metadata_origin_v2]) - - # then - actual_metadata = list( - storage.origin_intrinsic_metadata_get([data.origin_url_1]) - ) - - # metadata did not change as the v2 was dropped. - assert actual_metadata == expected_metadata_v1 - def test_origin_intrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: @@ -1218,10 +1114,8 @@ from_revision=data.revision_id_1, ) - storage.revision_intrinsic_metadata_add([metadata_rev_v2], conflict_update=True) - storage.origin_intrinsic_metadata_add( - [metadata_origin_v2], conflict_update=True - ) + storage.revision_intrinsic_metadata_add([metadata_rev_v2]) + storage.origin_intrinsic_metadata_add([metadata_origin_v2]) actual_metadata = list( storage.origin_intrinsic_metadata_get([data.origin_url_1]) @@ -1240,7 +1134,7 @@ # metadata did change as the v2 was used to overwrite v1 assert actual_metadata == expected_metadata_v2 - def test_origin_intrinsic_metadata_add__update_in_place_deadlock( + def test_origin_intrinsic_metadata_add__deadlock( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: storage, data = swh_indexer_storage_with_data @@ -1312,10 +1206,10 @@ # given def f1() -> None: - storage.origin_intrinsic_metadata_add(data_v2a, conflict_update=True) + storage.origin_intrinsic_metadata_add(data_v2a) def f2() -> None: - storage.origin_intrinsic_metadata_add(data_v2b, conflict_update=True) + storage.origin_intrinsic_metadata_add(data_v2b) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) @@ -1337,8 +1231,11 @@ for id_ in ids ] - assert len(actual_data) == len(expected_data_v2) - assert sorted(actual_data, key=lambda x: x.id) == expected_data_v2 + assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2) + for (item, expected_item_v1, expected_item_v2) in zip( + actual_data, expected_data_v1, expected_data_v2 + ): + assert item in (expected_item_v1, expected_item_v2) def test_origin_intrinsic_metadata_add__duplicate_twice( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]