diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -59,6 +59,7 @@ DirectoryEntryRow, DirectoryRow, ExtIDByTargetRow, + ExtIDByTargetVersionRow, ExtIDRow, MetadataAuthorityRow, MetadataFetcherRow, @@ -1346,10 +1347,27 @@ self._execute_with_retries(statement, [extid_type, extid]), ) + @_prepared_select_statement( + ExtIDRow, "WHERE extid_type=? AND extid=? AND extid_version = ?", + ) + def extid_get_from_extid_and_version( + self, extid_type: str, extid: bytes, extid_version: int, *, statement + ) -> Iterable[ExtIDRow]: + return map( + ExtIDRow.from_dict, + self._execute_with_retries(statement, [extid_type, extid, extid_version]), + ) + def extid_get_from_target( - self, target_type: str, target: bytes + self, target_type: str, target: bytes, version: Optional[int] = None, ) -> Iterable[ExtIDRow]: - for token in self._extid_get_tokens_from_target(target_type, target): + if version is not None: + extids = self._extid_get_tokens_from_target_and_version( + target_type, target, version + ) + else: + extids = self._extid_get_tokens_from_target(target_type, target) + for token in extids: if token is not None: for extid in self.extid_get_from_token(token): # re-check the extid against target (in case of murmur3 collision) @@ -1357,6 +1375,7 @@ extid is not None and extid.target_type == target_type and extid.target == target + and (version is None or extid.extid_version == version) ): yield extid @@ -1370,6 +1389,16 @@ the main 'extid' table.""" self._add_one(statement, row) + @_prepared_insert_statement(ExtIDByTargetVersionRow) + def extid_index_version_add_one( + self, row: ExtIDByTargetVersionRow, *, statement + ) -> None: + """Adds a row mapping extid[target_type, target, version] to the token of the + ExtID in the main 'extid' table. + + """ + self._add_one(statement, row) + @_prepared_statement( f""" SELECT target_token @@ -1385,6 +1414,23 @@ for row in self._execute_with_retries(statement, [target_type, target]) ) + @_prepared_statement( + f""" + SELECT target_token + FROM {ExtIDByTargetVersionRow.TABLE} + WHERE target_type = ? AND target = ? and extid_version = ? + """ + ) + def _extid_get_tokens_from_target_and_version( + self, target_type: str, target: bytes, version: int, *, statement + ) -> Iterable[int]: + return ( + row["target_token"] + for row in self._execute_with_retries( + statement, [target_type, target, version] + ) + ) + ########################## # Miscellaneous ########################## diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -332,3 +332,15 @@ target_type: str target: bytes target_token: int + + +@dataclasses.dataclass +class ExtIDByTargetVersionRow(BaseRow): + TABLE = "extid_by_target_version" + PARTITION_KEY = ("target_type", "target", "extid_version") + CLUSTERING_KEY = ("target_token",) + + target_type: str + target: bytes + extid_version: int + target_token: int diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -281,6 +281,14 @@ target blob, target_token bigint, -- value of token(pk) on the "primary" table PRIMARY KEY ((target_type, target), target_token) +);""", + """ +CREATE TABLE IF NOT EXISTS extid_by_target_version ( + target_type ascii, + target blob, + extid_version smallint, + target_token bigint, -- value of token(pk) on the "primary" table + PRIMARY KEY ((target_type, target, extid_version), target_token) );""", ] @@ -317,6 +325,7 @@ "metadata_fetcher", "extid", "extid_by_target", + "extid_by_target_version", ] HASH_ALGORITHMS = ["sha1", "sha1_git", "sha256", "blake2s256"] diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -72,6 +72,7 @@ DirectoryEntryRow, DirectoryRow, ExtIDByTargetRow, + ExtIDByTargetVersionRow, ExtIDRow, MetadataAuthorityRow, MetadataFetcherRow, @@ -1629,15 +1630,29 @@ target_type=target_type, target=target, target_token=token, ) self._cql_runner.extid_index_add_one(indexrow) + index2row = ExtIDByTargetVersionRow( + target_type=target_type, + target=target, + extid_version=extid.extid_version, + target_token=token, + ) + self._cql_runner.extid_index_version_add_one(index2row) insertion_finalizer() inserted += 1 return {"extid:add": inserted} @timed - def extid_get_from_extid(self, id_type: str, ids: List[bytes]) -> List[ExtID]: + def extid_get_from_extid( + self, id_type: str, ids: List[bytes], version: Optional[int] = None + ) -> List[ExtID]: result: List[ExtID] = [] for extid in ids: - extidrows = list(self._cql_runner.extid_get_from_extid(id_type, extid)) + if version is not None: + extidrows = self._cql_runner.extid_get_from_extid_and_version( + id_type, extid, version + ) + else: + extidrows = self._cql_runner.extid_get_from_extid(id_type, extid) result.extend( ExtID( extid_type=extidrow.extid_type, @@ -1653,13 +1668,17 @@ @timed def extid_get_from_target( - self, target_type: SwhidObjectType, ids: List[Sha1Git] + self, + target_type: SwhidObjectType, + ids: List[Sha1Git], + version: Optional[int] = None, ) -> List[ExtID]: result: List[ExtID] = [] for target in ids: - extidrows = list( - self._cql_runner.extid_get_from_target(target_type.value, target) + extidrows = self._cql_runner.extid_get_from_target( + target_type.value, target, version=version ) + result.extend( ExtID( extid_type=extidrow.extid_type, diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -31,6 +31,7 @@ DirectoryEntryRow, DirectoryRow, ExtIDByTargetRow, + ExtIDByTargetVersionRow, ExtIDRow, MetadataAuthorityRow, MetadataFetcherRow, @@ -692,6 +693,9 @@ def extid_index_add_one(self, row: ExtIDByTargetRow) -> None: pass + def extid_index_version_add_one(self, row: ExtIDByTargetVersionRow) -> None: + pass + def extid_get_from_pk( self, extid_type: str, extid: bytes, extid_version: int, target: ExtendedSWHID, ) -> Optional[ExtIDRow]: @@ -706,20 +710,35 @@ ) return self._extid.get_from_primary_key(primary_key) - def extid_get_from_extid(self, extid_type: str, extid: bytes) -> Iterable[ExtIDRow]: + def extid_get_from_extid( + self, extid_type: str, extid: bytes, + ) -> Iterable[ExtIDRow]: return ( row for pk, row in self._extid.iter_all() if row.extid_type == extid_type and row.extid == extid ) + def extid_get_from_extid_and_version( + self, extid_type: str, extid: bytes, extid_version: int, + ) -> Iterable[ExtIDRow]: + return ( + row + for pk, row in self._extid.iter_all() + if row.extid_type == extid_type + and row.extid == extid + and (extid_version is None or row.extid_version == extid_version) + ) + def extid_get_from_target( - self, target_type: str, target: bytes + self, target_type: str, target: bytes, version: Optional[int] = None ) -> Iterable[ExtIDRow]: return ( row for pk, row in self._extid.iter_all() - if row.target_type == target_type and row.target == target + if row.target_type == target_type + and row.target == target + and (version is None or row.extid_version == version) ) diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -532,12 +532,15 @@ ... @remote_api_endpoint("extid/from_extid") - def extid_get_from_extid(self, id_type: str, ids: List[bytes]) -> List[ExtID]: + def extid_get_from_extid( + self, id_type: str, ids: List[bytes], version: Optional[int] = None + ) -> List[ExtID]: """Get ExtID objects from external IDs Args: id_type: type of the given external identifiers (e.g. 'mercurial') ids: list of external IDs + version: (Optional) version to use as filter Returns: list of ExtID objects @@ -547,13 +550,17 @@ @remote_api_endpoint("extid/from_target") def extid_get_from_target( - self, target_type: ObjectType, ids: List[Sha1Git] + self, + target_type: ObjectType, + ids: List[Sha1Git], + version: Optional[int] = None, ) -> List[ExtID]: """Get ExtID objects from target IDs and target_type Args: target_type: type the SWH object ids: list of target IDs + version: (Optional) version to use as filter Returns: list of ExtID objects diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -845,15 +845,24 @@ extid_cols = ["extid", "extid_version", "extid_type", "target", "target_type"] - def extid_get_from_extid_list(self, extid_type, ids, cur=None): + def extid_get_from_extid_list( + self, extid_type: str, ids: List[bytes], version: Optional[int] = None, cur=None + ): cur = self._cursor(cur) query_keys = ", ".join( self.mangle_query_key(k, "extid") for k in self.extid_cols ) - sql = """ + filter_query = "" + if version is not None: + if not isinstance(version, int): + raise ValueError("ExtID version should be an integer") + filter_query = f"WHERE extid_version={version}" + + sql = f""" SELECT %s FROM (VALUES %%s) as t(sortkey, extid, extid_type) LEFT JOIN extid USING (extid, extid_type) + {filter_query} ORDER BY sortkey """ % ( query_keys, @@ -865,7 +874,13 @@ (((sortkey, extid, extid_type) for sortkey, extid in enumerate(ids))), ) - def extid_get_from_swhid_list(self, target_type, ids, cur=None): + def extid_get_from_swhid_list( + self, + target_type: str, + ids: List[bytes], + version: Optional[int] = None, + cur=None, + ): cur = self._cursor(cur) target_type = ObjectType( target_type @@ -873,10 +888,16 @@ query_keys = ", ".join( self.mangle_query_key(k, "extid") for k in self.extid_cols ) - sql = """ + filter_query = "" + if version is not None: + if not isinstance(version, int): + raise ValueError("ExtID version should be an integer") + filter_query = f"WHERE extid_version={version}" + sql = f""" SELECT %s FROM (VALUES %%s) as t(sortkey, target, target_type) LEFT JOIN extid USING (target, target_type) + {filter_query} ORDER BY sortkey """ % ( query_keys, diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -692,10 +692,16 @@ @timed @db_transaction() def extid_get_from_extid( - self, id_type: str, ids: List[bytes], *, db: Db, cur=None + self, + id_type: str, + ids: List[bytes], + version: Optional[int] = None, + *, + db: Db, + cur=None, ) -> List[ExtID]: extids = [] - for row in db.extid_get_from_extid_list(id_type, ids, cur): + for row in db.extid_get_from_extid_list(id_type, ids, version=version, cur=cur): if row[0] is not None: extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids @@ -703,10 +709,18 @@ @timed @db_transaction() def extid_get_from_target( - self, target_type: ObjectType, ids: List[Sha1Git], *, db: Db, cur=None + self, + target_type: ObjectType, + ids: List[Sha1Git], + version: Optional[int] = None, + *, + db: Db, + cur=None, ) -> List[ExtID]: extids = [] - for row in db.extid_get_from_swhid_list(target_type.value, ids, cur): + for row in db.extid_get_from_swhid_list( + target_type.value, ids, version=version, cur=cur + ): if row[0] is not None: extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1373,6 +1373,19 @@ objs = swh_storage.extid_get_from_target(ObjectType.REVISION, [swhid]) assert len(objs) == 2 assert set(obj.extid_version for obj in objs) == {0, 1} + for version in [0, 1]: + for git_id in ids: + objs = swh_storage.extid_get_from_extid( + "git", [git_id], version=version + ) + assert len(objs) == 1 + assert objs[0].extid_version == version + for swhid in ids: + objs = swh_storage.extid_get_from_target( + ObjectType.REVISION, [swhid], version=version + ) + assert len(objs) == 1 + assert objs[0].extid_version == version def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2]