diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,3 +1,4 @@ Daniele Serafini Ishan Bhanuka +Kumar Shivendu Quentin Campos diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -20,6 +20,7 @@ Set, Tuple, Union, + ValuesView, ) import attr @@ -191,8 +192,9 @@ return summary - def content_add(self, content: List[Content]) -> Dict: - contents = [attr.evolve(c, ctime=now()) for c in content] + def content_add(self, contents: Union[List[Content], ValuesView[Content]]) -> Dict: + contents = {c.swhid(): c for c in contents}.values() + contents = [attr.evolve(c, ctime=now()) for c in contents] return self._content_add(list(contents), with_data=True) def content_update( @@ -353,7 +355,10 @@ if not self._cql_runner.skipped_content_get_from_pk(content): yield {algo: content[algo] for algo in DEFAULT_ALGORITHMS} - def directory_add(self, directories: List[Directory]) -> Dict: + def directory_add( + self, directories: Union[ValuesView[Directory], List[Directory]] + ) -> Dict: + directories = {d.swhid(): d for d in directories}.values() # Filter out directories that are already inserted. missing = self.directory_missing([dir_.id for dir_ in directories]) directories = [dir_ for dir_ in directories if dir_.id in missing] @@ -477,8 +482,11 @@ assert directory, "Could not find any directory" return directory.id - def revision_add(self, revisions: List[Revision]) -> Dict: + def revision_add( + self, revisions: Union[ValuesView[Revision], List[Revision]] + ) -> Dict: # Filter-out revisions already in the database + revisions = {r.swhid(): r for r in revisions}.values() missing = self.revision_missing([rev.id for rev in revisions]) revisions = [rev for rev in revisions if rev.id in missing] self.journal_writer.revision_add(revisions) @@ -588,21 +596,18 @@ assert revision, "Could not find any revision" return revision.id - def release_add(self, releases: List[Release]) -> Dict: - to_add = [] - for rel in releases: - if rel not in to_add: - to_add.append(rel) - missing = set(self.release_missing([rel.id for rel in to_add])) - to_add = [rel for rel in to_add if rel.id in missing] + def release_add(self, releases: Union[ValuesView[Release], List[Release]]) -> Dict: + releases = {r.swhid(): r for r in releases}.values() + missing = set(self.release_missing([rel.id for rel in releases])) + releases = [rel for rel in releases if rel.id in missing] - self.journal_writer.release_add(to_add) + self.journal_writer.release_add(releases) - for release in to_add: + for release in releases: if release: self._cql_runner.release_add_one(converters.release_to_db(release)) - return {"release:add": len(to_add)} + return {"release:add": len(releases)} def release_missing(self, releases: List[Sha1Git]) -> Iterable[Sha1Git]: return self._cql_runner.release_missing(releases) @@ -621,7 +626,10 @@ assert release, "Could not find any release" return release.id - def snapshot_add(self, snapshots: List[Snapshot]) -> Dict: + def snapshot_add( + self, snapshots: Union[ValuesView[Snapshot], List[Snapshot]] + ) -> Dict: + snapshots = {s.swhid(): s for s in snapshots}.values() missing = self._cql_runner.snapshot_missing([snp.id for snp in snapshots]) snapshots = [snp for snp in snapshots if snp.id in missing] @@ -891,7 +899,10 @@ "The Cassandra backend does not implement origin_count" ) - def origin_add(self, origins: List[Origin]) -> Dict[str, int]: + def origin_add( + self, origins: Union[ValuesView[Origin], List[Origin]] + ) -> Dict[str, int]: + origins = {o.swhid(): o for o in origins}.values() to_add = [ori for ori in origins if self.origin_get_one(ori.url) is None] # keep only one occurrence of each given origin while keeping the list # sorted as originally given