diff --git a/swh/storage/algos/origin.py b/swh/storage/algos/origin.py --- a/swh/storage/algos/origin.py +++ b/swh/storage/algos/origin.py @@ -92,4 +92,8 @@ require_snapshot=require_snapshot, ) if visit_status is not None: + # storage api gives us too many data which no longer map to an + # origin-visit, so we drop those + for key in ["metadata", "status", "snapshot"]: + visit.pop(key, None) return (OriginVisit.from_dict(visit), visit_status) diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -843,26 +843,6 @@ for visit_status in visit_statuses: self._origin_visit_status_add(visit_status) - def _origin_visit_merge( - self, visit: Dict[str, Any], visit_status: OriginVisitStatus, - ) -> Dict[str, Any]: - """Merge origin_visit and visit_status together. - - """ - return OriginVisit.from_dict( - { - # default to the values in visit - **visit, - # override with the last update - **visit_status.to_dict(), - # visit['origin'] is the URL (via a join), while - # visit_status['origin'] is only an id. - "origin": visit["origin"], - # but keep the date of the creation of the origin visit - "date": visit["date"], - } - ).to_dict() - def _origin_visit_apply_last_status(self, visit: Dict[str, Any]) -> Dict[str, Any]: """Retrieve the latest visit status information for the origin visit. Then merge it with the visit and return it. @@ -872,7 +852,18 @@ visit["origin"], visit["visit"] ) assert row is not None - return self._origin_visit_merge(visit, row_to_visit_status(row)) + visit_status = row_to_visit_status(row) + return { + # default to the values in visit + **visit, + # override with the last update + **visit_status.to_dict(), + # visit['origin'] is the URL (via a join), while + # visit_status['origin'] is only an id. + "origin": visit["origin"], + # but keep the date of the creation of the origin visit + "date": visit["date"], + } def _origin_visit_get_updated(self, origin: str, visit_id: int) -> Dict[str, Any]: """Retrieve origin visit and latest origin visit status and merge them diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -581,7 +581,7 @@ return None visit = self._origin_visit_get_updated(origin_url, visit) - snapshot_id = visit.snapshot + snapshot_id = visit["snapshot"] if snapshot_id: return self.snapshot_get(snapshot_id) else: @@ -739,7 +739,8 @@ for ov in self._origin_visits[orig["url"]] ) for ov in visits: - if ov.snapshot and ov.snapshot in self._snapshots: + snapshot = ov["snapshot"] + if snapshot and snapshot in self._snapshots: filtered_origins.append(orig) break else: @@ -845,7 +846,7 @@ for visit_status in visit_statuses: self._origin_visit_status_add_one(visit_status) - def _origin_visit_get_updated(self, origin: str, visit_id: int) -> OriginVisit: + def _origin_visit_get_updated(self, origin: str, visit_id: int) -> Dict[str, Any]: """Merge origin visit and latest origin visit status """ @@ -855,16 +856,14 @@ visit_key = (origin, visit_id) visit_update = max(self._origin_visit_statuses[visit_key], key=lambda v: v.date) - return OriginVisit.from_dict( - { - # default to the values in visit - **visit.to_dict(), - # override with the last update - **visit_update.to_dict(), - # but keep the date of the creation of the origin visit - "date": visit.date, - } - ) + return { + # default to the values in visit + **visit.to_dict(), + # override with the last update + **visit_update.to_dict(), + # but keep the date of the creation of the origin visit + "date": visit.date, + } def origin_visit_get( self, @@ -893,7 +892,7 @@ visit_update = self._origin_visit_get_updated(origin_url, visit_id) assert visit_update is not None - yield visit_update.to_dict() + yield visit_update def origin_visit_find_by_date( self, origin: str, visit_date: datetime.datetime @@ -904,7 +903,7 @@ visit = min(visits, key=lambda v: (abs(v.date - visit_date), -v.visit)) visit_update = self._origin_visit_get_updated(origin, visit.visit) assert visit_update is not None - return visit_update.to_dict() + return visit_update return None def origin_visit_get_by(self, origin: str, visit: int) -> Optional[Dict[str, Any]]: @@ -914,7 +913,7 @@ ): visit_update = self._origin_visit_get_updated(origin_url, visit) assert visit_update is not None - return visit_update.to_dict() + return visit_update return None def origin_visit_get_latest( @@ -936,16 +935,16 @@ ] if type is not None: - visits = [visit for visit in visits if visit.type == type] + visits = [visit for visit in visits if visit["type"] == type] if allowed_statuses is not None: - visits = [visit for visit in visits if visit.status in allowed_statuses] + visits = [visit for visit in visits if visit["status"] in allowed_statuses] if require_snapshot: - visits = [visit for visit in visits if visit.snapshot] + visits = [visit for visit in visits if visit["snapshot"]] - visit = max(visits, key=lambda v: (v.date, v.visit), default=None) + visit = max(visits, key=lambda v: (v["date"], v["visit"]), default=None) if visit is None: return None - return visit.to_dict() + return visit def origin_visit_status_get_latest( self, @@ -987,8 +986,11 @@ for visit in random_origin_visits: updated_visit = self._origin_visit_get_updated(url, visit.visit) assert updated_visit is not None - if updated_visit.date > back_in_the_day and updated_visit.status == "full": - return updated_visit.to_dict() + if ( + updated_visit["date"] > back_in_the_day + and updated_visit["status"] == "full" + ): + return updated_visit else: return None diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -881,19 +881,6 @@ return None return OriginVisitStatus.from_dict(row) - def _origin_visit_get_updated( - self, origin: str, visit_id: int, db, cur - ) -> Optional[Dict[str, Any]]: - """Retrieve origin visit and latest origin visit status and merge them - into an origin visit. - - """ - row_visit = db.origin_visit_get(origin, visit_id) - if row_visit is None: - return None - visit = dict(zip(db.origin_visit_get_cols, row_visit)) - return self._origin_visit_apply_update(visit, db=db, cur=cur) - def _origin_visit_apply_update( self, visit: Dict[str, Any], db, cur=None ) -> Dict[str, Any]: @@ -904,27 +891,17 @@ visit_status = db.origin_visit_status_get_latest( visit["origin"], visit["visit"], cur=cur ) - return self._origin_visit_merge(visit, visit_status) - - def _origin_visit_merge( - self, visit: Dict[str, Any], visit_status: Dict[str, Any] - ) -> Dict[str, Any]: - """Merge origin_visit and origin_visit_status together. - - """ - return OriginVisit.from_dict( - { - # default to the values in visit - **visit, - # override with the last update - **visit_status, - # visit['origin'] is the URL (via a join), while - # visit_status['origin'] is only an id. - "origin": visit["origin"], - # but keep the date of the creation of the origin visit - "date": visit["date"], - } - ).to_dict() + return { + # default to the values in visit + **visit, + # override with the last update + **visit_status, + # visit['origin'] is the URL (via a join), while + # visit_status['origin'] is only an id. + "origin": visit["origin"], + # but keep the date of the creation of the origin visit + "date": visit["date"], + } @timed @db_transaction_generator(statement_timeout=500)