diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -208,6 +208,13 @@ return self.post('origin/visit/getby', {'origin': origin, 'visit': visit}) + def origin_visit_get_latest(self, origin, allowed_statuses=None, + require_snapshot=False): + return self.post( + 'origin/visit/get_latest', + {'origin': origin, 'allowed_statuses': allowed_statuses, + 'require_snapshot': require_snapshot}) + def person_get(self, person): return self.post('person', {'person': person}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -397,6 +397,13 @@ get_storage().origin_visit_get_by(**decode_request(request))) +@app.route('/origin/visit/get_latest', methods=['POST']) +@timed +def origin_visit_get_latest(): + return encode_data( + get_storage().origin_visit_get_latest(**decode_request(request))) + + @app.route('/origin/visit/add', methods=['POST']) @timed @encode diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -431,34 +431,45 @@ return bool(cur.fetchone()) - def origin_visit_get_latest_snapshot(self, origin_id, - allowed_statuses=None, - cur=None): - """Retrieve the most recent origin_visit which references a snapshot + def origin_visit_get_latest( + self, origin_id, allowed_statuses=None, require_snapshot=False, + cur=None): + """Retrieve the most recent origin_visit of the given origin, + with optional filters. Args: origin_id: the origin concerned allowed_statuses: the visit statuses allowed for the returned visit + require_snapshot (bool): If True, only a visit with a known + snapshot will be returned. Returns: The origin_visit information, or None if no visit matches. """ cur = self._cursor(cur) - extra_clause = "" + query_parts = [ + 'SELECT %s' % ', '.join(self.origin_visit_get_cols), + 'FROM origin_visit'] + + if require_snapshot: + # Makes sure the snapshot is known + query_parts.append( + 'INNER JOIN snapshot ON (origin_visit.snapshot=snapshot.id)') + + query_parts.append('WHERE origin = %s') + + if require_snapshot: + query_parts.append('AND snapshot is not null') + if allowed_statuses: - extra_clause = cur.mogrify("AND status IN %s", - (tuple(allowed_statuses),)).decode() + query_parts.append( + cur.mogrify('AND status IN %s', + (tuple(allowed_statuses),)).decode()) - query = """\ - SELECT %s - FROM origin_visit - INNER JOIN snapshot ON (origin_visit.snapshot=snapshot.id) - WHERE - origin = %%s AND snapshot is not null %s - ORDER BY date DESC, visit DESC - LIMIT 1 - """ % (', '.join(self.origin_visit_get_cols), extra_clause) + query_parts.append('ORDER BY date DESC, visit DESC LIMIT 1') + + query = '\n'.join(query_parts) cur.execute(query, (origin_id,)) r = cur.fetchone() diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -792,13 +792,13 @@ .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. + set of branches, the methods :meth:`origin_visit_get_latest` + and :meth:`snapshot_get_branches` should be used instead. Args: origin (Union[str,int]): the origin's URL or identifier allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the visit. For instance, + to find the latest snapshot for the origin. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. Returns: @@ -810,21 +810,13 @@ or :const:`None` if the snapshot has less than 1000 branches. """ - if isinstance(origin, str): - origin = self.origin_get({'url': origin})['id'] - visits = self._origin_visits[origin-1] - if allowed_statuses is not None: - visits = [visit for visit in visits - if visit['status'] in allowed_statuses] - snapshot = None - for visit in sorted(visits, key=lambda v: (v['date'], v['visit']), - reverse=True): - snapshot_id = visit['snapshot'] - snapshot = self.snapshot_get(snapshot_id) - if snapshot: - break + if isinstance(origin, int): + origin = self.origin_get({'id': origin})['url'] - return snapshot + visit = self.origin_visit_get_latest( + origin, allowed_statuses=allowed_statuses, require_snapshot=True) + if visit and visit['snapshot']: + return self.snapshot_get(visit['snapshot']) def snapshot_count_branches(self, snapshot_id, db=None, cur=None): """Count the number of branches in the snapshot with the given id @@ -1323,12 +1315,52 @@ it does not exist """ + if isinstance(origin, str): + origin = self.origin_get({'url': origin})['id'] origin_visit = None if origin <= len(self._origin_visits) and \ visit <= len(self._origin_visits[origin-1]): origin_visit = self._origin_visits[origin-1][visit-1] return copy.deepcopy(origin_visit) + def origin_visit_get_latest( + self, origin, allowed_statuses=None, require_snapshot=False): + """Get the latest origin visit for the given origin, optionally + looking only for those with one of the given allowed_statuses + or for those with a known snapshot. + + Args: + origin (str): the origin's URL + allowed_statuses (list of str): list of visit statuses considered + to find the latest visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + require_snapshot (bool): If True, only a visit with a snapshot + will be returned. + Returns: + dict: a dict with the following keys: + + origin: the URL of the origin + visit: origin visit id + type: type of loader used for the visit + date: timestamp of such visit + status: Visit's new status + metadata: Data associated to the visit + snapshot (Optional[sha1_git]): identifier of the snapshot + associated to the visit + """ + origin = self.origin_get({'url': origin})['id'] + visits = self._origin_visits[origin-1] + if allowed_statuses is not None: + visits = [visit for visit in visits + if visit['status'] in allowed_statuses] + if require_snapshot: + visits = [visit for visit in visits + if visit['snapshot'] + and visit['snapshot'] in self._snapshots] + + return max(visits, key=lambda v: (v['date'], v['visit']), default=None) + def person_get(self, person): """Return the persons identified by their ids. diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1063,13 +1063,13 @@ or :const:`None` if the snapshot has less than 1000 branches. """ - if isinstance(origin, str): - origin = self.origin_get({'url': origin})['id'] + if isinstance(origin, int): + origin = self.origin_get({'id': origin}, db=db, cur=cur)['url'] - origin_visit = db.origin_visit_get_latest_snapshot( - origin, allowed_statuses=allowed_statuses, cur=cur) - if origin_visit: - origin_visit = dict(zip(db.origin_visit_get_cols, origin_visit)) + origin_visit = self.origin_visit_get_latest( + origin, allowed_statuses=allowed_statuses, require_snapshot=True, + db=db, cur=cur) + if origin_visit and origin_visit['snapshot']: return self.snapshot_get(origin_visit['snapshot'], db=db, cur=cur) @db_transaction(statement_timeout=2000) @@ -1328,12 +1328,50 @@ it does not exist """ + if isinstance(origin, str): + origin = self.origin_get({'url': origin}, db=db, cur=cur)['id'] ori_visit = db.origin_visit_get(origin, visit, cur) if not ori_visit: return None return dict(zip(db.origin_visit_get_cols, ori_visit)) + @db_transaction(statement_timeout=4000) + def origin_visit_get_latest( + self, origin, allowed_statuses=None, require_snapshot=False, + db=None, cur=None): + """Get the latest origin visit for the given origin, optionally + looking only for those with one of the given allowed_statuses + or for those with a known snapshot. + + Args: + origin (str): the origin's URL + allowed_statuses (list of str): list of visit statuses considered + to find the latest visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + require_snapshot (bool): If True, only a visit with a snapshot + will be returned. + Returns: + dict: a dict with the following keys: + + origin: the URL of the origin + visit: origin visit id + type: type of loader used for the visit + date: timestamp of such visit + status: Visit's new status + metadata: Data associated to the visit + snapshot (Optional[sha1_git]): identifier of the snapshot + associated to the visit + """ + origin = self.origin_get({'url': origin}, db=db, cur=cur)['id'] + + origin_visit = db.origin_visit_get_latest( + origin, allowed_statuses=allowed_statuses, + require_snapshot=require_snapshot, cur=cur) + if origin_visit: + return dict(zip(db.origin_visit_get_cols, origin_visit)) + @db_transaction(statement_timeout=2000) def object_find_by_sha1_git(self, ids, db=None, cur=None): """Return the objects found with the given ids. diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2089,6 +2089,125 @@ expected_persons.reverse() self.assertEqual(list(actual_persons), expected_persons) + def test_origin_visit_get_latest(self): + origin_id = self.storage.origin_add_one(self.origin) + origin_url = self.origin['url'] + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) + visit1_id = origin_visit1['visit'] + origin_visit2 = self.storage.origin_visit_add(origin_id, + self.date_visit2) + visit2_id = origin_visit2['visit'] + + # Add a visit with the same date as the previous one + origin_visit3 = self.storage.origin_visit_add(origin_id, + self.date_visit2) + visit3_id = origin_visit3['visit'] + + origin_visit1 = self.storage.origin_visit_get_by(origin_url, visit1_id) + origin_visit2 = self.storage.origin_visit_get_by(origin_url, visit2_id) + origin_visit3 = self.storage.origin_visit_get_by(origin_url, visit3_id) + + # Two visits, both with no snapshot + self.assertEqual( + origin_visit3, + self.storage.origin_visit_get_latest(origin_url)) + self.assertIsNone( + self.storage.origin_visit_get_latest(origin_url, + require_snapshot=True)) + + # Add snapshot to visit1; require_snapshot=True makes it return + # visit1 and require_snapshot=False still returns visit2 + self.storage.snapshot_add([self.complete_snapshot]) + self.storage.origin_visit_update( + origin_id, visit1_id, snapshot=self.complete_snapshot['id']) + self.assertEqual( + {**origin_visit1, 'snapshot': self.complete_snapshot['id']}, + self.storage.origin_visit_get_latest( + origin_url, require_snapshot=True) + ) + self.assertEqual( + origin_visit3, + self.storage.origin_visit_get_latest(origin_url) + ) + + # Status filter: all three visits are status=ongoing, so no visit + # returned + self.assertIsNone( + self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full']) + ) + + # Mark the first visit as completed and check status filter again + self.storage.origin_visit_update(origin_id, visit1_id, status='full') + self.assertEqual( + { + **origin_visit1, + 'snapshot': self.complete_snapshot['id'], + 'status': 'full'}, + self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full']), + ) + self.assertEqual( + origin_visit3, + self.storage.origin_visit_get_latest(origin_url), + ) + + # Add snapshot to visit2 and check that the new snapshot is returned + self.storage.snapshot_add([self.empty_snapshot]) + self.storage.origin_visit_update( + origin_id, visit2_id, snapshot=self.empty_snapshot['id']) + self.assertEqual( + {**origin_visit2, 'snapshot': self.empty_snapshot['id']}, + self.storage.origin_visit_get_latest( + origin_url, require_snapshot=True), + ) + self.assertEqual( + origin_visit3, + self.storage.origin_visit_get_latest(origin_url), + ) + + # Check that the status filter is still working + self.assertEqual( + { + **origin_visit1, + 'snapshot': self.complete_snapshot['id'], + 'status': 'full'}, + self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full']), + ) + + # Add snapshot to visit3 (same date as visit2) + self.storage.snapshot_add([self.complete_snapshot]) + self.storage.origin_visit_update( + origin_id, visit3_id, snapshot=self.complete_snapshot['id']) + self.assertEqual( + { + **origin_visit1, + 'snapshot': self.complete_snapshot['id'], + 'status': 'full'}, + self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full']), + ) + self.assertEqual( + { + **origin_visit1, + 'snapshot': self.complete_snapshot['id'], + 'status': 'full'}, + self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full'], require_snapshot=True), + ) + self.assertEqual( + {**origin_visit3, 'snapshot': self.complete_snapshot['id']}, + self.storage.origin_visit_get_latest( + origin_url), + ) + self.assertEqual( + {**origin_visit3, 'snapshot': self.complete_snapshot['id']}, + self.storage.origin_visit_get_latest( + origin_url, require_snapshot=True), + ) + def test_person_get_fullname_unicity(self): # given (person injection through revisions for example) revision = self.revision @@ -2670,7 +2789,7 @@ self.assertEqual(self.complete_snapshot, self.storage.snapshot_get_latest(origin_id)) - # Status filter: both visits are status=ongoing, so no snapshot + # Status filter: all three visits are status=ongoing, so no snapshot # returned self.assertIsNone( self.storage.snapshot_get_latest(origin_id,