Page MenuHomeSoftware Heritage

D1581.diff
No OneTemporary

D1581.diff

diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -208,6 +208,13 @@
return self.post('origin/visit/getby', {'origin': origin,
'visit': visit})
+ def origin_visit_get_latest(self, origin, allowed_statuses=None,
+ require_snapshot=False):
+ return self.post(
+ 'origin/visit/get_latest',
+ {'origin': origin, 'allowed_statuses': allowed_statuses,
+ 'require_snapshot': require_snapshot})
+
def person_get(self, person):
return self.post('person', {'person': person})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -397,6 +397,13 @@
get_storage().origin_visit_get_by(**decode_request(request)))
+@app.route('/origin/visit/get_latest', methods=['POST'])
+@timed
+def origin_visit_get_latest():
+ return encode_data(
+ get_storage().origin_visit_get_latest(**decode_request(request)))
+
+
@app.route('/origin/visit/add', methods=['POST'])
@timed
@encode
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -431,34 +431,45 @@
return bool(cur.fetchone())
- def origin_visit_get_latest_snapshot(self, origin_id,
- allowed_statuses=None,
- cur=None):
- """Retrieve the most recent origin_visit which references a snapshot
+ def origin_visit_get_latest(
+ self, origin_id, allowed_statuses=None, require_snapshot=False,
+ cur=None):
+ """Retrieve the most recent origin_visit of the given origin,
+ with optional filters.
Args:
origin_id: the origin concerned
allowed_statuses: the visit statuses allowed for the returned visit
+ require_snapshot (bool): If True, only a visit with a known
+ snapshot will be returned.
Returns:
The origin_visit information, or None if no visit matches.
"""
cur = self._cursor(cur)
- extra_clause = ""
+ query_parts = [
+ 'SELECT %s' % ', '.join(self.origin_visit_get_cols),
+ 'FROM origin_visit']
+
+ if require_snapshot:
+ # Makes sure the snapshot is known
+ query_parts.append(
+ 'INNER JOIN snapshot ON (origin_visit.snapshot=snapshot.id)')
+
+ query_parts.append('WHERE origin = %s')
+
+ if require_snapshot:
+ query_parts.append('AND snapshot is not null')
+
if allowed_statuses:
- extra_clause = cur.mogrify("AND status IN %s",
- (tuple(allowed_statuses),)).decode()
+ query_parts.append(
+ cur.mogrify('AND status IN %s',
+ (tuple(allowed_statuses),)).decode())
- query = """\
- SELECT %s
- FROM origin_visit
- INNER JOIN snapshot ON (origin_visit.snapshot=snapshot.id)
- WHERE
- origin = %%s AND snapshot is not null %s
- ORDER BY date DESC, visit DESC
- LIMIT 1
- """ % (', '.join(self.origin_visit_get_cols), extra_clause)
+ query_parts.append('ORDER BY date DESC, visit DESC LIMIT 1')
+
+ query = '\n'.join(query_parts)
cur.execute(query, (origin_id,))
r = cur.fetchone()
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -792,13 +792,13 @@
.. warning:: At most 1000 branches contained in the snapshot will be
returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
+ set of branches, the methods :meth:`origin_visit_get_latest`
+ and :meth:`snapshot_get_branches` should be used instead.
Args:
origin (Union[str,int]): the origin's URL or identifier
allowed_statuses (list of str): list of visit statuses considered
- to find the latest snapshot for the visit. For instance,
+ to find the latest snapshot for the origin. For instance,
``allowed_statuses=['full']`` will only consider visits that
have successfully run to completion.
Returns:
@@ -810,21 +810,13 @@
or :const:`None` if the snapshot has less than 1000
branches.
"""
- if isinstance(origin, str):
- origin = self.origin_get({'url': origin})['id']
- visits = self._origin_visits[origin-1]
- if allowed_statuses is not None:
- visits = [visit for visit in visits
- if visit['status'] in allowed_statuses]
- snapshot = None
- for visit in sorted(visits, key=lambda v: (v['date'], v['visit']),
- reverse=True):
- snapshot_id = visit['snapshot']
- snapshot = self.snapshot_get(snapshot_id)
- if snapshot:
- break
+ if isinstance(origin, int):
+ origin = self.origin_get({'id': origin})['url']
- return snapshot
+ visit = self.origin_visit_get_latest(
+ origin, allowed_statuses=allowed_statuses, require_snapshot=True)
+ if visit and visit['snapshot']:
+ return self.snapshot_get(visit['snapshot'])
def snapshot_count_branches(self, snapshot_id, db=None, cur=None):
"""Count the number of branches in the snapshot with the given id
@@ -1323,12 +1315,52 @@
it does not exist
"""
+ if isinstance(origin, str):
+ origin = self.origin_get({'url': origin})['id']
origin_visit = None
if origin <= len(self._origin_visits) and \
visit <= len(self._origin_visits[origin-1]):
origin_visit = self._origin_visits[origin-1][visit-1]
return copy.deepcopy(origin_visit)
+ def origin_visit_get_latest(
+ self, origin, allowed_statuses=None, require_snapshot=False):
+ """Get the latest origin visit for the given origin, optionally
+ looking only for those with one of the given allowed_statuses
+ or for those with a known snapshot.
+
+ Args:
+ origin (str): the origin's URL
+ allowed_statuses (list of str): list of visit statuses considered
+ to find the latest visit. For instance,
+ ``allowed_statuses=['full']`` will only consider visits that
+ have successfully run to completion.
+ require_snapshot (bool): If True, only a visit with a snapshot
+ will be returned.
+ Returns:
+ dict: a dict with the following keys:
+
+ origin: the URL of the origin
+ visit: origin visit id
+ type: type of loader used for the visit
+ date: timestamp of such visit
+ status: Visit's new status
+ metadata: Data associated to the visit
+ snapshot (Optional[sha1_git]): identifier of the snapshot
+ associated to the visit
+ """
+ origin = self.origin_get({'url': origin})['id']
+ visits = self._origin_visits[origin-1]
+ if allowed_statuses is not None:
+ visits = [visit for visit in visits
+ if visit['status'] in allowed_statuses]
+ if require_snapshot:
+ visits = [visit for visit in visits
+ if visit['snapshot']
+ and visit['snapshot'] in self._snapshots]
+
+ return max(visits, key=lambda v: (v['date'], v['visit']), default=None)
+
def person_get(self, person):
"""Return the persons identified by their ids.
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1063,13 +1063,13 @@
or :const:`None` if the snapshot has less than 1000
branches.
"""
- if isinstance(origin, str):
- origin = self.origin_get({'url': origin})['id']
+ if isinstance(origin, int):
+ origin = self.origin_get({'id': origin}, db=db, cur=cur)['url']
- origin_visit = db.origin_visit_get_latest_snapshot(
- origin, allowed_statuses=allowed_statuses, cur=cur)
- if origin_visit:
- origin_visit = dict(zip(db.origin_visit_get_cols, origin_visit))
+ origin_visit = self.origin_visit_get_latest(
+ origin, allowed_statuses=allowed_statuses, require_snapshot=True,
+ db=db, cur=cur)
+ if origin_visit and origin_visit['snapshot']:
return self.snapshot_get(origin_visit['snapshot'], db=db, cur=cur)
@db_transaction(statement_timeout=2000)
@@ -1328,12 +1328,50 @@
it does not exist
"""
+ if isinstance(origin, str):
+ origin = self.origin_get({'url': origin}, db=db, cur=cur)['id']
ori_visit = db.origin_visit_get(origin, visit, cur)
if not ori_visit:
return None
return dict(zip(db.origin_visit_get_cols, ori_visit))
+ @db_transaction(statement_timeout=4000)
+ def origin_visit_get_latest(
+ self, origin, allowed_statuses=None, require_snapshot=False,
+ db=None, cur=None):
+ """Get the latest origin visit for the given origin, optionally
+ looking only for those with one of the given allowed_statuses
+ or for those with a known snapshot.
+
+ Args:
+ origin (str): the origin's URL
+ allowed_statuses (list of str): list of visit statuses considered
+ to find the latest visit. For instance,
+ ``allowed_statuses=['full']`` will only consider visits that
+ have successfully run to completion.
+ require_snapshot (bool): If True, only a visit with a snapshot
+ will be returned.
+ Returns:
+ dict: a dict with the following keys:
+
+ origin: the URL of the origin
+ visit: origin visit id
+ type: type of loader used for the visit
+ date: timestamp of such visit
+ status: Visit's new status
+ metadata: Data associated to the visit
+ snapshot (Optional[sha1_git]): identifier of the snapshot
+ associated to the visit
+ """
+ origin = self.origin_get({'url': origin}, db=db, cur=cur)['id']
+
+ origin_visit = db.origin_visit_get_latest(
+ origin, allowed_statuses=allowed_statuses,
+ require_snapshot=require_snapshot, cur=cur)
+ if origin_visit:
+ return dict(zip(db.origin_visit_get_cols, origin_visit))
+
@db_transaction(statement_timeout=2000)
def object_find_by_sha1_git(self, ids, db=None, cur=None):
"""Return the objects found with the given ids.
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -2089,6 +2089,125 @@
expected_persons.reverse()
self.assertEqual(list(actual_persons), expected_persons)
+ def test_origin_visit_get_latest(self):
+ origin_id = self.storage.origin_add_one(self.origin)
+ origin_url = self.origin['url']
+ origin_visit1 = self.storage.origin_visit_add(origin_id,
+ self.date_visit1)
+ visit1_id = origin_visit1['visit']
+ origin_visit2 = self.storage.origin_visit_add(origin_id,
+ self.date_visit2)
+ visit2_id = origin_visit2['visit']
+
+ # Add a visit with the same date as the previous one
+ origin_visit3 = self.storage.origin_visit_add(origin_id,
+ self.date_visit2)
+ visit3_id = origin_visit3['visit']
+
+ origin_visit1 = self.storage.origin_visit_get_by(origin_url, visit1_id)
+ origin_visit2 = self.storage.origin_visit_get_by(origin_url, visit2_id)
+ origin_visit3 = self.storage.origin_visit_get_by(origin_url, visit3_id)
+
+ # Two visits, both with no snapshot
+ self.assertEqual(
+ origin_visit3,
+ self.storage.origin_visit_get_latest(origin_url))
+ self.assertIsNone(
+ self.storage.origin_visit_get_latest(origin_url,
+ require_snapshot=True))
+
+ # Add snapshot to visit1; require_snapshot=True makes it return
+ # visit1 and require_snapshot=False still returns visit2
+ self.storage.snapshot_add([self.complete_snapshot])
+ self.storage.origin_visit_update(
+ origin_id, visit1_id, snapshot=self.complete_snapshot['id'])
+ self.assertEqual(
+ {**origin_visit1, 'snapshot': self.complete_snapshot['id']},
+ self.storage.origin_visit_get_latest(
+ origin_url, require_snapshot=True)
+ )
+ self.assertEqual(
+ origin_visit3,
+ self.storage.origin_visit_get_latest(origin_url)
+ )
+
+ # Status filter: all three visits are status=ongoing, so no visit
+ # returned
+ self.assertIsNone(
+ self.storage.origin_visit_get_latest(
+ origin_url, allowed_statuses=['full'])
+ )
+
+ # Mark the first visit as completed and check status filter again
+ self.storage.origin_visit_update(origin_id, visit1_id, status='full')
+ self.assertEqual(
+ {
+ **origin_visit1,
+ 'snapshot': self.complete_snapshot['id'],
+ 'status': 'full'},
+ self.storage.origin_visit_get_latest(
+ origin_url, allowed_statuses=['full']),
+ )
+ self.assertEqual(
+ origin_visit3,
+ self.storage.origin_visit_get_latest(origin_url),
+ )
+
+ # Add snapshot to visit2 and check that the new snapshot is returned
+ self.storage.snapshot_add([self.empty_snapshot])
+ self.storage.origin_visit_update(
+ origin_id, visit2_id, snapshot=self.empty_snapshot['id'])
+ self.assertEqual(
+ {**origin_visit2, 'snapshot': self.empty_snapshot['id']},
+ self.storage.origin_visit_get_latest(
+ origin_url, require_snapshot=True),
+ )
+ self.assertEqual(
+ origin_visit3,
+ self.storage.origin_visit_get_latest(origin_url),
+ )
+
+ # Check that the status filter is still working
+ self.assertEqual(
+ {
+ **origin_visit1,
+ 'snapshot': self.complete_snapshot['id'],
+ 'status': 'full'},
+ self.storage.origin_visit_get_latest(
+ origin_url, allowed_statuses=['full']),
+ )
+
+ # Add snapshot to visit3 (same date as visit2)
+ self.storage.snapshot_add([self.complete_snapshot])
+ self.storage.origin_visit_update(
+ origin_id, visit3_id, snapshot=self.complete_snapshot['id'])
+ self.assertEqual(
+ {
+ **origin_visit1,
+ 'snapshot': self.complete_snapshot['id'],
+ 'status': 'full'},
+ self.storage.origin_visit_get_latest(
+ origin_url, allowed_statuses=['full']),
+ )
+ self.assertEqual(
+ {
+ **origin_visit1,
+ 'snapshot': self.complete_snapshot['id'],
+ 'status': 'full'},
+ self.storage.origin_visit_get_latest(
+ origin_url, allowed_statuses=['full'], require_snapshot=True),
+ )
+ self.assertEqual(
+ {**origin_visit3, 'snapshot': self.complete_snapshot['id']},
+ self.storage.origin_visit_get_latest(
+ origin_url),
+ )
+ self.assertEqual(
+ {**origin_visit3, 'snapshot': self.complete_snapshot['id']},
+ self.storage.origin_visit_get_latest(
+ origin_url, require_snapshot=True),
+ )
+
def test_person_get_fullname_unicity(self):
# given (person injection through revisions for example)
revision = self.revision
@@ -2670,7 +2789,7 @@
self.assertEqual(self.complete_snapshot,
self.storage.snapshot_get_latest(origin_id))
- # Status filter: both visits are status=ongoing, so no snapshot
+ # Status filter: all three visits are status=ongoing, so no snapshot
# returned
self.assertIsNone(
self.storage.snapshot_get_latest(origin_id,

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 9:53 AM (18 h, 52 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224016

Event Timeline