Page MenuHomeSoftware Heritage

D2404.id8520.diff
No OneTemporary

D2404.id8520.diff

diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -185,6 +185,11 @@
return self.post('origin/visit/get', {
'origin': origin, 'last_visit': last_visit, 'limit': limit})
+ def origin_visit_get_random(self, type):
+ return self.post('origin/visit/get_random', {
+ 'type': type,
+ })
+
def origin_visit_find_by_date(self, origin, visit_date, limit=None):
return self.post('origin/visit/find_by_date', {
'origin': origin, 'visit_date': visit_date})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -402,6 +402,13 @@
**decode_request(request)))
+@app.route('/origin/visit/get_random', methods=['POST'])
+@timed
+def origin_visit_get_random():
+ return encode_data(get_storage().origin_visit_get_random(
+ **decode_request(request)))
+
+
@app.route('/origin/visit/find_by_date', methods=['POST'])
@timed
def origin_visit_find_by_date():
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -648,6 +648,25 @@
yield from execute_values_generator(
cur, query, ((sha1,) for sha1 in sha1s))
+ def origin_visit_get_random(self, type, cur=None):
+ """Randomly select one origin whose last visit was full in the last 3
+ months
+
+ """
+ cur = self._cursor(cur)
+ columns = ','.join(self.origin_visit_select_cols)
+ query = f"""select {columns}
+ from origin_visit tablesample bernoulli (1)
+ inner join origin
+ on origin_visit.origin = origin.id
+ where origin_visit.status='full' and
+ origin_visit.type=%s and
+ origin_visit.date > now() - '3 months'::interval
+ limit 1
+ """
+ cur.execute(query, (type, ))
+ return cur.fetchone()
+
def origin_id_get_by_url(self, origins, cur=None):
"""Retrieve origin `(type, url)` from urls if found."""
cur = self._cursor(cur)
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -7,12 +7,15 @@
import bisect
import dateutil
import collections
-from collections import defaultdict
import copy
import datetime
import itertools
import random
+from collections import defaultdict
+from datetime import timedelta
+from typing import Any, Dict, Mapping
+
import attr
from swh.model.model import \
@@ -1089,6 +1092,36 @@
for sha1 in sha1s
]
+ def _select_random_origin_by_type(self, type: str) -> str:
+ """Select randomly an origin visit """
+ while True:
+ url = random.choice(list(self._origin_visits.keys()))
+ random_origin_visits = self._origin_visits[url]
+ if random_origin_visits[0].type == type:
+ return url
+
+ def origin_visit_get_random(self, type: str) -> Mapping[str, Any]:
+ """Randomly select one origin with <type> whose visit was successful
+ in the last 3 months.
+
+ Returns:
+ origin dict selected randomly on the dataset
+
+ """
+ random_visit: Dict[str, Any] = {}
+ if not self._origin_visits: # empty dataset
+ return random_visit
+ url = self._select_random_origin_by_type(type)
+ random_origin_visits = copy.deepcopy(self._origin_visits[url])
+ random_origin_visits.reverse()
+ back_in_the_day = now() - timedelta(weeks=12) # 3 months back
+ # This should be enough for tests
+ for visit in random_origin_visits:
+ if visit.date > back_in_the_day and visit.status == 'full':
+ random_visit = visit.to_dict()
+ break
+ return random_visit
+
def origin_get_range(self, origin_from=1, origin_count=100):
"""Retrieve ``origin_count`` origins whose ids are greater
or equal than ``origin_from``.
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -3,15 +3,16 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-from collections import defaultdict
import copy
-from concurrent.futures import ThreadPoolExecutor
-from contextlib import contextmanager
import datetime
import itertools
import json
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from typing import Any, Dict, Mapping
+
import dateutil.parser
import psycopg2
import psycopg2.pool
@@ -1507,6 +1508,21 @@
else:
yield None
+ @db_transaction()
+ def origin_visit_get_random(
+ self, type, db=None, cur=None) -> Mapping[str, Any]:
+ """Randomly select one origin from the archive
+
+ Returns:
+ origin dict selected randomly on the dataset if found
+
+ """
+ data: Dict[str, Any] = {}
+ result = db.origin_visit_get_random(type, cur)
+ if result:
+ data = dict(zip(db.origin_visit_get_cols, result))
+ return data
+
@db_transaction_generator()
def origin_get_range(self, origin_from=1, origin_count=100,
db=None, cur=None):
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -8,8 +8,11 @@
import datetime
import itertools
import queue
+import random
import threading
+
from collections import defaultdict
+from datetime import timedelta
from unittest.mock import Mock
import psycopg2
@@ -936,6 +939,70 @@
assert len(actual_origin0) == 1
assert actual_origin0[0]['url'] == data.origin['url']
+ def _generate_random_visits(self, nb_visits=100, start=0, end=7):
+ """Generate random visits within the last 2 months (to avoid
+ computations)
+
+ """
+ visits = []
+ today = datetime.datetime.now(tz=datetime.timezone.utc)
+ for weeks in range(nb_visits, 0, -1):
+ hours = random.randint(0, 24)
+ minutes = random.randint(0, 60)
+ seconds = random.randint(0, 60)
+ days = random.randint(0, 28)
+ weeks = random.randint(start, end)
+ date_visit = today - timedelta(
+ weeks=weeks, hours=hours, minutes=minutes,
+ seconds=seconds, days=days)
+ visits.append(date_visit)
+ return visits
+
+ def test_origin_visit_get_random(self, swh_storage):
+ swh_storage.origin_add(data.origins)
+
+ # Add some visits, enough for the sample used in choosing randomly
+ # origin
+ visits = self._generate_random_visits()
+
+ type = 'git'
+
+ # Add visits to those origins
+ for origin in data.origins:
+ for date_visit in visits:
+ visit = swh_storage.origin_visit_add(
+ origin['url'], date=date_visit, type=type)
+ swh_storage.origin_visit_update(
+ origin['url'], visit_id=visit['visit'], status='full')
+
+ swh_storage.refresh_stat_counters()
+
+ stats = swh_storage.stat_counters()
+ assert stats['origin'] == len(data.origins)
+ assert stats['origin_visit'] == len(data.origins) * len(visits)
+
+ random_origin_visit = swh_storage.origin_visit_get_random(type)
+ assert random_origin_visit
+ assert random_origin_visit['origin'] is not None
+ original_urls = [o['url'] for o in data.origins]
+ assert random_origin_visit['origin'] in original_urls
+
+ def test_origin_visit_get_random_nothing_found(self, swh_storage):
+ swh_storage.origin_add(data.origins)
+ type = 'hg'
+ # Add some visits outside of the random generation selection
+ # so nothing will get find by the random selection
+ visits = self._generate_random_visits(nb_visits=3, start=13, end=24)
+ for origin in data.origins:
+ for date_visit in visits:
+ visit = swh_storage.origin_visit_add(
+ origin['url'], date=date_visit, type=type)
+ swh_storage.origin_visit_update(
+ origin['url'], visit_id=visit['visit'], status='full')
+
+ random_origin_visit = swh_storage.origin_visit_get_random(type)
+ assert random_origin_visit == {}
+
def test_origin_get_by_sha1(self, swh_storage):
assert swh_storage.origin_get(data.origin) is None
swh_storage.origin_add_one(data.origin)
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -6,6 +6,7 @@
testing
deps =
pytest-cov
+ dev: ipdb
commands =
pytest \
!slow: --hypothesis-profile=fast \

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 12:35 PM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223923

Event Timeline