Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066498
D2404.id8520.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D2404.id8520.diff
View Options
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -185,6 +185,11 @@
return self.post('origin/visit/get', {
'origin': origin, 'last_visit': last_visit, 'limit': limit})
+ def origin_visit_get_random(self, type):
+ return self.post('origin/visit/get_random', {
+ 'type': type,
+ })
+
def origin_visit_find_by_date(self, origin, visit_date, limit=None):
return self.post('origin/visit/find_by_date', {
'origin': origin, 'visit_date': visit_date})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -402,6 +402,13 @@
**decode_request(request)))
+@app.route('/origin/visit/get_random', methods=['POST'])
+@timed
+def origin_visit_get_random():
+ return encode_data(get_storage().origin_visit_get_random(
+ **decode_request(request)))
+
+
@app.route('/origin/visit/find_by_date', methods=['POST'])
@timed
def origin_visit_find_by_date():
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -648,6 +648,25 @@
yield from execute_values_generator(
cur, query, ((sha1,) for sha1 in sha1s))
+ def origin_visit_get_random(self, type, cur=None):
+ """Randomly select one origin whose last visit was full in the last 3
+ months
+
+ """
+ cur = self._cursor(cur)
+ columns = ','.join(self.origin_visit_select_cols)
+ query = f"""select {columns}
+ from origin_visit tablesample bernoulli (1)
+ inner join origin
+ on origin_visit.origin = origin.id
+ where origin_visit.status='full' and
+ origin_visit.type=%s and
+ origin_visit.date > now() - '3 months'::interval
+ limit 1
+ """
+ cur.execute(query, (type, ))
+ return cur.fetchone()
+
def origin_id_get_by_url(self, origins, cur=None):
"""Retrieve origin `(type, url)` from urls if found."""
cur = self._cursor(cur)
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -7,12 +7,15 @@
import bisect
import dateutil
import collections
-from collections import defaultdict
import copy
import datetime
import itertools
import random
+from collections import defaultdict
+from datetime import timedelta
+from typing import Any, Dict, Mapping
+
import attr
from swh.model.model import \
@@ -1089,6 +1092,36 @@
for sha1 in sha1s
]
+ def _select_random_origin_by_type(self, type: str) -> str:
+ """Select randomly an origin visit """
+ while True:
+ url = random.choice(list(self._origin_visits.keys()))
+ random_origin_visits = self._origin_visits[url]
+ if random_origin_visits[0].type == type:
+ return url
+
+ def origin_visit_get_random(self, type: str) -> Mapping[str, Any]:
+ """Randomly select one origin with <type> whose visit was successful
+ in the last 3 months.
+
+ Returns:
+ origin dict selected randomly on the dataset
+
+ """
+ random_visit: Dict[str, Any] = {}
+ if not self._origin_visits: # empty dataset
+ return random_visit
+ url = self._select_random_origin_by_type(type)
+ random_origin_visits = copy.deepcopy(self._origin_visits[url])
+ random_origin_visits.reverse()
+ back_in_the_day = now() - timedelta(weeks=12) # 3 months back
+ # This should be enough for tests
+ for visit in random_origin_visits:
+ if visit.date > back_in_the_day and visit.status == 'full':
+ random_visit = visit.to_dict()
+ break
+ return random_visit
+
def origin_get_range(self, origin_from=1, origin_count=100):
"""Retrieve ``origin_count`` origins whose ids are greater
or equal than ``origin_from``.
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -3,15 +3,16 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-from collections import defaultdict
import copy
-from concurrent.futures import ThreadPoolExecutor
-from contextlib import contextmanager
import datetime
import itertools
import json
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from typing import Any, Dict, Mapping
+
import dateutil.parser
import psycopg2
import psycopg2.pool
@@ -1507,6 +1508,21 @@
else:
yield None
+ @db_transaction()
+ def origin_visit_get_random(
+ self, type, db=None, cur=None) -> Mapping[str, Any]:
+ """Randomly select one origin from the archive
+
+ Returns:
+ origin dict selected randomly on the dataset if found
+
+ """
+ data: Dict[str, Any] = {}
+ result = db.origin_visit_get_random(type, cur)
+ if result:
+ data = dict(zip(db.origin_visit_get_cols, result))
+ return data
+
@db_transaction_generator()
def origin_get_range(self, origin_from=1, origin_count=100,
db=None, cur=None):
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -8,8 +8,11 @@
import datetime
import itertools
import queue
+import random
import threading
+
from collections import defaultdict
+from datetime import timedelta
from unittest.mock import Mock
import psycopg2
@@ -936,6 +939,70 @@
assert len(actual_origin0) == 1
assert actual_origin0[0]['url'] == data.origin['url']
+ def _generate_random_visits(self, nb_visits=100, start=0, end=7):
+ """Generate random visits within the last 2 months (to avoid
+ computations)
+
+ """
+ visits = []
+ today = datetime.datetime.now(tz=datetime.timezone.utc)
+ for weeks in range(nb_visits, 0, -1):
+ hours = random.randint(0, 24)
+ minutes = random.randint(0, 60)
+ seconds = random.randint(0, 60)
+ days = random.randint(0, 28)
+ weeks = random.randint(start, end)
+ date_visit = today - timedelta(
+ weeks=weeks, hours=hours, minutes=minutes,
+ seconds=seconds, days=days)
+ visits.append(date_visit)
+ return visits
+
+ def test_origin_visit_get_random(self, swh_storage):
+ swh_storage.origin_add(data.origins)
+
+ # Add some visits, enough for the sample used in choosing randomly
+ # origin
+ visits = self._generate_random_visits()
+
+ type = 'git'
+
+ # Add visits to those origins
+ for origin in data.origins:
+ for date_visit in visits:
+ visit = swh_storage.origin_visit_add(
+ origin['url'], date=date_visit, type=type)
+ swh_storage.origin_visit_update(
+ origin['url'], visit_id=visit['visit'], status='full')
+
+ swh_storage.refresh_stat_counters()
+
+ stats = swh_storage.stat_counters()
+ assert stats['origin'] == len(data.origins)
+ assert stats['origin_visit'] == len(data.origins) * len(visits)
+
+ random_origin_visit = swh_storage.origin_visit_get_random(type)
+ assert random_origin_visit
+ assert random_origin_visit['origin'] is not None
+ original_urls = [o['url'] for o in data.origins]
+ assert random_origin_visit['origin'] in original_urls
+
+ def test_origin_visit_get_random_nothing_found(self, swh_storage):
+ swh_storage.origin_add(data.origins)
+ type = 'hg'
+ # Add some visits outside of the random generation selection
+ # so nothing will get find by the random selection
+ visits = self._generate_random_visits(nb_visits=3, start=13, end=24)
+ for origin in data.origins:
+ for date_visit in visits:
+ visit = swh_storage.origin_visit_add(
+ origin['url'], date=date_visit, type=type)
+ swh_storage.origin_visit_update(
+ origin['url'], visit_id=visit['visit'], status='full')
+
+ random_origin_visit = swh_storage.origin_visit_get_random(type)
+ assert random_origin_visit == {}
+
def test_origin_get_by_sha1(self, swh_storage):
assert swh_storage.origin_get(data.origin) is None
swh_storage.origin_add_one(data.origin)
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -6,6 +6,7 @@
testing
deps =
pytest-cov
+ dev: ipdb
commands =
pytest \
!slow: --hypothesis-profile=fast \
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 12:35 PM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223923
Attached To
D2404: storage: Add endpoint to randomly pick an origin
Event Timeline
Log In to Comment