diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -492,6 +492,30 @@ return None return r + def origin_visit_get_random(self, type, cur=None): + """Randomly select one origin whose last visit was full in the last 3 + months + + """ + cur = self._cursor(cur) + columns = ','.join(self.origin_visit_select_cols) + query = f"""with visits as ( + select * + from origin_visit + where origin_visit.status='full' and + origin_visit.type=%s and + origin_visit.date > now() - '3 months'::interval + ) + select {columns} + from visits as origin_visit + inner join origin + on origin_visit.origin=origin.id + where random() < 0.1 + limit 1 + """ + cur.execute(query, (type, )) + return cur.fetchone() + @staticmethod def mangle_query_key(key, main_table): if key == 'id': @@ -648,30 +672,6 @@ yield from execute_values_generator( cur, query, ((sha1,) for sha1 in sha1s)) - def origin_visit_get_random(self, type, cur=None): - """Randomly select one origin whose last visit was full in the last 3 - months - - """ - cur = self._cursor(cur) - columns = ','.join(self.origin_visit_select_cols) - query = f"""with visits as ( - select * - from origin_visit - where origin_visit.status='full' and - origin_visit.type=%s and - origin_visit.date > now() - '3 months'::interval - ) - select {columns} - from visits as origin_visit - inner join origin - on origin_visit.origin=origin.id - where random() < 0.1 - limit 1 - """ - cur.execute(query, (type, )) - return cur.fetchone() - def origin_id_get_by_url(self, origins, cur=None): """Retrieve origin `(type, url)` from urls if found.""" cur = self._cursor(cur) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -1092,36 +1092,6 @@ for sha1 in sha1s ] - def _select_random_origin_by_type(self, type: str) -> str: - """Select randomly an origin visit """ - while True: - url = random.choice(list(self._origin_visits.keys())) - random_origin_visits = self._origin_visits[url] - if random_origin_visits[0].type == type: - return url - - def origin_visit_get_random(self, type: str) -> Mapping[str, Any]: - """Randomly select one origin with whose visit was successful - in the last 3 months. - - Returns: - origin dict selected randomly on the dataset - - """ - random_visit: Dict[str, Any] = {} - if not self._origin_visits: # empty dataset - return random_visit - url = self._select_random_origin_by_type(type) - random_origin_visits = copy.deepcopy(self._origin_visits[url]) - random_origin_visits.reverse() - back_in_the_day = now() - timedelta(weeks=12) # 3 months back - # This should be enough for tests - for visit in random_origin_visits: - if visit.date > back_in_the_day and visit.status == 'full': - random_visit = visit.to_dict() - break - return random_visit - def origin_get_range(self, origin_from=1, origin_count=100): """Retrieve ``origin_count`` origins whose ids are greater or equal than ``origin_from``. @@ -1507,6 +1477,36 @@ visits, key=lambda v: (v.date, v.visit), default=None) return self._convert_visit(visit) + def _select_random_origin_visit_by_type(self, type: str) -> str: + """Select randomly an origin visit """ + while True: + url = random.choice(list(self._origin_visits.keys())) + random_origin_visits = self._origin_visits[url] + if random_origin_visits[0].type == type: + return url + + def origin_visit_get_random(self, type: str) -> Mapping[str, Any]: + """Randomly select one origin with whose visit was successful + in the last 3 months. + + Returns: + origin dict selected randomly on the dataset + + """ + random_visit: Dict[str, Any] = {} + if not self._origin_visits: # empty dataset + return random_visit + url = self._select_random_origin_by_type(type) + random_origin_visits = copy.deepcopy(self._origin_visits[url]) + random_origin_visits.reverse() + back_in_the_day = now() - timedelta(weeks=12) # 3 months back + # This should be enough for tests + for visit in random_origin_visits: + if visit.date > back_in_the_day and visit.status == 'full': + random_visit = visit.to_dict() + break + return random_visit + def stat_counters(self): """compute statistics about the number of tuples in various tables diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1496,6 +1496,24 @@ if origin_visit: return dict(zip(db.origin_visit_get_cols, origin_visit)) + @remote_api_endpoint('origin/visit/get_random') + @timed + @db_transaction() + def origin_visit_get_random( + self, type, db=None, cur=None) -> Mapping[str, Any]: + """Randomly select one origin visit from the archive + + Returns: + dict representing an origin visit, in the same format as + `origin_visit_get`. + + """ + data: Dict[str, Any] = {} + result = db.origin_visit_get_random(type, cur) + if result: + data = dict(zip(db.origin_visit_get_cols, result)) + return data + @remote_api_endpoint('object/find_by_sha1_git') @timed @db_transaction(statement_timeout=2000) @@ -1595,23 +1613,6 @@ else: yield None - @remote_api_endpoint('origin/visit/get_random') - @timed - @db_transaction() - def origin_visit_get_random( - self, type, db=None, cur=None) -> Mapping[str, Any]: - """Randomly select one origin from the archive - - Returns: - origin dict selected randomly on the dataset if found - - """ - data: Dict[str, Any] = {} - result = db.origin_visit_get_random(type, cur) - if result: - data = dict(zip(db.origin_visit_get_cols, result)) - return data - @remote_api_endpoint('origin/get_range') @timed @db_transaction_generator()