diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ click flask psycopg2 -python-dateutil vcversioner aiohttp tenacity diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -8,10 +8,9 @@ import json import random import re -from typing import Any, Dict, List, Iterable, Optional, Union +from typing import Any, Dict, List, Iterable, Optional import attr -import dateutil from swh.core.api.serializers import msgpack_loads, msgpack_dumps from swh.model.model import ( @@ -795,48 +794,32 @@ return origin_url - def origin_visit_add( - self, origin_url: str, date: Union[str, datetime.datetime], type: str - ) -> OriginVisit: - if isinstance(date, str): - # FIXME: Converge on iso8601 at some point - date = dateutil.parser.parse(date) - elif not isinstance(date, datetime.datetime): - raise StorageArgumentException("Date must be a datetime or a string") - - if not self.origin_get_one({"url": origin_url}): - raise StorageArgumentException("Unknown origin %s", origin_url) - - visit_id = self._cql_runner.origin_generate_unique_visit_id(origin_url) - visit_state = "ongoing" - with convert_validation_exceptions(): - visit = OriginVisit.from_dict( - { - "origin": origin_url, - "date": date, - "type": type, - "status": visit_state, - "snapshot": None, - "metadata": None, - "visit": visit_id, - } - ) + def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]: + for visit in visits: + origin = self.origin_get({"url": visit.origin}) + if not origin: # Cannot add a visit without an origin + raise StorageArgumentException("Unknown origin %s", visit.origin) - self.journal_writer.origin_visit_add([visit]) - self._cql_runner.origin_visit_add_one(visit) + all_visits = [] + nb_visits = 0 + for visit in visits: + nb_visits += 1 + if not visit.visit: + visit_id = self._cql_runner.origin_generate_unique_visit_id( + visit.origin + ) + visit = OriginVisit.from_dict({**visit.to_dict(), "visit": visit_id}) + self.journal_writer.origin_visit_add([visit]) + self._cql_runner.origin_visit_add_one(visit) + assert visit.visit is not None + all_visits.append(visit) - with convert_validation_exceptions(): - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit_id, - date=date, - status=visit_state, - snapshot=None, - metadata=None, - ) - self._origin_visit_status_add(visit_status) + visit_status_dict = visit.to_dict() + visit_status_dict.pop("type") + visit_status = OriginVisitStatus.from_dict(visit_status_dict) + self._origin_visit_status_add(visit_status) - return visit + return all_visits def _origin_visit_status_add(self, visit_status: OriginVisitStatus) -> None: """Add an origin visit status""" diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -5,7 +5,6 @@ import re import bisect -import dateutil import collections import copy import datetime @@ -26,7 +25,6 @@ Optional, Tuple, TypeVar, - Union, ) import attr @@ -794,54 +792,36 @@ return origin.url - def origin_visit_add( - self, origin_url: str, date: Union[str, datetime.datetime], type: str - ) -> OriginVisit: - if isinstance(date, str): - # FIXME: Converge on iso8601 at some point - date = dateutil.parser.parse(date) - elif not isinstance(date, datetime.datetime): - raise StorageArgumentException("Date must be a datetime or a string") - - origin = self.origin_get({"url": origin_url}) - if not origin: # Cannot add a visit without an origin - raise StorageArgumentException("Unknown origin %s", origin_url) - - if origin_url in self._origins: - origin = self._origins[origin_url] - # visit ids are in the range [1, +inf[ - visit_id = len(self._origin_visits[origin_url]) + 1 - status = "ongoing" - with convert_validation_exceptions(): - visit = OriginVisit( - origin=origin_url, - date=date, - type=type, - # TODO: Remove when we remove those fields from the model - status=status, - snapshot=None, - metadata=None, - visit=visit_id, - ) - self.journal_writer.origin_visit_add([visit]) - self._origin_visits[origin_url].append(visit) - assert visit.visit is not None - visit_key = (origin_url, visit.visit) + def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]: + for visit in visits: + origin = self.origin_get({"url": visit.origin}) + if not origin: # Cannot add a visit without an origin + raise StorageArgumentException("Unknown origin %s", visit.origin) - with convert_validation_exceptions(): - visit_update = OriginVisitStatus( - origin=origin_url, - visit=visit_id, - date=date, - status=status, - snapshot=None, - metadata=None, - ) - self._origin_visit_status_add_one(visit_update) - self._objects[visit_key].append(("origin_visit", None)) + all_visits = [] + for visit in visits: + origin_url = visit.origin + if origin_url in self._origins: + origin = self._origins[origin_url] + if visit.visit: # assume already present, do nothing + all_visits.append(visit) + continue + # visit ids are in the range [1, +inf[ + visit_id = len(self._origin_visits[origin_url]) + 1 + visit = OriginVisit.from_dict({**visit.to_dict(), "visit": visit_id}) + assert visit.visit is not None + self.journal_writer.origin_visit_add([visit]) + all_visits.append(visit) + self._origin_visits[origin_url].append(visit) + visit_key = (origin_url, visit.visit) + + visit_status_dict = visit.to_dict() + visit_status_dict.pop("type") + visit_status = OriginVisitStatus.from_dict(visit_status_dict) + self._origin_visit_status_add_one(visit_status) + self._objects[visit_key].append(("origin_visit", None)) - # return last visit - return visit + return all_visits def _origin_visit_status_add_one(self, visit_status: OriginVisitStatus) -> None: """Add an origin visit status without checks. diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -5,7 +5,7 @@ import datetime -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional from swh.core.api import remote_api_endpoint from swh.model.model import ( @@ -780,24 +780,18 @@ ... @remote_api_endpoint("origin/visit/add") - def origin_visit_add( - self, origin_url: str, date: Union[str, datetime.datetime], type: str - ) -> OriginVisit: - """Add an origin_visit for the origin at ts with status 'ongoing'. + def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]: + """Add visits to storage. If the visits have no id, they will be created and assigned + one. The resulted visits are visit with their visit id set. Args: - origin_url: visited origin identifier (its URL) - date: timestamp of such visit - type: the type of loader used for the visit (hg, git, ...) + visits: Iterable of OriginVisit objects to add Raises: - StorageArgumentException if the date is mistyped, or the origin - is unknown. + StorageArgumentException if some origin visit reference unknown origins Returns: - dict: dictionary with keys origin and visit where: - - origin: origin object - - visit: the visit object for the new visit occurrence + Iterable[OriginVisit] stored """ ... diff --git a/swh/storage/retry.py b/swh/storage/retry.py --- a/swh/storage/retry.py +++ b/swh/storage/retry.py @@ -7,7 +7,7 @@ import traceback from datetime import datetime -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional from tenacity import ( retry, @@ -108,10 +108,8 @@ return self.storage.origin_add_one(origin) @swh_retry - def origin_visit_add( - self, origin_url: str, date: Union[datetime, str], type: str - ) -> OriginVisit: - return self.storage.origin_visit_add(origin_url, date, type) + def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]: + return self.storage.origin_visit_add(visits) @swh_retry def origin_visit_update( diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -9,10 +9,9 @@ from collections import defaultdict from contextlib import contextmanager -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional import attr -import dateutil.parser import psycopg2 import psycopg2.pool import psycopg2.errors @@ -824,55 +823,36 @@ @timed @db_transaction() def origin_visit_add( - self, - origin_url: str, - date: Union[str, datetime.datetime], - type: str, - db=None, - cur=None, - ) -> OriginVisit: - if isinstance(date, str): - # FIXME: Converge on iso8601 at some point - date = dateutil.parser.parse(date) - elif not isinstance(date, datetime.datetime): - raise StorageArgumentException("Date must be a datetime or a string") - - origin = self.origin_get({"url": origin_url}, db=db, cur=cur) - if not origin: # Cannot add a visit without an origin - raise StorageArgumentException("Unknown origin %s", origin_url) - - with convert_validation_exceptions(): - visit_id = db.origin_visit_add(origin_url, date, type, cur=cur) - - status = "ongoing" - # We can write to the journal only after inserting to the - # DB, because we want the id of the visit - visit = OriginVisit.from_dict( - { - "origin": origin_url, - "date": date, - "type": type, - "visit": visit_id, - # TODO: Remove when we remove those fields from the model - "status": status, - "metadata": None, - "snapshot": None, - } - ) - self.journal_writer.origin_visit_add([visit]) + self, visits: Iterable[OriginVisit], db=None, cur=None + ) -> Iterable[OriginVisit]: + for visit in visits: + origin = self.origin_get({"url": visit.origin}, db=db, cur=cur) + if not origin: # Cannot add a visit without an origin + raise StorageArgumentException("Unknown origin %s", visit.origin) - with convert_validation_exceptions(): - visit_status = OriginVisitStatus( - origin=origin_url, - visit=visit_id, - date=date, - status=status, - snapshot=None, - metadata=None, - ) - self._origin_visit_status_add(visit_status, db=db, cur=cur) - send_metric("origin_visit:add", count=1, method_name="origin_visit") - return visit + all_visits = [] + nb_visits = 0 + for visit in visits: + nb_visits += 1 + if not visit.visit: + with convert_validation_exceptions(): + visit_id = db.origin_visit_add( + visit.origin, visit.date, visit.type, cur=cur + ) + visit = OriginVisit.from_dict({**visit.to_dict(), "visit": visit_id}) + else: + db.origin_visit_upsert(visit) + assert visit.visit is not None + all_visits.append(visit) + # Forced to write after for the case when the visit has no id + self.journal_writer.origin_visit_add([visit]) + visit_status_dict = visit.to_dict() + visit_status_dict.pop("type") + visit_status = OriginVisitStatus.from_dict(visit_status_dict) + self._origin_visit_status_add(visit_status, db=db, cur=cur) + + send_metric("origin_visit:add", count=nb_visits, method_name="origin_visit") + return all_visits def _origin_visit_status_add( self, visit_status: OriginVisitStatus, db, cur diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1412,9 +1412,15 @@ for origin in data.origins: origin_url = origin["url"] for date_visit in visits: - visit = swh_storage.origin_visit_add( - origin_url, date=date_visit, type=visit_type + visit = OriginVisit( + origin=origin_url, + date=date_visit, + type=visit_type, + status="ongoing", + snapshot=None, ) + visits_ = swh_storage.origin_visit_add([visit]) + visit = visits_[0] swh_storage.origin_visit_update( origin_url, visit_id=visit.visit, status="full" ) @@ -1440,9 +1446,15 @@ for origin in data.origins: origin_url = origin["url"] for date_visit in visits: - visit = swh_storage.origin_visit_add( - origin_url, date=date_visit, type=visit_type + visit = OriginVisit( + origin=origin_url, + date=date_visit, + type=visit_type, + status="ongoing", + snapshot=None, ) + visits_ = swh_storage.origin_visit_add([visit]) + visit = visits_[0] swh_storage.origin_visit_update(origin_url, visit.visit, status="full") random_origin_visit = swh_storage.origin_visit_get_random(visit_type) @@ -1593,11 +1605,23 @@ date_visit = date_visit.replace(microsecond=round(date_visit.microsecond, -3)) # when - origin_visit = swh_storage.origin_visit_add( - origin_url, type=data.type_visit1, date=date_visit + visit = OriginVisit( + origin=origin_url, + type=data.type_visit1, + date=date_visit, + status="ongoing", + snapshot=None, ) - expected_origin_visit = { + # multiple calls to add the same visit changes nothing + origin_visit = swh_storage.origin_visit_add([visit])[0] + + # insert the same origin visit should be ok as well + origin_visit2 = swh_storage.origin_visit_add([origin_visit])[0] + + assert origin_visit == origin_visit2 # idempotent! + + expected_ori_visit_dict = { "origin": origin_url, "date": date_visit, "visit": origin_visit.visit, @@ -1606,14 +1630,22 @@ "metadata": None, "snapshot": None, } - assert origin_visit == OriginVisit.from_dict(expected_origin_visit) - actual_origin_visits = list(swh_storage.origin_visit_get(origin_url)) + visit_status = copy.deepcopy(expected_ori_visit_dict) + visit_status.pop("type") - assert expected_origin_visit in actual_origin_visits + expected_origin_visit_status = OriginVisitStatus.from_dict(visit_status) + expected_origin_visit = OriginVisit.from_dict(expected_ori_visit_dict) + assert origin_visit == expected_origin_visit + + actual_origin_visits = list(swh_storage.origin_visit_get(origin_url)) + assert expected_ori_visit_dict in actual_origin_visits objects = list(swh_storage.journal_writer.journal.objects) + assert ("origin", Origin.from_dict(data.origin2)) in objects - assert ("origin_visit", OriginVisit.from_dict(expected_origin_visit)) in objects + assert ("origin_visit", expected_origin_visit) in objects + assert ("origin_visit", expected_origin_visit) in objects + assert ("origin_visit_status", expected_origin_visit_status) in objects def test_origin_visit_get__unknown_origin(self, swh_storage): assert [] == list(swh_storage.origin_visit_get("foo")) @@ -1633,13 +1665,27 @@ microsecond=round(date_visit2.microsecond, -3) ) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=date_visit, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=date_visit, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + origin_visits = swh_storage.origin_visit_add([visit1, visit2]) + + assert len(origin_visits) == 2 + origin_visit1 = origin_visits[0] + origin_visit2 = origin_visits[1] + # then assert origin_visit1.origin == origin_url assert origin_visit1.visit is not None @@ -1676,12 +1722,16 @@ assert ("origin_visit", OriginVisit.from_dict(visit)) in objects def test_origin_visit_add_validation(self, swh_storage): - origin_url = swh_storage.origin_add_one(data.origin2) - with pytest.raises(StorageArgumentException) as cm: - swh_storage.origin_visit_add(origin_url, date=[b"foo"], type="git") - - if type(cm.value) == psycopg2.ProgrammingError: - assert cm.value.pgcode == psycopg2.errorcodes.UNDEFINED_FUNCTION + """Unknown origin when adding visits should raise""" + visit = OriginVisit( + origin="something-unknown", + date=now(), + type=data.type_visit1, + status="ongoing", + snapshot=None, + ) + with pytest.raises(StorageArgumentException, match="Unknown origin"): + swh_storage.origin_visit_add([visit]) def test_origin_visit_status_add_validation(self, swh_storage): """Wrong origin_visit_status input should raise storage argument error""" @@ -1701,9 +1751,25 @@ """ origin_url = swh_storage.origin_add_one(data.origin2) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_url2 = swh_storage.origin_add_one({"url": "new-origin"}) + visit2 = OriginVisit( + origin=origin_url2, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, + ) + origin_visits = swh_storage.origin_visit_add([visit1, visit2]) + origin_visit1 = origin_visits[0] + origin_visit2 = origin_visits[1] + snapshot_id = data.snapshot["id"] date_visit_now = now() visit_status1 = OriginVisitStatus( @@ -1714,10 +1780,6 @@ snapshot=snapshot_id, ) - origin_url2 = swh_storage.origin_add_one({"url": "new-origin"}) - origin_visit2 = swh_storage.origin_visit_add( - origin_url2, date=data.date_visit2, type=data.type_visit2 - ) date_visit_now = now() visit_status2 = OriginVisitStatus( origin=origin_visit2.origin, @@ -1760,15 +1822,28 @@ microsecond=round(date_visit2.microsecond, -3) ) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=date_visit, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=date_visit, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) - origin_visit3 = swh_storage.origin_visit_add( - origin_url2, date=date_visit2, type=data.type_visit3 + visit3 = OriginVisit( + origin=origin_url2, + date=date_visit2, + type=data.type_visit3, # noqa + status="ongoing", + snapshot=None, ) + ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) # when visit1_metadata = { @@ -1776,11 +1851,9 @@ "directories": 22, } swh_storage.origin_visit_update( - origin_url, origin_visit1.visit, status="full", metadata=visit1_metadata - ) - swh_storage.origin_visit_update( - origin_url2, origin_visit3.visit, status="partial" + origin_url, ov1.visit, status="full", metadata=visit1_metadata ) + swh_storage.origin_visit_update(origin_url2, ov3.visit, status="partial") # then actual_origin_visits = list(swh_storage.origin_visit_get(origin_url)) @@ -1788,7 +1861,7 @@ { "origin": origin_url, "date": date_visit, - "visit": origin_visit1.visit, + "visit": ov1.visit, "type": data.type_visit1, "status": "full", "metadata": visit1_metadata, @@ -1797,7 +1870,7 @@ { "origin": origin_url, "date": date_visit2, - "visit": origin_visit2.visit, + "visit": ov2.visit, "type": data.type_visit2, "status": "ongoing", "metadata": None, @@ -1814,7 +1887,7 @@ { "origin": origin_url, "date": date_visit, - "visit": origin_visit1.visit, + "visit": ov1.visit, "type": data.type_visit1, "status": "full", "metadata": visit1_metadata, @@ -1823,13 +1896,13 @@ ] actual_origin_visits_ter = list( - swh_storage.origin_visit_get(origin_url, last_visit=origin_visit1.visit) + swh_storage.origin_visit_get(origin_url, last_visit=ov1.visit) ) assert actual_origin_visits_ter == [ { "origin": origin_url, "date": date_visit2, - "visit": origin_visit2.visit, + "visit": ov2.visit, "type": data.type_visit2, "status": "ongoing", "metadata": None, @@ -1842,7 +1915,7 @@ { "origin": origin_url2, "date": date_visit2, - "visit": origin_visit3.visit, + "visit": ov3.visit, "type": data.type_visit3, "status": "partial", "metadata": None, @@ -1853,7 +1926,7 @@ data1 = { "origin": origin_url, "date": date_visit, - "visit": origin_visit1.visit, + "visit": ov1.visit, "type": data.type_visit1, "status": "ongoing", "metadata": None, @@ -1862,7 +1935,7 @@ data2 = { "origin": origin_url, "date": date_visit2, - "visit": origin_visit2.visit, + "visit": ov2.visit, "type": data.type_visit2, "status": "ongoing", "metadata": None, @@ -1871,7 +1944,7 @@ data3 = { "origin": origin_url2, "date": date_visit2, - "visit": origin_visit3.visit, + "visit": ov3.visit, "type": data.type_visit3, "status": "ongoing", "metadata": None, @@ -1880,7 +1953,7 @@ data4 = { "origin": origin_url, "date": date_visit, - "visit": origin_visit1.visit, + "visit": ov1.visit, "type": data.type_visit1, "metadata": visit1_metadata, "status": "full", @@ -1889,7 +1962,7 @@ data5 = { "origin": origin_url2, "date": date_visit2, - "visit": origin_visit3.visit, + "visit": ov3.visit, "type": data.type_visit3, "status": "partial", "metadata": None, @@ -1906,9 +1979,14 @@ def test_origin_visit_update_validation(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - visit = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + visit = swh_storage.origin_visit_add([visit])[0] with pytest.raises( (StorageArgumentException, ValueError), match="status" ) as cm: @@ -1920,24 +1998,37 @@ def test_origin_visit_find_by_date(self, swh_storage): # given origin_url = swh_storage.origin_add_one(data.origin) - swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit3, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit3, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) - origin_visit3 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit3 + visit3 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit3, + status="ongoing", + snapshot=None, ) + _, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) + # Simple case visit = swh_storage.origin_visit_find_by_date(origin_url, data.date_visit3) - assert visit["visit"] == origin_visit2.visit + assert visit["visit"] == ov2.visit # There are two visits at the same date, the latest must be returned visit = swh_storage.origin_visit_find_by_date(origin_url, data.date_visit2) - assert visit["visit"] == origin_visit3.visit + assert visit["visit"] == ov3.visit def test_origin_visit_find_by_date__unknown_origin(self, swh_storage): swh_storage.origin_visit_find_by_date("foo", data.date_visit2) @@ -1945,9 +2036,14 @@ def test_origin_visit_update_missing_snapshot(self, swh_storage): # given origin_url = swh_storage.origin_add_one(data.origin) - origin_visit = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit = swh_storage.origin_visit_add([visit])[0] # when swh_storage.origin_visit_update( @@ -1970,9 +2066,14 @@ def test_origin_visit_get_by(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) origin_url2 = swh_storage.origin_add_one(data.origin2) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( @@ -1983,12 +2084,21 @@ ) # Add some other {origin, visit} entries - swh_storage.origin_visit_add( - origin_url, date=data.date_visit3, type=data.type_visit3 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit3, + type=data.type_visit3, + status="ongoing", + snapshot=None, ) - swh_storage.origin_visit_add( - origin_url2, date=data.date_visit3, type=data.type_visit3 + visit3 = OriginVisit( + origin=origin_url2, + date=data.date_visit3, + type=data.type_visit3, + status="ongoing", + snapshot=None, ) + swh_storage.origin_visit_add([visit2, visit3]) # when visit1_metadata = { @@ -2167,9 +2277,14 @@ origin_url = swh_storage.origin_add_one(data.origin2) # when - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit1])[0] swh_storage.origin_visit_upsert( [ @@ -2271,16 +2386,29 @@ def test_origin_visit_get_latest(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - ov1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - ov2 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) # Add a visit with the same date as the previous one - ov3 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit3 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) origin_visit1 = swh_storage.origin_visit_get_by(origin_url, ov1.visit) origin_visit2 = swh_storage.origin_visit_get_by(origin_url, ov2.visit) @@ -2396,9 +2524,14 @@ def test_snapshot_add_get_empty(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] visit_id = origin_visit1.visit actual_result = swh_storage.snapshot_add([data.empty_snapshot]) @@ -2458,9 +2591,14 @@ def test_snapshot_add_get_complete(self, swh_storage): origin_url = data.origin["url"] origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] visit_id = origin_visit1.visit actual_result = swh_storage.snapshot_add([data.complete_snapshot]) @@ -2620,9 +2758,14 @@ def test_snapshot_add_get_filtered(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( @@ -2734,9 +2877,14 @@ def test_snapshot_add_get(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] visit_id = origin_visit1.visit swh_storage.snapshot_add([data.snapshot]) @@ -2775,9 +2923,14 @@ def test_snapshot_add_twice__by_origin_visit(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] visit1_id = origin_visit1.visit swh_storage.snapshot_add([data.snapshot]) date_now2 = now() @@ -2792,9 +2945,14 @@ by_ov1 = swh_storage.snapshot_get_by_origin_visit(origin_url, visit1_id) assert by_ov1 == {**data.snapshot, "next_branch": None} - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + origin_visit2 = swh_storage.origin_visit_add([visit2])[0] visit2_id = origin_visit2.visit swh_storage.snapshot_add([data.snapshot]) @@ -2876,16 +3034,29 @@ def test_snapshot_get_latest(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) # Add a visit with the same date as the previous one - origin_visit3 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit3 + visit3 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit3, + status="ongoing", + snapshot=None, ) + ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) # Two visits, both with no snapshot: latest snapshot is None assert swh_storage.snapshot_get_latest(origin_url) is None @@ -2893,7 +3064,7 @@ swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( origin_url, - origin_visit1.visit, + ov1.visit, status="ongoing", snapshot=data.complete_snapshot["id"], ) @@ -2910,7 +3081,7 @@ ) # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_update(origin_url, origin_visit1.visit, status="full") + swh_storage.origin_visit_update(origin_url, ov1.visit, status="full") assert { **data.complete_snapshot, "next_branch": None, @@ -2919,10 +3090,7 @@ # Add snapshot to visit2 and check that the new snapshot is returned swh_storage.snapshot_add([data.empty_snapshot]) swh_storage.origin_visit_update( - origin_url, - origin_visit2.visit, - status="ongoing", - snapshot=data.empty_snapshot["id"], + origin_url, ov2.visit, status="ongoing", snapshot=data.empty_snapshot["id"], ) assert { **data.empty_snapshot, @@ -2940,7 +3108,7 @@ swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( origin_url, - origin_visit3.visit, + ov3.visit, status="ongoing", snapshot=data.complete_snapshot["id"], ) @@ -2952,12 +3120,21 @@ def test_snapshot_get_latest__missing_snapshot(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin) assert swh_storage.snapshot_get_latest(origin_url) is None - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit1, type=data.type_visit1 + visit1 = OriginVisit( + origin=origin_url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, ) - origin_visit2 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit2 = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2]) # Two visits, both with no snapshot: latest snapshot is None assert swh_storage.snapshot_get_latest(origin_url) is None @@ -2966,7 +3143,7 @@ # detected swh_storage.origin_visit_update( origin_url, - origin_visit1.visit, + ov1.visit, status="ongoing", snapshot=data.complete_snapshot["id"], ) @@ -2982,7 +3159,7 @@ ) # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_update(origin_url, origin_visit1.visit, status="full") + swh_storage.origin_visit_update(origin_url, ov1.visit, status="full") with pytest.raises(Exception): # XXX: should the exception be more specific than this? swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]), @@ -2997,10 +3174,7 @@ # Add unknown snapshot to visit2 and check that the inconsistency # is detected swh_storage.origin_visit_update( - origin_url, - origin_visit2.visit, - status="ongoing", - snapshot=data.snapshot["id"], + origin_url, ov2.visit, status="ongoing", snapshot=data.snapshot["id"], ) with pytest.raises(Exception): # XXX: should the exception be more specific than this? @@ -3061,9 +3235,14 @@ # Add other objects. Check their counter increased as well. origin_url = swh_storage.origin_add_one(data.origin2) - origin_visit1 = swh_storage.origin_visit_add( - origin_url, date=data.date_visit2, type=data.type_visit2 + visit = OriginVisit( + origin=origin_url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, ) + origin_visit1 = swh_storage.origin_visit_add([visit])[0] swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( @@ -3778,7 +3957,10 @@ swh_storage.origin_add([{"url": url} for url in self.ORIGINS]) origin_url = "https://github.com/user1/repo1" - swh_storage.origin_visit_add(origin_url, date=now(), type="git") + visit = OriginVisit( + origin=origin_url, date=now(), type="git", status="ongoing", snapshot=None + ) + swh_storage.origin_visit_add([visit]) assert swh_storage.origin_count("github", with_visit=False) == 3 # it has a visit, but no snapshot, so with_visit=True => 0 @@ -3802,7 +3984,10 @@ swh_storage.snapshot_add([data.snapshot]) origin_url = "https://github.com/user1/repo1" - visit = swh_storage.origin_visit_add(origin_url, date=now(), type="git") + visit = OriginVisit( + origin=origin_url, date=now(), type="git", status="ongoing", snapshot=None + ) + visit = swh_storage.origin_visit_add([visit])[0] swh_storage.origin_visit_update( origin_url, visit.visit, status="ongoing", snapshot=data.snapshot["id"] ) @@ -3830,7 +4015,14 @@ swh_storage.origin_add_one({"url": origin_url}) if "visit" in obj: del obj["visit"] - swh_storage.origin_visit_add(origin_url, obj["date"], obj["type"]) + visit = OriginVisit( + origin=origin_url, + date=obj["date"], + type=obj["type"], + status="ongoing", + snapshot=None, + ) + swh_storage.origin_visit_add([visit]) else: if obj_type == "content" and obj["status"] == "absent": obj_type = "skipped_content" diff --git a/swh/storage/validate.py b/swh/storage/validate.py --- a/swh/storage/validate.py +++ b/swh/storage/validate.py @@ -3,12 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime import contextlib from typing import Dict, Iterable, Iterator, List, Optional, Tuple from swh.model.model import ( - BaseModel, SkippedContent, Content, Directory, @@ -112,14 +110,8 @@ snapshots = [Snapshot.from_dict(s) for s in snapshots] return self.storage.snapshot_add(snapshots) - def origin_visit_add( - self, origin_url: str, date: datetime.datetime, type: str - ) -> Dict[str, BaseModel]: - with convert_validation_exceptions(): - visit = OriginVisit( - origin=origin_url, date=date, type=type, status="ongoing", snapshot=None - ) - return self.storage.origin_visit_add(visit.origin, visit.date, visit.type) + def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]: + return self.storage.origin_visit_add(visits) def origin_add(self, origins: Iterable[Dict]) -> List[Dict]: with convert_validation_exceptions():