Page MenuHomeSoftware Heritage

D3262.id11576.diff
No OneTemporary

D3262.id11576.diff

diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
click
flask
psycopg2
-python-dateutil
vcversioner
aiohttp
tenacity
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -8,10 +8,9 @@
import json
import random
import re
-from typing import Any, Dict, List, Iterable, Optional, Union
+from typing import Any, Dict, List, Iterable, Optional
import attr
-import dateutil
from swh.core.api.serializers import msgpack_loads, msgpack_dumps
from swh.model.model import (
@@ -795,48 +794,32 @@
return origin_url
- def origin_visit_add(
- self, origin_url: str, date: Union[str, datetime.datetime], type: str
- ) -> OriginVisit:
- if isinstance(date, str):
- # FIXME: Converge on iso8601 at some point
- date = dateutil.parser.parse(date)
- elif not isinstance(date, datetime.datetime):
- raise StorageArgumentException("Date must be a datetime or a string")
-
- if not self.origin_get_one({"url": origin_url}):
- raise StorageArgumentException("Unknown origin %s", origin_url)
-
- visit_id = self._cql_runner.origin_generate_unique_visit_id(origin_url)
- visit_state = "ongoing"
- with convert_validation_exceptions():
- visit = OriginVisit.from_dict(
- {
- "origin": origin_url,
- "date": date,
- "type": type,
- "status": visit_state,
- "snapshot": None,
- "metadata": None,
- "visit": visit_id,
- }
- )
+ def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
+ for visit in visits:
+ origin = self.origin_get({"url": visit.origin})
+ if not origin: # Cannot add a visit without an origin
+ raise StorageArgumentException("Unknown origin %s", visit.origin)
- self.journal_writer.origin_visit_add([visit])
- self._cql_runner.origin_visit_add_one(visit)
+ all_visits = []
+ nb_visits = 0
+ for visit in visits:
+ nb_visits += 1
+ if not visit.visit:
+ visit_id = self._cql_runner.origin_generate_unique_visit_id(
+ visit.origin
+ )
+ visit = attr.evolve(visit, visit=visit_id)
+ self.journal_writer.origin_visit_add([visit])
+ self._cql_runner.origin_visit_add_one(visit)
+ assert visit.visit is not None
+ all_visits.append(visit)
- with convert_validation_exceptions():
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit_id,
- date=date,
- status=visit_state,
- snapshot=None,
- metadata=None,
- )
- self._origin_visit_status_add(visit_status)
+ visit_status_dict = visit.to_dict()
+ visit_status_dict.pop("type")
+ visit_status = OriginVisitStatus.from_dict(visit_status_dict)
+ self._origin_visit_status_add(visit_status)
- return visit
+ return all_visits
def _origin_visit_status_add(self, visit_status: OriginVisitStatus) -> None:
"""Add an origin visit status"""
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -5,7 +5,6 @@
import re
import bisect
-import dateutil
import collections
import copy
import datetime
@@ -26,7 +25,6 @@
Optional,
Tuple,
TypeVar,
- Union,
)
import attr
@@ -794,54 +792,43 @@
return origin.url
- def origin_visit_add(
- self, origin_url: str, date: Union[str, datetime.datetime], type: str
- ) -> OriginVisit:
- if isinstance(date, str):
- # FIXME: Converge on iso8601 at some point
- date = dateutil.parser.parse(date)
- elif not isinstance(date, datetime.datetime):
- raise StorageArgumentException("Date must be a datetime or a string")
-
- origin = self.origin_get({"url": origin_url})
- if not origin: # Cannot add a visit without an origin
- raise StorageArgumentException("Unknown origin %s", origin_url)
-
- if origin_url in self._origins:
- origin = self._origins[origin_url]
- # visit ids are in the range [1, +inf[
- visit_id = len(self._origin_visits[origin_url]) + 1
- status = "ongoing"
- with convert_validation_exceptions():
- visit = OriginVisit(
- origin=origin_url,
- date=date,
- type=type,
- # TODO: Remove when we remove those fields from the model
- status=status,
- snapshot=None,
- metadata=None,
- visit=visit_id,
- )
- self.journal_writer.origin_visit_add([visit])
- self._origin_visits[origin_url].append(visit)
- assert visit.visit is not None
- visit_key = (origin_url, visit.visit)
+ def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
+ for visit in visits:
+ origin = self.origin_get({"url": visit.origin})
+ if not origin: # Cannot add a visit without an origin
+ raise StorageArgumentException("Unknown origin %s", visit.origin)
- with convert_validation_exceptions():
- visit_update = OriginVisitStatus(
- origin=origin_url,
- visit=visit_id,
- date=date,
- status=status,
- snapshot=None,
- metadata=None,
- )
- self._origin_visit_status_add_one(visit_update)
- self._objects[visit_key].append(("origin_visit", None))
+ all_visits = []
+ for visit in visits:
+ origin_url = visit.origin
+ if origin_url in self._origins:
+ origin = self._origins[origin_url]
+ if visit.visit:
+ self.journal_writer.origin_visit_add([visit])
+ while len(self._origin_visits[origin_url]) < visit.visit:
+ self._origin_visits[origin_url].append(None)
+ self._origin_visits[origin_url][visit.visit - 1] = visit
+ visit_status_dict = visit.to_dict()
+ visit_status_dict.pop("type")
+ visit_status = OriginVisitStatus.from_dict(visit_status_dict)
+ self._origin_visit_status_add_one(visit_status)
+ else:
+ # visit ids are in the range [1, +inf[
+ visit_id = len(self._origin_visits[origin_url]) + 1
+ visit = attr.evolve(visit, visit=visit_id)
+ self.journal_writer.origin_visit_add([visit])
+ self._origin_visits[origin_url].append(visit)
+ visit_key = (origin_url, visit.visit)
+
+ visit_status_dict = visit.to_dict()
+ visit_status_dict.pop("type")
+ visit_status = OriginVisitStatus.from_dict(visit_status_dict)
+ self._origin_visit_status_add_one(visit_status)
+ self._objects[visit_key].append(("origin_visit", None))
+ assert visit.visit is not None
+ all_visits.append(visit)
- # return last visit
- return visit
+ return all_visits
def _origin_visit_status_add_one(self, visit_status: OriginVisitStatus) -> None:
"""Add an origin visit status without checks.
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -5,7 +5,7 @@
import datetime
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional
from swh.core.api import remote_api_endpoint
from swh.model.model import (
@@ -780,24 +780,18 @@
...
@remote_api_endpoint("origin/visit/add")
- def origin_visit_add(
- self, origin_url: str, date: Union[str, datetime.datetime], type: str
- ) -> OriginVisit:
- """Add an origin_visit for the origin at ts with status 'ongoing'.
+ def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
+ """Add visits to storage. If the visits have no id, they will be created and assigned
+ one. The resulted visits are visit with their visit id set.
Args:
- origin_url: visited origin identifier (its URL)
- date: timestamp of such visit
- type: the type of loader used for the visit (hg, git, ...)
+ visits: Iterable of OriginVisit objects to add
Raises:
- StorageArgumentException if the date is mistyped, or the origin
- is unknown.
+ StorageArgumentException if some origin visit reference unknown origins
Returns:
- dict: dictionary with keys origin and visit where:
- - origin: origin object
- - visit: the visit object for the new visit occurrence
+ Iterable[OriginVisit] stored
"""
...
diff --git a/swh/storage/retry.py b/swh/storage/retry.py
--- a/swh/storage/retry.py
+++ b/swh/storage/retry.py
@@ -7,7 +7,7 @@
import traceback
from datetime import datetime
-from typing import Any, Dict, Iterable, Optional, Union
+from typing import Any, Dict, Iterable, Optional
from tenacity import (
retry,
@@ -108,10 +108,8 @@
return self.storage.origin_add_one(origin)
@swh_retry
- def origin_visit_add(
- self, origin_url: str, date: Union[datetime, str], type: str
- ) -> OriginVisit:
- return self.storage.origin_visit_add(origin_url, date, type)
+ def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
+ return self.storage.origin_visit_add(visits)
@swh_retry
def origin_visit_update(
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -9,10 +9,9 @@
from collections import defaultdict
from contextlib import contextmanager
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional
import attr
-import dateutil.parser
import psycopg2
import psycopg2.pool
import psycopg2.errors
@@ -824,55 +823,36 @@
@timed
@db_transaction()
def origin_visit_add(
- self,
- origin_url: str,
- date: Union[str, datetime.datetime],
- type: str,
- db=None,
- cur=None,
- ) -> OriginVisit:
- if isinstance(date, str):
- # FIXME: Converge on iso8601 at some point
- date = dateutil.parser.parse(date)
- elif not isinstance(date, datetime.datetime):
- raise StorageArgumentException("Date must be a datetime or a string")
-
- origin = self.origin_get({"url": origin_url}, db=db, cur=cur)
- if not origin: # Cannot add a visit without an origin
- raise StorageArgumentException("Unknown origin %s", origin_url)
-
- with convert_validation_exceptions():
- visit_id = db.origin_visit_add(origin_url, date, type, cur=cur)
-
- status = "ongoing"
- # We can write to the journal only after inserting to the
- # DB, because we want the id of the visit
- visit = OriginVisit.from_dict(
- {
- "origin": origin_url,
- "date": date,
- "type": type,
- "visit": visit_id,
- # TODO: Remove when we remove those fields from the model
- "status": status,
- "metadata": None,
- "snapshot": None,
- }
- )
- self.journal_writer.origin_visit_add([visit])
+ self, visits: Iterable[OriginVisit], db=None, cur=None
+ ) -> Iterable[OriginVisit]:
+ for visit in visits:
+ origin = self.origin_get({"url": visit.origin}, db=db, cur=cur)
+ if not origin: # Cannot add a visit without an origin
+ raise StorageArgumentException("Unknown origin %s", visit.origin)
- with convert_validation_exceptions():
- visit_status = OriginVisitStatus(
- origin=origin_url,
- visit=visit_id,
- date=date,
- status=status,
- snapshot=None,
- metadata=None,
- )
- self._origin_visit_status_add(visit_status, db=db, cur=cur)
- send_metric("origin_visit:add", count=1, method_name="origin_visit")
- return visit
+ all_visits = []
+ nb_visits = 0
+ for visit in visits:
+ nb_visits += 1
+ if not visit.visit:
+ with convert_validation_exceptions():
+ visit_id = db.origin_visit_add(
+ visit.origin, visit.date, visit.type, cur=cur
+ )
+ visit = attr.evolve(visit, visit=visit_id)
+ else:
+ db.origin_visit_upsert(visit)
+ assert visit.visit is not None
+ all_visits.append(visit)
+ # Forced to write after for the case when the visit has no id
+ self.journal_writer.origin_visit_add([visit])
+ visit_status_dict = visit.to_dict()
+ visit_status_dict.pop("type")
+ visit_status = OriginVisitStatus.from_dict(visit_status_dict)
+ self._origin_visit_status_add(visit_status, db=db, cur=cur)
+
+ send_metric("origin_visit:add", count=nb_visits, method_name="origin_visit")
+ return all_visits
def _origin_visit_status_add(
self, visit_status: OriginVisitStatus, db, cur
diff --git a/swh/storage/tests/test_kafka_writer.py b/swh/storage/tests/test_kafka_writer.py
--- a/swh/storage/tests/test_kafka_writer.py
+++ b/swh/storage/tests/test_kafka_writer.py
@@ -53,7 +53,7 @@
for obj in objs:
assert isinstance(obj, OriginVisit)
storage.origin_add_one(Origin(url=obj.origin))
- visit = method(obj.origin, date=obj.date, type=obj.type)
+ visit = method([obj])[0]
expected_messages += 1 + 1 # 1 visit + 1 visit status
obj_d = obj.to_dict()
diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py
--- a/swh/storage/tests/test_replay.py
+++ b/swh/storage/tests/test_replay.py
@@ -70,16 +70,10 @@
# Fill Kafka using a source storage
nb_sent = 0
for object_type, objects in TEST_OBJECTS.items():
+ method = getattr(src, object_type + "_add")
+ method(objects)
if object_type == "origin_visit":
- # src.origin_visit_upsert(objects)
- for visit in objects:
- src.origin_visit_add(
- origin_url=visit.origin, date=visit.date, type=visit.type
- )
- nb_sent += 1 # this adds origin-visit-status as well
- else:
- method = getattr(src, object_type + "_add")
- method(objects)
+ nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well
nb_sent += len(objects)
caplog.set_level(logging.ERROR, "swh.journal.replay")
@@ -115,15 +109,10 @@
# Fill Kafka using a source storage
nb_sent = 0
for object_type, objects in TEST_OBJECTS.items():
+ method = getattr(src, object_type + "_add")
+ method(objects)
if object_type == "origin_visit":
- for visit in objects:
- src.origin_visit_add(
- origin_url=visit.origin, date=visit.date, type=visit.type
- )
- nb_sent += 1 # this adds origin-visit-status as well
- else:
- method = getattr(src, object_type + "_add")
- method(objects)
+ nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well
nb_sent += len(objects)
# Create collision in input data
diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py
--- a/swh/storage/tests/test_retry.py
+++ b/swh/storage/tests/test_retry.py
@@ -16,6 +16,7 @@
Snapshot,
SkippedContent,
Origin,
+ OriginVisit,
)
from swh.storage import get_storage
@@ -355,7 +356,10 @@
origin = list(swh_storage.origin_visit_get(origin_url))
assert not origin
- origin_visit = swh_storage.origin_visit_add(origin_url, date_visit1, "hg")
+ visit = OriginVisit(
+ origin=origin_url, date=date_visit1, type="hg", status="ongoing", snapshot=None
+ )
+ origin_visit = swh_storage.origin_visit_add([visit])[0]
assert origin_visit.origin == origin_url
assert isinstance(origin_visit.visit, int)
@@ -374,27 +378,26 @@
origin_url = swh_storage.origin_add_one(sample_origin)
mock_memory = mocker.patch("swh.storage.in_memory.InMemoryStorage.origin_visit_add")
+ visit = OriginVisit(
+ origin=origin_url, date=date_visit1, type="git", status="ongoing", snapshot=None
+ )
mock_memory.side_effect = [
# first try goes ko
fake_hash_collision,
# second try goes ko
psycopg2.IntegrityError("origin already inserted"),
# ok then!
- {"origin": origin_url, "visit": 1},
+ [visit],
]
origin = list(swh_storage.origin_visit_get(origin_url))
assert not origin
- r = swh_storage.origin_visit_add(origin_url, date_visit1, "git")
- assert r == {"origin": origin_url, "visit": 1}
+ r = swh_storage.origin_visit_add([visit])
+ assert r == [visit]
mock_memory.assert_has_calls(
- [
- call(origin_url, date_visit1, "git"),
- call(origin_url, date_visit1, "git"),
- call(origin_url, date_visit1, "git"),
- ]
+ [call([visit]), call([visit]), call([visit]),]
)
@@ -413,10 +416,17 @@
assert not origin
with pytest.raises(StorageArgumentException, match="Refuse to add"):
- swh_storage.origin_visit_add(origin_url, date_visit1, "svn")
+ visit = OriginVisit(
+ origin=origin_url,
+ date=date_visit1,
+ type="svn",
+ status="ongoing",
+ snapshot=None,
+ )
+ swh_storage.origin_visit_add([visit])
mock_memory.assert_has_calls(
- [call(origin_url, date_visit1, "svn"),]
+ [call([visit]),]
)
@@ -675,7 +685,11 @@
"""
sample_origin = sample_data["origin"][0]
origin_url = swh_storage.origin_add_one(sample_origin)
- origin_visit = swh_storage.origin_visit_add(origin_url, date_visit1, "hg")
+ visit = OriginVisit(
+ origin=origin_url, date=date_visit1, type="hg", status="ongoing", snapshot=None
+ )
+
+ origin_visit = swh_storage.origin_visit_add([visit])[0]
ov = next(swh_storage.origin_visit_get(origin_url))
assert ov["origin"] == origin_url
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -1412,9 +1412,15 @@
for origin in data.origins:
origin_url = origin["url"]
for date_visit in visits:
- visit = swh_storage.origin_visit_add(
- origin_url, date=date_visit, type=visit_type
+ visit = OriginVisit(
+ origin=origin_url,
+ date=date_visit,
+ type=visit_type,
+ status="ongoing",
+ snapshot=None,
)
+ visits_ = swh_storage.origin_visit_add([visit])
+ visit = visits_[0]
swh_storage.origin_visit_update(
origin_url, visit_id=visit.visit, status="full"
)
@@ -1440,9 +1446,15 @@
for origin in data.origins:
origin_url = origin["url"]
for date_visit in visits:
- visit = swh_storage.origin_visit_add(
- origin_url, date=date_visit, type=visit_type
+ visit = OriginVisit(
+ origin=origin_url,
+ date=date_visit,
+ type=visit_type,
+ status="ongoing",
+ snapshot=None,
)
+ visits_ = swh_storage.origin_visit_add([visit])
+ visit = visits_[0]
swh_storage.origin_visit_update(origin_url, visit.visit, status="full")
random_origin_visit = swh_storage.origin_visit_get_random(visit_type)
@@ -1593,11 +1605,23 @@
date_visit = date_visit.replace(microsecond=round(date_visit.microsecond, -3))
# when
- origin_visit = swh_storage.origin_visit_add(
- origin_url, type=data.type_visit1, date=date_visit
+ visit = OriginVisit(
+ origin=origin_url,
+ type=data.type_visit1,
+ date=date_visit,
+ status="ongoing",
+ snapshot=None,
)
- expected_origin_visit = {
+ # multiple calls to add the same visit changes nothing
+ origin_visit = swh_storage.origin_visit_add([visit])[0]
+
+ # insert the same origin visit should be ok as well
+ origin_visit2 = swh_storage.origin_visit_add([origin_visit])[0]
+
+ assert origin_visit == origin_visit2 # idempotent!
+
+ expected_ori_visit_dict = {
"origin": origin_url,
"date": date_visit,
"visit": origin_visit.visit,
@@ -1606,14 +1630,24 @@
"metadata": None,
"snapshot": None,
}
- assert origin_visit == OriginVisit.from_dict(expected_origin_visit)
- actual_origin_visits = list(swh_storage.origin_visit_get(origin_url))
+ expected_ori_visit_status_dict = copy.deepcopy(expected_ori_visit_dict)
+ expected_ori_visit_status_dict.pop("type")
- assert expected_origin_visit in actual_origin_visits
+ expected_origin_visit_status = OriginVisitStatus.from_dict(
+ expected_ori_visit_status_dict
+ )
+ expected_origin_visit = OriginVisit.from_dict(expected_ori_visit_dict)
+ assert origin_visit == expected_origin_visit
+
+ actual_origin_visits = list(swh_storage.origin_visit_get(origin_url))
+ assert expected_ori_visit_dict in actual_origin_visits
objects = list(swh_storage.journal_writer.journal.objects)
+
assert ("origin", Origin.from_dict(data.origin2)) in objects
- assert ("origin_visit", OriginVisit.from_dict(expected_origin_visit)) in objects
+ assert ("origin_visit", expected_origin_visit) in objects
+ assert ("origin_visit", expected_origin_visit) in objects
+ assert ("origin_visit_status", expected_origin_visit_status) in objects
def test_origin_visit_get__unknown_origin(self, swh_storage):
assert [] == list(swh_storage.origin_visit_get("foo"))
@@ -1633,13 +1667,27 @@
microsecond=round(date_visit2.microsecond, -3)
)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=date_visit, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=date_visit,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visits = swh_storage.origin_visit_add([visit1, visit2])
+
+ assert len(origin_visits) == 2
+ origin_visit1 = origin_visits[0]
+ origin_visit2 = origin_visits[1]
+
# then
assert origin_visit1.origin == origin_url
assert origin_visit1.visit is not None
@@ -1676,12 +1724,16 @@
assert ("origin_visit", OriginVisit.from_dict(visit)) in objects
def test_origin_visit_add_validation(self, swh_storage):
- origin_url = swh_storage.origin_add_one(data.origin2)
- with pytest.raises(StorageArgumentException) as cm:
- swh_storage.origin_visit_add(origin_url, date=[b"foo"], type="git")
-
- if type(cm.value) == psycopg2.ProgrammingError:
- assert cm.value.pgcode == psycopg2.errorcodes.UNDEFINED_FUNCTION
+ """Unknown origin when adding visits should raise"""
+ visit = OriginVisit(
+ origin="something-unknown",
+ date=now(),
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
+ )
+ with pytest.raises(StorageArgumentException, match="Unknown origin"):
+ swh_storage.origin_visit_add([visit])
def test_origin_visit_status_add_validation(self, swh_storage):
"""Wrong origin_visit_status input should raise storage argument error"""
@@ -1701,9 +1753,23 @@
"""
origin_url = swh_storage.origin_add_one(data.origin2)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
+ )
+ origin_url2 = swh_storage.origin_add_one({"url": "new-origin"})
+ visit2 = OriginVisit(
+ origin=origin_url2,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1, origin_visit2 = swh_storage.origin_visit_add([visit1, visit2])
+
snapshot_id = data.snapshot["id"]
date_visit_now = now()
visit_status1 = OriginVisitStatus(
@@ -1714,10 +1780,6 @@
snapshot=snapshot_id,
)
- origin_url2 = swh_storage.origin_add_one({"url": "new-origin"})
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url2, date=data.date_visit2, type=data.type_visit2
- )
date_visit_now = now()
visit_status2 = OriginVisitStatus(
origin=origin_visit2.origin,
@@ -1760,15 +1822,28 @@
microsecond=round(date_visit2.microsecond, -3)
)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=date_visit, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=date_visit,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit3 = swh_storage.origin_visit_add(
- origin_url2, date=date_visit2, type=data.type_visit3
+ visit3 = OriginVisit(
+ origin=origin_url2,
+ date=date_visit2,
+ type=data.type_visit3, # noqa
+ status="ongoing",
+ snapshot=None,
)
+ ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
# when
visit1_metadata = {
@@ -1776,11 +1851,9 @@
"directories": 22,
}
swh_storage.origin_visit_update(
- origin_url, origin_visit1.visit, status="full", metadata=visit1_metadata
- )
- swh_storage.origin_visit_update(
- origin_url2, origin_visit3.visit, status="partial"
+ origin_url, ov1.visit, status="full", metadata=visit1_metadata
)
+ swh_storage.origin_visit_update(origin_url2, ov3.visit, status="partial")
# then
actual_origin_visits = list(swh_storage.origin_visit_get(origin_url))
@@ -1788,7 +1861,7 @@
{
"origin": origin_url,
"date": date_visit,
- "visit": origin_visit1.visit,
+ "visit": ov1.visit,
"type": data.type_visit1,
"status": "full",
"metadata": visit1_metadata,
@@ -1797,7 +1870,7 @@
{
"origin": origin_url,
"date": date_visit2,
- "visit": origin_visit2.visit,
+ "visit": ov2.visit,
"type": data.type_visit2,
"status": "ongoing",
"metadata": None,
@@ -1814,7 +1887,7 @@
{
"origin": origin_url,
"date": date_visit,
- "visit": origin_visit1.visit,
+ "visit": ov1.visit,
"type": data.type_visit1,
"status": "full",
"metadata": visit1_metadata,
@@ -1823,13 +1896,13 @@
]
actual_origin_visits_ter = list(
- swh_storage.origin_visit_get(origin_url, last_visit=origin_visit1.visit)
+ swh_storage.origin_visit_get(origin_url, last_visit=ov1.visit)
)
assert actual_origin_visits_ter == [
{
"origin": origin_url,
"date": date_visit2,
- "visit": origin_visit2.visit,
+ "visit": ov2.visit,
"type": data.type_visit2,
"status": "ongoing",
"metadata": None,
@@ -1842,7 +1915,7 @@
{
"origin": origin_url2,
"date": date_visit2,
- "visit": origin_visit3.visit,
+ "visit": ov3.visit,
"type": data.type_visit3,
"status": "partial",
"metadata": None,
@@ -1853,7 +1926,7 @@
data1 = {
"origin": origin_url,
"date": date_visit,
- "visit": origin_visit1.visit,
+ "visit": ov1.visit,
"type": data.type_visit1,
"status": "ongoing",
"metadata": None,
@@ -1862,7 +1935,7 @@
data2 = {
"origin": origin_url,
"date": date_visit2,
- "visit": origin_visit2.visit,
+ "visit": ov2.visit,
"type": data.type_visit2,
"status": "ongoing",
"metadata": None,
@@ -1871,7 +1944,7 @@
data3 = {
"origin": origin_url2,
"date": date_visit2,
- "visit": origin_visit3.visit,
+ "visit": ov3.visit,
"type": data.type_visit3,
"status": "ongoing",
"metadata": None,
@@ -1880,7 +1953,7 @@
data4 = {
"origin": origin_url,
"date": date_visit,
- "visit": origin_visit1.visit,
+ "visit": ov1.visit,
"type": data.type_visit1,
"metadata": visit1_metadata,
"status": "full",
@@ -1889,7 +1962,7 @@
data5 = {
"origin": origin_url2,
"date": date_visit2,
- "visit": origin_visit3.visit,
+ "visit": ov3.visit,
"type": data.type_visit3,
"status": "partial",
"metadata": None,
@@ -1906,9 +1979,14 @@
def test_origin_visit_update_validation(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- visit = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ visit = swh_storage.origin_visit_add([visit])[0]
with pytest.raises(
(StorageArgumentException, ValueError), match="status"
) as cm:
@@ -1920,24 +1998,37 @@
def test_origin_visit_find_by_date(self, swh_storage):
# given
origin_url = swh_storage.origin_add_one(data.origin)
- swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
-
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit3, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit3,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit3 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit3
+ visit3 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit3,
+ status="ongoing",
+ snapshot=None,
)
+ _, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
+
# Simple case
visit = swh_storage.origin_visit_find_by_date(origin_url, data.date_visit3)
- assert visit["visit"] == origin_visit2.visit
+ assert visit["visit"] == ov2.visit
# There are two visits at the same date, the latest must be returned
visit = swh_storage.origin_visit_find_by_date(origin_url, data.date_visit2)
- assert visit["visit"] == origin_visit3.visit
+ assert visit["visit"] == ov3.visit
def test_origin_visit_find_by_date__unknown_origin(self, swh_storage):
swh_storage.origin_visit_find_by_date("foo", data.date_visit2)
@@ -1945,9 +2036,14 @@
def test_origin_visit_update_missing_snapshot(self, swh_storage):
# given
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit = swh_storage.origin_visit_add([visit])[0]
# when
swh_storage.origin_visit_update(
@@ -1970,9 +2066,14 @@
def test_origin_visit_get_by(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
origin_url2 = swh_storage.origin_add_one(data.origin2)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
swh_storage.snapshot_add([data.snapshot])
swh_storage.origin_visit_update(
@@ -1983,12 +2084,21 @@
)
# Add some other {origin, visit} entries
- swh_storage.origin_visit_add(
- origin_url, date=data.date_visit3, type=data.type_visit3
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit3,
+ type=data.type_visit3,
+ status="ongoing",
+ snapshot=None,
)
- swh_storage.origin_visit_add(
- origin_url2, date=data.date_visit3, type=data.type_visit3
+ visit3 = OriginVisit(
+ origin=origin_url2,
+ date=data.date_visit3,
+ type=data.type_visit3,
+ status="ongoing",
+ snapshot=None,
)
+ swh_storage.origin_visit_add([visit2, visit3])
# when
visit1_metadata = {
@@ -2167,9 +2277,14 @@
origin_url = swh_storage.origin_add_one(data.origin2)
# when
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit1])[0]
swh_storage.origin_visit_upsert(
[
@@ -2271,16 +2386,29 @@
def test_origin_visit_get_latest(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- ov1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
- ov2 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
# Add a visit with the same date as the previous one
- ov3 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit3 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
origin_visit1 = swh_storage.origin_visit_get_by(origin_url, ov1.visit)
origin_visit2 = swh_storage.origin_visit_get_by(origin_url, ov2.visit)
@@ -2396,9 +2524,14 @@
def test_snapshot_add_get_empty(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
visit_id = origin_visit1.visit
actual_result = swh_storage.snapshot_add([data.empty_snapshot])
@@ -2458,9 +2591,14 @@
def test_snapshot_add_get_complete(self, swh_storage):
origin_url = data.origin["url"]
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
visit_id = origin_visit1.visit
actual_result = swh_storage.snapshot_add([data.complete_snapshot])
@@ -2620,9 +2758,14 @@
def test_snapshot_add_get_filtered(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
swh_storage.snapshot_add([data.complete_snapshot])
swh_storage.origin_visit_update(
@@ -2734,9 +2877,14 @@
def test_snapshot_add_get(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
visit_id = origin_visit1.visit
swh_storage.snapshot_add([data.snapshot])
@@ -2775,9 +2923,14 @@
def test_snapshot_add_twice__by_origin_visit(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
visit1_id = origin_visit1.visit
swh_storage.snapshot_add([data.snapshot])
date_now2 = now()
@@ -2792,9 +2945,14 @@
by_ov1 = swh_storage.snapshot_get_by_origin_visit(origin_url, visit1_id)
assert by_ov1 == {**data.snapshot, "next_branch": None}
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit2 = swh_storage.origin_visit_add([visit2])[0]
visit2_id = origin_visit2.visit
swh_storage.snapshot_add([data.snapshot])
@@ -2876,16 +3034,29 @@
def test_snapshot_get_latest(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
# Add a visit with the same date as the previous one
- origin_visit3 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit3
+ visit3 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit3,
+ status="ongoing",
+ snapshot=None,
)
+ ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
# Two visits, both with no snapshot: latest snapshot is None
assert swh_storage.snapshot_get_latest(origin_url) is None
@@ -2893,7 +3064,7 @@
swh_storage.snapshot_add([data.complete_snapshot])
swh_storage.origin_visit_update(
origin_url,
- origin_visit1.visit,
+ ov1.visit,
status="ongoing",
snapshot=data.complete_snapshot["id"],
)
@@ -2910,7 +3081,7 @@
)
# Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_update(origin_url, origin_visit1.visit, status="full")
+ swh_storage.origin_visit_update(origin_url, ov1.visit, status="full")
assert {
**data.complete_snapshot,
"next_branch": None,
@@ -2919,10 +3090,7 @@
# Add snapshot to visit2 and check that the new snapshot is returned
swh_storage.snapshot_add([data.empty_snapshot])
swh_storage.origin_visit_update(
- origin_url,
- origin_visit2.visit,
- status="ongoing",
- snapshot=data.empty_snapshot["id"],
+ origin_url, ov2.visit, status="ongoing", snapshot=data.empty_snapshot["id"],
)
assert {
**data.empty_snapshot,
@@ -2940,7 +3108,7 @@
swh_storage.snapshot_add([data.complete_snapshot])
swh_storage.origin_visit_update(
origin_url,
- origin_visit3.visit,
+ ov3.visit,
status="ongoing",
snapshot=data.complete_snapshot["id"],
)
@@ -2952,12 +3120,21 @@
def test_snapshot_get_latest__missing_snapshot(self, swh_storage):
origin_url = swh_storage.origin_add_one(data.origin)
assert swh_storage.snapshot_get_latest(origin_url) is None
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit1, type=data.type_visit1
+ visit1 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
)
- origin_visit2 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit2 = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2])
# Two visits, both with no snapshot: latest snapshot is None
assert swh_storage.snapshot_get_latest(origin_url) is None
@@ -2966,7 +3143,7 @@
# detected
swh_storage.origin_visit_update(
origin_url,
- origin_visit1.visit,
+ ov1.visit,
status="ongoing",
snapshot=data.complete_snapshot["id"],
)
@@ -2982,7 +3159,7 @@
)
# Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_update(origin_url, origin_visit1.visit, status="full")
+ swh_storage.origin_visit_update(origin_url, ov1.visit, status="full")
with pytest.raises(Exception):
# XXX: should the exception be more specific than this?
swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]),
@@ -2997,10 +3174,7 @@
# Add unknown snapshot to visit2 and check that the inconsistency
# is detected
swh_storage.origin_visit_update(
- origin_url,
- origin_visit2.visit,
- status="ongoing",
- snapshot=data.snapshot["id"],
+ origin_url, ov2.visit, status="ongoing", snapshot=data.snapshot["id"],
)
with pytest.raises(Exception):
# XXX: should the exception be more specific than this?
@@ -3061,9 +3235,14 @@
# Add other objects. Check their counter increased as well.
origin_url = swh_storage.origin_add_one(data.origin2)
- origin_visit1 = swh_storage.origin_visit_add(
- origin_url, date=data.date_visit2, type=data.type_visit2
+ visit = OriginVisit(
+ origin=origin_url,
+ date=data.date_visit2,
+ type=data.type_visit2,
+ status="ongoing",
+ snapshot=None,
)
+ origin_visit1 = swh_storage.origin_visit_add([visit])[0]
swh_storage.snapshot_add([data.snapshot])
swh_storage.origin_visit_update(
@@ -3778,7 +3957,10 @@
swh_storage.origin_add([{"url": url} for url in self.ORIGINS])
origin_url = "https://github.com/user1/repo1"
- swh_storage.origin_visit_add(origin_url, date=now(), type="git")
+ visit = OriginVisit(
+ origin=origin_url, date=now(), type="git", status="ongoing", snapshot=None
+ )
+ swh_storage.origin_visit_add([visit])
assert swh_storage.origin_count("github", with_visit=False) == 3
# it has a visit, but no snapshot, so with_visit=True => 0
@@ -3802,7 +3984,10 @@
swh_storage.snapshot_add([data.snapshot])
origin_url = "https://github.com/user1/repo1"
- visit = swh_storage.origin_visit_add(origin_url, date=now(), type="git")
+ visit = OriginVisit(
+ origin=origin_url, date=now(), type="git", status="ongoing", snapshot=None
+ )
+ visit = swh_storage.origin_visit_add([visit])[0]
swh_storage.origin_visit_update(
origin_url, visit.visit, status="ongoing", snapshot=data.snapshot["id"]
)
@@ -3830,7 +4015,14 @@
swh_storage.origin_add_one({"url": origin_url})
if "visit" in obj:
del obj["visit"]
- swh_storage.origin_visit_add(origin_url, obj["date"], obj["type"])
+ visit = OriginVisit(
+ origin=origin_url,
+ date=obj["date"],
+ type=obj["type"],
+ status="ongoing",
+ snapshot=None,
+ )
+ swh_storage.origin_visit_add([visit])
else:
if obj_type == "content" and obj["status"] == "absent":
obj_type = "skipped_content"
diff --git a/swh/storage/validate.py b/swh/storage/validate.py
--- a/swh/storage/validate.py
+++ b/swh/storage/validate.py
@@ -3,12 +3,10 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import datetime
import contextlib
from typing import Dict, Iterable, Iterator, List, Optional, Tuple
from swh.model.model import (
- BaseModel,
SkippedContent,
Content,
Directory,
@@ -112,14 +110,8 @@
snapshots = [Snapshot.from_dict(s) for s in snapshots]
return self.storage.snapshot_add(snapshots)
- def origin_visit_add(
- self, origin_url: str, date: datetime.datetime, type: str
- ) -> Dict[str, BaseModel]:
- with convert_validation_exceptions():
- visit = OriginVisit(
- origin=origin_url, date=date, type=type, status="ongoing", snapshot=None
- )
- return self.storage.origin_visit_add(visit.origin, visit.date, visit.type)
+ def origin_visit_add(self, visits: Iterable[OriginVisit]) -> Iterable[OriginVisit]:
+ return self.storage.origin_visit_add(visits)
def origin_add(self, origins: Iterable[Dict]) -> List[Dict]:
with convert_validation_exceptions():

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 2:45 PM (1 w, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222831

Event Timeline