diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1134,25 +1134,28 @@ # Set origin.next_visit_id = max(origin.next_visit_id, visit.visit+1) # so the next loader run does not reuse the id. self._cql_runner.origin_bump_next_visit_id(visit.origin, visit.visit) + add_status = False else: visit_id = self._cql_runner.origin_generate_unique_visit_id( visit.origin ) visit = attr.evolve(visit, visit=visit_id) + add_status = True self.journal_writer.origin_visit_add([visit]) self._cql_runner.origin_visit_add_one(OriginVisitRow(**visit.to_dict())) assert visit.visit is not None all_visits.append(visit) - self._origin_visit_status_add( - OriginVisitStatus( - origin=visit.origin, - visit=visit.visit, - date=visit.date, - type=visit.type, - status="created", - snapshot=None, + if add_status: + self._origin_visit_status_add( + OriginVisitStatus( + origin=visit.origin, + visit=visit.visit, + date=visit.date, + type=visit.type, + status="created", + snapshot=None, + ) ) - ) return all_visits def _origin_visit_status_add(self, visit_status: OriginVisitStatus) -> None: diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -1039,27 +1039,31 @@ all_visits = [] for visit in visits: - if not visit.visit: + if visit.visit: + self.journal_writer.origin_visit_add([visit]) + db.origin_visit_add_with_id(visit, cur=cur) + else: + # visit_id is not given, it needs to be set by the db with convert_validation_exceptions(): visit_id = db.origin_visit_add( visit.origin, visit.date, visit.type, cur=cur ) visit = attr.evolve(visit, visit=visit_id) - else: - db.origin_visit_add_with_id(visit, cur=cur) + # Forced to write in the journal after the db (since its the db + # call that set the visit id) + self.journal_writer.origin_visit_add([visit]) + # In this case, we also want to create the initial OVS object + visit_status = OriginVisitStatus( + origin=visit.origin, + visit=visit_id, + date=visit.date, + type=visit.type, + status="created", + snapshot=None, + ) + self._origin_visit_status_add(visit_status, db=db, cur=cur) assert visit.visit is not None all_visits.append(visit) - # Forced to write after for the case when the visit has no id - self.journal_writer.origin_visit_add([visit]) - visit_status = OriginVisitStatus( - origin=visit.origin, - visit=visit.visit, - date=visit.date, - type=visit.type, - status="created", - snapshot=None, - ) - self._origin_visit_status_add(visit_status, db=db, cur=cur) return all_visits diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py --- a/swh/storage/tests/algos/test_snapshot.py +++ b/swh/storage/tests/algos/test_snapshot.py @@ -65,13 +65,20 @@ # no snapshot on origin visit so None origin = sample_data.origin - swh_storage.origin_add([origin]) + swh_storage.origin_add(sample_data.origins) origin_visit, origin_visit2 = sample_data.origin_visits[:2] assert origin_visit.origin == origin.url - swh_storage.origin_visit_add([origin_visit]) + swh_storage.origin_visit_add(sample_data.origin_visits) assert snapshot_get_latest(swh_storage, origin.url) is None + # no visit status for now (origin visits in sample_data do have a visit_id, + # so origin_visit_add won't auto-add the OriginVisitStatus: + assert swh_storage.origin_visit_get_latest(origin.url) is None + + # add "created" visit statuses + swh_storage.origin_visit_status_add(sample_data.origin_visit_statuses) + # now we should have a result from origin_visit_get_latest() ov1 = swh_storage.origin_visit_get_latest(origin.url) assert ov1 is not None @@ -96,12 +103,13 @@ def test_snapshot_get_latest(swh_storage, sample_data): origin = sample_data.origin - swh_storage.origin_add([origin]) + swh_storage.origin_add(sample_data.origins) visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url swh_storage.origin_visit_add([visit1]) + swh_storage.origin_visit_status_add([sample_data.origin_visit_status]) ov1 = swh_storage.origin_visit_get_latest(origin.url) # Add snapshot to visit1, latest snapshot = visit 1 snapshot diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -21,6 +21,7 @@ ObjectType, Origin, OriginVisit, + OriginVisitStatus, Person, RawExtrinsicMetadata, Release, @@ -579,6 +580,36 @@ origin_visit3, ) + origin_visit_status = OriginVisitStatus( + origin=origin.url, + visit=1, + date=date_visit1, + type=type_visit1, + status="created", + snapshot=None, + ) + origin_visit2_status = OriginVisitStatus( + origin=origin.url, + visit=2, + date=date_visit2, + type=type_visit1, + status="created", + snapshot=None, + ) + origin_visit3_status = OriginVisitStatus( + origin=origin2.url, + visit=1, + date=date_visit1, + type=type_visit2, + status="created", + snapshot=None, + ) + origin_visit_statuses: Tuple[OriginVisitStatus, ...] = ( + origin_visit_status, + origin_visit2_status, + origin_visit3_status, + ) + release = Release( id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -2389,11 +2389,11 @@ origin=sample_data.origin.url, date=now(), type=sample_data.type_visit1, - visit=42, + visit=0, ) ovs = OriginVisitStatus( origin=ov.origin, - visit=ov.visit, + visit=1, date=now(), status="created", snapshot=None, @@ -2968,6 +2968,14 @@ assert ov1.visit == 42 assert ov2.visit == 43 + # check OriginVisitStatus objects + ovs1 = swh_storage.origin_visit_status_get(ov1.origin, visit=ov1.visit).results + assert not ovs1, f"There should be no OriginVisitStatus for visit {ov1}" + ovs2 = swh_storage.origin_visit_status_get(ov2.origin, visit=ov2.visit).results + assert len(ovs2) == 1 + assert ovs2[0].status == "created" + assert ovs2[0].type == ov2.type + visit3 = OriginVisit( origin=origin1.url, date=date_visit, type=sample_data.type_visit1, visit=12 ) @@ -3558,7 +3566,6 @@ assert actual_visit == ov2 def test_origin_visit_get_latest_order(self, swh_storage, sample_data): - empty_snapshot, complete_snapshot = sample_data.snapshots[1:3] origin = sample_data.origin id1 = 2 @@ -3589,6 +3596,18 @@ ) ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) + ovs = [ + OriginVisitStatus( + origin=origin.url, + visit=ov.visit, + date=ov.date, + type=ov.type, + status="created", + snapshot=None, + ) + for ov in [ov1, ov2, ov3] + ] + swh_storage.origin_visit_status_add(ovs) # no filters actual_visit = swh_storage.origin_visit_get_latest(origin.url) @@ -3598,13 +3617,22 @@ origin = sample_data.origin swh_storage.origin_add([origin]) - visit1, visit2 = sample_data.origin_visits[:2] - assert visit1.origin == origin.url + (date1, date2, date3, date4) = [ + datetime.datetime(2021, 8, i, tzinfo=datetime.timezone.utc) + for i in range(1, 5) + ] + + visit1 = OriginVisit( + origin=origin.url, + visit=0, + date=date1, + type="git", + ) swh_storage.origin_visit_add([visit1]) ov1 = swh_storage.origin_visit_get_latest(origin.url) - # Add snapshot to visit1, latest snapshot = visit 1 snapshot + # Add a snapshot, but do not attach it to visit1 for now complete_snapshot = sample_data.snapshots[2] swh_storage.snapshot_add([complete_snapshot]) @@ -3613,15 +3641,14 @@ OriginVisitStatus( origin=origin.url, visit=ov1.visit, - date=visit2.date, + date=date2, status="partial", snapshot=None, ) ] ) - assert visit1.date < visit2.date - # no snapshot associated to the visit, so None + # no snapshot is associated to the visit, so None visit = swh_storage.origin_visit_get_latest( origin.url, allowed_statuses=["partial"], @@ -3629,32 +3656,36 @@ ) assert visit is None - date_now = now() - assert visit2.date < date_now + # attach the visit to the snapshot swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, - date=date_now, + date=date3, status="full", snapshot=complete_snapshot.id, ) ] ) - - swh_storage.origin_visit_add( + # and add a visit later on + ov2 = swh_storage.origin_visit_add( [ OriginVisit( origin=origin.url, - date=now(), + date=date4, type=visit1.type, ) ] - ) + )[0] + # so now the returned visit should be ov1 (because ov2 has no shapshot, + # so it won't be returned when require_snapshot is True) visit = swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) - assert visit is not None + assert visit == ov1 + # but without require_snapshot, ov2 is returned + visit = swh_storage.origin_visit_get_latest(origin.url, require_snapshot=False) + assert visit == ov2 def test_origin_visit_status_get_latest__validation(self, swh_storage, sample_data): origin = sample_data.origin diff --git a/swh/storage/tests/test_kafka_writer.py b/swh/storage/tests/test_kafka_writer.py --- a/swh/storage/tests/test_kafka_writer.py +++ b/swh/storage/tests/test_kafka_writer.py @@ -12,7 +12,7 @@ from swh.journal.pytest_plugin import assert_all_objects_consumed, consume_messages from swh.model.hypothesis_strategies import objects -from swh.model.model import Origin, OriginVisit, Person +from swh.model.model import Person from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.storage import get_storage @@ -50,17 +50,12 @@ "release", "snapshot", "origin", + "origin_visit", "origin_visit_status", "raw_extrinsic_metadata", ): method(objs) expected_messages += len(objs) - elif obj_type in ("origin_visit",): - for obj in objs: - assert isinstance(obj, OriginVisit) - storage.origin_add([Origin(url=obj.origin)]) - method([obj]) - expected_messages += 1 + 1 # 1 visit + 1 visit status else: assert False, obj_type diff --git a/swh/storage/tests/test_replay.py b/swh/storage/tests/test_replay.py --- a/swh/storage/tests/test_replay.py +++ b/swh/storage/tests/test_replay.py @@ -105,8 +105,6 @@ for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) - if object_type == "origin_visit": - nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) caplog.set_level(logging.ERROR, "swh.journal.replay") @@ -146,8 +144,6 @@ for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) - if object_type == "origin_visit": - nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) # Create collision in input data @@ -407,8 +403,6 @@ for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) - if object_type == "origin_visit": - nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) # Fill the destination storage from Kafka @@ -453,8 +447,6 @@ for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) - if object_type == "origin_visit": - nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) # insert invalid objects @@ -523,8 +515,6 @@ for object_type, objects in TEST_OBJECTS.items(): method = getattr(src, object_type + "_add") method(objects) - if object_type == "origin_visit": - nb_sent += len(objects) # origin-visit-add adds origin-visit-status as well nb_sent += len(objects) # insert invalid objects