diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -14,8 +14,8 @@ import dateutil from swh.model.model import ( - Revision, Release, Directory, DirectoryEntry, Content, SkippedContent, - OriginVisit, Snapshot, Origin + Revision, Release, Directory, DirectoryEntry, Content, + SkippedContent, OriginVisit, Snapshot, Origin ) from swh.model.hashutil import DEFAULT_ALGORITHMS from swh.storage.objstorage import ObjStorage @@ -668,7 +668,8 @@ else: return results - def origin_get_one(self, origin): + def origin_get_one(self, origin: Dict[str, Any]) -> Optional[ + Dict[str, Any]]: if 'id' in origin: raise StorageArgumentException('Origin ids are not supported.') if 'url' not in origin: @@ -757,17 +758,19 @@ return origin_url - def origin_visit_add( - self, origin, date, type) -> Optional[Dict[str, Union[str, int]]]: - origin_url = origin # TODO: rename the argument - + def origin_visit_add(self, origin_url: str, + date: Union[str, datetime.datetime], + type: str) -> OriginVisit: if isinstance(date, str): + # FIXME: Converge on iso8601 at some point date = dateutil.parser.parse(date) + elif not isinstance(date, datetime.datetime): + raise StorageArgumentException( + 'Date must be a datetime or a string') - origin = self.origin_get_one({'url': origin_url}) - - if not origin: - return None + if not self.origin_get_one({'url': origin_url}): + raise StorageArgumentException( + 'Unknown origin %s', origin_url) visit_id = self._cql_runner.origin_generate_unique_visit_id(origin_url) @@ -784,13 +787,8 @@ except (KeyError, TypeError, ValueError) as e: raise StorageArgumentException(*e.args) self.journal_writer.origin_visit_add(visit) - self._cql_runner.origin_visit_add_one(visit) - - return { - 'origin': origin_url, - 'visit': visit_id, - } + return visit def origin_visit_update( self, origin: str, visit_id: int, status: Optional[str] = None, diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -19,8 +19,9 @@ import attr from swh.model.model import ( - BaseContent, Content, SkippedContent, Directory, Revision, Release, - Snapshot, OriginVisit, Origin, SHA1_SIZE) + BaseContent, Content, SkippedContent, Directory, Revision, + Release, Snapshot, OriginVisit, Origin, SHA1_SIZE +) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.storage.objstorage import ObjStorage @@ -702,27 +703,28 @@ return origin.url - def origin_visit_add( - self, origin, date, type) -> Optional[Dict[str, Union[str, int]]]: - origin_url = origin - if origin_url is None: - raise StorageArgumentException('Unknown origin.') - + def origin_visit_add(self, origin_url: str, + date: Union[str, datetime.datetime], + type: str) -> OriginVisit: if isinstance(date, str): # FIXME: Converge on iso8601 at some point date = dateutil.parser.parse(date) elif not isinstance(date, datetime.datetime): raise StorageArgumentException( - 'date must be a datetime or a string.') + 'Date must be a datetime or a string') + + origin = self.origin_get({'url': origin_url}) + if not origin: # Cannot add a visit without an origin + raise StorageArgumentException( + 'Unknown origin %s', origin_url) - visit_ret = None if origin_url in self._origins: origin = self._origins[origin_url] # visit ids are in the range [1, +inf[ visit_id = len(self._origin_visits[origin_url]) + 1 status = 'ongoing' visit = OriginVisit( - origin=origin.url, + origin=origin_url, date=date, type=type, status=status, @@ -731,17 +733,15 @@ visit=visit_id, ) self._origin_visits[origin_url].append(visit) - visit_ret = { - 'origin': origin.url, - 'visit': visit_id, - } + visit = visit - self._objects[(origin_url, visit_id)].append( + self._objects[(origin_url, visit.visit)].append( ('origin_visit', None)) self.journal_writer.origin_visit_add(visit) - return visit_ret + # return last visit + return visit def origin_visit_update( self, origin: str, visit_id: int, status: Optional[str] = None, diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime + from typing import Any, Dict, Iterable, List, Optional, Union from swh.core.api import remote_api_endpoint @@ -768,19 +770,24 @@ @remote_api_endpoint('origin/visit/add') def origin_visit_add( - self, origin, date, type) -> Optional[Dict[str, Union[str, int]]]: + self, origin_url: str, + date: Union[str, datetime.datetime], + type: str) -> OriginVisit: """Add an origin_visit for the origin at ts with status 'ongoing'. Args: - origin (str): visited origin's identifier or URL - date (Union[str,datetime]): timestamp of such visit - type (str): the type of loader used for the visit (hg, git, ...) + origin_url: visited origin identifier (its URL) + date: timestamp of such visit + type: the type of loader used for the visit (hg, git, ...) + + Raises: + StorageArgumentException if the date is mistyped, or the origin + is unknown. Returns: dict: dictionary with keys origin and visit where: - - - origin: origin identifier - - visit: the visit identifier for the new visit occurrence + - origin: origin object + - visit: the visit object for the new visit occurrence """ ... diff --git a/swh/storage/retry.py b/swh/storage/retry.py --- a/swh/storage/retry.py +++ b/swh/storage/retry.py @@ -15,7 +15,7 @@ from swh.model.model import ( Content, SkippedContent, Directory, Revision, Release, Snapshot, - Origin, + Origin, OriginVisit ) from swh.storage import get_storage @@ -93,9 +93,9 @@ return self.storage.origin_add_one(origin) @swh_retry - def origin_visit_add(self, origin: Dict, - date: Union[datetime, str], type: str) -> Dict: - return self.storage.origin_visit_add(origin, date, type) + def origin_visit_add(self, origin_url: str, + date: Union[datetime, str], type: str) -> OriginVisit: + return self.storage.origin_visit_add(origin_url, date, type) @swh_retry def origin_visit_update( diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -804,20 +804,26 @@ @timed @db_transaction() def origin_visit_add( - self, origin, date, type, db=None, cur=None - ) -> Optional[Dict[str, Union[str, int]]]: - origin_url = origin - + self, origin_url: str, date: Union[str, datetime.datetime], + type: str, db=None, cur=None) -> OriginVisit: if isinstance(date, str): # FIXME: Converge on iso8601 at some point date = dateutil.parser.parse(date) + elif not isinstance(date, datetime.datetime): + raise StorageArgumentException( + 'Date must be a datetime or a string') + + origin = self.origin_get({'url': origin_url}) + if not origin: # Cannot add a visit without an origin + raise StorageArgumentException( + 'Unknown origin %s', origin_url) with convert_validation_exceptions(): - visit_id = db.origin_visit_add(origin_url, date, type, cur) + visit_id = db.origin_visit_add(origin_url, date, type, cur=cur) # We can write to the journal only after inserting to the # DB, because we want the id of the visit - visit = { + visit = OriginVisit.from_dict({ 'origin': origin_url, 'date': date, 'type': type, @@ -825,14 +831,11 @@ 'status': 'ongoing', 'metadata': None, 'snapshot': None - } + }) self.journal_writer.origin_visit_add(visit) send_metric('origin_visit:add', count=1, method_name='origin_visit') - return { - 'origin': origin_url, - 'visit': visit_id, - } + return visit @timed @db_transaction() diff --git a/swh/storage/tests/test_retry.py b/swh/storage/tests/test_retry.py --- a/swh/storage/tests/test_retry.py +++ b/swh/storage/tests/test_retry.py @@ -255,16 +255,14 @@ """ sample_origin = sample_data['origin'][0] - swh_storage.origin_add_one(sample_origin) - origin_url = sample_origin['url'] + origin_url = swh_storage.origin_add_one(sample_origin) origin = list(swh_storage.origin_visit_get(origin_url)) assert not origin - origin_visit = swh_storage.origin_visit_add( - origin_url, '2020-01-01', 'hg') - assert origin_visit['origin'] == origin_url - assert isinstance(origin_visit['visit'], int) + origin_visit = swh_storage.origin_visit_add(origin_url, '2020-01-01', 'hg') + assert origin_visit.origin == origin_url + assert isinstance(origin_visit.visit, int) origin_visit = next(swh_storage.origin_visit_get(origin_url)) assert origin_visit['origin'] == origin_url @@ -277,8 +275,7 @@ """ sample_origin = sample_data['origin'][1] - swh_storage.origin_add_one(sample_origin) - origin_url = sample_origin['url'] + origin_url = swh_storage.origin_add_one(sample_origin) mock_memory = mocker.patch( 'swh.storage.in_memory.InMemoryStorage.origin_visit_add') @@ -297,13 +294,13 @@ origin = list(swh_storage.origin_visit_get(origin_url)) assert not origin - r = swh_storage.origin_visit_add(sample_origin, '2020-01-01', 'git') + r = swh_storage.origin_visit_add(origin_url, '2020-01-01', 'git') assert r == {'origin': origin_url, 'visit': 1} mock_memory.assert_has_calls([ - call(sample_origin, '2020-01-01', 'git'), - call(sample_origin, '2020-01-01', 'git'), - call(sample_origin, '2020-01-01', 'git') + call(origin_url, '2020-01-01', 'git'), + call(origin_url, '2020-01-01', 'git'), + call(origin_url, '2020-01-01', 'git') ]) assert mock_sleep.call_count == 2 @@ -595,23 +592,22 @@ """ sample_origin = sample_data['origin'][0] - swh_storage.origin_add_one(sample_origin) - origin_url = sample_origin['url'] - origin_visit = swh_storage.origin_visit_add( - origin_url, '2020-01-01', 'hg') + origin_url = swh_storage.origin_add_one(sample_origin) + origin_visit = swh_storage.origin_visit_add(origin_url, '2020-01-01', 'hg') ov = next(swh_storage.origin_visit_get(origin_url)) assert ov['origin'] == origin_url - assert ov['visit'] == origin_visit['visit'] + assert ov['visit'] == origin_visit.visit assert ov['status'] == 'ongoing' assert ov['snapshot'] is None assert ov['metadata'] is None - swh_storage.origin_visit_update(origin_url, ov['visit'], status='full') + swh_storage.origin_visit_update( + origin_url, origin_visit.visit, status='full') ov = next(swh_storage.origin_visit_get(origin_url)) assert ov['origin'] == origin_url - assert ov['visit'] == origin_visit['visit'] + assert ov['visit'] == origin_visit.visit assert ov['status'] == 'full' assert ov['snapshot'] is None assert ov['metadata'] is None diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -26,7 +26,9 @@ from swh.model import from_disk, identifiers from swh.model.hashutil import hash_to_bytes -from swh.model.model import Content, OriginVisit, Release, Revision +from swh.model.model import ( + Content, OriginVisit, Release, Revision +) from swh.model.hypothesis_strategies import objects from swh.storage import HashCollision, get_storage from swh.storage.converters import origin_url_to_sha1 as sha1 @@ -1389,11 +1391,12 @@ # Add visits to those origins for origin in data.origins: + origin_url = origin['url'] for date_visit in visits: visit = swh_storage.origin_visit_add( - origin['url'], date=date_visit, type=visit_type) + origin_url, date=date_visit, type=visit_type) swh_storage.origin_visit_update( - origin['url'], visit_id=visit['visit'], status='full') + origin_url, visit_id=visit.visit, status='full') swh_storage.refresh_stat_counters() @@ -1414,11 +1417,12 @@ # will be found by the random selection visits = self._generate_random_visits(nb_visits=3, start=13, end=24) for origin in data.origins: + origin_url = origin['url'] for date_visit in visits: visit = swh_storage.origin_visit_add( - origin['url'], date=date_visit, type=visit_type) + origin_url, date=date_visit, type=visit_type) swh_storage.origin_visit_update( - origin['url'], visit_id=visit['visit'], status='full') + origin_url, visit.visit, status='full') random_origin_visit = swh_storage.origin_visit_get_random(visit_type) assert random_origin_visit is None @@ -1554,9 +1558,7 @@ def test_origin_visit_add(self, swh_storage): # given - swh_storage.origin_add_one(data.origin2) - - origin_url = data.origin2['url'] + origin_url = swh_storage.origin_add_one(data.origin2) date_visit = datetime.datetime.now(datetime.timezone.utc) # Round to milliseconds before insertion, so equality doesn't fail @@ -1566,31 +1568,22 @@ # when origin_visit1 = swh_storage.origin_visit_add( - origin_url, - type=data.type_visit1, - date=date_visit) + origin_url, type=data.type_visit1, date=date_visit) actual_origin_visits = list(swh_storage.origin_visit_get( origin_url)) - assert { - 'origin': origin_url, - 'date': date_visit, - 'visit': origin_visit1['visit'], - 'type': data.type_visit1, - 'status': 'ongoing', - 'metadata': None, - 'snapshot': None, - } in actual_origin_visits - origin_visit = { 'origin': origin_url, 'date': date_visit, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, 'snapshot': None, } + + assert origin_visit in actual_origin_visits + objects = list(swh_storage.journal_writer.journal.objects) assert ('origin', data.origin2) in objects assert ('origin_visit', origin_visit) in objects @@ -1600,8 +1593,7 @@ def test_origin_visit_add_default_type(self, swh_storage): # given - swh_storage.origin_add_one(data.origin2) - origin_url = data.origin2['url'] + origin_url = swh_storage.origin_add_one(data.origin2) # when date_visit = datetime.datetime.now(datetime.timezone.utc) @@ -1615,42 +1607,36 @@ microsecond=round(date_visit2.microsecond, -3)) origin_visit1 = swh_storage.origin_visit_add( - origin_url, - date=date_visit, - type=data.type_visit1, - ) + origin_url, date=date_visit, type=data.type_visit1) origin_visit2 = swh_storage.origin_visit_add( - origin_url, - date=date_visit2, - type=data.type_visit2, - ) + origin_url, date=date_visit2, type=data.type_visit2) # then - assert origin_visit1['origin'] == origin_url - assert origin_visit1['visit'] is not None + assert origin_visit1.origin == origin_url + assert origin_visit1.visit is not None actual_origin_visits = list(swh_storage.origin_visit_get( origin_url)) expected_visits = [ - { - 'origin': origin_url, - 'date': date_visit, - 'visit': origin_visit1['visit'], - 'type': data.type_visit1, - 'status': 'ongoing', - 'metadata': None, - 'snapshot': None, - }, - { - 'origin': origin_url, - 'date': date_visit2, - 'visit': origin_visit2['visit'], - 'type': data.type_visit2, - 'status': 'ongoing', - 'metadata': None, - 'snapshot': None, - }, - ] + { + 'origin': origin_url, + 'date': date_visit, + 'visit': origin_visit1.visit, + 'type': data.type_visit1, + 'status': 'ongoing', + 'metadata': None, + 'snapshot': None, + }, + { + 'origin': origin_url, + 'date': date_visit2, + 'visit': origin_visit2.visit, + 'type': data.type_visit2, + 'status': 'ongoing', + 'metadata': None, + 'snapshot': None, + }, + ] for visit in expected_visits: assert visit in actual_origin_visits @@ -1662,7 +1648,6 @@ def test_origin_visit_add_validation(self, swh_storage): origin_url = swh_storage.origin_add_one(data.origin2) - with pytest.raises(StorageArgumentException) as cm: swh_storage.origin_visit_add(origin_url, date=[b'foo'], type='git') @@ -1672,9 +1657,8 @@ def test_origin_visit_update(self, swh_storage): # given - swh_storage.origin_add_one(data.origin) - origin_url = data.origin['url'] - + origin_url = swh_storage.origin_add_one(data.origin) + origin_url2 = swh_storage.origin_add_one(data.origin2) date_visit = datetime.datetime.now(datetime.timezone.utc) date_visit2 = date_visit + datetime.timedelta(minutes=1) @@ -1686,24 +1670,11 @@ microsecond=round(date_visit2.microsecond, -3)) origin_visit1 = swh_storage.origin_visit_add( - origin_url, - date=date_visit, - type=data.type_visit1, - ) - + origin_url, date=date_visit, type=data.type_visit1) origin_visit2 = swh_storage.origin_visit_add( - origin_url, - date=date_visit2, - type=data.type_visit2 - ) - - swh_storage.origin_add_one(data.origin2) - origin_url2 = data.origin2['url'] + origin_url, date=date_visit2, type=data.type_visit2) origin_visit3 = swh_storage.origin_visit_add( - origin_url2, - date=date_visit2, - type=data.type_visit3 - ) + origin_url2, date=date_visit2, type=data.type_visit3) # when visit1_metadata = { @@ -1711,12 +1682,10 @@ 'directories': 22, } swh_storage.origin_visit_update( - origin_url, - origin_visit1['visit'], status='full', + origin_url, origin_visit1.visit, status='full', metadata=visit1_metadata) swh_storage.origin_visit_update( - origin_url2, - origin_visit3['visit'], status='partial') + origin_url2, origin_visit3.visit, status='partial') # then actual_origin_visits = list(swh_storage.origin_visit_get( @@ -1724,7 +1693,7 @@ expected_visits = [{ 'origin': origin_url, 'date': date_visit, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'full', 'metadata': visit1_metadata, @@ -1732,7 +1701,7 @@ }, { 'origin': origin_url, 'date': date_visit2, - 'visit': origin_visit2['visit'], + 'visit': origin_visit2.visit, 'type': data.type_visit2, 'status': 'ongoing', 'metadata': None, @@ -1742,13 +1711,12 @@ assert visit in actual_origin_visits actual_origin_visits_bis = list(swh_storage.origin_visit_get( - origin_url, - limit=1)) + origin_url, limit=1)) assert actual_origin_visits_bis == [ { 'origin': origin_url, 'date': date_visit, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'full', 'metadata': visit1_metadata, @@ -1756,13 +1724,12 @@ }] actual_origin_visits_ter = list(swh_storage.origin_visit_get( - origin_url, - last_visit=origin_visit1['visit'])) + origin_url, last_visit=origin_visit1.visit)) assert actual_origin_visits_ter == [ { 'origin': origin_url, 'date': date_visit2, - 'visit': origin_visit2['visit'], + 'visit': origin_visit2.visit, 'type': data.type_visit2, 'status': 'ongoing', 'metadata': None, @@ -1775,7 +1742,7 @@ { 'origin': origin_url2, 'date': date_visit2, - 'visit': origin_visit3['visit'], + 'visit': origin_visit3.visit, 'type': data.type_visit3, 'status': 'partial', 'metadata': None, @@ -1785,7 +1752,7 @@ data1 = { 'origin': origin_url, 'date': date_visit, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -1794,7 +1761,7 @@ data2 = { 'origin': origin_url, 'date': date_visit2, - 'visit': origin_visit2['visit'], + 'visit': origin_visit2.visit, 'type': data.type_visit2, 'status': 'ongoing', 'metadata': None, @@ -1803,7 +1770,7 @@ data3 = { 'origin': origin_url2, 'date': date_visit2, - 'visit': origin_visit3['visit'], + 'visit': origin_visit3.visit, 'type': data.type_visit3, 'status': 'ongoing', 'metadata': None, @@ -1812,7 +1779,7 @@ data4 = { 'origin': origin_url, 'date': date_visit, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'metadata': visit1_metadata, 'status': 'full', @@ -1821,7 +1788,7 @@ data5 = { 'origin': origin_url2, 'date': date_visit2, - 'visit': origin_visit3['visit'], + 'visit': origin_visit3.visit, 'type': data.type_visit3, 'status': 'partial', 'metadata': None, @@ -1837,17 +1804,12 @@ assert ('origin_visit', data5) in objects def test_origin_visit_update_validation(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) visit = swh_storage.origin_visit_add( - origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - + origin_url, date=data.date_visit2, type=data.type_visit2) with pytest.raises(StorageArgumentException, match='status') as cm: swh_storage.origin_visit_update( - origin_url, visit['visit'], status='foobar') + origin_url, visit.visit, status='foobar') if type(cm.value) == psycopg2.DataError: assert cm.value.pgcode == \ @@ -1855,60 +1817,43 @@ def test_origin_visit_find_by_date(self, swh_storage): # given - swh_storage.origin_add_one(data.origin) - + origin_url = swh_storage.origin_add_one(data.origin) swh_storage.origin_visit_add( - data.origin['url'], - date=data.date_visit2, - type=data.type_visit1, - ) + origin_url, date=data.date_visit2, type=data.type_visit1) origin_visit2 = swh_storage.origin_visit_add( - data.origin['url'], - date=data.date_visit3, - type=data.type_visit2, - ) - + origin_url, date=data.date_visit3, type=data.type_visit2) origin_visit3 = swh_storage.origin_visit_add( - data.origin['url'], - date=data.date_visit2, - type=data.type_visit3, - ) + origin_url, date=data.date_visit2, type=data.type_visit3) # Simple case visit = swh_storage.origin_visit_find_by_date( - data.origin['url'], data.date_visit3) - assert visit['visit'] == origin_visit2['visit'] + origin_url, data.date_visit3) + assert visit['visit'] == origin_visit2.visit # There are two visits at the same date, the latest must be returned visit = swh_storage.origin_visit_find_by_date( - data.origin['url'], data.date_visit2) - assert visit['visit'] == origin_visit3['visit'] + origin_url, data.date_visit2) + assert visit['visit'] == origin_visit3.visit def test_origin_visit_find_by_date__unknown_origin(self, swh_storage): swh_storage.origin_visit_find_by_date('foo', data.date_visit2) def test_origin_visit_update_missing_snapshot(self, swh_storage): # given - swh_storage.origin_add_one(data.origin) - origin_url = data.origin['url'] - + origin_url = swh_storage.origin_add_one(data.origin) origin_visit = swh_storage.origin_visit_add( - origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) + origin_url, date=data.date_visit1, type=data.type_visit1) # when swh_storage.origin_visit_update( origin_url, - origin_visit['visit'], + origin_visit.visit, snapshot=data.snapshot['id']) # then actual_origin_visit = swh_storage.origin_visit_get_by( - origin_url, - origin_visit['visit']) + origin_url, origin_visit.visit) assert actual_origin_visit['snapshot'] == data.snapshot['id'] # when @@ -1916,35 +1861,22 @@ assert actual_origin_visit['snapshot'] == data.snapshot['id'] def test_origin_visit_get_by(self, swh_storage): - swh_storage.origin_add_one(data.origin) - swh_storage.origin_add_one(data.origin2) - - origin_url = data.origin['url'] - origin2_url = data.origin2['url'] - + origin_url = swh_storage.origin_add_one(data.origin) + origin_url2 = swh_storage.origin_add_one(data.origin2) origin_visit1 = swh_storage.origin_visit_add( - origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) + origin_url, date=data.date_visit2, type=data.type_visit2) swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( origin_url, - origin_visit1['visit'], + origin_visit1.visit, snapshot=data.snapshot['id']) # Add some other {origin, visit} entries swh_storage.origin_visit_add( - origin_url, - date=data.date_visit3, - type=data.type_visit3, - ) + origin_url, date=data.date_visit3, type=data.type_visit3) swh_storage.origin_visit_add( - origin2_url, - date=data.date_visit3, - type=data.type_visit3, - ) + origin_url2, date=data.date_visit3, type=data.type_visit3) # when visit1_metadata = { @@ -1953,14 +1885,13 @@ } swh_storage.origin_visit_update( - origin_url, - origin_visit1['visit'], status='full', - metadata=visit1_metadata) + origin_url, origin_visit1.visit, + status='full', metadata=visit1_metadata) - expected_origin_visit = origin_visit1.copy() + expected_origin_visit = origin_visit1.to_dict() expected_origin_visit.update({ 'origin': origin_url, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'date': data.date_visit2, 'type': data.type_visit2, 'metadata': visit1_metadata, @@ -1971,7 +1902,7 @@ # when actual_origin_visit1 = swh_storage.origin_visit_get_by( origin_url, - origin_visit1['visit']) + origin_visit1.visit) # then assert actual_origin_visit1 == expected_origin_visit @@ -1981,8 +1912,7 @@ def test_origin_visit_upsert_new(self, swh_storage): # given - swh_storage.origin_add_one(data.origin2) - origin_url = data.origin2['url'] + origin_url = swh_storage.origin_add_one(data.origin2) # when swh_storage.origin_visit_upsert([ @@ -2055,19 +1985,16 @@ def test_origin_visit_upsert_existing(self, swh_storage): # given - swh_storage.origin_add_one(data.origin2) - origin_url = data.origin2['url'] + origin_url = swh_storage.origin_add_one(data.origin2) # when origin_visit1 = swh_storage.origin_visit_add( - origin_url, - date=data.date_visit2, - type=data.type_visit1, - ) + origin_url, date=data.date_visit2, type=data.type_visit1) + swh_storage.origin_visit_upsert([OriginVisit.from_dict({ 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'full', 'metadata': None, @@ -2075,8 +2002,8 @@ })]) # then - assert origin_visit1['origin'] == origin_url - assert origin_visit1['visit'] is not None + assert origin_visit1.origin == origin_url + assert origin_visit1.visit is not None actual_origin_visits = list(swh_storage.origin_visit_get( origin_url)) @@ -2084,7 +2011,7 @@ { 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'full', 'metadata': None, @@ -2094,7 +2021,7 @@ data1 = { 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -2103,7 +2030,7 @@ data2 = { 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'full', 'metadata': None, @@ -2121,33 +2048,21 @@ assert actual_origin_visit is None def test_origin_visit_get_latest(self, swh_storage): - swh_storage.origin_add_one(data.origin) - origin_url = data.origin['url'] - origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit1_id = origin_visit1['visit'] - origin_visit2 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - visit2_id = origin_visit2['visit'] - + origin_url = swh_storage.origin_add_one(data.origin) + ov1 = swh_storage.origin_visit_add( + origin_url, date=data.date_visit1, type=data.type_visit1) + ov2 = swh_storage.origin_visit_add( + origin_url, date=data.date_visit2, type=data.type_visit2) # Add a visit with the same date as the previous one - origin_visit3 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - visit3_id = origin_visit3['visit'] - - origin_visit1 = swh_storage.origin_visit_get_by(origin_url, visit1_id) - origin_visit2 = swh_storage.origin_visit_get_by(origin_url, visit2_id) - origin_visit3 = swh_storage.origin_visit_get_by(origin_url, visit3_id) - + ov3 = swh_storage.origin_visit_add( + origin_url, date=data.date_visit2, type=data.type_visit2) + + origin_visit1 = swh_storage.origin_visit_get_by( + origin_url, ov1.visit) + origin_visit2 = swh_storage.origin_visit_get_by( + origin_url, ov2.visit) + origin_visit3 = swh_storage.origin_visit_get_by( + origin_url, ov3.visit) # Two visits, both with no snapshot assert origin_visit3 == swh_storage.origin_visit_get_latest(origin_url) assert swh_storage.origin_visit_get_latest( @@ -2157,9 +2072,9 @@ # visit1 and require_snapshot=False still returns visit2 swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit1_id, - snapshot=data.complete_snapshot['id']) - assert {**origin_visit1, 'snapshot': data.complete_snapshot['id']} \ + origin_url, ov1.visit, snapshot=data.complete_snapshot['id']) + assert {**origin_visit1, + 'snapshot': data.complete_snapshot['id']} \ == swh_storage.origin_visit_get_latest( origin_url, require_snapshot=True) @@ -2172,8 +2087,7 @@ # Mark the first visit as completed and check status filter again swh_storage.origin_visit_update( - origin_url, - visit1_id, status='full') + origin_url, ov1.visit, status='full') assert { **origin_visit1, 'snapshot': data.complete_snapshot['id'], @@ -2185,8 +2099,7 @@ # Add snapshot to visit2 and check that the new snapshot is returned swh_storage.snapshot_add([data.empty_snapshot]) swh_storage.origin_visit_update( - origin_url, visit2_id, - snapshot=data.empty_snapshot['id']) + origin_url, ov2.visit, snapshot=data.empty_snapshot['id']) assert {**origin_visit2, 'snapshot': data.empty_snapshot['id']} == \ swh_storage.origin_visit_get_latest( origin_url, require_snapshot=True) @@ -2203,7 +2116,7 @@ # Add snapshot to visit3 (same date as visit2) swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit3_id, snapshot=data.complete_snapshot['id']) + origin_url, ov3.visit, snapshot=data.complete_snapshot['id']) assert { **origin_visit1, 'snapshot': data.complete_snapshot['id'], @@ -2247,20 +2160,17 @@ assert revisions[0]['committer'] == revisions[1]['committer'] def test_snapshot_add_get_empty(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) + visit_id = origin_visit1.visit actual_result = swh_storage.snapshot_add([data.empty_snapshot]) assert actual_result == {'snapshot:add': 1} swh_storage.origin_visit_update( - origin_url, visit_id, snapshot=data.empty_snapshot['id']) + origin_url, origin_visit1.visit, + snapshot=data.empty_snapshot['id']) by_id = swh_storage.snapshot_get(data.empty_snapshot['id']) assert by_id == {**data.empty_snapshot, 'next_branch': None} @@ -2271,7 +2181,7 @@ data1 = { 'origin': origin_url, 'date': data.date_visit1, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -2280,7 +2190,7 @@ data2 = { 'origin': origin_url, 'date': data.date_visit1, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -2294,17 +2204,15 @@ def test_snapshot_add_get_complete(self, swh_storage): origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) + visit_id = origin_visit1.visit actual_result = swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit_id, snapshot=data.complete_snapshot['id']) + origin_url, origin_visit1.visit, + snapshot=data.complete_snapshot['id']) assert actual_result == {'snapshot:add': 1} by_id = swh_storage.snapshot_get(data.complete_snapshot['id']) @@ -2451,18 +2359,14 @@ assert snapshot == expected_snapshot def test_snapshot_add_get_filtered(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit_id, snapshot=data.complete_snapshot['id']) + origin_url, origin_visit1.visit, + snapshot=data.complete_snapshot['id']) snp_id = data.complete_snapshot['id'] branches = data.complete_snapshot['branches'] @@ -2570,18 +2474,14 @@ assert snapshot == expected_snapshot def test_snapshot_add_get(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) + visit_id = origin_visit1.visit swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( - origin_url, visit_id, snapshot=data.snapshot['id']) + origin_url, origin_visit1.visit, snapshot=data.snapshot['id']) by_id = swh_storage.snapshot_get(data.snapshot['id']) assert by_id == {**data.snapshot, 'next_branch': None} @@ -2594,9 +2494,16 @@ assert origin_visit_info['snapshot'] == data.snapshot['id'] def test_snapshot_add_nonexistent_visit(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) - visit_id = 54164461156 + origin_url = swh_storage.origin_add_one(data.origin) + # unknown visit + visit = OriginVisit( + origin=origin_url, + visit=54164461156, + date=data.date_visit1, + status='ongoing', + type='git', + snapshot=None + ) swh_storage.journal_writer.journal.objects[:] = [] @@ -2604,38 +2511,31 @@ with pytest.raises(StorageArgumentException): swh_storage.origin_visit_update( - origin_url, visit_id, snapshot=data.snapshot['id']) + origin_url, visit.visit, snapshot=data.snapshot['id']) assert list(swh_storage.journal_writer.journal.objects) == [ ('snapshot', data.snapshot)] def test_snapshot_add_twice__by_origin_visit(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit1_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) + visit1_id = origin_visit1.visit swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( - origin_url, visit1_id, snapshot=data.snapshot['id']) + origin_url, origin_visit1.visit, snapshot=data.snapshot['id']) by_ov1 = swh_storage.snapshot_get_by_origin_visit( origin_url, visit1_id) assert by_ov1 == {**data.snapshot, 'next_branch': None} origin_visit2 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - visit2_id = origin_visit2['visit'] + origin_url, date=data.date_visit2, type=data.type_visit2) + visit2_id = origin_visit2.visit swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( - origin_url, visit2_id, snapshot=data.snapshot['id']) + origin_url, origin_visit2.visit, snapshot=data.snapshot['id']) by_ov2 = swh_storage.snapshot_get_by_origin_visit( origin_url, visit2_id) @@ -2644,7 +2544,7 @@ data1 = { 'origin': origin_url, 'date': data.date_visit1, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -2653,7 +2553,7 @@ data2 = { 'origin': origin_url, 'date': data.date_visit1, - 'visit': origin_visit1['visit'], + 'visit': origin_visit1.visit, 'type': data.type_visit1, 'status': 'ongoing', 'metadata': None, @@ -2662,7 +2562,7 @@ data3 = { 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit2['visit'], + 'visit': origin_visit2.visit, 'type': data.type_visit2, 'status': 'ongoing', 'metadata': None, @@ -2671,7 +2571,7 @@ data4 = { 'origin': origin_url, 'date': data.date_visit2, - 'visit': origin_visit2['visit'], + 'visit': origin_visit2.visit, 'type': data.type_visit2, 'status': 'ongoing', 'metadata': None, @@ -2686,48 +2586,34 @@ ('origin_visit', data4)] def test_snapshot_get_latest(self, swh_storage): - origin_url = data.origin['url'] - swh_storage.origin_add_one(data.origin) - origin_url = data.origin['url'] + origin_url = swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit1_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) origin_visit2 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - visit2_id = origin_visit2['visit'] - + origin_url, date=data.date_visit2, type=data.type_visit2) # Add a visit with the same date as the previous one origin_visit3 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit3, - ) - visit3_id = origin_visit3['visit'] - + origin_url, date=data.date_visit2, type=data.type_visit3) # Two visits, both with no snapshot: latest snapshot is None assert swh_storage.snapshot_get_latest(origin_url) is None # Add snapshot to visit1, latest snapshot = visit 1 snapshot swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit1_id, snapshot=data.complete_snapshot['id']) + origin_url, origin_visit1.visit, + snapshot=data.complete_snapshot['id']) assert {**data.complete_snapshot, 'next_branch': None} \ == swh_storage.snapshot_get_latest(origin_url) # Status filter: all three visits are status=ongoing, so no snapshot # returned assert swh_storage.snapshot_get_latest( - origin_url, - allowed_statuses=['full']) is None + origin_url, + allowed_statuses=['full']) is None # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_update(origin_url, visit1_id, status='full') + swh_storage.origin_visit_update( + origin_url, origin_visit1.visit, status='full') assert {**data.complete_snapshot, 'next_branch': None} \ == swh_storage.snapshot_get_latest( origin_url, @@ -2736,7 +2622,8 @@ # Add snapshot to visit2 and check that the new snapshot is returned swh_storage.snapshot_add([data.empty_snapshot]) swh_storage.origin_visit_update( - origin_url, visit2_id, snapshot=data.empty_snapshot['id']) + origin_url, origin_visit2.visit, + snapshot=data.empty_snapshot['id']) assert {**data.empty_snapshot, 'next_branch': None} \ == swh_storage.snapshot_get_latest(origin_url) @@ -2750,28 +2637,18 @@ # the new snapshot is returned swh_storage.snapshot_add([data.complete_snapshot]) swh_storage.origin_visit_update( - origin_url, visit3_id, snapshot=data.complete_snapshot['id']) + origin_url, origin_visit3.visit, + snapshot=data.complete_snapshot['id']) assert {**data.complete_snapshot, 'next_branch': None} \ == swh_storage.snapshot_get_latest(origin_url) def test_snapshot_get_latest__missing_snapshot(self, swh_storage): - # Origin does not exist - origin_url = data.origin['url'] + origin_url = swh_storage.origin_add_one(data.origin) assert swh_storage.snapshot_get_latest(origin_url) is None - - swh_storage.origin_add_one(data.origin) origin_visit1 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - ) - visit1_id = origin_visit1['visit'] + origin_url, date=data.date_visit1, type=data.type_visit1) origin_visit2 = swh_storage.origin_visit_add( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - ) - visit2_id = origin_visit2['visit'] + origin_url, date=data.date_visit2, type=data.type_visit2) # Two visits, both with no snapshot: latest snapshot is None assert swh_storage.snapshot_get_latest(origin_url) is None @@ -2779,8 +2656,8 @@ # Add unknown snapshot to visit1, check that the inconsistency is # detected swh_storage.origin_visit_update( - origin_url, - visit1_id, snapshot=data.complete_snapshot['id']) + origin_url, origin_visit1.visit, + snapshot=data.complete_snapshot['id']) with pytest.raises(Exception): # XXX: should the exception be more specific than this? swh_storage.snapshot_get_latest(origin_url) @@ -2788,13 +2665,11 @@ # Status filter: both visits are status=ongoing, so no snapshot # returned assert swh_storage.snapshot_get_latest( - origin_url, - allowed_statuses=['full']) is None + origin_url, allowed_statuses=['full']) is None # Mark the first visit as completed and check status filter again swh_storage.origin_visit_update( - origin_url, - visit1_id, status='full') + origin_url, origin_visit1.visit, status='full') with pytest.raises(Exception): # XXX: should the exception be more specific than this? swh_storage.snapshot_get_latest( @@ -2809,8 +2684,7 @@ # Add unknown snapshot to visit2 and check that the inconsistency # is detected swh_storage.origin_visit_update( - origin_url, - visit2_id, snapshot=data.snapshot['id']) + origin_url, origin_visit2.visit, snapshot=data.snapshot['id']) with pytest.raises(Exception): # XXX: should the exception be more specific than this? swh_storage.snapshot_get_latest( @@ -2866,16 +2740,13 @@ # Add other objects. Check their counter increased as well. - swh_storage.origin_add_one(data.origin2) + origin_url = swh_storage.origin_add_one(data.origin2) origin_visit1 = swh_storage.origin_visit_add( - origin=data.origin2['url'], - date=data.date_visit2, - type=data.type_visit2, - ) + origin_url, date=data.date_visit2, type=data.type_visit2) + swh_storage.snapshot_add([data.snapshot]) swh_storage.origin_visit_update( - data.origin2['url'], origin_visit1['visit'], - snapshot=data.snapshot['id']) + origin_url, origin_visit1.visit, snapshot=data.snapshot['id']) swh_storage.directory_add([data.dir]) swh_storage.revision_add([data.revision]) swh_storage.release_add([data.release]) @@ -3737,8 +3608,9 @@ swh_storage.origin_add([{'url': url} for url in self.ORIGINS]) now = datetime.datetime.now(tz=datetime.timezone.utc) + origin_url = 'https://github.com/user1/repo1' swh_storage.origin_visit_add( - origin='https://github.com/user1/repo1', date=now, type='git') + origin_url, date=now, type='git') assert swh_storage.origin_count('github', with_visit=False) == 3 # it has a visit, but no snapshot, so with_visit=True => 0 @@ -3761,11 +3633,11 @@ now = datetime.datetime.now(tz=datetime.timezone.utc) swh_storage.snapshot_add([data.snapshot]) + origin_url = 'https://github.com/user1/repo1' visit = swh_storage.origin_visit_add( - origin='https://github.com/user1/repo1', date=now, type='git') + origin_url, date=now, type='git') swh_storage.origin_visit_update( - origin='https://github.com/user1/repo1', visit_id=visit['visit'], - snapshot=data.snapshot['id']) + origin_url, visit.visit, snapshot=data.snapshot['id']) assert swh_storage.origin_count('github', with_visit=False) == 3 # github/user1 has a visit and a snapshot, so with_visit=True => 1 @@ -3784,12 +3656,12 @@ for (obj_type, obj) in objects: obj = obj.to_dict() if obj_type == 'origin_visit': - origin = obj.pop('origin') - swh_storage.origin_add_one({'url': origin}) + origin_url = obj.pop('origin') + swh_storage.origin_add_one({'url': origin_url}) if 'visit' in obj: del obj['visit'] swh_storage.origin_visit_add( - origin, obj['date'], obj['type']) + origin_url, obj['date'], obj['type']) else: if obj_type == 'content' and obj['status'] == 'absent': obj_type = 'skipped_content' diff --git a/swh/storage/validate.py b/swh/storage/validate.py --- a/swh/storage/validate.py +++ b/swh/storage/validate.py @@ -3,11 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import contextlib from typing import Dict, Iterable, List, Union from swh.model.model import ( - SkippedContent, Content, Directory, Revision, Release, Snapshot, + BaseModel, SkippedContent, Content, Directory, Revision, Release, Snapshot, OriginVisit, Origin ) @@ -81,9 +82,10 @@ return self.storage.snapshot_add(snapshots) def origin_visit_add( - self, origin, date, type) -> Dict[str, Union[str, int]]: + self, origin_url: str, + date: Union[datetime.datetime], type: str) -> Dict[str, BaseModel]: with convert_validation_exceptions(): - visit = OriginVisit(origin=origin, date=date, type=type, + visit = OriginVisit(origin=origin_url, date=date, type=type, status='ongoing', snapshot=None) return self.storage.origin_visit_add( visit.origin, visit.date, visit.type)