diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -100,7 +100,7 @@ as $$ create temporary table tmp_occurrence_history( like occurrence_history including defaults, - date timestamptz not null + visit bigint not null ) on commit drop; alter table tmp_occurrence_history drop column visits, @@ -832,7 +832,22 @@ end; $$; - +-- add a new origin_visit for origin origin_id at date. +-- +-- Returns the new visit id. +create or replace function swh_origin_visit_add(origin_id bigint, date timestamptz) + returns bigint + language sql +as $$ + with last_known_visit as ( + select coalesce(max(visit), 0) as visit + from origin_visit + where origin = origin_id + ) + insert into origin_visit (origin, date, visit, status) + values (origin_id, date, (select visit from last_known_visit) + 1, 'ongoing') + returning visit; +$$; -- add tmp_occurrence_history entries to occurrence_history -- @@ -845,30 +860,10 @@ declare origin_id origin.id%type; begin - -- Create new visits - with current_visits as ( - select distinct origin, date from tmp_occurrence_history - ), - new_visits as ( - select origin, date, (select coalesce(max(visit), 0) - from origin_visit ov - where ov.origin = cv.origin) as max_visit - from current_visits cv - where not exists (select 1 from origin_visit ov - where ov.origin = cv.origin and - ov.date = cv.date) - ) - insert into origin_visit (origin, date, visit) - select origin, date, max_visit + row_number() over - (partition by origin - order by origin, date) - from new_visits; - -- Create or update occurrence_history with occurrence_history_id_visit as ( - select tmp_occurrence_history.*, object_id, visits, visit from tmp_occurrence_history + select tmp_occurrence_history.*, object_id, visits from tmp_occurrence_history left join occurrence_history using(origin, branch, target, target_type) - left join origin_visit using(origin, date) ), occurrences_to_update as ( select object_id, visit from occurrence_history_id_visit where object_id is not null @@ -987,7 +982,7 @@ language sql stable as $$ - select origin, visit, date + select origin, visit, date, status from origin_visit where origin=origin order by date desc diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(74, now(), 'Work In Progress'); + values(76, now(), 'Work In Progress'); -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); @@ -452,15 +452,29 @@ create index on revision_history(parent_id); +create type origin_visit_status as enum ( + 'ongoing', + 'full', + 'partial' +); + +comment on type origin_visit_status IS 'Possible visit status'; + -- The timestamps at which Software Heritage has made a visit of the given origin. create table origin_visit ( origin bigint not null references origin(id), visit bigint not null, date timestamptz not null, + status origin_visit_status not null, primary key (origin, visit) ); +comment on column origin_visit.origin is 'Visited origin'; +comment on column origin_visit.visit is 'Visit number the visit occurred for that origin'; +comment on column origin_visit.date is 'Visit date for that origin'; +comment on column origin_visit.status is 'Visit status for that origin'; + create index on origin_visit(date); -- Asynchronous notification of new origin visits diff --git a/sql/upgrades/075.sql b/sql/upgrades/075.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/075.sql @@ -0,0 +1,47 @@ +-- SWH DB schema upgrade +-- from_version: 74 +-- to_version: 75 +-- description: Add completion information to origin_visit + +INSERT INTO dbversion(version, release, description) + VALUES(75, now(), 'Work In Progress'); + +CREATE TYPE origin_visit_status AS ENUM ( + 'ongoing', + 'full', + 'partial' +); + +COMMENT ON TYPE origin_visit_status IS 'Possible visit status'; + +ALTER TABLE origin_visit + ADD COLUMN status origin_visit_status; + +-- Already visited origins are considered full +UPDATE origin_visit SET status = 'full'; + +-- provide a status for visits is mandatory +ALTER TABLE origin_visit + ALTER COLUMN status SET NOT NULL; + +comment on column origin_visit.origin is 'Visited origin'; +comment on column origin_visit.visit is 'The numbered visit occurrence for that origin'; +comment on column origin_visit.date is 'Visit date for that origin'; +comment on column origin_visit.status is 'Visit status for that origin'; + +-- add a new origin_visit for origin origin_id at date. +-- +-- Returns the new visit id. +create or replace function swh_origin_visit_add(origin_id bigint, date timestamptz) + returns bigint + language sql +as $$ + with last_known_visit as ( + select coalesce(max(visit), 0) as visit + from origin_visit + where origin = origin_id + ) + insert into origin_visit (origin, date, visit, status) + values (origin_id, date, (select visit from last_known_visit) + 1, 'ongoing') + returning visit; +$$; diff --git a/sql/upgrades/076.sql b/sql/upgrades/076.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/076.sql @@ -0,0 +1,61 @@ +-- SWH DB schema upgrade +-- from_version: 75 +-- to_version: 76 +-- description: Add completion information to origin_visit + +INSERT INTO dbversion(version, release, description) + VALUES(76, now(), 'Work In Progress'); + + +create or replace function swh_mktemp_occurrence_history() + returns void + language sql +as $$ + create temporary table tmp_occurrence_history( + like occurrence_history including defaults, + visit bigint not null + ) on commit drop; + alter table tmp_occurrence_history + drop column visits, + drop column object_id; +$$; + + +create or replace function swh_occurrence_history_add() + returns void + language plpgsql +as $$ +declare + origin_id origin.id%type; +begin + -- Create or update occurrence_history + with occurrence_history_id_visit as ( + select tmp_occurrence_history.*, object_id, visits from tmp_occurrence_history + left join occurrence_history using(origin, branch, target, target_type) + ), + occurrences_to_update as ( + select object_id, visit from occurrence_history_id_visit where object_id is not null + ), + update_occurrences as ( + update occurrence_history + set visits = array(select unnest(occurrence_history.visits) as e + union + select occurrences_to_update.visit as e + order by e) + from occurrences_to_update + where occurrence_history.object_id = occurrences_to_update.object_id + ) + insert into occurrence_history (origin, branch, target, target_type, visits) + select origin, branch, target, target_type, ARRAY[visit] + from occurrence_history_id_visit + where object_id is null; + + -- update occurrence + for origin_id in + select distinct origin from tmp_occurrence_history + loop + perform swh_occurrence_update_for_origin(origin_id); + end loop; + return; +end +$$; diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -151,6 +151,14 @@ def origin_add_one(self, origin): return self.post('origin/add', {'origin': origin}) + def origin_visit_add(self, origin, ts): + return self.post('origin/visit/add', {'origin': origin, 'ts': ts}) + + def origin_visit_update(self, origin, visit_id, status): + return self.post('origin/visit/update', {'origin': origin, + 'visit_id': visit_id, + 'status': status}) + def origin_visit_get(self, origin): return self.post('origin/visit/get', {'origin': origin}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -186,6 +186,17 @@ return encode_data(g.storage.origin_visit_get(**decode_request(request))) +@app.route('/origin/visit/add', methods=['POST']) +def origin_visit_add(): + return encode_data(g.storage.origin_visit_add(**decode_request(request))) + + +@app.route('/origin/visit/update', methods=['POST']) +def origin_visit_update(): + return encode_data(g.storage.origin_visit_update( + **decode_request(request))) + + @app.route('/person', methods=['POST']) def person_get(): return encode_data(g.storage.person_get(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -344,12 +344,37 @@ revision_get_cols = revision_add_cols + [ 'author_id', 'committer_id', 'parents'] + def origin_visit_add(self, origin, ts, cur=None): + """Add a new origin_visit for origin origin at timestamp ts with + status 'ongoing'. + + Args: + origin: origin concerned by the visit + ts: the date of the visit + + Returns: + The new visit index step for that origin + + """ + cur = self._cursor(cur) + self._cursor(cur).execute('SELECT swh_origin_visit_add(%s, %s)', + (origin, ts)) + return cur.fetchone()[0] + + def origin_visit_update(self, origin, visit_id, status, cur): + """Update origin_visit's status.""" + cur = self._cursor(cur) + update = """UPDATE origin_visit + SET status=%s + WHERE origin=%s AND visit=%s""" + cur.execute(update, (status, origin, visit_id)) + origin_visit_get_cols = [ - 'origin', 'visit', 'date' + 'origin', 'visit', 'date', 'status' ] def origin_visit_get(self, origin_id, cur=None): - """Retrieve occurrence's history information by origin_id. + """Retrieve all visits for origin with id origin_id. Args: origin_id: The occurrence's origin @@ -361,7 +386,9 @@ cur = self._cursor(cur) cur.execute( - 'SELECT origin, visit, date FROM origin_visit where origin=%s', + """SELECT origin, visit, date, status + FROM origin_visit + WHERE origin=%s""", (origin_id, )) yield from cursor_to_bytes(cur) diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -701,20 +701,12 @@ the occurrence - target_type (str): the type of object pointed to by the occurrence - - date (datetime.DateTime): the validity date for the given - occurrence """ db = self.db - processed = [] - for occurrence in occurrences: - if isinstance(occurrence['date'], str): - occurrence['date'] = dateutil.parser.parse(occurrence['date']) - processed.append(occurrence) - db.mktemp_occurrence_history(cur) - db.copy_to(processed, 'tmp_occurrence_history', - ['origin', 'branch', 'target', 'target_type', 'date'], cur) + db.copy_to(occurrences, 'tmp_occurrence_history', + ['origin', 'branch', 'target', 'target_type', 'visit'], cur) db.occurrence_history_add_from_temp(cur) @@ -738,6 +730,44 @@ 'target_type': line[3], } + @db_transaction + def origin_visit_add(self, origin, ts, cur=None): + """Add an origin_visit for the origin at ts with status 'ongoing'. + + Args: + origin: Visited Origin id + ts: timestamp of such visit + + Returns: + Dict with keys origin and visit where: + - origin: origin identifier + - visit: the visit identifier for the new visit occurrence + - ts (datetime.DateTime): the visit date + + """ + if isinstance(ts, str): + ts = dateutil.parser.parse(ts) + + return { + 'origin': origin, + 'visit': self.db.origin_visit_add(origin, ts, cur) + } + + @db_transaction + def origin_visit_update(self, origin, visit_id, status, cur=None): + """Update an origin_visit's status. + + Args: + origin: Visited Origin id + visit_id: Visit's id + status: Visit's new status + + Returns: + None + + """ + return self.db.origin_visit_update(origin, visit_id, status, cur) + @db_transaction_generator def origin_visit_get(self, origin, cur=None): """Retrieve origin's visit dates. diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -296,28 +296,31 @@ 'type': 'git', } + self.date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc) + self.occurrence = { 'branch': b'master', 'target': b'67890123456789012345', 'target_type': 'revision', - 'date': datetime.datetime(2015, 1, 1, 23, 0, 0, - tzinfo=datetime.timezone.utc), } + self.date_visit2 = datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc) + self.occurrence2 = { 'branch': b'master', 'target': self.revision2['id'], 'target_type': 'revision', - 'date': datetime.datetime(2015, 1, 1, 23, 0, 0, - tzinfo=datetime.timezone.utc), } + self.date_visit3 = datetime.datetime(2015, 1, 1, 23, 0, 0, + tzinfo=datetime.timezone.utc) + # template occurrence to be filled in test (cf. revision_log_by) self.occurrence3 = { 'branch': b'master', 'target_type': 'revision', - 'date': datetime.datetime(2015, 1, 1, 23, 0, 0, - tzinfo=datetime.timezone.utc), } self.release = { @@ -764,9 +767,13 @@ # occurrence3 targets 'revision4' # with branch 'master' and origin origin_id occurrence3 = self.occurrence3.copy() + date_visit1 = self.date_visit3 + origin_visit1 = self.storage.origin_visit_add(origin_id, + date_visit1) occurrence3.update({ 'origin': origin_id, 'target': self.revision4['id'], + 'visit': origin_visit1['visit'], }) self.storage.occurrence_add([occurrence3]) @@ -776,7 +783,7 @@ actual_results = list(self.storage.revision_log_by( origin_id, branch_name=occurrence3['branch'], - timestamp=occurrence3['date'])) + timestamp=date_visit1)) # hack: ids generated for actual_result in actual_results: @@ -880,7 +887,12 @@ # occurrence2 points to 'revision2' with branch 'master', we # need to point to the right origin occurrence2 = self.occurrence2.copy() - occurrence2.update({'origin': origin_id}) + date_visit1 = self.date_visit2 + origin_visit1 = self.storage.origin_visit_add(origin_id, date_visit1) + occurrence2.update({ + 'origin': origin_id, + 'visit': origin_visit1['visit'], + }) self.storage.occurrence_add([occurrence2]) # we want only revision 2 @@ -917,17 +929,23 @@ # occurrence2 points to 'revision2' with branch 'master', we # need to point to the right origin + date_visit1 = self.date_visit2 + origin_visit1 = self.storage.origin_visit_add(origin_id, date_visit1) occurrence2 = self.occurrence2.copy() - occurrence2.update({'origin': origin_id, - 'date': occurrence2['date']}) + occurrence2.update({ + 'origin': origin_id, + 'visit': origin_visit1['visit'] + }) dt = datetime.timedelta(days=1) - + date_visit2 = date_visit1 + dt + origin_visit2 = self.storage.origin_visit_add(origin_id, date_visit2) occurrence3 = self.occurrence2.copy() - occurrence3.update({'origin': origin_id, - 'date': occurrence3['date'] + dt, - 'target': self.revision3['id']}) - + occurrence3.update({ + 'origin': origin_id, + 'visit': origin_visit2['visit'], + 'target': self.revision3['id'], + }) # 2 occurrences on same revision with lower validity date with 1h delta self.storage.occurrence_add([occurrence2]) self.storage.occurrence_add([occurrence3]) @@ -936,7 +954,7 @@ actual_results0 = list(self.storage.revision_get_by( origin_id, occurrence2['branch'], - occurrence2['date'])) + date_visit1)) # hack: ids are generated del actual_results0[0]['author']['id'] @@ -949,7 +967,7 @@ actual_results1 = list(self.storage.revision_get_by( origin_id, occurrence2['branch'], - occurrence2['date'] + dt/3)) # closer to occurrence2 + date_visit1 + dt/3)) # closer to first visit # hack: ids are generated del actual_results1[0]['author']['id'] @@ -962,7 +980,7 @@ actual_results2 = list(self.storage.revision_get_by( origin_id, occurrence2['branch'], - occurrence2['date'] + 2*dt/3)) # closer to occurrence3 + date_visit1 + 2*dt/3)) # closer to second visit del actual_results2[0]['author']['id'] del actual_results2[0]['committer']['id'] @@ -974,7 +992,7 @@ actual_results3 = list(self.storage.revision_get_by( origin_id, occurrence3['branch'], - occurrence3['date'])) + date_visit2)) # hack: ids are generated del actual_results3[0]['author']['id'] @@ -1035,8 +1053,14 @@ # occurrence2 points to 'revision2' with branch 'master', we # need to point to the right origin + origin_visit = self.storage.origin_visit_add(origin_id, + self.date_visit2) occurrence2 = self.occurrence2.copy() - occurrence2.update({'origin': origin_id}) + occurrence2.update({ + 'origin': origin_id, + 'visit': origin_visit['visit'], + }) + self.storage.occurrence_add([occurrence2]) # we want only revision 2 @@ -1085,69 +1109,97 @@ 'project': None}) @istest - def origin_visit_get(self): - # 1- given + def origin_visit_add(self): + # given self.assertIsNone(self.storage.origin_get(self.origin2)) - self.storage.content_add([self.cont2]) - self.storage.directory_add([self.dir2]) - self.storage.revision_add([self.revision2, self.revision3]) origin_id = self.storage.origin_add_one(self.origin2) + self.assertIsNotNone(origin_id) - # occurrence2 points to 'revision2' with branch 'master', we - # need to point to the right origin - occurrence2 = self.occurrence2.copy() - occurrence2.update({'origin': origin_id, - 'date': occurrence2['date']}) - - dt = datetime.timedelta(days=1) - - occurrence3 = self.occurrence2.copy() - occurrence3.update({'origin': origin_id, - 'date': occurrence3['date'] + dt, - 'target': self.revision3['id']}) + # when + origin_visit1 = self.storage.origin_visit_add( + origin_id, + ts=self.date_visit2) - # 2 occurrences on same revision with lower validity date with 1h delta - self.storage.occurrence_add([occurrence2]) + # then + self.assertEquals(origin_visit1['origin'], origin_id) + self.assertIsNotNone(origin_visit1['visit']) + self.assertTrue(origin_visit1['visit'] > 0) - # when actual_origin_visits = list(self.storage.origin_visit_get(origin_id)) - self.assertEquals(len(actual_origin_visits), 1) self.assertEquals(actual_origin_visits, [{ 'origin': origin_id, - 'date': occurrence2['date'], - 'visit': 1 + 'date': self.date_visit2, + 'visit': origin_visit1['visit'], + 'status': 'ongoing', }]) - # 2- given - self.storage.occurrence_add([occurrence3]) + @istest + def origin_visit_update(self): + # given + origin_id = self.storage.origin_add_one(self.origin2) + origin_id2 = self.storage.origin_add_one(self.origin) + + origin_visit1 = self.storage.origin_visit_add( + origin_id, + ts=self.date_visit2) + + origin_visit2 = self.storage.origin_visit_add( + origin_id, + ts=self.date_visit3) + + origin_visit3 = self.storage.origin_visit_add( + origin_id2, + ts=self.date_visit3) # when + self.storage.origin_visit_update(origin_id, origin_visit1['visit'], + status='full') + self.storage.origin_visit_update(origin_id2, origin_visit3['visit'], + status='partial') + + # then actual_origin_visits = list(self.storage.origin_visit_get(origin_id)) - self.assertEquals(len(actual_origin_visits), 2) self.assertEquals(actual_origin_visits, [{ - 'origin': origin_id, - 'date': occurrence2['date'], - 'visit': 1 - }, { - 'origin': origin_id, - 'date': occurrence3['date'], - 'visit': 2 + 'origin': origin_visit2['origin'], + 'date': self.date_visit2, + 'visit': origin_visit1['visit'], + 'status': 'full' + }, + { + 'origin': origin_visit2['origin'], + 'date': self.date_visit3, + 'visit': origin_visit2['visit'], + 'status': 'ongoing' + }]) + + actual_origin_visits2 = list(self.storage.origin_visit_get(origin_id2)) + self.assertEquals(actual_origin_visits2, + [{ + 'origin': origin_visit3['origin'], + 'date': self.date_visit3, + 'visit': origin_visit3['visit'], + 'status': 'partial' }]) @istest def occurrence_add(self): + occur = self.occurrence.copy() + origin_id = self.storage.origin_add_one(self.origin2) + date_visit1 = self.date_visit1 + origin_visit1 = self.storage.origin_visit_add(origin_id, date_visit1) revision = self.revision.copy() - revision['id'] = self.occurrence['target'] + revision['id'] = occur['target'] self.storage.revision_add([revision]) - occur = self.occurrence - occur['origin'] = origin_id - self.storage.occurrence_add([occur]) + occur.update({ + 'origin': origin_id, + 'visit': origin_visit1['visit'], + }) self.storage.occurrence_add([occur]) test_query = ''' @@ -1167,11 +1219,16 @@ (ret[0][0], ret[0][1].tobytes(), ret[0][2].tobytes(), ret[0][3], ret[0][4]), (occur['origin'], occur['branch'], occur['target'], - occur['target_type'], occur['date'])) + occur['target_type'], self.date_visit1)) - orig_date = occur['date'] - occur['date'] += datetime.timedelta(hours=10) - self.storage.occurrence_add([occur]) + date_visit2 = date_visit1 + datetime.timedelta(hours=10) + + origin_visit2 = self.storage.origin_visit_add(origin_id, date_visit2) + occur2 = occur.copy() + occur2.update({ + 'visit': origin_visit2['visit'], + }) + self.storage.occurrence_add([occur2]) self.cursor.execute(test_query) ret = self.cursor.fetchall() @@ -1180,24 +1237,29 @@ (ret[0][0], ret[0][1].tobytes(), ret[0][2].tobytes(), ret[0][3], ret[0][4]), (occur['origin'], occur['branch'], occur['target'], - occur['target_type'], orig_date)) + occur['target_type'], date_visit1)) self.assertEqual( (ret[1][0], ret[1][1].tobytes(), ret[1][2].tobytes(), ret[1][3], ret[1][4]), - (occur['origin'], occur['branch'], occur['target'], - occur['target_type'], occur['date'])) + (occur2['origin'], occur2['branch'], occur2['target'], + occur2['target_type'], date_visit2)) @istest def occurrence_get(self): # given + occur = self.occurrence.copy() origin_id = self.storage.origin_add_one(self.origin2) + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit1) revision = self.revision.copy() - revision['id'] = self.occurrence['target'] + revision['id'] = occur['target'] self.storage.revision_add([revision]) - occur = self.occurrence - occur['origin'] = origin_id + occur.update({ + 'origin': origin_id, + 'visit': origin_visit1['visit'], + }) self.storage.occurrence_add([occur]) self.storage.occurrence_add([occur]) @@ -1205,11 +1267,12 @@ actual_occurrence = list(self.storage.occurrence_get(origin_id)) # then - expected_occur = occur.copy() - del expected_occur['date'] - + expected_occurrence = self.occurrence.copy() + expected_occurrence.update({ + 'origin': origin_id + }) self.assertEquals(len(actual_occurrence), 1) - self.assertEquals(actual_occurrence[0], expected_occur) + self.assertEquals(actual_occurrence[0], expected_occurrence) @istest def content_find_occurrence_with_present_content(self): @@ -1219,8 +1282,15 @@ self.storage.directory_add([self.dir2]) # point to self.cont self.storage.revision_add([self.revision2]) # points to self.dir origin_id = self.storage.origin_add_one(self.origin2) - occurrence = self.occurrence2 - occurrence.update({'origin': origin_id}) + + occurrence = self.occurrence2.copy() + origin_visit1 = self.storage.origin_visit_add(origin_id, + self.date_visit2) + occurrence.update({ + 'origin': origin_id, + 'visit': origin_visit1['visit'], + }) + self.storage.occurrence_add([occurrence]) # when