diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ url: http://uffizi.internal.softwareheritage.org:5002 provenance: - cls: ps + cls: direct db: dbname: provenance host: localhost diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core >= 0.12 +swh.core[db,api] >= 0.12 swh.model swh.storage diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,3 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html methodtools -pytz diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -17,46 +17,46 @@ @lru_cache(maxsize=1000000) def directory_ls_internal(self, id: bytes) -> List[Dict[str, Any]]: # TODO: add file size filtering - cursor = self.conn.cursor() - cursor.execute( - """WITH - dir AS (SELECT id AS dir_id, dir_entries, file_entries, rev_entries - FROM directory WHERE id=%s), - ls_d AS (SELECT dir_id, UNNEST(dir_entries) AS entry_id FROM dir), - ls_f AS (SELECT dir_id, UNNEST(file_entries) AS entry_id FROM dir), - ls_r AS (SELECT dir_id, UNNEST(rev_entries) AS entry_id FROM dir) - (SELECT 'dir'::directory_entry_type AS type, e.target, e.name, - NULL::sha1_git - FROM ls_d - LEFT JOIN directory_entry_dir e ON ls_d.entry_id=e.id) - UNION - (WITH known_contents AS - (SELECT 'file'::directory_entry_type AS type, e.target, e.name, - c.sha1_git - FROM ls_f - LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id - INNER JOIN content c ON e.target=c.sha1_git) - SELECT * FROM known_contents + with self.conn.cursor() as cursor: + cursor.execute( + """WITH + dir AS (SELECT id AS dir_id, dir_entries, file_entries, rev_entries + FROM directory WHERE id=%s), + ls_d AS (SELECT dir_id, UNNEST(dir_entries) AS entry_id FROM dir), + ls_f AS (SELECT dir_id, UNNEST(file_entries) AS entry_id FROM dir), + ls_r AS (SELECT dir_id, UNNEST(rev_entries) AS entry_id FROM dir) + (SELECT 'dir'::directory_entry_type AS type, e.target, e.name, + NULL::sha1_git + FROM ls_d + LEFT JOIN directory_entry_dir e ON ls_d.entry_id=e.id) UNION - (SELECT 'file'::directory_entry_type AS type, e.target, e.name, - c.sha1_git - FROM ls_f - LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id - LEFT JOIN skipped_content c ON e.target=c.sha1_git - WHERE NOT EXISTS ( - SELECT 1 FROM known_contents - WHERE known_contents.sha1_git=e.target + (WITH known_contents AS + (SELECT 'file'::directory_entry_type AS type, e.target, e.name, + c.sha1_git + FROM ls_f + LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id + INNER JOIN content c ON e.target=c.sha1_git) + SELECT * FROM known_contents + UNION + (SELECT 'file'::directory_entry_type AS type, e.target, e.name, + c.sha1_git + FROM ls_f + LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id + LEFT JOIN skipped_content c ON e.target=c.sha1_git + WHERE NOT EXISTS ( + SELECT 1 FROM known_contents + WHERE known_contents.sha1_git=e.target + ) ) ) + ORDER BY name + """, + (id,), ) - ORDER BY name - """, - (id,), - ) - return [ - {"type": row[0], "target": row[1], "name": row[2]} - for row in cursor.fetchall() - ] + return [ + {"type": row[0], "target": row[1], "name": row[2]} + for row in cursor.fetchall() + ] def iter_origins(self): raise NotImplementedError diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -1,6 +1,4 @@ from datetime import datetime -import itertools -import operator import os from typing import Generator, Optional, Tuple @@ -140,50 +138,52 @@ ) def insert_location(self, src0_table, src1_table, dst_table): + """Insert location entries in `dst_table` from the insert_cache + + Also insert missing location entries in the 'location' table. + """ + # TODO: find a better way of doing this; might be doable in a coupls of + # SQL queries (one to insert missing entries in the location' table, + # one to insert entries in the dst_table) + # Resolve src0 ids - src0_values = dict().fromkeys( - map(operator.itemgetter(0), self.insert_cache[dst_table]) - ) - values = ", ".join(itertools.repeat("%s", len(src0_values))) + src0_sha1s = tuple(set(sha1 for (sha1, _, _) in self.insert_cache[dst_table])) + fmt = ",".join(["%s"] * len(src0_sha1s)) self.cursor.execute( - f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({values})""", - tuple(src0_values), + f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({fmt})""", src0_sha1s, ) src0_values = dict(self.cursor.fetchall()) # Resolve src1 ids - src1_values = dict().fromkeys( - map(operator.itemgetter(1), self.insert_cache[dst_table]) - ) - values = ", ".join(itertools.repeat("%s", len(src1_values))) + src1_sha1s = tuple(set(sha1 for (_, sha1, _) in self.insert_cache[dst_table])) + fmt = ",".join(["%s"] * len(src1_sha1s)) self.cursor.execute( - f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({values})""", - tuple(src1_values), + f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({fmt})""", src1_sha1s, ) src1_values = dict(self.cursor.fetchall()) - # Resolve location ids - location = dict().fromkeys( - map(operator.itemgetter(2), self.insert_cache[dst_table]) + # insert missing locations + locations = tuple(set((loc,) for (_, _, loc) in self.insert_cache[dst_table])) + psycopg2.extras.execute_values( + self.cursor, + """ + INSERT INTO location(path) VALUES %s + ON CONFLICT (path) DO NOTHING + """, + locations, ) - location = dict( - psycopg2.extras.execute_values( - self.cursor, - """LOCK TABLE ONLY location; - INSERT INTO location(path) VALUES %s - ON CONFLICT (path) DO - UPDATE SET path=EXCLUDED.path - RETURNING path, id""", - map(lambda path: (path,), location.keys()), - fetch=True, - ) + # fetch location ids + fmt = ",".join(["%s"] * len(locations)) + self.cursor.execute( + f"SELECT path, id FROM location WHERE path IN ({fmt})", locations, ) + loc_ids = dict(self.cursor.fetchall()) # Insert values in dst_table - rows = map( - lambda row: (src0_values[row[0]], src1_values[row[1]], location[row[2]]), - self.insert_cache[dst_table], - ) + rows = [ + (src0_values[sha1_src], src1_values[sha1_dst], loc_ids[loc]) + for (sha1_src, sha1_dst, loc) in self.insert_cache[dst_table] + ] psycopg2.extras.execute_values( self.cursor, f"""INSERT INTO {dst_table} VALUES %s diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -1,6 +1,5 @@ -from datetime import datetime +from datetime import datetime, timezone import os -import pytz from typing import Dict, Generator, List, Optional, Tuple from typing_extensions import Protocol, runtime_checkable @@ -10,6 +9,8 @@ from .origin import OriginEntry from .revision import RevisionEntry +UTCMIN = datetime.min.replace(tzinfo=timezone.utc) + @runtime_checkable class ProvenanceInterface(Protocol): @@ -284,13 +285,12 @@ # Recursively analyse directory nodes. stack.append(child) else: - maxdates = [] - for child in current.children: - assert child.maxdate is not None - maxdates.append(child.maxdate) - current.maxdate = ( - max(maxdates) if maxdates else datetime.min.replace(tzinfo=pytz.UTC) - ) + maxdates = [ + child.maxdate + for child in current.children + if child.maxdate is not None # mostly to please mypy + ] + current.maxdate = max(maxdates) if maxdates else UTCMIN else: # Directory node in the frontier, just use its known date. current.maxdate = current.date @@ -327,9 +327,7 @@ ) provenance.directory_add_to_revision(revision, current.entry, path) directory_process_content( - provenance, - directory=current.entry, - relative=current.entry, + provenance, directory=current.entry, relative=current.entry, ) else: # No point moving the frontier here. Either there are no files or they diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from itertools import islice import threading from typing import Iterable, Iterator, Optional, Tuple @@ -20,6 +20,7 @@ self.archive = archive self.id = id self.date = date + assert self.date is None or self.date.tzinfo is not None self.parents = parents self.root = root @@ -78,11 +79,11 @@ def __next__(self): with self.mutex: id, date, root = next(self.revisions) + date = datetime.fromisoformat(date) + if date.tzinfo is None: + date = date.replace(tzinfo=timezone.utc) return RevisionEntry( - self.archive, - hash_to_bytes(id), - date=datetime.fromisoformat(date), - root=hash_to_bytes(root), + self.archive, hash_to_bytes(id), date=date, root=hash_to_bytes(root), ) diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -8,8 +8,11 @@ import pytest +from swh.core.api.serializers import msgpack_loads +from swh.core.db import BaseDb from swh.core.db.pytest_plugin import postgresql_fact from swh.core.utils import numfile_sortkey as sortkey +from swh.model.model import Content, Directory, DirectoryEntry, Revision from swh.model.tests.swh_model_data import TEST_OBJECTS import swh.provenance from swh.provenance.postgresql.archive import ArchivePostgreSQL @@ -34,6 +37,7 @@ ProvenanceWithPathDB as ProvenanceDB, ) + BaseDb.adapt_conn(provenance_db) return ProvenanceDB(provenance_db) @@ -67,3 +71,113 @@ @pytest.fixture def archive_api(swh_storage_with_objects): return ArchiveStorage(swh_storage_with_objects) + + +@pytest.fixture +def archive_pg(swh_storage_with_objects): + # this is a workaround to prevent tests from hanging because of an unclosed + # transaction. + # TODO: refactor the ArchivePostgreSQL to properly deal with + # transactions and get rif of this fixture + archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) + yield archive + archive.conn.rollback() + + +@pytest.fixture +def CMDBTS_data(): + # imported git tree is https://github.com/grouss/CMDBTS rev 4c5551b496 + # ([xxx] is the timestamp): + # o - [1609757158] first commit 35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5 + # | `- README.md * 43f3c871310a8e524004e91f033e7fb3b0bc8475 + # o - [1610644094] Reset Empty repository 840b91df68e9549c156942ddd5002111efa15604 + # | + # o - [1610644094] R0000 9e36e095b79e36a3da104ce272989b39cd68aefd + # | `- Red/Blue/Green/a * 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # o - [1610644097] R0001 bfbfcc72ae7fc35d6941386c36280512e6b38440 + # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # | `- Red/Blue/Green/b * 9f6e04be05297905f1275d3f4e0bb0583458b2e8 + # o - [1610644099] R0002 0a31c9d509783abfd08f9fdfcd3acae20f17dfd0 + # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 + # | `- Red/Blue/c * a28fa70e725ebda781e772795ca080cd737b823c + # o - [1610644101] R0003 ca6ec564c69efd2e5c70fb05486fd3f794765a04 + # | |- Red/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # | |- Red/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 + # | `- Red/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # o - [1610644103] R0004 fc6e10b7d41b1d56a94091134e3683ce91e80d91 + # | |- Red/Blue/Green/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # | |- Red/Blue/Green/b 9f6e04be05297905f1275d3f4e0bb0583458b2e8 + # | `- Red/Blue/c a28fa70e725ebda781e772795ca080cd737b823c + # o - [1610644105] R0005 1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17 + # | `- Purple/d * c0229d305adf3edf49f031269a70e3e87665fe88 + # o - [1610644107] R0006 9a71f967ae1a125be9b6569cc4eccec0aecabb7c + # | `- Purple/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 + # o - [1610644109] R0007 4fde4ea4494a630030a4bda99d03961d9add00c7 + # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 + # | `- Dark/d c0229d305adf3edf49f031269a70e3e87665fe88 + # o - [1610644111] R0008 ba00e89d47dc820bb32c783af7123ffc6e58b56d + # | |- Dark/Brown/Purple/d c0229d305adf3edf49f031269a70e3e87665fe88 + # | |- Dark/Brown/Purple/e c0229d305adf3edf49f031269a70e3e87665fe88 + # | `- Dark/a 6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1 + # o - [1610644113] R0009 55d4dc9471de6144f935daf3c38878155ca274d5 + # | |- Dark/Brown/Purple/f * 94ba40161084e8b80943accd9d24e1f9dd47189b + # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # | `- Dark/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # o - [1610644116] R0010 a8939755d0be76cfea136e9e5ebce9bc51c49fef + # | |- Dark/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # | |- Dark/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # | `- Dark/h * 5e8f9ceaee9dafae2e3210e254fdf170295f8b5b + # o - [1610644118] R0011 ca1774a07b6e02c1caa7ae678924efa9259ee7c6 + # | |- Paris/Brown/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # | |- Paris/Brown/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # | `- Paris/i * bbd54b961764094b13f10cef733e3725d0a834c3 + # o - [1610644120] R0012 611fe71d75b6ea151b06e3845c09777acc783d82 + # | |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # | |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # | `- Paris/j * 7ce4fe9a22f589fa1656a752ea371b0ebc2106b1 + # o - [1610644122] R0013 4c5551b4969eb2160824494d40b8e1f6187fc01e + # |- Paris/Berlin/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # |- Paris/Berlin/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # |- Paris/Munich/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # |- Paris/Munich/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # |- Paris/Purple/f 94ba40161084e8b80943accd9d24e1f9dd47189b + # |- Paris/Purple/g 94ba40161084e8b80943accd9d24e1f9dd47189b + # `- Paris/k * cb79b39935c9392fa5193d9f84a6c35dc9c22c75 + data = {"revision": [], "directory": [], "content": []} + with open( + path.join(path.dirname(__file__), "data", "CMDBTS.msgpack"), "rb" + ) as fobj: + for etype, value in msgpack_loads(fobj.read()): + data[etype].append(value) + return data + + +def filter_dict(d, keys): + return {k: v for (k, v) in d.items() if k in keys} + + +@pytest.fixture +def storage_and_CMDBTS(swh_storage, CMDBTS_data): + swh_storage.content_add_metadata( + Content.from_dict(content) for content in CMDBTS_data["content"] + ) + swh_storage.directory_add( + [ + Directory( + entries=tuple( + [ + DirectoryEntry.from_dict( + filter_dict(entry, ("name", "type", "target", "perms")) + ) + for entry in dir["entries"] + ] + ) + ) + for dir in CMDBTS_data["directory"] + ] + ) + swh_storage.revision_add( + Revision.from_dict(revision) for revision in CMDBTS_data["revision"] + ) + return swh_storage, CMDBTS_data diff --git a/swh/provenance/tests/generate_CMDBTS_dataset.py b/swh/provenance/tests/generate_CMDBTS_dataset.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/generate_CMDBTS_dataset.py @@ -0,0 +1,77 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +import click + +from swh.core.api.serializers import msgpack_dumps +from swh.model.hashutil import hash_to_bytes as h2b +from swh.provenance.tests.test_provenance_db import ts2dt +from swh.storage import get_storage + + +def dump_file(hash, storage, cache): + if hash not in cache: + content = storage.content_find({"sha1_git": hash})[0] + cache[hash] = content + yield "content", content.to_dict() + + +def dump_directory(hash, storage, cache): + if hash not in cache: + dircontent = list(storage.directory_ls(hash)) + cache[hash] = dircontent + yield "directory", {"id": hash, "entries": list(storage.directory_ls(hash))} + for direntry in dircontent: + if direntry["type"] == "dir": + yield from dump_directory(direntry["target"], storage, cache) + elif direntry["type"] == "file": + yield from dump_file(direntry["target"], storage, cache) + else: + raise ValueError("Unexpected directory entry type {direntry['type']}") + + +def dump_git_revision(hash, storage, cache): + if hash not in cache: + rev = storage.revision_get([hash])[0] + revd = { + "id": rev.id, + "date": ts2dt(rev.date.to_dict()), + "parents": rev.parents, + "directory": rev.directory, + } + revd = rev.to_dict() + cache[hash] = revd + for parent in rev.parents: + yield from dump_git_revision(parent, storage, cache) + yield from dump_directory(rev.directory, storage, cache) + yield "revision", cache[hash] + + +@click.command() +@click.option( + "-r", + "--head", + default="4c5551b4969eb2160824494d40b8e1f6187fc01e", + help="head revision to start from", +) +@click.option("-o", "--output", default="data/CMDBTS.msgpack", help="output file") +@click.argument("storage-url") +def main(head, output, storage_url): + "simple tool to generate the CMDBTS.msgpack dataset filed used in tests" + sto = get_storage(cls="remote", url=storage_url) + + cache: Dict[bytes, dict] = {} + outf = open(output, "wb") + outd = [] + for e in dump_git_revision(h2b(head), storage=sto, cache=cache): + outd.append(e) + outf.write(msgpack_dumps(outd)) + click.echo(f"Wrote {len(outd)} objects in {output}") + + +if __name__ == "__main__": + main() diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_provenance_db.py @@ -0,0 +1,163 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime + +from swh.model.tests.swh_model_data import TEST_OBJECTS +from swh.provenance.origin import OriginEntry +from swh.provenance.provenance import origin_add, revision_add +from swh.provenance.revision import RevisionEntry + + +def ts2dt(ts: dict) -> datetime.datetime: + timestamp = datetime.datetime.fromtimestamp( + ts["timestamp"]["seconds"], + datetime.timezone(datetime.timedelta(minutes=ts["offset"])), + ) + return timestamp.replace(microsecond=ts["timestamp"]["microseconds"]) + + +def test_provenance_origin_add(provenance, swh_storage_with_objects): + """Test the ProvenanceDB.origin_add() method""" + for origin in TEST_OBJECTS["origin"]: + entry = OriginEntry(url=origin.url, revisions=[]) + origin_add(provenance, entry) + # TODO: check some facts here + + +def test_provenance_add_revision(provenance, storage_and_CMDBTS, archive_pg): + + storage, data = storage_and_CMDBTS + for i in range(2): + # do it twice, there should be no change in results + for revision in data["revision"]: + entry = RevisionEntry( + archive_pg, + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + parents=revision["parents"], + ) + revision_add(provenance, archive_pg, entry) + + # there should be as many entries in 'revision' as revisions from the + # test dataset + provenance.cursor.execute("SELECT count(*) FROM revision") + assert provenance.cursor.fetchone()[0] == len(data["revision"]) + + # there should be no 'location' for the empty path + provenance.cursor.execute("SELECT count(*) FROM location WHERE path=''") + assert provenance.cursor.fetchone()[0] == 0 + + # there should be 32 'location' for non-empty path + provenance.cursor.execute("SELECT count(*) FROM location WHERE path!=''") + assert provenance.cursor.fetchone()[0] == 32 + + # there should be as many entries in 'revision' as revisions from the + # test dataset + provenance.cursor.execute("SELECT count(*) FROM revision") + assert provenance.cursor.fetchone()[0] == len(data["revision"]) + + # 7 directories + provenance.cursor.execute("SELECT count(*) FROM directory") + assert provenance.cursor.fetchone()[0] == 7 + + # 12 D-R entries + provenance.cursor.execute("SELECT count(*) FROM directory_in_rev") + assert provenance.cursor.fetchone()[0] == 12 + + provenance.cursor.execute("SELECT count(*) FROM content") + assert provenance.cursor.fetchone()[0] == len(data["content"]) + provenance.cursor.execute("SELECT count(*) FROM content_in_dir") + assert provenance.cursor.fetchone()[0] == 16 + provenance.cursor.execute("SELECT count(*) FROM content_early_in_rev") + assert provenance.cursor.fetchone()[0] == 13 + + +def test_provenance_content_find_first(provenance, storage_and_CMDBTS, archive_pg): + storage, data = storage_and_CMDBTS + for revision in data["revision"]: + entry = RevisionEntry( + archive_pg, + id=revision["id"], + date=ts2dt(revision["date"]), + root=revision["directory"], + parents=revision["parents"], + ) + revision_add(provenance, archive_pg, entry) + + first_expected_content = [ + { + "content": "43f3c871310a8e524004e91f033e7fb3b0bc8475", + "rev": "35ccb8dd1b53d2d8a5c1375eb513ef2beaa79ae5", + "date": 1609757158, + "path": "README.md", + }, + { + "content": "6dc7e44ead5c0e300fe94448c3e046dfe33ad4d1", + "rev": "9e36e095b79e36a3da104ce272989b39cd68aefd", + "date": 1610644094, + "path": "Red/Blue/Green/a", + }, + { + "content": "9f6e04be05297905f1275d3f4e0bb0583458b2e8", + "rev": "bfbfcc72ae7fc35d6941386c36280512e6b38440", + "date": 1610644097, + "path": "Red/Blue/Green/b", + }, + { + "content": "a28fa70e725ebda781e772795ca080cd737b823c", + "rev": "0a31c9d509783abfd08f9fdfcd3acae20f17dfd0", + "date": 1610644099, + "path": "Red/Blue/c", + }, + { + "content": "c0229d305adf3edf49f031269a70e3e87665fe88", + "rev": "1d1fcf1816a8a2a77f9b1f342ba11d0fe9fd7f17", + "date": 1610644105, + "path": "Purple/d", + }, + { + "content": "94ba40161084e8b80943accd9d24e1f9dd47189b", + "rev": "55d4dc9471de6144f935daf3c38878155ca274d5", + "date": 1610644113, + "path": ("Dark/Brown/Purple/f", "Dark/Brown/Purple/g", "Dark/h"), # XXX + }, + { + "content": "5e8f9ceaee9dafae2e3210e254fdf170295f8b5b", + "rev": "a8939755d0be76cfea136e9e5ebce9bc51c49fef", + "date": 1610644116, + "path": "Dark/h", + }, + { + "content": "bbd54b961764094b13f10cef733e3725d0a834c3", + "rev": "ca1774a07b6e02c1caa7ae678924efa9259ee7c6", + "date": 1610644118, + "path": "Paris/i", + }, + { + "content": "7ce4fe9a22f589fa1656a752ea371b0ebc2106b1", + "rev": "611fe71d75b6ea151b06e3845c09777acc783d82", + "date": 1610644120, + "path": "Paris/j", + }, + { + "content": "cb79b39935c9392fa5193d9f84a6c35dc9c22c75", + "rev": "4c5551b4969eb2160824494d40b8e1f6187fc01e", + "date": 1610644122, + "path": "Paris/k", + }, + ] + + for expected in first_expected_content: + contentid = bytes.fromhex(expected["content"]) + (blob, rev, date, path) = provenance.content_find_first(contentid) + if isinstance(expected["path"], tuple): + assert bytes(path).decode() in expected["path"] + else: + assert bytes(path).decode() == expected["path"] + assert bytes(blob) == contentid + assert bytes(rev).hex() == expected["rev"] + assert int(date.timestamp()) == expected["date"] diff --git a/swh/provenance/tests/test_revision_iterator.py b/swh/provenance/tests/test_revision_iterator.py --- a/swh/provenance/tests/test_revision_iterator.py +++ b/swh/provenance/tests/test_revision_iterator.py @@ -2,27 +2,18 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime -from swh.model.model import TimestampWithTimezone -from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.revision import CSVRevisionIterator +from swh.provenance.tests.test_provenance_db import ts2dt -def ts_to_dt(ts_with_tz: TimestampWithTimezone) -> datetime.datetime: - """converts a TimestampWithTimezone into a datetime""" - ts = ts_with_tz.timestamp - timestamp = datetime.datetime.fromtimestamp(ts.seconds, datetime.timezone.utc) - timestamp = timestamp.replace(microsecond=ts.microseconds) - return timestamp - - -def test_archive_direct_revision_iterator(swh_storage_with_objects, archive_direct): - """Test FileOriginIterator""" +def test_archive_direct_revision_iterator(storage_and_CMDBTS, archive_direct): + """Test CSVRevisionIterator""" + storage, data = storage_and_CMDBTS revisions_csv = [ - (rev.id, ts_to_dt(rev.date).isoformat(), rev.directory) - for rev in TEST_OBJECTS["revision"] + (rev["id"], ts2dt(rev["date"]).isoformat(), rev["directory"]) + for rev in data["revision"] ] revisions = list(CSVRevisionIterator(revisions_csv, archive_direct)) assert revisions - assert len(revisions) == len(TEST_OBJECTS["revision"]) + assert len(revisions) == len(data["revision"])