diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py index daaf1c5..150b593 100644 --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -1,263 +1,263 @@ from datetime import datetime import itertools import logging from typing import Any, Dict, Generator, List, Mapping, Optional, Set, Tuple import psycopg2 import psycopg2.extras from swh.model.model import Sha1Git from ..provenance import ProvenanceResult class ProvenanceDBBase: def __init__(self, conn: psycopg2.extensions.connection): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) conn.set_session(autocommit=True) self.conn = conn - self.cursor = self.conn.cursor() + self.cursor = self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # XXX: not sure this is the best place to do it! self.cursor.execute("SET timezone TO 'UTC'") self._flavor: Optional[str] = None @property def flavor(self) -> str: if self._flavor is None: - self.cursor.execute("select swh_get_dbflavor()") - self._flavor = self.cursor.fetchone()[0] + self.cursor.execute("SELECT swh_get_dbflavor() AS flavor") + self._flavor = self.cursor.fetchone()["flavor"] assert self._flavor is not None return self._flavor @property def with_path(self) -> bool: return self.flavor == "with-path" def commit(self, data: Mapping[str, Any], raise_on_commit: bool = False) -> bool: try: # First insert entities for entity in ("content", "directory", "revision"): self.insert_entity( entity, { sha1: data[entity]["data"][sha1] for sha1 in data[entity]["added"] }, ) data[entity]["data"].clear() data[entity]["added"].clear() # Relations should come after ids for entities were resolved for relation in ( "content_in_revision", "content_in_directory", "directory_in_revision", ): self.insert_relation(relation, data[relation]) # Insert origins self.insert_origin( { sha1: data["origin"]["data"][sha1] for sha1 in data["origin"]["added"] }, ) data["origin"]["data"].clear() data["origin"]["added"].clear() # Insert relations from the origin-revision layer self.insert_revision_history(data["revision_before_revision"]) self.insert_origin_head(data["revision_in_origin"]) # Update preferred origins self.update_preferred_origin( { sha1: data["revision_origin"]["data"][sha1] for sha1 in data["revision_origin"]["added"] } ) data["revision_origin"]["data"].clear() data["revision_origin"]["added"].clear() return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message logging.exception("Unexpected error") if raise_on_commit: raise return False def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: ... def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: ... def get_dates(self, entity: str, ids: List[Sha1Git]) -> Dict[Sha1Git, datetime]: - dates = {} + dates: Dict[Sha1Git, datetime] = {} if ids: values = ", ".join(itertools.repeat("%s", len(ids))) self.cursor.execute( f"""SELECT sha1, date FROM {entity} WHERE sha1 IN ({values})""", tuple(ids), ) - dates.update(self.cursor.fetchall()) + dates.update(((row["sha1"], row["date"]) for row in self.cursor.fetchall())) return dates def insert_entity(self, entity: str, data: Dict[Sha1Git, datetime]): if data: psycopg2.extras.execute_values( self.cursor, f""" LOCK TABLE ONLY {entity}; INSERT INTO {entity}(sha1, date) VALUES %s ON CONFLICT (sha1) DO UPDATE SET date=LEAST(EXCLUDED.date,{entity}.date) """, data.items(), ) # XXX: not sure if Python takes a reference or a copy. # This might be useless! data.clear() def insert_origin(self, data: Dict[Sha1Git, str]): if data: psycopg2.extras.execute_values( self.cursor, """ LOCK TABLE ONLY origin; INSERT INTO origin(sha1, url) VALUES %s ON CONFLICT DO NOTHING """, data.items(), ) # XXX: not sure if Python takes a reference or a copy. # This might be useless! data.clear() def insert_origin_head(self, data: Set[Tuple[Sha1Git, Sha1Git]]): if data: # Insert revisions first, to ensure "foreign keys" exist # Origins are assumed to be already inserted (they require knowing the url) psycopg2.extras.execute_values( self.cursor, """ LOCK TABLE ONLY revision; INSERT INTO revision(sha1) VALUES %s ON CONFLICT DO NOTHING """, {(rev,) for rev, _ in data}, ) psycopg2.extras.execute_values( self.cursor, # XXX: not clear how conflicts are handled here! """ LOCK TABLE ONLY revision_in_origin; INSERT INTO revision_in_origin SELECT R.id, O.id FROM (VALUES %s) AS V(rev, org) INNER JOIN revision AS R on (R.sha1=V.rev) INNER JOIN origin AS O on (O.sha1=V.org) ON CONFLICT DO NOTHING """, data, ) data.clear() def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): ... def insert_revision_history(self, data: Dict[Sha1Git, Set[Sha1Git]]): if data: # print(f"Inserting histories: {data}") # Insert revisions first, to ensure "foreign keys" exist revisions = set(data) for rev in data: revisions.update(data[rev]) psycopg2.extras.execute_values( self.cursor, """ LOCK TABLE ONLY revision; INSERT INTO revision(sha1) VALUES %s ON CONFLICT DO NOTHING """, ((rev,) for rev in revisions), ) values = [[(prev, next) for next in data[prev]] for prev in data] psycopg2.extras.execute_values( self.cursor, # XXX: not clear how conflicts are handled here! """ LOCK TABLE ONLY revision_before_revision; INSERT INTO revision_before_revision SELECT P.id, N.id FROM (VALUES %s) AS V(prev, next) INNER JOIN revision AS P on (P.sha1=V.prev) INNER JOIN revision AS N on (N.sha1=V.next) ON CONFLICT DO NOTHING """, sum(values, []), ) data.clear() def revision_get_preferred_origin(self, revision: Sha1Git) -> Optional[Sha1Git]: self.cursor.execute( """ SELECT O.sha1 FROM revision AS R JOIN origin as O ON R.origin=O.id WHERE R.sha1=%s""", (revision,), ) row = self.cursor.fetchone() - return row[0] if row is not None else None + return row["sha1"] if row is not None else None def revision_in_history(self, revision: Sha1Git) -> bool: self.cursor.execute( """ SELECT 1 FROM revision_before_revision JOIN revision ON revision.id=revision_before_revision.prev WHERE revision.sha1=%s """, (revision,), ) return self.cursor.fetchone() is not None def revision_visited(self, revision: Sha1Git) -> bool: self.cursor.execute( """ SELECT 1 FROM revision_in_origin JOIN revision ON revision.id=revision_in_origin.revision WHERE revision.sha1=%s """, (revision,), ) return self.cursor.fetchone() is not None def update_preferred_origin(self, data: Dict[Sha1Git, Sha1Git]): if data: # XXX: this is assuming the revision already exists in the db! It should # be improved by allowing null dates in the revision table. psycopg2.extras.execute_values( self.cursor, """ UPDATE revision R SET origin=O.id FROM (VALUES %s) AS V(rev, org) INNER JOIN origin AS O on (O.sha1=V.org) WHERE R.sha1=V.rev """, data.items(), ) data.clear() diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py index 287f528..a502b8f 100644 --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -1,120 +1,108 @@ from typing import Generator, Optional, Set, Tuple import psycopg2 import psycopg2.extras from swh.model.model import Sha1Git from ..provenance import ProvenanceResult from .provenancedb_base import ProvenanceDBBase class ProvenanceWithPathDB(ProvenanceDBBase): def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - self.cursor.execute( - """ - SELECT C.sha1 AS blob, - R.sha1 AS rev, + sql = """ + SELECT C.sha1 AS content, + R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, L.path AS path FROM content AS C INNER JOIN content_in_revision AS CR ON (CR.content=C.id) INNER JOIN location as L ON (CR.location=L.id) INNER JOIN revision as R ON (CR.revision=R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s - ORDER BY date, rev, url, path ASC LIMIT 1 - """, - (id,), - ) + ORDER BY date, revision, origin, path ASC LIMIT 1 + """ + self.cursor.execute(sql, (id,)) row = self.cursor.fetchone() - if row: - return ProvenanceResult( - content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] - ) - else: - return None + return ProvenanceResult(**row) if row is not None else None def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - self.cursor.execute( - f""" - (SELECT C.sha1 AS blob, - R.sha1 AS rev, + sql = f""" + (SELECT C.sha1 AS content, + R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, L.path AS path FROM content AS C INNER JOIN content_in_revision AS CR ON (CR.content=C.id) INNER JOIN location AS L ON (CR.location=L.id) INNER JOIN revision AS R ON (CR.revision=R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) UNION (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, CASE DL.path WHEN '' THEN CL.path WHEN '.' THEN CL.path ELSE (DL.path || '/' || CL.path)::unix_path END AS path FROM content AS C INNER JOIN content_in_directory AS CD ON (C.id=CD.content) INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) INNER JOIN revision AS R ON (DR.revision=R.id) INNER JOIN location AS CL ON (CD.location=CL.id) INNER JOIN location AS DL ON (DR.location=DL.id) LEFT JOIN origin AS O ON (R.origin=O.id) WHERE C.sha1=%s) - ORDER BY date, rev, url, path {early_cut} - """, - (id, id), - ) - for row in self.cursor.fetchall(): - yield ProvenanceResult( - content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] - ) + ORDER BY date, revision, origin, path {early_cut} + """ + self.cursor.execute(sql, (id, id)) + yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): """Insert entries in `relation` from `data` Also insert missing location entries in the 'location' table. """ if data: assert relation in ( "content_in_revision", "content_in_directory", "directory_in_revision", ) src, dst = relation.split("_in_") # insert missing locations locations = tuple(set((loc,) for (_, _, loc) in data)) psycopg2.extras.execute_values( self.cursor, """ LOCK TABLE ONLY location; INSERT INTO location(path) VALUES %s ON CONFLICT (path) DO NOTHING """, locations, ) psycopg2.extras.execute_values( self.cursor, f""" LOCK TABLE ONLY {relation}; INSERT INTO {relation} SELECT {src}.id, {dst}.id, location.id FROM (VALUES %s) AS V(src, dst, path) INNER JOIN {src} on ({src}.sha1=V.src) INNER JOIN {dst} on ({dst}.sha1=V.dst) INNER JOIN location on (location.path=V.path) """, data, ) data.clear() diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py index 4598577..789bc92 100644 --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -1,96 +1,84 @@ from typing import Generator, Optional, Set, Tuple import psycopg2 import psycopg2.extras from swh.model.model import Sha1Git from ..provenance import ProvenanceResult from .provenancedb_base import ProvenanceDBBase class ProvenanceWithoutPathDB(ProvenanceDBBase): def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - self.cursor.execute( - """ - SELECT C.sha1 AS blob, - R.sha1 AS rev, + sql = """ + SELECT C.sha1 AS content, + R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, '\\x'::bytea as path FROM content AS C INNER JOIN content_in_revision AS CR ON (CR.content=C.id) INNER JOIN revision as R ON (CR.revision=R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s - ORDER BY date, rev, url ASC LIMIT 1 - """, - (id,), - ) + ORDER BY date, revision, origin ASC LIMIT 1 + """ + self.cursor.execute(sql, (id,)) row = self.cursor.fetchone() - if row: - return ProvenanceResult( - content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] - ) - else: - return None + return ProvenanceResult(**row) if row is not None else None def content_find_all( self, id: Sha1Git, limit: Optional[int] = None ) -> Generator[ProvenanceResult, None, None]: early_cut = f"LIMIT {limit}" if limit is not None else "" - self.cursor.execute( - f""" - (SELECT C.sha1 AS blob, - R.sha1 AS rev, + sql = f""" + (SELECT C.sha1 AS content, + R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, '\\x'::bytea as path FROM content AS C INNER JOIN content_in_revision AS CR ON (CR.content=C.id) INNER JOIN revision AS R ON (CR.revision=R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) UNION (SELECT C.sha1 AS content, R.sha1 AS revision, R.date AS date, - O.url AS url, + O.url AS origin, '\\x'::bytea as path FROM content AS C INNER JOIN content_in_directory AS CD ON (C.id=CD.content) INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) INNER JOIN revision AS R ON (DR.revision=R.id) LEFT JOIN origin as O ON (R.origin=O.id) WHERE C.sha1=%s) - ORDER BY date, rev, url {early_cut} - """, - (id, id), - ) - for row in self.cursor.fetchall(): - yield ProvenanceResult( - content=row[0], revision=row[1], date=row[2], origin=row[3], path=row[4] - ) + ORDER BY date, revision, origin {early_cut} + """ + self.cursor.execute(sql, (id, id)) + yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) def insert_relation(self, relation: str, data: Set[Tuple[Sha1Git, Sha1Git, bytes]]): if data: assert relation in ( "content_in_revision", "content_in_directory", "directory_in_revision", ) src, dst = relation.split("_in_") psycopg2.extras.execute_values( self.cursor, f""" LOCK TABLE ONLY {relation}; INSERT INTO {relation} SELECT {src}.id, {dst}.id FROM (VALUES %s) AS V(src, dst) INNER JOIN {src} on ({src}.sha1=V.src) INNER JOIN {dst} on ({dst}.sha1=V.dst) """, data, ) data.clear() diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_provenance_heuristics.py index 19dde8c..f6134fb 100644 --- a/swh/provenance/tests/test_provenance_heuristics.py +++ b/swh/provenance/tests/test_provenance_heuristics.py @@ -1,350 +1,350 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, List, Tuple import pytest from swh.model.hashutil import hash_to_bytes from swh.provenance.model import RevisionEntry from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( fill_storage, get_datafile, load_repo_data, synthetic_result, ) from swh.provenance.tests.test_provenance_db import ts2dt def sha1s(cur, table): """return the 'sha1' column from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ cur.execute(f"SELECT sha1 FROM {table}") - return set(sha1.hex() for (sha1,) in cur.fetchall()) + return set(row["sha1"].hex() for row in cur.fetchall()) def locations(cur): """return the 'path' column from the DB location table 'cur' is a cursor to the provenance index DB. """ - cur.execute("SELECT encode(location.path::bytea, 'escape') FROM location") - return set(x for (x,) in cur.fetchall()) + cur.execute("SELECT encode(location.path::bytea, 'escape') AS path FROM location") + return set(row["path"] for row in cur.fetchall()) def relations(cur, src, dst): """return the triplets ('sha1', 'sha1', 'path') from the DB for the relation between 'src' table and 'dst' table (i.e. for C-R, C-D and D-R relations). 'cur' is a cursor to the provenance index DB. """ relation = f"{src}_in_{dst}" - cur.execute("select swh_get_dbflavor()") - with_path = cur.fetchone()[0] == "with-path" + cur.execute("SELECT swh_get_dbflavor() AS flavor") + with_path = cur.fetchone()["flavor"] == "with-path" # note that the columns have the same name as the relations they refer to, # so we can write things like "rel.{dst}=src.id" in the query below if with_path: cur.execute( f""" - SELECT encode(src.sha1::bytea, 'hex'), - encode(dst.sha1::bytea, 'hex'), - encode(location.path::bytea, 'escape') + SELECT encode(src.sha1::bytea, 'hex') AS src, + encode(dst.sha1::bytea, 'hex') AS dst, + encode(location.path::bytea, 'escape') AS path FROM {relation} as relation INNER JOIN {src} AS src ON (relation.{src} = src.id) INNER JOIN {dst} AS dst ON (relation.{dst} = dst.id) INNER JOIN location ON (relation.location = location.id) """ ) else: cur.execute( f""" - SELECT encode(src.sha1::bytea, 'hex'), - encode(dst.sha1::bytea, 'hex'), - '' + SELECT encode(src.sha1::bytea, 'hex') AS src, + encode(dst.sha1::bytea, 'hex') AS dst, + '' AS path FROM {relation} as relation INNER JOIN {src} AS src ON (src.id = relation.{src}) INNER JOIN {dst} AS dst ON (dst.id = relation.{dst}) """ ) - return set(cur.fetchall()) + return set((row["src"], row["dst"], row["path"]) for row in cur.fetchall()) def get_timestamp(cur, table, sha1): """return the date for the 'sha1' from the DB 'table' (as hex) 'cur' is a cursor to the provenance index DB. """ if isinstance(sha1, str): sha1 = hash_to_bytes(sha1) cur.execute(f"SELECT date FROM {table} WHERE sha1=%s", (sha1,)) - return [date.timestamp() for (date,) in cur.fetchall()] + return [row["date"].timestamp() for row in cur.fetchall()] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) def test_provenance_heuristics(provenance, swh_storage, archive, repo, lower, mindepth): # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) revisions = {rev["id"]: rev for rev in data["revision"]} rows = { "content": set(), "content_in_directory": set(), "content_in_revision": set(), "directory": set(), "directory_in_revision": set(), "location": set(), "revision": set(), } cursor = provenance.storage.cursor def maybe_path(path: str) -> str: if provenance.storage.with_path: return path return "" for synth_rev in synthetic_result(syntheticfile): revision = revisions[synth_rev["sha1"]] entry = RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"].hex()) assert rows["revision"] == sha1s(cursor, "revision"), synth_rev["msg"] # check the timestamp of the revision rev_ts = synth_rev["date"] assert get_timestamp(cursor, "revision", synth_rev["sha1"].hex()) == [ rev_ts ], synth_rev["msg"] # this revision might have added new content objects rows["content"] |= set(x["dst"].hex() for x in synth_rev["R_C"]) rows["content"] |= set(x["dst"].hex() for x in synth_rev["D_C"]) assert rows["content"] == sha1s(cursor, "content"), synth_rev["msg"] # check for R-C (direct) entries # these are added directly in the content_early_in_rev table rows["content_in_revision"] |= set( (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) for x in synth_rev["R_C"] ) assert rows["content_in_revision"] == relations( cursor, "content", "revision" ), synth_rev["msg"] # check timestamps for rc in synth_rev["R_C"]: assert get_timestamp(cursor, "content", rc["dst"]) == [ rev_ts + rc["rel_ts"] ], synth_rev["msg"] # check directories # each directory stored in the provenance index is an entry # in the "directory" table... rows["directory"] |= set(x["dst"].hex() for x in synth_rev["R_D"]) assert rows["directory"] == sha1s(cursor, "directory"), synth_rev["msg"] # ... + a number of rows in the "directory_in_rev" table... # check for R-D entries rows["directory_in_revision"] |= set( (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) for x in synth_rev["R_D"] ) assert rows["directory_in_revision"] == relations( cursor, "directory", "revision" ), synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: assert get_timestamp(cursor, "directory", rd["dst"]) == [ rev_ts + rd["rel_ts"] ], synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory. # check for D-C entries rows["content_in_directory"] |= set( (x["dst"].hex(), x["src"].hex(), maybe_path(x["path"])) for x in synth_rev["D_C"] ) assert rows["content_in_directory"] == relations( cursor, "content", "directory" ), synth_rev["msg"] # check timestamps for dc in synth_rev["D_C"]: assert get_timestamp(cursor, "content", dc["dst"]) == [ rev_ts + dc["rel_ts"] ], synth_rev["msg"] if provenance.storage.with_path: # check for location entries rows["location"] |= set(x["path"] for x in synth_rev["R_C"]) rows["location"] |= set(x["path"] for x in synth_rev["D_C"]) rows["location"] |= set(x["path"] for x in synth_rev["R_D"]) assert rows["location"] == locations(cursor), synth_rev["msg"] @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) def test_provenance_heuristics_content_find_all( provenance, swh_storage, archive, repo, lower, mindepth ): # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] def maybe_path(path: str) -> str: if provenance.storage.with_path: return path return "" # XXX adding all revisions at once should be working just fine, but it does not... # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) # ...so add revisions one at a time for now for revision in revisions: revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_occurrences = {} for synth_rev in synthetic_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: expected_occurrences.setdefault(rc["dst"].hex(), []).append( (rev_id, rev_ts, None, maybe_path(rc["path"])) ) for dc in synth_rev["D_C"]: assert dc["prefix"] is not None # to please mypy expected_occurrences.setdefault(dc["dst"].hex(), []).append( (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"])) ) for content_id, results in expected_occurrences.items(): expected = [(content_id, *result) for result in results] db_occurrences = [ ( occur.content.hex(), occur.revision.hex(), occur.date.timestamp(), occur.origin, occur.path.decode(), ) for occur in provenance.content_find_all(hash_to_bytes(content_id)) ] if provenance.storage.with_path: # this is not true if the db stores no path, because a same content # that appears several times in a given revision may be reported # only once by content_find_all() assert len(db_occurrences) == len(expected) assert set(db_occurrences) == set(expected) @pytest.mark.parametrize( "repo, lower, mindepth", ( ("cmdbts2", True, 1), ("cmdbts2", False, 1), ("cmdbts2", True, 2), ("cmdbts2", False, 2), ("out-of-order", True, 1), ), ) def test_provenance_heuristics_content_find_first( provenance, swh_storage, archive, repo, lower, mindepth ): # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) fill_storage(swh_storage, data) revisions = [ RevisionEntry( id=revision["id"], date=ts2dt(revision["date"]), root=revision["directory"], ) for revision in data["revision"] ] # XXX adding all revisions at once should be working just fine, but it does not... # revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth) # ...so add revisions one at a time for now for revision in revisions: revision_add(provenance, archive, [revision], lower=lower, mindepth=mindepth) syntheticfile = get_datafile( f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt" ) expected_first: Dict[str, Tuple[str, str, List[str]]] = {} # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path # is a list because a content can be added at several places in a single # revision, in which case the result of content_find_first() is one of # those path, but we have no guarantee which one it will return. for synth_rev in synthetic_result(syntheticfile): rev_id = synth_rev["sha1"].hex() rev_ts = synth_rev["date"] for rc in synth_rev["R_C"]: sha1 = rc["dst"].hex() if sha1 not in expected_first: assert rc["rel_ts"] == 0 expected_first[sha1] = (rev_id, rev_ts, [rc["path"]]) else: if rev_ts == expected_first[sha1][1]: expected_first[sha1][2].append(rc["path"]) elif rev_ts < expected_first[sha1][1]: expected_first[sha1] = (rev_id, rev_ts, rc["path"]) for dc in synth_rev["D_C"]: sha1 = rc["dst"].hex() assert sha1 in expected_first # nothing to do there, this content cannot be a "first seen file" for content_id, (rev_id, ts, paths) in expected_first.items(): occur = provenance.content_find_first(hash_to_bytes(content_id)) assert occur.content.hex() == content_id assert occur.revision.hex() == rev_id assert occur.date.timestamp() == ts assert occur.origin is None if provenance.storage.with_path: assert occur.path.decode() in paths