diff --git a/mypy.ini b/mypy.ini index 2276188..f0cf3bb 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,21 +1,24 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-iso8601.*] ignore_missing_imports = True [mypy-methodtools.*] ignore_missing_imports = True +[mypy-msgpack.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-psycopg2.*] ignore_missing_imports = True diff --git a/requirements-test.txt b/requirements-test.txt index fa30296..2d179fb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,3 @@ pytest swh.loader.git >= 0.8 +swh.journal >= 0.8 diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py index 521dd84..79ae7a4 100644 --- a/swh/provenance/tests/conftest.py +++ b/swh/provenance/tests/conftest.py @@ -1,243 +1,233 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os import path import re from typing import Iterable, Iterator, List, Optional +import msgpack import pytest from typing_extensions import TypedDict -from swh.core.api.serializers import msgpack_loads from swh.core.db import BaseDb -from swh.model.model import Content, Directory, DirectoryEntry, Revision +from swh.journal.serializers import msgpack_ext_hook from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.provenance.postgresql.archive import ArchivePostgreSQL from swh.provenance.storage.archive import ArchiveStorage +from swh.storage.replay import process_replay_objects @pytest.fixture(params=["with-path", "without-path"]) def provenance(request, postgresql): """return a working and initialized provenance db""" from swh.core.cli.db import populate_database_for_package flavor = request.param populate_database_for_package("swh.provenance", postgresql.dsn, flavor=flavor) from swh.provenance.provenance import ProvenanceBackend BaseDb.adapt_conn(postgresql) prov = ProvenanceBackend(postgresql) assert prov.storage.flavor == flavor # in test sessions, we DO want to raise any exception occurring at commit time prov.raise_on_commit = True return prov @pytest.fixture def swh_storage_with_objects(swh_storage): """return a Storage object (postgresql-based by default) with a few of each object type in it The inserted content comes from swh.model.tests.swh_model_data. """ for obj_type in ( "content", "skipped_content", "directory", "revision", "release", "snapshot", "origin", "origin_visit", "origin_visit_status", ): getattr(swh_storage, f"{obj_type}_add")(TEST_OBJECTS[obj_type]) return swh_storage @pytest.fixture def archive_direct(swh_storage_with_objects): return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn) @pytest.fixture def archive_api(swh_storage_with_objects): return ArchiveStorage(swh_storage_with_objects) @pytest.fixture(params=["archive", "db"]) def archive(request, swh_storage_with_objects): """Return a ArchivePostgreSQL based StorageInterface object""" # this is a workaround to prevent tests from hanging because of an unclosed # transaction. # TODO: refactor the ArchivePostgreSQL to properly deal with # transactions and get rif of this fixture if request.param == "db": archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn) yield archive archive.conn.rollback() else: yield ArchiveStorage(swh_storage_with_objects) def get_datafile(fname): return path.join(path.dirname(__file__), "data", fname) def load_repo_data(repo): - data = {"revision": [], "directory": [], "content": []} + data = {} with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj: - for etype, value in msgpack_loads(fobj.read()): - data[etype].append(value) + unpacker = msgpack.Unpacker( + fobj, + raw=False, + ext_hook=msgpack_ext_hook, + strict_map_key=False, + timestamp=3, # convert Timestamp in datetime objects (tz UTC) + ) + for objtype, objd in unpacker: + data.setdefault(objtype, []).append(objd) return data def filter_dict(d, keys): return {k: v for (k, v) in d.items() if k in keys} def fill_storage(storage, data): - storage.content_add_metadata( - Content.from_dict(content) for content in data["content"] - ) - storage.directory_add( - [ - Directory( - entries=tuple( - [ - DirectoryEntry.from_dict( - filter_dict(entry, ("name", "type", "target", "perms")) - ) - for entry in dir["entries"] - ] - ) - ) - for dir in data["directory"] - ] - ) - storage.revision_add(Revision.from_dict(revision) for revision in data["revision"]) + process_replay_objects(data, storage=storage) class SynthRelation(TypedDict): prefix: Optional[str] path: str src: bytes dst: bytes rel_ts: float class SynthRevision(TypedDict): sha1: bytes date: float msg: str R_C: List[SynthRelation] R_D: List[SynthRelation] D_C: List[SynthRelation] def synthetic_result(filename: str) -> Iterator[SynthRevision]: """Generates dict representations of synthetic revisions found in the synthetic file (from the data/ directory) given as argument of the generator. Generated SynthRevision (typed dict) with the following elements: "sha1": (bytes) sha1 of the revision, "date": (float) timestamp of the revision, "msg": (str) commit message of the revision, "R_C": (list) new R---C relations added by this revision "R_D": (list) new R-D relations added by this revision "D_C": (list) new D-C relations added by this revision Each relation above is a SynthRelation typed dict with: "path": (str) location "src": (bytes) sha1 of the source of the relation "dst": (bytes) sha1 of the destination of the relation "rel_ts": (float) timestamp of the target of the relation (related to the timestamp of the revision) """ with open(get_datafile(filename), "r") as fobj: yield from _parse_synthetic_file(fobj) def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]: """Read a 'synthetic' file and generate a dict representation of the synthetic revision for each revision listed in the synthetic file. """ regs = [ "(?PR[0-9]{2,4})?", "(?P[^| ]*)", "([+] )?(?P[^| +]*?)[/]?", "(?P[RDC]) (?P[0-9a-z]{40})", "(?P-?[0-9]+(.[0-9]+)?)", ] regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$") current_rev: List[dict] = [] for m in (regex.match(line) for line in fobj): if m: d = m.groupdict() if d["revname"]: if current_rev: yield _mk_synth_rev(current_rev) current_rev.clear() current_rev.append(d) if current_rev: yield _mk_synth_rev(current_rev) def _mk_synth_rev(synth_rev) -> SynthRevision: assert synth_rev[0]["type"] == "R" rev = SynthRevision( sha1=bytes.fromhex(synth_rev[0]["sha1"]), date=float(synth_rev[0]["ts"]), msg=synth_rev[0]["revname"], R_C=[], R_D=[], D_C=[], ) current_path = None # path of the last R-D relation we parsed, used a prefix for next D-C # relations for row in synth_rev[1:]: if row["reltype"] == "R---C": assert row["type"] == "C" rev["R_C"].append( SynthRelation( prefix=None, path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) current_path = None elif row["reltype"] == "R-D": assert row["type"] == "D" rev["R_D"].append( SynthRelation( prefix=None, path=row["path"], src=rev["sha1"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) current_path = row["path"] elif row["reltype"] == "D-C": assert row["type"] == "C" rev["D_C"].append( SynthRelation( prefix=current_path, path=row["path"], src=rev["R_D"][-1]["dst"], dst=bytes.fromhex(row["sha1"]), rel_ts=float(row["ts"]), ) ) return rev diff --git a/swh/provenance/tests/data/cmdbts2.msgpack b/swh/provenance/tests/data/cmdbts2.msgpack index 0ca8460..5dfd186 100644 Binary files a/swh/provenance/tests/data/cmdbts2.msgpack and b/swh/provenance/tests/data/cmdbts2.msgpack differ diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py index 1c7b628..854a3eb 100644 --- a/swh/provenance/tests/data/generate_storage_from_git.py +++ b/swh/provenance/tests/data/generate_storage_from_git.py @@ -1,115 +1,51 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import os -import re -from subprocess import check_output -from typing import Dict import click -from swh.core.api.serializers import msgpack_dumps from swh.loader.git.from_disk import GitLoaderFromDisk -from swh.model.hashutil import hash_to_bytes as h2b -from swh.provenance.tests.test_provenance_db import ts2dt from swh.storage import get_storage def load_git_repo(url, directory, storage): visit_date = datetime.now(tz=timezone.utc) loader = GitLoaderFromDisk( url=url, directory=directory, visit_date=visit_date, storage=storage, ) return loader.load() def pop_key(d, k): d.pop(k) return d -def dump_file(hash, storage, cache): - if hash not in cache: - content = storage.content_find({"sha1_git": hash})[0] - cache[hash] = content - # we remove ctime to make the resulting data (eg. output msgpack file) - # independent from execution time - yield "content", pop_key(content.to_dict(), "ctime") - - -def dump_directory(hash, storage, cache): - if hash not in cache: - dircontent = list(storage.directory_ls(hash)) - cache[hash] = dircontent - yield "directory", {"id": hash, "entries": list(storage.directory_ls(hash))} - for direntry in dircontent: - if direntry["type"] == "dir": - yield from dump_directory(direntry["target"], storage, cache) - elif direntry["type"] == "file": - yield from dump_file(direntry["target"], storage, cache) - else: - raise ValueError("Unexpected directory entry type {direntry['type']}") - - -def dump_git_revision(hash, storage, cache): - if hash not in cache: - rev = storage.revision_get([hash])[0] - revd = { - "id": rev.id, - "date": ts2dt(rev.date.to_dict()), - "parents": rev.parents, - "directory": rev.directory, - } - revd = rev.to_dict() - cache[hash] = revd - for parent in rev.parents: - yield from dump_git_revision(parent, storage, cache) - yield from dump_directory(rev.directory, storage, cache) - yield "revision", cache[hash] - - @click.command() -@click.option( - "-r", - "--head", - default="master", - help="head revision to start from", -) @click.option("-o", "--output", default=None, help="output file") -@click.argument("git-repo") -def main(head, output, git_repo): +@click.argument("git-repo", type=click.Path(exists=True, file_okay=False)) +def main(output, git_repo): "simple tool to generate the git_repo.msgpack dataset file used in some tests" - sto = get_storage(cls="memory") - if git_repo.endswith("/"): - git_repo = git_repo[:-1] - - reponame = os.path.basename(git_repo) - load_git_repo(f"https://{reponame}", git_repo, sto) - if output is None: output = f"{git_repo}.msgpack" - - if not re.match("[0-9a-fA-F]{40}", head): - headhash = ( - check_output(["git", "-C", git_repo, "rev-parse", head]).decode().strip() + with open(output, "wb") as outstream: + sto = get_storage( + cls="memory", journal_writer={"cls": "stream", "output_stream": outstream} ) - click.echo(f"Revision hash for {head} is {headhash}") - else: - headhash = head - cache: Dict[bytes, dict] = {} - outf = open(output, "wb") - outd = [] - for e in dump_git_revision(h2b(headhash), storage=sto, cache=cache): - outd.append(e) - outf.write(msgpack_dumps(outd)) - click.echo(f"Wrote {len(outd)} objects in {output}") + if git_repo.endswith("/"): + git_repo = git_repo[:-1] + + reponame = os.path.basename(git_repo) + load_git_repo(f"https://{reponame}", git_repo, sto) + click.echo(f"Serialized the storage made from {reponame} in {output}") if __name__ == "__main__": main() diff --git a/swh/provenance/tests/data/out-of-order.msgpack b/swh/provenance/tests/data/out-of-order.msgpack index 774dfa8..660fe16 100644 Binary files a/swh/provenance/tests/data/out-of-order.msgpack and b/swh/provenance/tests/data/out-of-order.msgpack differ