diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md --- a/swh/provenance/tests/data/README.md +++ b/swh/provenance/tests/data/README.md @@ -63,6 +63,9 @@ only depends on the content of the input yaml file. Calling the tool twice on the same input file should generate the exact same git repo twice. +Also note that the tool will add a branch at each revision (using the commit +message as bramch name), to make it easier to reference any point in the git +history. ## Msgpack dump of the storage @@ -92,6 +95,31 @@ Wrote 86 objects in repo2.msgpack ``` +### Adding extra visits/snapshots + +It is also possible to generate a storage from a git repo with extra origin +visits, using the `--visit` option of the `generate_repo_from_git` tool. + +This option expect a yaml file as argument. This file contains a description of +extra visits (and snapshots) you want to add to the storage. + +The format is simple, for example: + +``` +# a visit pattern scenario for the 'repo_with_merges' repo + +- origin: http://repo_with_merges/1/ + date: 1000000015 + branches: + - R01 + +``` + +will create an OriginVisit (at given date) for the given origin URL (the Origin +will be created as well), with a `Snapshot` including the listed +branches. + + ## Synthetic files These files describe the expected content of the provenance database for each diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py --- a/swh/provenance/tests/data/generate_storage_from_git.py +++ b/swh/provenance/tests/data/generate_storage_from_git.py @@ -5,10 +5,20 @@ from datetime import datetime, timezone import os +from subprocess import check_output import click +import yaml from swh.loader.git.from_disk import GitLoaderFromDisk +from swh.model.model import ( + Origin, + OriginVisit, + OriginVisitStatus, + Snapshot, + SnapshotBranch, + TargetType, +) from swh.storage import get_storage @@ -30,8 +40,15 @@ @click.command() @click.option("-o", "--output", default=None, help="output file") +@click.option( + "-v", + "--visits", + type=click.File(mode="rb"), + default=None, + help="additional visits to generate.", +) @click.argument("git-repo", type=click.Path(exists=True, file_okay=False)) -def main(output, git_repo): +def main(output, visits, git_repo): "simple tool to generate the git_repo.msgpack dataset file used in some tests" if output is None: output = f"{git_repo}.msgpack" @@ -44,6 +61,52 @@ reponame = os.path.basename(git_repo) load_git_repo(f"https://{reponame}", git_repo, sto) + + if visits: + # retrieve all branches from the actual git repo + all_branches = { + ref: sha1 + for sha1, ref in ( + line.strip().split() + for line in check_output(["git", "-C", git_repo, "show-ref"]) + .decode() + .splitlines() + ) + } + + for visit in yaml.full_load(visits): + # add the origin (if it already exists, this is a noop) + sto.origin_add([Origin(url=visit["origin"])]) + # add a new visit for this origin + visit_id = sto.origin_visit_add( + [ + OriginVisit( + origin=visit["origin"], + date=datetime.fromtimestamp(visit["date"], tz=timezone.utc), + type="git", + ) + ] + )[0].visit + # add a snapshot with branches from the input file + branches = { + f"refs/heads/{name}".encode(): SnapshotBranch( + target=bytes.fromhex(all_branches[f"refs/heads/{name}"]), + target_type=TargetType.REVISION, + ) + for name in visit["branches"] + } + snap = Snapshot(branches=branches) + sto.snapshot_add([snap]) + # add a "closing" origin visit status update referencing the snapshot + status = OriginVisitStatus( + origin=visit["origin"], + visit=visit_id, + date=datetime.fromtimestamp(visit["date"], tz=timezone.utc), + status="full", + snapshot=snap.id, + ) + sto.origin_visit_status_add([status]) + click.echo(f"Serialized the storage made from {reponame} in {output}") diff --git a/swh/provenance/tests/data/repo_with_merges-visits-01.yaml b/swh/provenance/tests/data/repo_with_merges-visits-01.yaml new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/data/repo_with_merges-visits-01.yaml @@ -0,0 +1,34 @@ +# a visit pattern scenario for the 'repo_with_merges' repo + +- origin: http://repo_with_merges/1/ + date: 1000000015 + branches: + - R01 + +- origin: http://repo_with_merges/1/ + date: 1000000025 + branches: + - R03 + - R06 + +- origin: http://repo_with_merges/2/ + date: 1000000035 + branches: + - R05 + - R06 + +- origin: http://repo_with_merges/1/ + date: 1000000045 + branches: + - R06 + - R07 + +- origin: http://repo_with_merges/1/ + date: 1000000055 + branches: + - R08 + +- origin: http://repo_with_merges/2/ + date: 1000000065 + branches: + - R08