diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -141,8 +141,20 @@ @cli.command(name="iter-frontiers") @click.argument("filename") -@click.option("-l", "--limit", type=int) -@click.option("-s", "--min-size", default=0, type=int) +@click.option( + "-l", + "--limit", + type=int, + help="""Limit the amount of entries (directories) to read from the input file.""", +) +@click.option( + "-s", + "--min-size", + default=0, + type=int, + help="""Set the minimum size (in bytes) of files to be indexed. """ + """Any smaller file will be ignored.""", +) @click.pass_context def iter_frontiers( ctx: click.core.Context, @@ -178,16 +190,56 @@ @cli.command(name="iter-revisions") @click.argument("filename") -@click.option("-a", "--track-all", default=True, type=bool) -@click.option("-l", "--limit", type=int) -@click.option("-m", "--min-depth", default=1, type=int) -@click.option("-r", "--reuse", default=True, type=bool) -@click.option("-s", "--min-size", default=0, type=int) +@click.option( + "-a", + "--track-all", + default=True, + type=bool, + help="""Index all occurrences of files in the development history.""", +) +@click.option( + "-f", + "--flatten", + default=True, + type=bool, + help="""Create flat models for directories in the isochrone frontier.""", +) +@click.option( + "-l", + "--limit", + type=int, + help="""Limit the amount of entries (revisions) to read from the input file.""", +) +@click.option( + "-m", + "--min-depth", + default=1, + type=int, + help="""Set minimum depth (in the directory tree) at which an isochrone """ + """frontier can be defined.""", +) +@click.option( + "-r", + "--reuse", + default=True, + type=bool, + help="""Prioritize the usage of previously defined isochrone frontiers """ + """whenever possible.""", +) +@click.option( + "-s", + "--min-size", + default=0, + type=int, + help="""Set the minimum size (in bytes) of files to be indexed. """ + """Any smaller file will be ignored.""", +) @click.pass_context def iter_revisions( ctx: click.core.Context, filename: str, track_all: bool, + flatten: bool, limit: Optional[int], min_depth: int, reuse: bool, @@ -208,6 +260,7 @@ archive, [revision], trackall=track_all, + flatten=flatten, lower=reuse, mindepth=min_depth, minsize=min_size, @@ -229,7 +282,12 @@ @cli.command(name="iter-origins") @click.argument("filename") -@click.option("-l", "--limit", type=int) +@click.option( + "-l", + "--limit", + type=int, + help="""Limit the amount of entries (origins) to read from the input file.""", +) @click.pass_context def iter_origins(ctx: click.core.Context, filename: str, limit: Optional[int]) -> None: """Process a provided list of origins.""" @@ -275,7 +333,9 @@ @cli.command(name="find-all") @click.argument("swhid") -@click.option("-l", "--limit", type=int) +@click.option( + "-l", "--limit", type=int, help="""Limit the amount results to be retrieved.""" +) @click.pass_context def find_all(ctx: click.core.Context, swhid: str, limit: Optional[int]) -> None: """Find all occurrences of the requested blob.""" diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -57,6 +57,7 @@ archive: ArchiveInterface, revisions: List[RevisionEntry], trackall: bool = True, + flatten: bool = True, lower: bool = True, mindepth: int = 1, minsize: int = 0, @@ -81,6 +82,7 @@ revision, graph, trackall=trackall, + flatten=flatten, lower=lower, mindepth=mindepth, minsize=minsize, @@ -96,6 +98,7 @@ revision: RevisionEntry, graph: IsochroneNode, trackall: bool = True, + flatten: bool = True, lower: bool = True, mindepth: int = 1, minsize: int = 0, @@ -135,9 +138,10 @@ provenance.directory_add_to_revision( revision, current.entry, current.path ) - directory_flatten( - provenance, archive, current.entry, minsize=minsize - ) + if flatten: + directory_flatten( + provenance, archive, current.entry, minsize=minsize + ) else: # If current node is an invalidated frontier, update its date for future # revisions to get the proper value. diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -12,8 +12,9 @@ from swh.model.hashutil import hash_to_bytes from swh.model.model import Sha1Git from swh.provenance.archive import ArchiveInterface +from swh.provenance.directory import directory_add from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType -from swh.provenance.model import RevisionEntry +from swh.provenance.model import DirectoryEntry, RevisionEntry from swh.provenance.revision import revision_add from swh.provenance.tests.conftest import ( fill_storage, @@ -148,13 +149,18 @@ @pytest.mark.parametrize( - "repo, lower, mindepth", + "repo, lower, mindepth, flatten", ( - ("cmdbts2", True, 1), - ("cmdbts2", False, 1), - ("cmdbts2", True, 2), - ("cmdbts2", False, 2), - ("out-of-order", True, 1), + ("cmdbts2", True, 1, True), + ("cmdbts2", True, 1, False), + ("cmdbts2", False, 1, True), + ("cmdbts2", False, 1, False), + ("cmdbts2", True, 2, True), + ("cmdbts2", True, 2, False), + ("cmdbts2", False, 2, True), + ("cmdbts2", False, 2, False), + ("out-of-order", True, 1, True), + ("out-of-order", True, 1, False), ), ) def test_revision_content_result( @@ -163,6 +169,7 @@ repo: str, lower: bool, mindepth: int, + flatten: bool, ) -> None: # read data/README.md for more details on how these datasets are generated data = load_repo_data(repo) @@ -195,7 +202,28 @@ date=ts2dt(revision["date"]), root=revision["directory"], ) - revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) + + if flatten: + revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth) + else: + prev_directories = provenance.storage.entity_get_all(EntityType.DIRECTORY) + revision_add( + provenance, + archive, + [entry], + lower=lower, + mindepth=mindepth, + flatten=False, + ) + directories = [ + DirectoryEntry(id=sha1) + for sha1 in provenance.storage.entity_get_all( + EntityType.DIRECTORY + ).difference(prev_directories) + ] + for directory in directories: + assert not provenance.directory_already_flattenned(directory) + directory_add(provenance, archive, directories) # each "entry" in the synth file is one new revision rows["revision"].add(synth_rev["sha1"])