diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -388,6 +388,50 @@ client.close() +@cli.group(name="directory") +@click.pass_context +def directory(ctx: click.core.Context): + from . import get_archive, get_provenance + + archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) + provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) + + ctx.obj["provenance"] = provenance + ctx.obj["archive"] = archive + + +@directory.command(name="flatten") +@click.option( + "--range-from", type=str, help="start ID of the range of directories to flatten" +) +@click.option( + "--range-to", type=str, help="stop ID of the range of directories to flatten" +) +@click.option( + "-s", + "--min-size", + default=0, + type=int, + help="""Set the minimum size (in bytes) of files to be indexed. + Any smaller file will be ignored.""", +) +@click.pass_context +def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size): + from swh.provenance.directory import directory_flatten_range + + provenance = ctx.obj["provenance"] + archive = ctx.obj["archive"] + + directory_flatten_range( + provenance, + archive, + hash_to_bytes(range_from), + hash_to_bytes(range_to), + min_size, + ) + + +# old (deprecated) commands @cli.command(name="iter-frontiers") @click.argument("filename") @click.option( diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py --- a/swh/provenance/directory.py +++ b/swh/provenance/directory.py @@ -40,7 +40,29 @@ yield DirectoryEntry(id) -@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"}) +def directory_flatten_range( + provenance: ProvenanceInterface, + archive: ArchiveInterface, + start_id: Sha1Git, + end_id: Sha1Git, + minsize: int = 0, + commit: bool = True, +) -> None: + """Flatten the known directories from ``start_id`` to ``end_id``.""" + current = start_id + while current < end_id: + dirs = provenance.storage.directory_iter_not_flattenned( + limit=100, start_id=current + ) + if not dirs: + break + directory_add( + provenance, archive, [DirectoryEntry(id=d) for d in dirs], minsize, commit + ) + current = dirs[-1] + + +@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "add"}) def directory_add( provenance: ProvenanceInterface, archive: ArchiveInterface, diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -9,7 +9,7 @@ from datetime import datetime import enum from types import TracebackType -from typing import Dict, Generator, Iterable, Optional, Set, Type, Union +from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union from typing_extensions import Protocol, runtime_checkable @@ -138,6 +138,13 @@ """ ... + @remote_api_endpoint("directory_iter_not_flattenned") + def directory_iter_not_flattenned( + self, limit: int, start_id: Sha1Git + ) -> List[Sha1Git]: + """Retrieve the unflattenned directories after ``start_id`` up to ``limit`` entries.""" + ... + @remote_api_endpoint("entity_get_all") def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: """Retrieve all sha1 ids for entities of type `entity` present in the provenance diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -194,6 +194,20 @@ ) return result + @statsd.timed( + metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"} + ) + def directory_iter_not_flattenned( + self, limit: int, start_id: Sha1Git + ) -> List[Sha1Git]: + sql = """ + SELECT sha1 FROM directory + WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s + """ + with self.transaction(readonly=True) as cursor: + cursor.execute(query=sql, vars=(start_id, limit)) + return [row["sha1"] for row in cursor] + @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"}) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: with self.transaction(readonly=True) as cursor: diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql --- a/swh/provenance/sql/60-indexes.sql +++ b/swh/provenance/sql/60-indexes.sql @@ -14,6 +14,7 @@ \endif create unique index on location(digest(path, 'sha1')); +create index on directory(sha1) where flat=false; alter table revision_in_origin add primary key (revision, origin); alter table revision_before_revision add primary key (prev, next); diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py --- a/swh/provenance/tests/test_directory_flatten.py +++ b/swh/provenance/tests/test_directory_flatten.py @@ -5,10 +5,11 @@ from datetime import datetime, timezone +from typing import Tuple from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface -from swh.provenance.directory import directory_add +from swh.provenance.directory import directory_add, directory_flatten_range from swh.provenance.interface import ( DirectoryData, ProvenanceInterface, @@ -19,10 +20,10 @@ from swh.provenance.tests.conftest import fill_storage, load_repo_data -def test_directory_add( - provenance: ProvenanceInterface, - archive: ArchiveInterface, -) -> None: +def prepare( + provenance: ProvenanceInterface, archive: ArchiveInterface +) -> Tuple[datetime, DirectoryEntry, FileEntry, FileEntry]: + """Prepare the provenance database with some content suitable for flattening tests""" # read data/README.md for more details on how these datasets are generated data = load_repo_data("cmdbts2") fill_storage(archive.storage, data) @@ -58,6 +59,16 @@ flattenned = provenance.directory_already_flattenned(directory) assert flattenned is not None and not flattenned + return date, directory, content1, content2 + + +def test_directory_add( + provenance: ProvenanceInterface, + archive: ArchiveInterface, +) -> None: + + date, directory, content1, content2 = prepare(provenance, archive) + # flatten the directory and check the expected result directory_add(provenance, archive, [directory]) assert provenance.storage.directory_get([directory.id]) == { @@ -70,3 +81,25 @@ }, content2.id: {RelationData(dst=directory.id, path=b"C/b")}, } + + +def test_directory_flatten_range( + provenance: ProvenanceInterface, + archive: ArchiveInterface, +) -> None: + + date, directory, content1, content2 = prepare(provenance, archive) + + # flatten the directory and check the expected result + directory_flatten_range(provenance, archive, directory.id[:-1], directory.id) + + assert provenance.storage.directory_get([directory.id]) == { + directory.id: DirectoryData(date=date, flat=True) + } + assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == { + content1.id: { + RelationData(dst=directory.id, path=b"a"), + RelationData(dst=directory.id, path=b"C/a"), + }, + content2.id: {RelationData(dst=directory.id, path=b"C/b")}, + }