diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -388,6 +388,50 @@ client.close() +@cli.group(name="directory") +@click.pass_context +def directory(ctx: click.core.Context): + from . import get_archive, get_provenance + + archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) + provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"]) + + ctx.obj["provenance"] = provenance + ctx.obj["archive"] = archive + + +@directory.command(name="flatten") +@click.option( + "--range-from", type=str, help="start ID of the range of directories to flatten" +) +@click.option( + "--range-to", type=str, help="stop ID of the range of directories to flatten" +) +@click.option( + "-s", + "--min-size", + default=0, + type=int, + help="""Set the minimum size (in bytes) of files to be indexed. + Any smaller file will be ignored.""", +) +@click.pass_context +def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size): + from swh.provenance.directory import directory_flatten_range + + provenance = ctx.obj["provenance"] + archive = ctx.obj["archive"] + + directory_flatten_range( + provenance, + archive, + hash_to_bytes(range_from), + hash_to_bytes(range_to), + min_size, + ) + + +# old (deprecated) commands @cli.command(name="iter-frontiers") @click.argument("filename") @click.option( diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py --- a/swh/provenance/directory.py +++ b/swh/provenance/directory.py @@ -40,6 +40,28 @@ yield DirectoryEntry(id) +def directory_flatten_range( + provenance: ProvenanceInterface, + archive: ArchiveInterface, + start_id: Sha1Git, + end_id: Sha1Git, + minsize: int = 0, + commit: bool = True, +) -> None: + """XXX""" + current = start_id + while current < end_id: + dirs = provenance.storage.directory_iter_not_flattenned( + limit=100, start_id=current + ) + if not dirs: + break + directory_add( + provenance, archive, [DirectoryEntry(id=d) for d in dirs], minsize, commit + ) + current = dirs[-1] + + @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"}) def directory_add( provenance: ProvenanceInterface, diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -9,7 +9,7 @@ from datetime import datetime import enum from types import TracebackType -from typing import Dict, Generator, Iterable, Optional, Set, Type, Union +from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union from typing_extensions import Protocol, runtime_checkable @@ -138,6 +138,13 @@ """ ... + @remote_api_endpoint("directory_iter_not_flattenned") + def directory_iter_not_flattenned( + self, limit: int, start_id: Sha1Git + ) -> List[Sha1Git]: + """Retrieve the unflattenned directories""" + ... + @remote_api_endpoint("entity_get_all") def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: """Retrieve all sha1 ids for entities of type `entity` present in the provenance diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -194,6 +194,20 @@ ) return result + @statsd.timed( + metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"} + ) + def directory_iter_not_flattenned( + self, limit: int, start_id: Sha1Git + ) -> List[Sha1Git]: + sql = """ + SELECT sha1 FROM directory + WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s + """ + with self.transaction(readonly=True) as cursor: + cursor.execute(query=sql, vars=(start_id, limit)) + return [row["sha1"] for row in cursor] + @statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"}) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: with self.transaction(readonly=True) as cursor: diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql --- a/swh/provenance/sql/60-indexes.sql +++ b/swh/provenance/sql/60-indexes.sql @@ -14,6 +14,7 @@ \endif create unique index on location(digest(path, 'sha1')); +create index on directory(sha1) where flat=false; alter table revision_in_origin add primary key (revision, origin); alter table revision_before_revision add primary key (prev, next); diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py --- a/swh/provenance/tests/test_directory_flatten.py +++ b/swh/provenance/tests/test_directory_flatten.py @@ -8,7 +8,7 @@ from swh.model.hashutil import hash_to_bytes from swh.provenance.archive import ArchiveInterface -from swh.provenance.directory import directory_add +from swh.provenance.directory import directory_add, directory_flatten_range from swh.provenance.interface import ( DirectoryData, ProvenanceInterface, @@ -70,3 +70,58 @@ }, content2.id: {RelationData(dst=directory.id, path=b"C/b")}, } + + +def test_directory_flatten_range( + provenance: ProvenanceInterface, + archive: ArchiveInterface, +) -> None: + # read data/README.md for more details on how these datasets are generated + data = load_repo_data("cmdbts2") + fill_storage(archive.storage, data) + + # just take a directory that is known to exists in cmdbts2 + directory = DirectoryEntry( + id=hash_to_bytes("48007c961cc734d1f63886d0413a6dc605e3e2ea") + ) + content1 = FileEntry( + id=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), name=b"a" + ) + content2 = FileEntry( + id=hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3"), name=b"b" + ) + date = datetime.fromtimestamp(1000000010, timezone.utc) + + # directory_add and the internal directory_flatten require the directory and its + # content to be known by the provenance object. Otherwise, they do nothing + provenance.directory_set_date_in_isochrone_frontier(directory, date) + provenance.content_set_early_date(content1, date) + provenance.content_set_early_date(content2, date) + provenance.flush() + + assert provenance.storage.directory_get([directory.id]) == { + directory.id: DirectoryData(date=date, flat=False) + } + assert provenance.storage.content_get([content1.id, content2.id]) == { + content1.id: date, + content2.id: date, + } + + # this query forces the directory date to be retrieved from the storage and cached + # (otherwise, the flush below won't update the directory flatten flag) + flattenned = provenance.directory_already_flattenned(directory) + assert flattenned is not None and not flattenned + + # flatten the directory and check the expected result + directory_flatten_range(provenance, archive, directory.id[:-1], directory.id) + + assert provenance.storage.directory_get([directory.id]) == { + directory.id: DirectoryData(date=date, flat=True) + } + assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == { + content1.id: { + RelationData(dst=directory.id, path=b"a"), + RelationData(dst=directory.id, path=b"C/a"), + }, + content2.id: {RelationData(dst=directory.id, path=b"C/b")}, + }