Page MenuHomeSoftware Heritage

D8100.diff
No OneTemporary

D8100.diff

diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -388,6 +388,50 @@
client.close()
+@cli.group(name="directory")
+@click.pass_context
+def directory(ctx: click.core.Context):
+ from . import get_archive, get_provenance
+
+ archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
+ provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"])
+
+ ctx.obj["provenance"] = provenance
+ ctx.obj["archive"] = archive
+
+
+@directory.command(name="flatten")
+@click.option(
+ "--range-from", type=str, help="start ID of the range of directories to flatten"
+)
+@click.option(
+ "--range-to", type=str, help="stop ID of the range of directories to flatten"
+)
+@click.option(
+ "-s",
+ "--min-size",
+ default=0,
+ type=int,
+ help="""Set the minimum size (in bytes) of files to be indexed.
+ Any smaller file will be ignored.""",
+)
+@click.pass_context
+def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size):
+ from swh.provenance.directory import directory_flatten_range
+
+ provenance = ctx.obj["provenance"]
+ archive = ctx.obj["archive"]
+
+ directory_flatten_range(
+ provenance,
+ archive,
+ hash_to_bytes(range_from),
+ hash_to_bytes(range_to),
+ min_size,
+ )
+
+
+# old (deprecated) commands
@cli.command(name="iter-frontiers")
@click.argument("filename")
@click.option(
diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py
--- a/swh/provenance/directory.py
+++ b/swh/provenance/directory.py
@@ -40,7 +40,29 @@
yield DirectoryEntry(id)
-@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
+def directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ start_id: Sha1Git,
+ end_id: Sha1Git,
+ minsize: int = 0,
+ commit: bool = True,
+) -> None:
+ """Flatten the known directories from ``start_id`` to ``end_id``."""
+ current = start_id
+ while current < end_id:
+ dirs = provenance.storage.directory_iter_not_flattenned(
+ limit=100, start_id=current
+ )
+ if not dirs:
+ break
+ directory_add(
+ provenance, archive, [DirectoryEntry(id=d) for d in dirs], minsize, commit
+ )
+ current = dirs[-1]
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "add"})
def directory_add(
provenance: ProvenanceInterface,
archive: ArchiveInterface,
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -9,7 +9,7 @@
from datetime import datetime
import enum
from types import TracebackType
-from typing import Dict, Generator, Iterable, Optional, Set, Type, Union
+from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union
from typing_extensions import Protocol, runtime_checkable
@@ -138,6 +138,13 @@
"""
...
+ @remote_api_endpoint("directory_iter_not_flattenned")
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ """Retrieve the unflattenned directories after ``start_id`` up to ``limit`` entries."""
+ ...
+
@remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -194,6 +194,20 @@
)
return result
+ @statsd.timed(
+ metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"}
+ )
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ sql = """
+ SELECT sha1 FROM directory
+ WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=(start_id, limit))
+ return [row["sha1"] for row in cursor]
+
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"})
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
with self.transaction(readonly=True) as cursor:
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -14,6 +14,7 @@
\endif
create unique index on location(digest(path, 'sha1'));
+create index on directory(sha1) where flat=false;
alter table revision_in_origin add primary key (revision, origin);
alter table revision_before_revision add primary key (prev, next);
diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py
--- a/swh/provenance/tests/test_directory_flatten.py
+++ b/swh/provenance/tests/test_directory_flatten.py
@@ -5,10 +5,11 @@
from datetime import datetime, timezone
+from typing import Tuple
from swh.model.hashutil import hash_to_bytes
from swh.provenance.archive import ArchiveInterface
-from swh.provenance.directory import directory_add
+from swh.provenance.directory import directory_add, directory_flatten_range
from swh.provenance.interface import (
DirectoryData,
ProvenanceInterface,
@@ -19,10 +20,10 @@
from swh.provenance.tests.conftest import fill_storage, load_repo_data
-def test_directory_add(
- provenance: ProvenanceInterface,
- archive: ArchiveInterface,
-) -> None:
+def prepare(
+ provenance: ProvenanceInterface, archive: ArchiveInterface
+) -> Tuple[datetime, DirectoryEntry, FileEntry, FileEntry]:
+ """Prepare the provenance database with some content suitable for flattening tests"""
# read data/README.md for more details on how these datasets are generated
data = load_repo_data("cmdbts2")
fill_storage(archive.storage, data)
@@ -58,6 +59,16 @@
flattenned = provenance.directory_already_flattenned(directory)
assert flattenned is not None and not flattenned
+ return date, directory, content1, content2
+
+
+def test_directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+
+ date, directory, content1, content2 = prepare(provenance, archive)
+
# flatten the directory and check the expected result
directory_add(provenance, archive, [directory])
assert provenance.storage.directory_get([directory.id]) == {
@@ -70,3 +81,25 @@
},
content2.id: {RelationData(dst=directory.id, path=b"C/b")},
}
+
+
+def test_directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+
+ date, directory, content1, content2 = prepare(provenance, archive)
+
+ # flatten the directory and check the expected result
+ directory_flatten_range(provenance, archive, directory.id[:-1], directory.id)
+
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=True)
+ }
+ assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == {
+ content1.id: {
+ RelationData(dst=directory.id, path=b"a"),
+ RelationData(dst=directory.id, path=b"C/a"),
+ },
+ content2.id: {RelationData(dst=directory.id, path=b"C/b")},
+ }

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 3:17 PM (1 d, 5 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218177

Event Timeline