Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123302
D8100.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D8100.diff
View Options
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -388,6 +388,50 @@
client.close()
+@cli.group(name="directory")
+@click.pass_context
+def directory(ctx: click.core.Context):
+ from . import get_archive, get_provenance
+
+ archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
+ provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"])
+
+ ctx.obj["provenance"] = provenance
+ ctx.obj["archive"] = archive
+
+
+@directory.command(name="flatten")
+@click.option(
+ "--range-from", type=str, help="start ID of the range of directories to flatten"
+)
+@click.option(
+ "--range-to", type=str, help="stop ID of the range of directories to flatten"
+)
+@click.option(
+ "-s",
+ "--min-size",
+ default=0,
+ type=int,
+ help="""Set the minimum size (in bytes) of files to be indexed.
+ Any smaller file will be ignored.""",
+)
+@click.pass_context
+def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size):
+ from swh.provenance.directory import directory_flatten_range
+
+ provenance = ctx.obj["provenance"]
+ archive = ctx.obj["archive"]
+
+ directory_flatten_range(
+ provenance,
+ archive,
+ hash_to_bytes(range_from),
+ hash_to_bytes(range_to),
+ min_size,
+ )
+
+
+# old (deprecated) commands
@cli.command(name="iter-frontiers")
@click.argument("filename")
@click.option(
diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py
--- a/swh/provenance/directory.py
+++ b/swh/provenance/directory.py
@@ -40,7 +40,29 @@
yield DirectoryEntry(id)
-@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
+def directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ start_id: Sha1Git,
+ end_id: Sha1Git,
+ minsize: int = 0,
+ commit: bool = True,
+) -> None:
+ """Flatten the known directories from ``start_id`` to ``end_id``."""
+ current = start_id
+ while current < end_id:
+ dirs = provenance.storage.directory_iter_not_flattenned(
+ limit=100, start_id=current
+ )
+ if not dirs:
+ break
+ directory_add(
+ provenance, archive, [DirectoryEntry(id=d) for d in dirs], minsize, commit
+ )
+ current = dirs[-1]
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "add"})
def directory_add(
provenance: ProvenanceInterface,
archive: ArchiveInterface,
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -9,7 +9,7 @@
from datetime import datetime
import enum
from types import TracebackType
-from typing import Dict, Generator, Iterable, Optional, Set, Type, Union
+from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union
from typing_extensions import Protocol, runtime_checkable
@@ -138,6 +138,13 @@
"""
...
+ @remote_api_endpoint("directory_iter_not_flattenned")
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ """Retrieve the unflattenned directories after ``start_id`` up to ``limit`` entries."""
+ ...
+
@remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -194,6 +194,20 @@
)
return result
+ @statsd.timed(
+ metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"}
+ )
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ sql = """
+ SELECT sha1 FROM directory
+ WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=(start_id, limit))
+ return [row["sha1"] for row in cursor]
+
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"})
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
with self.transaction(readonly=True) as cursor:
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -14,6 +14,7 @@
\endif
create unique index on location(digest(path, 'sha1'));
+create index on directory(sha1) where flat=false;
alter table revision_in_origin add primary key (revision, origin);
alter table revision_before_revision add primary key (prev, next);
diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py
--- a/swh/provenance/tests/test_directory_flatten.py
+++ b/swh/provenance/tests/test_directory_flatten.py
@@ -5,10 +5,11 @@
from datetime import datetime, timezone
+from typing import Tuple
from swh.model.hashutil import hash_to_bytes
from swh.provenance.archive import ArchiveInterface
-from swh.provenance.directory import directory_add
+from swh.provenance.directory import directory_add, directory_flatten_range
from swh.provenance.interface import (
DirectoryData,
ProvenanceInterface,
@@ -19,10 +20,10 @@
from swh.provenance.tests.conftest import fill_storage, load_repo_data
-def test_directory_add(
- provenance: ProvenanceInterface,
- archive: ArchiveInterface,
-) -> None:
+def prepare(
+ provenance: ProvenanceInterface, archive: ArchiveInterface
+) -> Tuple[datetime, DirectoryEntry, FileEntry, FileEntry]:
+ """Prepare the provenance database with some content suitable for flattening tests"""
# read data/README.md for more details on how these datasets are generated
data = load_repo_data("cmdbts2")
fill_storage(archive.storage, data)
@@ -58,6 +59,16 @@
flattenned = provenance.directory_already_flattenned(directory)
assert flattenned is not None and not flattenned
+ return date, directory, content1, content2
+
+
+def test_directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+
+ date, directory, content1, content2 = prepare(provenance, archive)
+
# flatten the directory and check the expected result
directory_add(provenance, archive, [directory])
assert provenance.storage.directory_get([directory.id]) == {
@@ -70,3 +81,25 @@
},
content2.id: {RelationData(dst=directory.id, path=b"C/b")},
}
+
+
+def test_directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+
+ date, directory, content1, content2 = prepare(provenance, archive)
+
+ # flatten the directory and check the expected result
+ directory_flatten_range(provenance, archive, directory.id[:-1], directory.id)
+
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=True)
+ }
+ assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == {
+ content1.id: {
+ RelationData(dst=directory.id, path=b"a"),
+ RelationData(dst=directory.id, path=b"C/a"),
+ },
+ content2.id: {RelationData(dst=directory.id, path=b"C/b")},
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 3:17 PM (1 d, 5 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218177
Attached To
D8100: Add a `swh provenance directory flatten` command
Event Timeline
Log In to Comment