Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124539
D8100.id29239.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D8100.id29239.diff
View Options
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -388,6 +388,50 @@
client.close()
+@cli.group(name="directory")
+@click.pass_context
+def directory(ctx: click.core.Context):
+ from . import get_archive, get_provenance
+
+ archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
+ provenance = get_provenance(**ctx.obj["config"]["provenance"]["storage"])
+
+ ctx.obj["provenance"] = provenance
+ ctx.obj["archive"] = archive
+
+
+@directory.command(name="flatten")
+@click.option(
+ "--range-from", type=str, help="start ID of the range of directories to flatten"
+)
+@click.option(
+ "--range-to", type=str, help="stop ID of the range of directories to flatten"
+)
+@click.option(
+ "-s",
+ "--min-size",
+ default=0,
+ type=int,
+ help="""Set the minimum size (in bytes) of files to be indexed.
+ Any smaller file will be ignored.""",
+)
+@click.pass_context
+def directory_flatten(ctx: click.core.Context, range_from, range_to, min_size):
+ from swh.provenance.directory import directory_flatten_range
+
+ provenance = ctx.obj["provenance"]
+ archive = ctx.obj["archive"]
+
+ directory_flatten_range(
+ provenance,
+ archive,
+ hash_to_bytes(range_from),
+ hash_to_bytes(range_to),
+ min_size,
+ )
+
+
+# old (deprecated) commands
@cli.command(name="iter-frontiers")
@click.argument("filename")
@click.option(
diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py
--- a/swh/provenance/directory.py
+++ b/swh/provenance/directory.py
@@ -40,6 +40,28 @@
yield DirectoryEntry(id)
+def directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ start_id: Sha1Git,
+ end_id: Sha1Git,
+ minsize: int = 0,
+ commit: bool = True,
+) -> None:
+ """XXX"""
+ current = start_id
+ while current < end_id:
+ dirs = provenance.storage.directory_iter_not_flattenned(
+ limit=100, start_id=current
+ )
+ if not dirs:
+ break
+ directory_add(
+ provenance, archive, [DirectoryEntry(id=d) for d in dirs], minsize, commit
+ )
+ current = dirs[-1]
+
+
@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
def directory_add(
provenance: ProvenanceInterface,
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -9,7 +9,7 @@
from datetime import datetime
import enum
from types import TracebackType
-from typing import Dict, Generator, Iterable, Optional, Set, Type, Union
+from typing import Dict, Generator, Iterable, List, Optional, Set, Type, Union
from typing_extensions import Protocol, runtime_checkable
@@ -138,6 +138,13 @@
"""
...
+ @remote_api_endpoint("directory_iter_not_flattenned")
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ """Retrieve the unflattenned directories"""
+ ...
+
@remote_api_endpoint("entity_get_all")
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
"""Retrieve all sha1 ids for entities of type `entity` present in the provenance
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -194,6 +194,20 @@
)
return result
+ @statsd.timed(
+ metric=STORAGE_DURATION_METRIC, tags={"method": "directory_iter_not_flattenned"}
+ )
+ def directory_iter_not_flattenned(
+ self, limit: int, start_id: Sha1Git
+ ) -> List[Sha1Git]:
+ sql = """
+ SELECT sha1 FROM directory
+ WHERE flat=false AND sha1>%s ORDER BY sha1 LIMIT %s
+ """
+ with self.transaction(readonly=True) as cursor:
+ cursor.execute(query=sql, vars=(start_id, limit))
+ return [row["sha1"] for row in cursor]
+
@statsd.timed(metric=STORAGE_DURATION_METRIC, tags={"method": "entity_get_all"})
def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]:
with self.transaction(readonly=True) as cursor:
diff --git a/swh/provenance/sql/60-indexes.sql b/swh/provenance/sql/60-indexes.sql
--- a/swh/provenance/sql/60-indexes.sql
+++ b/swh/provenance/sql/60-indexes.sql
@@ -14,6 +14,7 @@
\endif
create unique index on location(digest(path, 'sha1'));
+create index on directory(sha1) where flat=false;
alter table revision_in_origin add primary key (revision, origin);
alter table revision_before_revision add primary key (prev, next);
diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py
--- a/swh/provenance/tests/test_directory_flatten.py
+++ b/swh/provenance/tests/test_directory_flatten.py
@@ -8,7 +8,7 @@
from swh.model.hashutil import hash_to_bytes
from swh.provenance.archive import ArchiveInterface
-from swh.provenance.directory import directory_add
+from swh.provenance.directory import directory_add, directory_flatten_range
from swh.provenance.interface import (
DirectoryData,
ProvenanceInterface,
@@ -70,3 +70,58 @@
},
content2.id: {RelationData(dst=directory.id, path=b"C/b")},
}
+
+
+def test_directory_flatten_range(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+ # read data/README.md for more details on how these datasets are generated
+ data = load_repo_data("cmdbts2")
+ fill_storage(archive.storage, data)
+
+ # just take a directory that is known to exists in cmdbts2
+ directory = DirectoryEntry(
+ id=hash_to_bytes("48007c961cc734d1f63886d0413a6dc605e3e2ea")
+ )
+ content1 = FileEntry(
+ id=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), name=b"a"
+ )
+ content2 = FileEntry(
+ id=hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3"), name=b"b"
+ )
+ date = datetime.fromtimestamp(1000000010, timezone.utc)
+
+ # directory_add and the internal directory_flatten require the directory and its
+ # content to be known by the provenance object. Otherwise, they do nothing
+ provenance.directory_set_date_in_isochrone_frontier(directory, date)
+ provenance.content_set_early_date(content1, date)
+ provenance.content_set_early_date(content2, date)
+ provenance.flush()
+
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=False)
+ }
+ assert provenance.storage.content_get([content1.id, content2.id]) == {
+ content1.id: date,
+ content2.id: date,
+ }
+
+ # this query forces the directory date to be retrieved from the storage and cached
+ # (otherwise, the flush below won't update the directory flatten flag)
+ flattenned = provenance.directory_already_flattenned(directory)
+ assert flattenned is not None and not flattenned
+
+ # flatten the directory and check the expected result
+ directory_flatten_range(provenance, archive, directory.id[:-1], directory.id)
+
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=True)
+ }
+ assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == {
+ content1.id: {
+ RelationData(dst=directory.id, path=b"a"),
+ RelationData(dst=directory.id, path=b"C/a"),
+ },
+ content2.id: {RelationData(dst=directory.id, path=b"C/b")},
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 1:07 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232552
Attached To
D8100: Add a `swh provenance directory flatten` command
Event Timeline
Log In to Comment