Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123889
D6714.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D6714.diff
View Options
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -139,6 +139,43 @@
atexit.register(exit)
+@cli.command(name="iter-frontiers")
+@click.argument("filename")
+@click.option("-l", "--limit", type=int)
+@click.option("-s", "--min-size", default=0, type=int)
+@click.pass_context
+def iter_frontiers(
+ ctx: click.core.Context,
+ filename: str,
+ limit: Optional[int],
+ min_size: int,
+) -> None:
+ """Process a provided list of directories in the isochrone frontier."""
+ from . import get_archive, get_provenance
+ from .directory import CSVDirectoryIterator, directory_add
+
+ archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
+ directories_provider = generate_directory_ids(filename)
+ directories = CSVDirectoryIterator(directories_provider, limit=limit)
+
+ with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
+ for directory in directories:
+ directory_add(
+ provenance,
+ archive,
+ [directory],
+ minsize=min_size,
+ )
+
+
+def generate_directory_ids(
+ filename: str,
+) -> Generator[Sha1Git, None, None]:
+ for line in open(filename, "r"):
+ if line.strip():
+ yield hash_to_bytes(line.strip())
+
+
@cli.command(name="iter-revisions")
@click.argument("filename")
@click.option("-a", "--track-all", default=True, type=bool)
diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/directory.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from typing import Generator, Iterable, Iterator, List, Optional
+
+from swh.core.statsd import statsd
+from swh.model.model import Sha1Git
+
+from .archive import ArchiveInterface
+from .interface import ProvenanceInterface
+from .model import DirectoryEntry
+
+REVISION_DURATION_METRIC = "swh_provenance_directory_duration_seconds"
+
+
+class CSVDirectoryIterator:
+ """Iterator over directories typically present in the given CSV file.
+
+ The input is an iterator that produces ids (sha1_git) of directories
+ """
+
+ def __init__(
+ self,
+ directories: Iterable[Sha1Git],
+ limit: Optional[int] = None,
+ ) -> None:
+ self.directories: Iterator[Sha1Git]
+ if limit is not None:
+ from itertools import islice
+
+ self.directories = islice(directories, limit)
+ else:
+ self.directories = iter(directories)
+
+ def __iter__(self) -> Generator[DirectoryEntry, None, None]:
+ for id in self.directories:
+ yield DirectoryEntry(id)
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
+def directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ directories: List[DirectoryEntry],
+ minsize: int = 0,
+ commit: bool = True,
+) -> None:
+ for directory in directories:
+ # Only flatten directories that are present in the provenance model, but not
+ # flattenned yet.
+ flattenned = provenance.directory_already_flattenned(directory)
+ if flattenned is not None and not flattenned:
+ directory_flatten(
+ provenance,
+ archive,
+ directory,
+ minsize=minsize,
+ )
+ if commit:
+ provenance.flush()
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten"})
+def directory_flatten(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ directory: DirectoryEntry,
+ minsize: int = 0,
+) -> None:
+ """Recursively retrieve all the files of 'directory' and insert them in the
+ 'provenance' database in the 'content_to_directory' table.
+ """
+ stack = [(directory, b"")]
+ while stack:
+ current, prefix = stack.pop()
+ current.retrieve_children(archive, minsize=minsize)
+ for f_child in current.files:
+ # Add content to the directory with the computed prefix.
+ provenance.content_add_to_directory(directory, f_child, prefix)
+ for d_child in current.dirs:
+ # Recursively walk the child directory.
+ stack.append((d_child, os.path.join(prefix, d_child.name)))
+ provenance.directory_flag_as_flattenned(directory)
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -301,6 +301,18 @@
"""
...
+ def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]:
+ """Check if the directory is already flattenned in the provenance model. If the
+ directory is unknown for the model, the methods returns None.
+ """
+ ...
+
+ def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None:
+ """Mark the directory as flattenned in the provenance model. If the
+ directory is unknown for the model, this method has no effect.
+ """
+ ...
+
def directory_get_date_in_isochrone_frontier(
self, directory: DirectoryEntry
) -> Optional[datetime]:
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -319,7 +319,7 @@
date=date, flat=self.cache["directory_flatten"].get(sha1) or False
)
for sha1, date in self.cache["directory"]["data"].items()
- if sha1 in self.cache["directory"]["added"] and date is not None
+ if self.cache["directory_flatten"].get(sha1) and date is not None
}
if dir_acks:
while not self.storage.directory_add(dir_acks):
@@ -387,6 +387,21 @@
(directory.id, revision.id, path_normalize(path))
)
+ def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]:
+ cache = self.cache["directory_flatten"]
+ if directory.id not in cache:
+ cache.setdefault(directory.id, None)
+ ret = self.storage.directory_get([directory.id])
+ if directory.id in ret:
+ dir = ret[directory.id]
+ cache[directory.id] = dir.flat
+ # date is kept to ensure we have it available when flushing
+ self.cache["directory"]["data"][directory.id] = dir.date
+ return cache.get(directory.id)
+
+ def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None:
+ self.cache["directory_flatten"][directory.id] = True
+
def directory_get_date_in_isochrone_frontier(
self, directory: DirectoryEntry
) -> Optional[datetime]:
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -4,13 +4,13 @@
# See top-level LICENSE file for more information
from datetime import datetime, timezone
-import os
from typing import Generator, Iterable, Iterator, List, Optional, Tuple
from swh.core.statsd import statsd
from swh.model.model import Sha1Git
from .archive import ArchiveInterface
+from .directory import directory_flatten
from .graph import IsochroneNode, build_isochrone_graph
from .interface import ProvenanceInterface
from .model import DirectoryEntry, RevisionEntry
@@ -135,7 +135,7 @@
provenance.directory_add_to_revision(
revision, current.entry, current.path
)
- flatten_directory(
+ directory_flatten(
provenance, archive, current.entry, minsize=minsize
)
else:
@@ -158,28 +158,6 @@
stack.append(child)
-@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten_directory"})
-def flatten_directory(
- provenance: ProvenanceInterface,
- archive: ArchiveInterface,
- directory: DirectoryEntry,
- minsize: int = 0,
-) -> None:
- """Recursively retrieve all the files of 'directory' and insert them in the
- 'provenance' database in the 'content_to_directory' table.
- """
- stack = [(directory, b"")]
- while stack:
- current, prefix = stack.pop()
- current.retrieve_children(archive, minsize=minsize)
- for f_child in current.files:
- # Add content to the directory with the computed prefix.
- provenance.content_add_to_directory(directory, f_child, prefix)
- for d_child in current.dirs:
- # Recursively walk the child directory.
- stack.append((d_child, os.path.join(prefix, d_child.name)))
-
-
def is_new_frontier(
node: IsochroneNode,
revision: RevisionEntry,
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
--- a/swh/provenance/tests/test_cli.py
+++ b/swh/provenance/tests/test_cli.py
@@ -25,6 +25,7 @@
for command in (
"find-all",
"find-first",
+ "iter-frontiers",
"iter-origins",
"iter-revisions",
):
diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_directory_flatten.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from datetime import datetime, timezone
+
+from swh.model.hashutil import hash_to_bytes
+from swh.provenance.archive import ArchiveInterface
+from swh.provenance.directory import directory_add
+from swh.provenance.interface import (
+ DirectoryData,
+ ProvenanceInterface,
+ RelationData,
+ RelationType,
+)
+from swh.provenance.model import DirectoryEntry, FileEntry
+from swh.provenance.tests.conftest import fill_storage, load_repo_data
+
+
+def test_directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+ # read data/README.md for more details on how these datasets are generated
+ data = load_repo_data("cmdbts2")
+ fill_storage(archive.storage, data)
+
+ # just take a directory that is known to exists in cmdbts2
+ directory = DirectoryEntry(
+ id=hash_to_bytes("48007c961cc734d1f63886d0413a6dc605e3e2ea")
+ )
+ content1 = FileEntry(
+ id=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), name=b"a"
+ )
+ content2 = FileEntry(
+ id=hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3"), name=b"b"
+ )
+ date = datetime.fromtimestamp(1000000010, timezone.utc)
+
+ # directory_add and the internal directory_flatten require the directory and its
+ # content to be known by the provenance object. Otherwise, they do nothing
+ provenance.directory_set_date_in_isochrone_frontier(directory, date)
+ provenance.content_set_early_date(content1, date)
+ provenance.content_set_early_date(content2, date)
+ provenance.flush()
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=False)
+ }
+ assert provenance.storage.content_get([content1.id, content2.id]) == {
+ content1.id: date,
+ content2.id: date,
+ }
+
+ # this query forces the directory date to be retrieved from the storage and cached
+ # (otherwise, the flush below won't update the directory flatten flag)
+ flattenned = provenance.directory_already_flattenned(directory)
+ assert flattenned is not None and not flattenned
+
+ # flatten the directory and check the expected result
+ directory_add(provenance, archive, [directory])
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=True)
+ }
+ assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == {
+ content1.id: {
+ RelationData(dst=directory.id, path=b"a"),
+ RelationData(dst=directory.id, path=b"C/a"),
+ },
+ content2.id: {RelationData(dst=directory.id, path=b"C/b")},
+ }
diff --git a/swh/provenance/tests/test_directory_iterator.py b/swh/provenance/tests/test_directory_iterator.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_directory_iterator.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.provenance.directory import CSVDirectoryIterator
+from swh.provenance.tests.conftest import fill_storage, load_repo_data
+from swh.storage.interface import StorageInterface
+
+
+@pytest.mark.parametrize(
+ "repo",
+ (
+ "cmdbts2",
+ "out-of-order",
+ ),
+)
+def test_revision_iterator(swh_storage: StorageInterface, repo: str) -> None:
+ """Test CSVDirectoryIterator"""
+ data = load_repo_data(repo)
+ fill_storage(swh_storage, data)
+
+ directories_ids = [dir["id"] for dir in data["directory"]]
+ directories = list(CSVDirectoryIterator(directories_ids))
+
+ assert directories
+ assert len(directories) == len(data["directory"])
diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py
--- a/swh/provenance/tests/test_revision_content_layer.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -259,12 +259,9 @@
}, synth_rev["msg"]
# check timestamps
for rd in synth_rev["R_D"]:
- assert (
- rev_ts + rd["rel_ts"]
- == provenance.storage.directory_get([rd["dst"]])[
- rd["dst"]
- ].date.timestamp()
- ), synth_rev["msg"]
+ dir_data = provenance.storage.directory_get([rd["dst"]])[rd["dst"]]
+ assert rev_ts + rd["rel_ts"] == dir_data.date.timestamp(), synth_rev["msg"]
+ assert dir_data.flat, synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table
# for content of the directory.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 5:45 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217994
Attached To
D6714: Add support to flatten directories in the isochrone frontiers separately
Event Timeline
Log In to Comment