Page MenuHomeSoftware Heritage

D6714.diff
No OneTemporary

D6714.diff

diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -139,6 +139,43 @@
atexit.register(exit)
+@cli.command(name="iter-frontiers")
+@click.argument("filename")
+@click.option("-l", "--limit", type=int)
+@click.option("-s", "--min-size", default=0, type=int)
+@click.pass_context
+def iter_frontiers(
+ ctx: click.core.Context,
+ filename: str,
+ limit: Optional[int],
+ min_size: int,
+) -> None:
+ """Process a provided list of directories in the isochrone frontier."""
+ from . import get_archive, get_provenance
+ from .directory import CSVDirectoryIterator, directory_add
+
+ archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
+ directories_provider = generate_directory_ids(filename)
+ directories = CSVDirectoryIterator(directories_provider, limit=limit)
+
+ with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
+ for directory in directories:
+ directory_add(
+ provenance,
+ archive,
+ [directory],
+ minsize=min_size,
+ )
+
+
+def generate_directory_ids(
+ filename: str,
+) -> Generator[Sha1Git, None, None]:
+ for line in open(filename, "r"):
+ if line.strip():
+ yield hash_to_bytes(line.strip())
+
+
@cli.command(name="iter-revisions")
@click.argument("filename")
@click.option("-a", "--track-all", default=True, type=bool)
diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/directory.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+from typing import Generator, Iterable, Iterator, List, Optional
+
+from swh.core.statsd import statsd
+from swh.model.model import Sha1Git
+
+from .archive import ArchiveInterface
+from .interface import ProvenanceInterface
+from .model import DirectoryEntry
+
+REVISION_DURATION_METRIC = "swh_provenance_directory_duration_seconds"
+
+
+class CSVDirectoryIterator:
+ """Iterator over directories typically present in the given CSV file.
+
+ The input is an iterator that produces ids (sha1_git) of directories
+ """
+
+ def __init__(
+ self,
+ directories: Iterable[Sha1Git],
+ limit: Optional[int] = None,
+ ) -> None:
+ self.directories: Iterator[Sha1Git]
+ if limit is not None:
+ from itertools import islice
+
+ self.directories = islice(directories, limit)
+ else:
+ self.directories = iter(directories)
+
+ def __iter__(self) -> Generator[DirectoryEntry, None, None]:
+ for id in self.directories:
+ yield DirectoryEntry(id)
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
+def directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ directories: List[DirectoryEntry],
+ minsize: int = 0,
+ commit: bool = True,
+) -> None:
+ for directory in directories:
+ # Only flatten directories that are present in the provenance model, but not
+ # flattenned yet.
+ flattenned = provenance.directory_already_flattenned(directory)
+ if flattenned is not None and not flattenned:
+ directory_flatten(
+ provenance,
+ archive,
+ directory,
+ minsize=minsize,
+ )
+ if commit:
+ provenance.flush()
+
+
+@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten"})
+def directory_flatten(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+ directory: DirectoryEntry,
+ minsize: int = 0,
+) -> None:
+ """Recursively retrieve all the files of 'directory' and insert them in the
+ 'provenance' database in the 'content_to_directory' table.
+ """
+ stack = [(directory, b"")]
+ while stack:
+ current, prefix = stack.pop()
+ current.retrieve_children(archive, minsize=minsize)
+ for f_child in current.files:
+ # Add content to the directory with the computed prefix.
+ provenance.content_add_to_directory(directory, f_child, prefix)
+ for d_child in current.dirs:
+ # Recursively walk the child directory.
+ stack.append((d_child, os.path.join(prefix, d_child.name)))
+ provenance.directory_flag_as_flattenned(directory)
diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py
--- a/swh/provenance/interface.py
+++ b/swh/provenance/interface.py
@@ -301,6 +301,18 @@
"""
...
+ def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]:
+ """Check if the directory is already flattenned in the provenance model. If the
+ directory is unknown for the model, the methods returns None.
+ """
+ ...
+
+ def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None:
+ """Mark the directory as flattenned in the provenance model. If the
+ directory is unknown for the model, this method has no effect.
+ """
+ ...
+
def directory_get_date_in_isochrone_frontier(
self, directory: DirectoryEntry
) -> Optional[datetime]:
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -319,7 +319,7 @@
date=date, flat=self.cache["directory_flatten"].get(sha1) or False
)
for sha1, date in self.cache["directory"]["data"].items()
- if sha1 in self.cache["directory"]["added"] and date is not None
+ if self.cache["directory_flatten"].get(sha1) and date is not None
}
if dir_acks:
while not self.storage.directory_add(dir_acks):
@@ -387,6 +387,21 @@
(directory.id, revision.id, path_normalize(path))
)
+ def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]:
+ cache = self.cache["directory_flatten"]
+ if directory.id not in cache:
+ cache.setdefault(directory.id, None)
+ ret = self.storage.directory_get([directory.id])
+ if directory.id in ret:
+ dir = ret[directory.id]
+ cache[directory.id] = dir.flat
+ # date is kept to ensure we have it available when flushing
+ self.cache["directory"]["data"][directory.id] = dir.date
+ return cache.get(directory.id)
+
+ def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None:
+ self.cache["directory_flatten"][directory.id] = True
+
def directory_get_date_in_isochrone_frontier(
self, directory: DirectoryEntry
) -> Optional[datetime]:
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -4,13 +4,13 @@
# See top-level LICENSE file for more information
from datetime import datetime, timezone
-import os
from typing import Generator, Iterable, Iterator, List, Optional, Tuple
from swh.core.statsd import statsd
from swh.model.model import Sha1Git
from .archive import ArchiveInterface
+from .directory import directory_flatten
from .graph import IsochroneNode, build_isochrone_graph
from .interface import ProvenanceInterface
from .model import DirectoryEntry, RevisionEntry
@@ -135,7 +135,7 @@
provenance.directory_add_to_revision(
revision, current.entry, current.path
)
- flatten_directory(
+ directory_flatten(
provenance, archive, current.entry, minsize=minsize
)
else:
@@ -158,28 +158,6 @@
stack.append(child)
-@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten_directory"})
-def flatten_directory(
- provenance: ProvenanceInterface,
- archive: ArchiveInterface,
- directory: DirectoryEntry,
- minsize: int = 0,
-) -> None:
- """Recursively retrieve all the files of 'directory' and insert them in the
- 'provenance' database in the 'content_to_directory' table.
- """
- stack = [(directory, b"")]
- while stack:
- current, prefix = stack.pop()
- current.retrieve_children(archive, minsize=minsize)
- for f_child in current.files:
- # Add content to the directory with the computed prefix.
- provenance.content_add_to_directory(directory, f_child, prefix)
- for d_child in current.dirs:
- # Recursively walk the child directory.
- stack.append((d_child, os.path.join(prefix, d_child.name)))
-
-
def is_new_frontier(
node: IsochroneNode,
revision: RevisionEntry,
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
--- a/swh/provenance/tests/test_cli.py
+++ b/swh/provenance/tests/test_cli.py
@@ -25,6 +25,7 @@
for command in (
"find-all",
"find-first",
+ "iter-frontiers",
"iter-origins",
"iter-revisions",
):
diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_directory_flatten.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from datetime import datetime, timezone
+
+from swh.model.hashutil import hash_to_bytes
+from swh.provenance.archive import ArchiveInterface
+from swh.provenance.directory import directory_add
+from swh.provenance.interface import (
+ DirectoryData,
+ ProvenanceInterface,
+ RelationData,
+ RelationType,
+)
+from swh.provenance.model import DirectoryEntry, FileEntry
+from swh.provenance.tests.conftest import fill_storage, load_repo_data
+
+
+def test_directory_add(
+ provenance: ProvenanceInterface,
+ archive: ArchiveInterface,
+) -> None:
+ # read data/README.md for more details on how these datasets are generated
+ data = load_repo_data("cmdbts2")
+ fill_storage(archive.storage, data)
+
+ # just take a directory that is known to exists in cmdbts2
+ directory = DirectoryEntry(
+ id=hash_to_bytes("48007c961cc734d1f63886d0413a6dc605e3e2ea")
+ )
+ content1 = FileEntry(
+ id=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), name=b"a"
+ )
+ content2 = FileEntry(
+ id=hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3"), name=b"b"
+ )
+ date = datetime.fromtimestamp(1000000010, timezone.utc)
+
+ # directory_add and the internal directory_flatten require the directory and its
+ # content to be known by the provenance object. Otherwise, they do nothing
+ provenance.directory_set_date_in_isochrone_frontier(directory, date)
+ provenance.content_set_early_date(content1, date)
+ provenance.content_set_early_date(content2, date)
+ provenance.flush()
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=False)
+ }
+ assert provenance.storage.content_get([content1.id, content2.id]) == {
+ content1.id: date,
+ content2.id: date,
+ }
+
+ # this query forces the directory date to be retrieved from the storage and cached
+ # (otherwise, the flush below won't update the directory flatten flag)
+ flattenned = provenance.directory_already_flattenned(directory)
+ assert flattenned is not None and not flattenned
+
+ # flatten the directory and check the expected result
+ directory_add(provenance, archive, [directory])
+ assert provenance.storage.directory_get([directory.id]) == {
+ directory.id: DirectoryData(date=date, flat=True)
+ }
+ assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == {
+ content1.id: {
+ RelationData(dst=directory.id, path=b"a"),
+ RelationData(dst=directory.id, path=b"C/a"),
+ },
+ content2.id: {RelationData(dst=directory.id, path=b"C/b")},
+ }
diff --git a/swh/provenance/tests/test_directory_iterator.py b/swh/provenance/tests/test_directory_iterator.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_directory_iterator.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.provenance.directory import CSVDirectoryIterator
+from swh.provenance.tests.conftest import fill_storage, load_repo_data
+from swh.storage.interface import StorageInterface
+
+
+@pytest.mark.parametrize(
+ "repo",
+ (
+ "cmdbts2",
+ "out-of-order",
+ ),
+)
+def test_revision_iterator(swh_storage: StorageInterface, repo: str) -> None:
+ """Test CSVDirectoryIterator"""
+ data = load_repo_data(repo)
+ fill_storage(swh_storage, data)
+
+ directories_ids = [dir["id"] for dir in data["directory"]]
+ directories = list(CSVDirectoryIterator(directories_ids))
+
+ assert directories
+ assert len(directories) == len(data["directory"])
diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py
--- a/swh/provenance/tests/test_revision_content_layer.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -259,12 +259,9 @@
}, synth_rev["msg"]
# check timestamps
for rd in synth_rev["R_D"]:
- assert (
- rev_ts + rd["rel_ts"]
- == provenance.storage.directory_get([rd["dst"]])[
- rd["dst"]
- ].date.timestamp()
- ), synth_rev["msg"]
+ dir_data = provenance.storage.directory_get([rd["dst"]])[rd["dst"]]
+ assert rev_ts + rd["rel_ts"] == dir_data.date.timestamp(), synth_rev["msg"]
+ assert dir_data.flat, synth_rev["msg"]
# ... + a number of rows in the "content_in_dir" table
# for content of the directory.

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 5:45 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217994

Event Timeline