diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -139,6 +139,43 @@ atexit.register(exit) +@cli.command(name="iter-frontiers") +@click.argument("filename") +@click.option("-l", "--limit", type=int) +@click.option("-s", "--min-size", default=0, type=int) +@click.pass_context +def iter_frontiers( + ctx: click.core.Context, + filename: str, + limit: Optional[int], + min_size: int, +) -> None: + """Process a provided list of directories in the isochrone frontier.""" + from . import get_archive, get_provenance + from .directory import CSVDirectoryIterator, directory_add + + archive = get_archive(**ctx.obj["config"]["provenance"]["archive"]) + directories_provider = generate_directory_ids(filename) + directories = CSVDirectoryIterator(directories_provider, limit=limit) + + with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance: + for directory in directories: + directory_add( + provenance, + archive, + [directory], + minsize=min_size, + ) + + +def generate_directory_ids( + filename: str, +) -> Generator[Sha1Git, None, None]: + for line in open(filename, "r"): + if line.strip(): + yield hash_to_bytes(line.strip()) + + @cli.command(name="iter-revisions") @click.argument("filename") @click.option("-a", "--track-all", default=True, type=bool) diff --git a/swh/provenance/directory.py b/swh/provenance/directory.py new file mode 100644 --- /dev/null +++ b/swh/provenance/directory.py @@ -0,0 +1,86 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from typing import Generator, Iterable, Iterator, List, Optional + +from swh.core.statsd import statsd +from swh.model.model import Sha1Git + +from .archive import ArchiveInterface +from .interface import ProvenanceInterface +from .model import DirectoryEntry + +REVISION_DURATION_METRIC = "swh_provenance_directory_duration_seconds" + + +class CSVDirectoryIterator: + """Iterator over directories typically present in the given CSV file. + + The input is an iterator that produces ids (sha1_git) of directories + """ + + def __init__( + self, + directories: Iterable[Sha1Git], + limit: Optional[int] = None, + ) -> None: + self.directories: Iterator[Sha1Git] + if limit is not None: + from itertools import islice + + self.directories = islice(directories, limit) + else: + self.directories = iter(directories) + + def __iter__(self) -> Generator[DirectoryEntry, None, None]: + for id in self.directories: + yield DirectoryEntry(id) + + +@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"}) +def directory_add( + provenance: ProvenanceInterface, + archive: ArchiveInterface, + directories: List[DirectoryEntry], + minsize: int = 0, + commit: bool = True, +) -> None: + for directory in directories: + # Only flatten directories that are present in the provenance model, but not + # flattenned yet. + flattenned = provenance.directory_already_flattenned(directory) + if flattenned is not None and not flattenned: + directory_flatten( + provenance, + archive, + directory, + minsize=minsize, + ) + if commit: + provenance.flush() + + +@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten"}) +def directory_flatten( + provenance: ProvenanceInterface, + archive: ArchiveInterface, + directory: DirectoryEntry, + minsize: int = 0, +) -> None: + """Recursively retrieve all the files of 'directory' and insert them in the + 'provenance' database in the 'content_to_directory' table. + """ + stack = [(directory, b"")] + while stack: + current, prefix = stack.pop() + current.retrieve_children(archive, minsize=minsize) + for f_child in current.files: + # Add content to the directory with the computed prefix. + provenance.content_add_to_directory(directory, f_child, prefix) + for d_child in current.dirs: + # Recursively walk the child directory. + stack.append((d_child, os.path.join(prefix, d_child.name))) + provenance.directory_flag_as_flattenned(directory) diff --git a/swh/provenance/interface.py b/swh/provenance/interface.py --- a/swh/provenance/interface.py +++ b/swh/provenance/interface.py @@ -301,6 +301,18 @@ """ ... + def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: + """Check if the directory is already flattenned in the provenance model. If the + directory is unknown for the model, the methods returns None. + """ + ... + + def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: + """Mark the directory as flattenned in the provenance model. If the + directory is unknown for the model, this method has no effect. + """ + ... + def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py --- a/swh/provenance/provenance.py +++ b/swh/provenance/provenance.py @@ -319,7 +319,7 @@ date=date, flat=self.cache["directory_flatten"].get(sha1) or False ) for sha1, date in self.cache["directory"]["data"].items() - if sha1 in self.cache["directory"]["added"] and date is not None + if self.cache["directory_flatten"].get(sha1) and date is not None } if dir_acks: while not self.storage.directory_add(dir_acks): @@ -387,6 +387,21 @@ (directory.id, revision.id, path_normalize(path)) ) + def directory_already_flattenned(self, directory: DirectoryEntry) -> Optional[bool]: + cache = self.cache["directory_flatten"] + if directory.id not in cache: + cache.setdefault(directory.id, None) + ret = self.storage.directory_get([directory.id]) + if directory.id in ret: + dir = ret[directory.id] + cache[directory.id] = dir.flat + # date is kept to ensure we have it available when flushing + self.cache["directory"]["data"][directory.id] = dir.date + return cache.get(directory.id) + + def directory_flag_as_flattenned(self, directory: DirectoryEntry) -> None: + self.cache["directory_flatten"][directory.id] = True + def directory_get_date_in_isochrone_frontier( self, directory: DirectoryEntry ) -> Optional[datetime]: diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -4,13 +4,13 @@ # See top-level LICENSE file for more information from datetime import datetime, timezone -import os from typing import Generator, Iterable, Iterator, List, Optional, Tuple from swh.core.statsd import statsd from swh.model.model import Sha1Git from .archive import ArchiveInterface +from .directory import directory_flatten from .graph import IsochroneNode, build_isochrone_graph from .interface import ProvenanceInterface from .model import DirectoryEntry, RevisionEntry @@ -135,7 +135,7 @@ provenance.directory_add_to_revision( revision, current.entry, current.path ) - flatten_directory( + directory_flatten( provenance, archive, current.entry, minsize=minsize ) else: @@ -158,28 +158,6 @@ stack.append(child) -@statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "flatten_directory"}) -def flatten_directory( - provenance: ProvenanceInterface, - archive: ArchiveInterface, - directory: DirectoryEntry, - minsize: int = 0, -) -> None: - """Recursively retrieve all the files of 'directory' and insert them in the - 'provenance' database in the 'content_to_directory' table. - """ - stack = [(directory, b"")] - while stack: - current, prefix = stack.pop() - current.retrieve_children(archive, minsize=minsize) - for f_child in current.files: - # Add content to the directory with the computed prefix. - provenance.content_add_to_directory(directory, f_child, prefix) - for d_child in current.dirs: - # Recursively walk the child directory. - stack.append((d_child, os.path.join(prefix, d_child.name))) - - def is_new_frontier( node: IsochroneNode, revision: RevisionEntry, diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py --- a/swh/provenance/tests/test_cli.py +++ b/swh/provenance/tests/test_cli.py @@ -25,6 +25,7 @@ for command in ( "find-all", "find-first", + "iter-frontiers", "iter-origins", "iter-revisions", ): diff --git a/swh/provenance/tests/test_directory_flatten.py b/swh/provenance/tests/test_directory_flatten.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_directory_flatten.py @@ -0,0 +1,72 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from datetime import datetime, timezone + +from swh.model.hashutil import hash_to_bytes +from swh.provenance.archive import ArchiveInterface +from swh.provenance.directory import directory_add +from swh.provenance.interface import ( + DirectoryData, + ProvenanceInterface, + RelationData, + RelationType, +) +from swh.provenance.model import DirectoryEntry, FileEntry +from swh.provenance.tests.conftest import fill_storage, load_repo_data + + +def test_directory_add( + provenance: ProvenanceInterface, + archive: ArchiveInterface, +) -> None: + # read data/README.md for more details on how these datasets are generated + data = load_repo_data("cmdbts2") + fill_storage(archive.storage, data) + + # just take a directory that is known to exists in cmdbts2 + directory = DirectoryEntry( + id=hash_to_bytes("48007c961cc734d1f63886d0413a6dc605e3e2ea") + ) + content1 = FileEntry( + id=hash_to_bytes("20329687bb9c1231a7e05afe86160343ad49b494"), name=b"a" + ) + content2 = FileEntry( + id=hash_to_bytes("50e9cdb03f9719261dd39d7f2920b906db3711a3"), name=b"b" + ) + date = datetime.fromtimestamp(1000000010, timezone.utc) + + # directory_add and the internal directory_flatten require the directory and its + # content to be known by the provenance object. Otherwise, they do nothing + provenance.directory_set_date_in_isochrone_frontier(directory, date) + provenance.content_set_early_date(content1, date) + provenance.content_set_early_date(content2, date) + provenance.flush() + assert provenance.storage.directory_get([directory.id]) == { + directory.id: DirectoryData(date=date, flat=False) + } + assert provenance.storage.content_get([content1.id, content2.id]) == { + content1.id: date, + content2.id: date, + } + + # this query forces the directory date to be retrieved from the storage and cached + # (otherwise, the flush below won't update the directory flatten flag) + flattenned = provenance.directory_already_flattenned(directory) + assert flattenned is not None and not flattenned + + # flatten the directory and check the expected result + directory_add(provenance, archive, [directory]) + assert provenance.storage.directory_get([directory.id]) == { + directory.id: DirectoryData(date=date, flat=True) + } + assert provenance.storage.relation_get_all(RelationType.CNT_IN_DIR) == { + content1.id: { + RelationData(dst=directory.id, path=b"a"), + RelationData(dst=directory.id, path=b"C/a"), + }, + content2.id: {RelationData(dst=directory.id, path=b"C/b")}, + } diff --git a/swh/provenance/tests/test_directory_iterator.py b/swh/provenance/tests/test_directory_iterator.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_directory_iterator.py @@ -0,0 +1,29 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.provenance.directory import CSVDirectoryIterator +from swh.provenance.tests.conftest import fill_storage, load_repo_data +from swh.storage.interface import StorageInterface + + +@pytest.mark.parametrize( + "repo", + ( + "cmdbts2", + "out-of-order", + ), +) +def test_revision_iterator(swh_storage: StorageInterface, repo: str) -> None: + """Test CSVDirectoryIterator""" + data = load_repo_data(repo) + fill_storage(swh_storage, data) + + directories_ids = [dir["id"] for dir in data["directory"]] + directories = list(CSVDirectoryIterator(directories_ids)) + + assert directories + assert len(directories) == len(data["directory"]) diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py --- a/swh/provenance/tests/test_revision_content_layer.py +++ b/swh/provenance/tests/test_revision_content_layer.py @@ -259,12 +259,9 @@ }, synth_rev["msg"] # check timestamps for rd in synth_rev["R_D"]: - assert ( - rev_ts + rd["rel_ts"] - == provenance.storage.directory_get([rd["dst"]])[ - rd["dst"] - ].date.timestamp() - ), synth_rev["msg"] + dir_data = provenance.storage.directory_get([rd["dst"]])[rd["dst"]] + assert rev_ts + rd["rel_ts"] == dir_data.date.timestamp(), synth_rev["msg"] + assert dir_data.flat, synth_rev["msg"] # ... + a number of rows in the "content_in_dir" table # for content of the directory.