diff --git a/swh/provenance/archive.py b/swh/provenance/archive.py --- a/swh/provenance/archive.py +++ b/swh/provenance/archive.py @@ -15,7 +15,7 @@ class ArchiveInterface(Protocol): storage: StorageInterface - def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: + def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: """List entries for one directory. Args: diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -130,6 +130,7 @@ @click.option("-l", "--limit", type=int) @click.option("-m", "--min-depth", default=1, type=int) @click.option("-r", "--reuse", default=True, type=bool) +@click.option("-s", "--min-size", default=0, type=int) @click.pass_context def iter_revisions( ctx: click.core.Context, @@ -138,8 +139,8 @@ limit: Optional[int], min_depth: int, reuse: bool, + min_size: int, ) -> None: - # TODO: add file size filtering """Process a provided list of revisions.""" from . import get_archive, get_provenance from .revision import CSVRevisionIterator, revision_add @@ -157,6 +158,7 @@ trackall=track_all, lower=reuse, mindepth=min_depth, + minsize=min_size, ) diff --git a/swh/provenance/graph.py b/swh/provenance/graph.py --- a/swh/provenance/graph.py +++ b/swh/provenance/graph.py @@ -176,6 +176,7 @@ provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry, + minsize: int = 0, ) -> IsochroneNode: assert revision.date is not None assert revision.root == directory.id @@ -208,7 +209,7 @@ # Pre-query all known dates for directories in the current directory # for the provenance object to have them cached and (potentially) improve # performance. - current.entry.retrieve_children(archive) + current.entry.retrieve_children(archive, minsize=minsize) ddates = provenance.directory_get_dates_in_isochrone_frontier( current.entry.dirs ) diff --git a/swh/provenance/model.py b/swh/provenance/model.py --- a/swh/provenance/model.py +++ b/swh/provenance/model.py @@ -86,11 +86,11 @@ self._files: Optional[List[FileEntry]] = None self._dirs: Optional[List[DirectoryEntry]] = None - def retrieve_children(self, archive: ArchiveInterface) -> None: + def retrieve_children(self, archive: ArchiveInterface, minsize: int = 0) -> None: if self._files is None and self._dirs is None: self._files = [] self._dirs = [] - for child in archive.directory_ls(self.id): + for child in archive.directory_ls(self.id, minsize=minsize): if child["type"] == "dir": self._dirs.append( DirectoryEntry(child["target"], name=child["name"]) diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py --- a/swh/provenance/postgresql/archive.py +++ b/swh/provenance/postgresql/archive.py @@ -22,14 +22,13 @@ ) self.conn = conn - def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: - entries = self._directory_ls(id) + def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: + entries = self._directory_ls(id, minsize=minsize) yield from entries @lru_cache(maxsize=100000) @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "directory_ls"}) - def _directory_ls(self, id: Sha1Git) -> List[Dict[str, Any]]: - # TODO: add file size filtering + def _directory_ls(self, id: Sha1Git, minsize: int = 0) -> List[Dict[str, Any]]: with self.conn.cursor() as cursor: cursor.execute( """ @@ -49,7 +48,9 @@ c.sha1_git FROM ls_f LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id - INNER JOIN content c ON e.target=c.sha1_git) + INNER JOIN content c ON e.target=c.sha1_git + WHERE c.length >= %s + ) SELECT * FROM known_contents UNION (SELECT 'file'::directory_entry_type AS type, e.target, e.name, @@ -61,10 +62,11 @@ SELECT 1 FROM known_contents WHERE known_contents.sha1_git=e.target ) + AND c.length >= %s ) ) """, - (id,), + (id, minsize, minsize), ) return [ {"type": row[0], "target": row[1], "name": row[2]} for row in cursor @@ -84,7 +86,6 @@ """, (id,), ) - # There should be at most one row anyway yield from (row[0] for row in cursor) @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "snapshot_get_heads"}) diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py --- a/swh/provenance/revision.py +++ b/swh/provenance/revision.py @@ -59,6 +59,7 @@ trackall: bool = True, lower: bool = True, mindepth: int = 1, + minsize: int = 0, commit: bool = True, ) -> None: for revision in revisions: @@ -72,8 +73,8 @@ provenance, revision, DirectoryEntry(revision.root), + minsize=minsize, ) - # TODO: add file size filtering revision_process_content( archive, provenance, @@ -82,6 +83,7 @@ trackall=trackall, lower=lower, mindepth=mindepth, + minsize=minsize, ) if commit: provenance.flush() @@ -96,6 +98,7 @@ trackall: bool = True, lower: bool = True, mindepth: int = 1, + minsize: int = 0, ) -> None: assert revision.date is not None provenance.revision_add(revision) @@ -132,7 +135,9 @@ provenance.directory_add_to_revision( revision, current.entry, current.path ) - flatten_directory(archive, provenance, current.entry) + flatten_directory( + archive, provenance, current.entry, minsize=minsize + ) else: # If current node is an invalidated frontier, update its date for future # revisions to get the proper value. @@ -158,6 +163,7 @@ archive: ArchiveInterface, provenance: ProvenanceInterface, directory: DirectoryEntry, + minsize: int = 0, ) -> None: """Recursively retrieve all the files of 'directory' and insert them in the 'provenance' database in the 'content_to_directory' table. @@ -165,7 +171,7 @@ stack = [(directory, b"")] while stack: current, prefix = stack.pop() - current.retrieve_children(archive) + current.retrieve_children(archive, minsize=minsize) for f_child in current.files: # Add content to the directory with the computed prefix. provenance.content_add_to_directory(directory, f_child, prefix) diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py --- a/swh/provenance/storage/archive.py +++ b/swh/provenance/storage/archive.py @@ -18,14 +18,16 @@ self.storage = storage @statsd.timed(metric=ARCHIVE_DURATION_METRIC, tags={"method": "directory_ls"}) - def directory_ls(self, id: Sha1Git) -> Iterable[Dict[str, Any]]: - # TODO: add file size filtering + def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]: for entry in self.storage.directory_ls(id): - yield { - "name": entry["name"], - "target": entry["target"], - "type": entry["type"], - } + if entry["type"] == "dir" or ( + entry["type"] == "file" and entry["length"] >= minsize + ): + yield { + "name": entry["name"], + "target": entry["target"], + "type": entry["type"], + } @statsd.timed( metric=ARCHIVE_DURATION_METRIC, tags={"method": "revision_get_parents"}