Changeset View
Standalone View
swh/loader/core/discovery.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | |||||||||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||||||||||
# License: GNU General Public License version 3, or any later version | |||||||||||||||||||||||
# See top-level LICENSE file for more information | |||||||||||||||||||||||
"""Primitives for finding the unknown parts of disk contents efficiently.""" | |||||||||||||||||||||||
from collections import namedtuple | |||||||||||||||||||||||
import itertools | |||||||||||||||||||||||
import logging | |||||||||||||||||||||||
import random | |||||||||||||||||||||||
from typing import Any, Iterable, List, Mapping, NamedTuple, Set, Union | |||||||||||||||||||||||
from swh.model.from_disk import model | |||||||||||||||||||||||
from swh.model.model import Sha1Git | |||||||||||||||||||||||
from swh.storage.interface import StorageInterface | |||||||||||||||||||||||
logger = logging.getLogger(__name__) | |||||||||||||||||||||||
# Maximum amount when sampling from the undecided set of directory entries | |||||||||||||||||||||||
SAMPLE_SIZE = 1000 | |||||||||||||||||||||||
# Sets of sha1 of contents, skipped contents and directories respectively | |||||||||||||||||||||||
vlorentz: Could you use a namedtuple instead?
And use sets instead of list, as they are converted to… | |||||||||||||||||||||||
Sample: NamedTuple = namedtuple( | |||||||||||||||||||||||
"Sample", ["contents", "skipped_contents", "directories"] | |||||||||||||||||||||||
) | |||||||||||||||||||||||
class BaseDiscoveryGraph: | |||||||||||||||||||||||
"""Creates the base structures and methods needed for discovery algorithms. | |||||||||||||||||||||||
Subclasses should override ``get_sample`` to affect how the discovery is made.""" | |||||||||||||||||||||||
def __init__(self, contents, skipped_contents, directories): | |||||||||||||||||||||||
self._all_contents: Mapping[ | |||||||||||||||||||||||
Sha1Git, Union[model.Content, model.SkippedContent] | |||||||||||||||||||||||
] = {} | |||||||||||||||||||||||
self._undecided_directories: Set[Sha1Git] = set() | |||||||||||||||||||||||
self._children: Mapping[Sha1Git, model.DirectoryEntry] = {} | |||||||||||||||||||||||
self._parents: Mapping[model.DirectoryEntry, Sha1Git] = {} | |||||||||||||||||||||||
self.undecided: Set[Sha1Git] = set() | |||||||||||||||||||||||
Not Done Inline ActionsThis is going to handle large numbers of objects, right? It's probably be faster to use self._all.update((content.sha1_git, content) for content in ...) vlorentz: This is going to handle large numbers of objects, right? It's probably be faster to use `self. | |||||||||||||||||||||||
Done Inline ActionsWe're going to be iterating over all contents twice if we do this since we need to populate undecided as well. Re-allocating more space in the dict a few times shouldn't be a big deal in comparison for large inputs. Alphare: We're going to be iterating over all contents twice if we do this since we need to populate… | |||||||||||||||||||||||
for content in itertools.chain(contents, skipped_contents): | |||||||||||||||||||||||
self.undecided.add(content.sha1_git) | |||||||||||||||||||||||
self._all_contents[content.sha1_git] = content | |||||||||||||||||||||||
Not Done Inline ActionsNaive question, can't we optimize a bit all those directories iterations into 1 loop (including the various set initialzations)? That won't be less clear than the current initialization here anyway. Ideally, i'd see only 2 loops, one over content and another over directories (which would init self.{_undecided_directories, _all, undecided} at the same time as well. What do you think? ardumont: Naive question, can't we optimize a bit all those directories iterations into 1 loop (including… | |||||||||||||||||||||||
Done Inline ActionsYep, absolutely. I even have no need for _all for directories, so I'll clean this all up. Alphare: Yep, absolutely. I even have no need for `_all` for directories, so I'll clean this all up. | |||||||||||||||||||||||
for directory in directories: | |||||||||||||||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||||||||||||||
self.undecided.add(directory.id) | |||||||||||||||||||||||
Done Inline Actions
Might as well do the union computation on undecided set on the fly within the same iteration loop. ardumont: Might as well do the union computation on `undecided` set on the fly within the same iteration… | |||||||||||||||||||||||
self._undecided_directories.add(directory.id) | |||||||||||||||||||||||
self._children[directory.id] = {c.target for c in directory.entries} | |||||||||||||||||||||||
for child in directory.entries: | |||||||||||||||||||||||
Not Done Inline Actions
spares a copy vlorentz: spares a copy | |||||||||||||||||||||||
Done Inline Actionshttps://docs.python.org/3/library/stdtypes.html#frozenset.update claims that left_set |= right_set is supposed to be strictly equivalent to left_set.update(right_set). olasd: https://docs.python.org/3/library/stdtypes.html#frozenset.update claims that `left_set |=… | |||||||||||||||||||||||
self._parents.setdefault(child.target, set()).add(directory.id) | |||||||||||||||||||||||
self.undecided |= self._undecided_directories | |||||||||||||||||||||||
self.known: Set[Sha1Git] = set() | |||||||||||||||||||||||
self.unknown: Set[Sha1Git] = set() | |||||||||||||||||||||||
Not Done Inline Actions
ardumont: | |||||||||||||||||||||||
Done Inline ActionsModified this and all other docstrings that use the same conjugation. Alphare: Modified this and all other docstrings that use the same conjugation. | |||||||||||||||||||||||
def mark_known(self, entries: Iterable[Sha1Git]): | |||||||||||||||||||||||
"""Mark ``entries`` and those they imply as known in the SWH archive""" | |||||||||||||||||||||||
Not Done Inline ActionsWhat's a node? Is it a directory or a content or both? Indeed as per you irc comment, a bit of type would help here ;p ardumont: What's a node? Is it a directory or a content or both?
Indeed as per you irc comment, a bit… | |||||||||||||||||||||||
Done Inline ActionsI've added typing information and adjusted docstrings to change "nodes" to "directory entries". Alphare: I've added typing information and adjusted docstrings to change "nodes" to "directory entries". | |||||||||||||||||||||||
self._mark_entries(entries, self._children, self.known) | |||||||||||||||||||||||
def mark_unknown(self, entries: Iterable[Sha1Git]): | |||||||||||||||||||||||
"""Mark ``entries`` and those they imply as unknown in the SWH archive""" | |||||||||||||||||||||||
self._mark_entries(entries, self._parents, self.unknown) | |||||||||||||||||||||||
def _mark_entries( | |||||||||||||||||||||||
self, | |||||||||||||||||||||||
entries: Iterable[Sha1Git], | |||||||||||||||||||||||
transitive_mapping: Mapping[Any, Any], | |||||||||||||||||||||||
target_set: Set[Any], | |||||||||||||||||||||||
): | |||||||||||||||||||||||
"""Use Merkle graph properties to mark a directory entry as known or unknown. | |||||||||||||||||||||||
Done Inline ActionsPlease describe the meaning of all three arguments in the docstring. And would be more natural to have target_set as last argument, as it's the one being mutated, IMO vlorentz: Please describe the meaning of all three arguments in the docstring.
And would be more… | |||||||||||||||||||||||
If an entry is known, then all of its descendants are known. If it's | |||||||||||||||||||||||
unknown, then all of its ancestors are unknown. | |||||||||||||||||||||||
- ``entries``: directory entries to mark along with their ancestors/descendants | |||||||||||||||||||||||
where applicable. | |||||||||||||||||||||||
- ``transitive_mapping``: mapping from an entry to the next entries to mark | |||||||||||||||||||||||
in the hierarchy, if any. | |||||||||||||||||||||||
- ``target_set``: set where marked entries will be added. | |||||||||||||||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||||||||||||||
Done Inline Actionsand to_proceed should probably be called to_process? olasd: and `to_proceed` should probably be called `to_process`? | |||||||||||||||||||||||
""" | |||||||||||||||||||||||
to_process = set(entries) | |||||||||||||||||||||||
while to_process: | |||||||||||||||||||||||
current = to_process.pop() | |||||||||||||||||||||||
target_set.add(current) | |||||||||||||||||||||||
self.undecided.discard(current) | |||||||||||||||||||||||
self._undecided_directories.discard(current) | |||||||||||||||||||||||
Done Inline Actions
please make the return explicit, that avoids asking oneself questions. ardumont: please make the return explicit, that avoids asking oneself questions. | |||||||||||||||||||||||
next_entries = transitive_mapping.get(current, set()) & self.undecided | |||||||||||||||||||||||
to_process.update(next_entries) | |||||||||||||||||||||||
def get_sample( | |||||||||||||||||||||||
self, | |||||||||||||||||||||||
) -> Sample: | |||||||||||||||||||||||
"""Return a three-tuple of samples from the undecided sets of contents, | |||||||||||||||||||||||
skipped contents and directories respectively. | |||||||||||||||||||||||
These samples will be queried against the storage which will tell us | |||||||||||||||||||||||
which are known.""" | |||||||||||||||||||||||
raise NotImplementedError() | |||||||||||||||||||||||
Done Inline Actions
spares a copy vlorentz: spares a copy | |||||||||||||||||||||||
def do_query(self, swh_storage: StorageInterface, sample: Sample) -> None: | |||||||||||||||||||||||
"""Given a three-tuple of samples, ask the storage which are known or | |||||||||||||||||||||||
unknown and mark them as such.""" | |||||||||||||||||||||||
contents_sample, skipped_contents_sample, directories_sample = sample | |||||||||||||||||||||||
# TODO unify those APIs, and make it so only have to manipulate hashes | |||||||||||||||||||||||
if contents_sample: | |||||||||||||||||||||||
known = set(contents_sample) | |||||||||||||||||||||||
unknown = set(swh_storage.content_missing_per_sha1_git(contents_sample)) | |||||||||||||||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||||||||||||||
Done Inline Actionsskipped_contents_sample is a set of ids (after your recommendation, before that a list), not a dict. Alphare: `skipped_contents_sample` is a `set` of ids (after your recommendation, before that a list)… | |||||||||||||||||||||||
known -= unknown | |||||||||||||||||||||||
Not Done Inline ActionsAnother naive question ;), why until only contents are undecided? ardumont: Another naive question ;), why `until only contents are undecided`? | |||||||||||||||||||||||
Done Inline ActionsI've written a more expansive docstring, please tell me if it still isn't clear. Alphare: I've written a more expansive docstring, please tell me if it still isn't clear. | |||||||||||||||||||||||
self.mark_known(known) | |||||||||||||||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||||||||||||||
Done Inline Actions*technically* faster, I'll allow it ;) Alphare: *technically* faster, I'll allow it ;) | |||||||||||||||||||||||
Not Done Inline Actionsright, i missed that, set comprehension!;p ardumont: right, i missed that, set comprehension!;p | |||||||||||||||||||||||
self.mark_unknown(unknown) | |||||||||||||||||||||||
if skipped_contents_sample: | |||||||||||||||||||||||
known = set(skipped_contents_sample) | |||||||||||||||||||||||
as_dicts = [ | |||||||||||||||||||||||
self._all_contents[s].to_dict() for s in skipped_contents_sample | |||||||||||||||||||||||
] | |||||||||||||||||||||||
unknown = { | |||||||||||||||||||||||
d["sha1_git"] for d in swh_storage.skipped_content_missing(as_dicts) | |||||||||||||||||||||||
Done Inline Actions
ditto vlorentz: ditto | |||||||||||||||||||||||
} | |||||||||||||||||||||||
known -= unknown | |||||||||||||||||||||||
self.mark_known(known) | |||||||||||||||||||||||
self.mark_unknown(unknown) | |||||||||||||||||||||||
if directories_sample: | |||||||||||||||||||||||
known = set(directories_sample) | |||||||||||||||||||||||
Done Inline ActionsCaught this mistake also. :) Alphare: Caught this mistake also. :) | |||||||||||||||||||||||
unknown = set(swh_storage.directory_missing(directories_sample)) | |||||||||||||||||||||||
known -= unknown | |||||||||||||||||||||||
self.mark_known(known) | |||||||||||||||||||||||
self.mark_unknown(unknown) | |||||||||||||||||||||||
class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph): | |||||||||||||||||||||||
"""Use a random sampling using only directories. | |||||||||||||||||||||||
This allows us to find a statistically good spread of entries in the graph | |||||||||||||||||||||||
with a smaller population than using all types of entries. When there are | |||||||||||||||||||||||
no more directories, only contents or skipped contents are undecided if any | |||||||||||||||||||||||
are left: we send them directly to the storage since they should be few and | |||||||||||||||||||||||
their structure flat.""" | |||||||||||||||||||||||
def get_sample(self) -> Sample: | |||||||||||||||||||||||
if self._undecided_directories: | |||||||||||||||||||||||
if len(self._undecided_directories) <= SAMPLE_SIZE: | |||||||||||||||||||||||
return Sample( | |||||||||||||||||||||||
contents=set(), | |||||||||||||||||||||||
skipped_contents=set(), | |||||||||||||||||||||||
directories=set(self._undecided_directories), | |||||||||||||||||||||||
) | |||||||||||||||||||||||
sample = random.sample(self._undecided_directories, SAMPLE_SIZE) | |||||||||||||||||||||||
directories = {o for o in sample} | |||||||||||||||||||||||
return Sample( | |||||||||||||||||||||||
contents=set(), skipped_contents=set(), directories=directories | |||||||||||||||||||||||
) | |||||||||||||||||||||||
contents = set() | |||||||||||||||||||||||
Done Inline Actions
let logging do the formatting per log level. ardumont: let logging do the formatting per log level. | |||||||||||||||||||||||
skipped_contents = set() | |||||||||||||||||||||||
for sha1 in self.undecided: | |||||||||||||||||||||||
obj = self._all_contents[sha1] | |||||||||||||||||||||||
obj_type = obj.object_type | |||||||||||||||||||||||
if obj_type == model.Content.object_type: | |||||||||||||||||||||||
contents.add(sha1) | |||||||||||||||||||||||
elif obj_type == model.SkippedContent.object_type: | |||||||||||||||||||||||
skipped_contents.add(sha1) | |||||||||||||||||||||||
else: | |||||||||||||||||||||||
raise TypeError(f"Unexpected object type {obj_type}") | |||||||||||||||||||||||
return Sample( | |||||||||||||||||||||||
contents=contents, skipped_contents=skipped_contents, directories=set() | |||||||||||||||||||||||
) | |||||||||||||||||||||||
def filter_known_objects( | |||||||||||||||||||||||
swh_storage: StorageInterface, | |||||||||||||||||||||||
contents: List[model.Content], | |||||||||||||||||||||||
skipped_contents: List[model.SkippedContent], | |||||||||||||||||||||||
directories: List[model.Directory], | |||||||||||||||||||||||
): | |||||||||||||||||||||||
"""Filter ``contents``, ``skipped_contents`` and ``directories`` to only return | |||||||||||||||||||||||
those that are unknown to the SWH archive using a discovery algorithm.""" | |||||||||||||||||||||||
contents_count = len(contents) | |||||||||||||||||||||||
skipped_contents_count = len(skipped_contents) | |||||||||||||||||||||||
directories_count = len(directories) | |||||||||||||||||||||||
graph = RandomDirSamplingDiscoveryGraph(contents, skipped_contents, directories) | |||||||||||||||||||||||
while graph.undecided: | |||||||||||||||||||||||
sample = graph.get_sample() | |||||||||||||||||||||||
graph.do_query(swh_storage, sample) | |||||||||||||||||||||||
contents = [c for c in contents if c.sha1_git in graph.unknown] | |||||||||||||||||||||||
skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown] | |||||||||||||||||||||||
directories = [c for c in directories if c.id in graph.unknown] | |||||||||||||||||||||||
logger.debug( | |||||||||||||||||||||||
"Filtered out %d contents, %d skipped contents and %d directories", | |||||||||||||||||||||||
contents_count - len(contents), | |||||||||||||||||||||||
skipped_contents_count - len(skipped_contents), | |||||||||||||||||||||||
directories_count - len(directories), | |||||||||||||||||||||||
) | |||||||||||||||||||||||
return (contents, skipped_contents, directories) |
Could you use a namedtuple instead?
And use sets instead of list, as they are converted to sets before use anyway