Changeset View
Standalone View
swh/scanner/data.py
- This file was added.
# Copyright (C) 2021 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from pathlib import Path | |||||
from typing import Dict, Tuple | |||||
from swh.model.exceptions import ValidationError | |||||
from swh.model.from_disk import Directory | |||||
from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID | |||||
class MerkleNodeInfo(dict): | |||||
zack: naming nitpick: I'd call this MerkleNodeInfo, as it's //extra// data w.r.t. the actual data… | |||||
"""Store additional information about Merkle DAG nodes, using SWHIDs as keys""" | |||||
Not Done Inline Actionsbetter docstring: "store additional information about Merkle DAG nodes, using SWHIDs as keys" zack: better docstring: "store additional information about Merkle DAG nodes, using SWHIDs as keys" | |||||
def __setitem__(self, key, value): | |||||
"""The keys must be valid valid Software Heritage Persistent Identifiers | |||||
Not Done Inline Actionsgiven the constructor isn't doing anything else, I think you can just leave it out entirely and not define __init__ here zack: given the constructor isn't doing anything else, I think you can just leave it out entirely and… | |||||
while values must be dict. | |||||
""" | |||||
if not isinstance(key, CoreSWHID): | |||||
raise ValidationError("keys must be valid SWHID(s)") | |||||
if not isinstance(value, dict): | |||||
raise ValidationError(f"values must be dict, not {type(value)}") | |||||
Not Done Inline Actionswhy merkle node classes as keys? With SWHIDs we will have the ability to keep information about archive objects that maybe we do not have locally available (e.g., in the future, revision objects pointing to submodules that we have not retrieved locally). That seems valuable to me. zack: why merkle node classes as keys?
shouldn't we use actual SWHIDs, which are meant to be IDs? | |||||
Not Done Inline ActionsYeah, the comment and the exception here should be changed, the function is actually checking that the key is a valid SWHID. DanSeraf: Yeah, the comment and the exception here should be changed, the function is actually checking… | |||||
Not Done Inline ActionsAs dictionary keys for additional information we should use CoreSWHID instances, not strings. That will avoid having to re-validate SWHID syntactical correctness every time. Also, it is now trivial to obtain CoreSWHID from from_disk instances, thanks to your change in T3393. So, to recap: the scanner should instantiate from_disk objects from the filesystem, then obtain CoreSWHIDs from each node of it using the new swhid() method, and use those as keys for MerkleNodeData zack: As dictionary keys for additional information we should use CoreSWHID instances, not strings. | |||||
super(MerkleNodeInfo, self).__setitem__(key, value) | |||||
def get_directory_data( | |||||
root_path: str, | |||||
source_tree: Directory, | |||||
nodes_data: MerkleNodeInfo, | |||||
directory_data: Dict = {}, | |||||
) -> Dict[Path, dict]: | |||||
"""Get content information for each directory inside source_tree. | |||||
Returns: | |||||
A dictionary with a directory path as key and the relative | |||||
contents information as values. | |||||
""" | |||||
def _get_directory_data( | |||||
source_tree: Directory, nodes_data: MerkleNodeInfo, directory_data: Dict | |||||
): | |||||
directories = list( | |||||
filter( | |||||
lambda n: n.object_type == DIRECTORY, | |||||
map(lambda n: n[1], source_tree.items()), | |||||
) | |||||
) | |||||
Not Done Inline Actionsis there a constant/enum somewhere that we can use instead of "directory" as a string? zack: is there a constant/enum somewhere that we can use instead of `"directory"` as a string? | |||||
Not Done Inline ActionsThere are some deprecated constants in swh.model.identifiers. Shouldn't the object_type attribute of from_disk.Directory/Content be consistent with the enum swh.model.identifiers.ObjectType? DanSeraf: There are some deprecated constants in `swh.model.identifiers`. Shouldn't the `object_type`… | |||||
for node in directories: | |||||
directory_info = directory_content(node, nodes_data) | |||||
rel_path = Path(node.data["path"].decode()).relative_to(Path(root_path)) | |||||
directory_data[rel_path] = directory_info | |||||
if has_dirs(node): | |||||
_get_directory_data(node, nodes_data, directory_data) | |||||
Not Done Inline Actionsthis decode() looks dangerous, and Path objects work fine with bytes in general, please check if it is avoidable zack: this `decode()` looks dangerous, and Path objects work fine with bytes in general, please check… | |||||
Not Done Inline ActionsI think it is not possible to avoid decode() here, since Path wants strings instead of bytes. DanSeraf: I think it is not possible to avoid `decode()` here, since `Path` wants strings instead of… | |||||
Not Done Inline Actionsouch, you're right (https://bugs.python.org/issue23904) and that sucks oh well zack: ouch, you're right (https://bugs.python.org/issue23904) and that sucks
oh well | |||||
_get_directory_data(source_tree, nodes_data, directory_data) | |||||
return directory_data | |||||
def directory_content(node: Directory, nodes_data: MerkleNodeInfo) -> Tuple[int, int]: | |||||
"""Count known contents inside the given directory. | |||||
Returns: | |||||
A tuple with the total number of contents inside the directory and the number | |||||
of known contents. | |||||
""" | |||||
known_cnt = 0 | |||||
Not Done Inline Actionsis there a constant/enum somewhere that we can use instead of "content" as a string? zack: is there a constant/enum somewhere that we can use instead of `"content"` as a string? | |||||
node_contents = list( | |||||
filter(lambda n: n.object_type == CONTENT, map(lambda n: n[1], node.items())) | |||||
) | |||||
Not Done Inline Actionsremember to bump the dependency on swh.model in requirements-swh.txt to get access to the swhid() method zack: remember to bump the dependency on swh.model in requirements-swh.txt to get access to the… | |||||
for sub_node in node_contents: | |||||
if nodes_data[sub_node.swhid()]["known"]: | |||||
known_cnt += 1 | |||||
return (len(node_contents), known_cnt) | |||||
Not Done Inline Actionshere's an example of why we should use CoreSWHID instances as keys: here you are first rendering an existing CoreSWHID to string, than using it as a dictionary key, which means it will be hashed again. What a waste :) zack: here's an example of why we should use CoreSWHID instances as keys: here you are first… | |||||
def has_dirs(node: Directory) -> bool: | |||||
"""Check if the given directory has other directories inside.""" | |||||
for _, sub_node in node.items(): | |||||
if isinstance(sub_node, Directory): | |||||
return True | |||||
return False | |||||
def get_content_from( | |||||
node_path: bytes, source_tree: Directory, nodes_data: MerkleNodeInfo | |||||
) -> Dict[bytes, dict]: | |||||
"""Get content information from the given directory node.""" | |||||
Not Done Inline Actionsditto: constant for "content", if it exists zack: ditto: constant for `"content"`, if it exists | |||||
# root in model.from_disk.Directory should be accessed with b"" | |||||
directory = source_tree[node_path if node_path != source_tree.data["path"] else b""] | |||||
node_contents = list( | |||||
filter( | |||||
lambda n: n.object_type == CONTENT, map(lambda n: n[1], directory.items()) | |||||
) | |||||
) | |||||
files_data = {} | |||||
for node in node_contents: | |||||
node_info = nodes_data[node.swhid()] | |||||
node_info["swhid"] = str(node.swhid()) | |||||
path_name = "path" if "path" in node.data.keys() else "data" | |||||
files_data[node.data[path_name]] = node_info | |||||
Not Done Inline Actionshere's another example of rendering/reparsing, this time even worse because it will also go through syntactic validation for SWHID correctness, which is pointless given it came from a CoreSWHID object already zack: here's another example of rendering/reparsing, this time even worse because it will also go… | |||||
return files_data | |||||
Not Done Inline ActionsWhy are you decoding the path here? in general, there is no guarantee that we will be able to decode it at all, so it should be remain bytes as long as possible (until final output). I think this function should return a Dict[bytes,dict] instead, delaying decoding. zack: Why are you decoding the path here? in general, there is no guarantee that we will be able to… |
naming nitpick: I'd call this MerkleNodeInfo, as it's extra data w.r.t. the actual data in the node, which is stored in the from_disk structure already