Changeset View
Changeset View
Standalone View
Standalone View
swh/scanner/model.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | from __future__ import annotations | ||||
import sys | import sys | ||||
import json | import json | ||||
from pathlib import PosixPath | from pathlib import PosixPath | ||||
from typing import Any, Dict | from typing import Any, Dict, Tuple | ||||
from enum import Enum | from enum import Enum | ||||
from .plot import sunburst | |||||
from .exceptions import InvalidObjectType | |||||
from swh.model.identifiers import ( | from swh.model.identifiers import ( | ||||
DIRECTORY, CONTENT | DIRECTORY, CONTENT | ||||
) | ) | ||||
class Color(Enum): | class Color(Enum): | ||||
blue = '\033[94m' | blue = '\033[94m' | ||||
green = '\033[92m' | green = '\033[92m' | ||||
Show All 11 Lines | class Tree: | ||||
def __init__(self, path: PosixPath, father: Tree = None): | def __init__(self, path: PosixPath, father: Tree = None): | ||||
self.father = father | self.father = father | ||||
self.path = path | self.path = path | ||||
self.otype = DIRECTORY if path.is_dir() else CONTENT | self.otype = DIRECTORY if path.is_dir() else CONTENT | ||||
self.pid = '' | self.pid = '' | ||||
self.children: Dict[PosixPath, Tree] = {} | self.children: Dict[PosixPath, Tree] = {} | ||||
def addNode(self, path: PosixPath, pid: str = None) -> None: | def addNode(self, path: PosixPath, pid: str = None) -> None: | ||||
"""Recursively add a new node path | """Recursively add a new path. | ||||
""" | """ | ||||
relative_path = path.relative_to(self.path) | relative_path = path.relative_to(self.path) | ||||
if relative_path == PosixPath('.'): | if relative_path == PosixPath('.'): | ||||
if pid is not None: | if pid is not None: | ||||
self.pid = pid | self.pid = pid | ||||
return | return | ||||
new_path = self.path.joinpath(relative_path.parts[0]) | new_path = self.path.joinpath(relative_path.parts[0]) | ||||
if new_path not in self.children: | if new_path not in self.children: | ||||
self.children[new_path] = Tree(new_path, self) | self.children[new_path] = Tree(new_path, self) | ||||
self.children[new_path].addNode(path, pid) | self.children[new_path].addNode(path, pid) | ||||
def show(self, format) -> None: | def show(self, format) -> None: | ||||
"""Print all the tree""" | """Show tree in different formats""" | ||||
if format == 'json': | if format == 'json': | ||||
print(json.dumps(self.getTree(), indent=4, sort_keys=True)) | print(json.dumps(self.getTree(), indent=4, sort_keys=True)) | ||||
elif format == 'text': | elif format == 'text': | ||||
isatty = sys.stdout.isatty() | isatty = sys.stdout.isatty() | ||||
print(colorize(str(self.path), Color.blue) if isatty | print(colorize(str(self.path), Color.blue) if isatty | ||||
else str(self.path)) | else str(self.path)) | ||||
self.printChildren(isatty) | self.printChildren(isatty) | ||||
def printChildren(self, isatty: bool, inc: int = 0) -> None: | elif format == 'sunburst': | ||||
root = self.path | |||||
directories = self.getDirectoriesInfo(root) | |||||
sunburst(directories, root) | |||||
def printChildren(self, isatty: bool, inc: int = 1) -> None: | |||||
for path, node in self.children.items(): | for path, node in self.children.items(): | ||||
self.printNode(node, isatty, inc) | self.printNode(node, isatty, inc) | ||||
if node.children: | if node.children: | ||||
node.printChildren(isatty, inc+1) | node.printChildren(isatty, inc+1) | ||||
def printNode(self, node: Any, isatty: bool, inc: int) -> None: | def printNode(self, node: Any, isatty: bool, inc: int) -> None: | ||||
rel_path = str(node.path.relative_to(self.path)) | rel_path = str(node.path.relative_to(self.path)) | ||||
begin = '│ ' * inc | begin = '│ ' * inc | ||||
Show All 24 Lines | def getTree(self): | ||||
if child_node.pid: | if child_node.pid: | ||||
child_tree[rel_path] = child_node.pid | child_tree[rel_path] = child_node.pid | ||||
else: | else: | ||||
next_tree = child_node.getTree() | next_tree = child_node.getTree() | ||||
if next_tree: | if next_tree: | ||||
child_tree[rel_path] = next_tree | child_tree[rel_path] = next_tree | ||||
return child_tree | return child_tree | ||||
vlorentz: `directories` looks like an accumulator; and it's initialized by the called with the… | |||||
Not Done Inline ActionsShould be a private method now, so prefix its name with an underscore vlorentz: Should be a private method now, so prefix its name with an underscore | |||||
def __getSubDirsInfo(self, root, directories): | |||||
Not Done Inline ActionsThis docstring doesn't give more info than the signature of the function. You should either make it more detailed or remove it. vlorentz: This docstring doesn't give more info than the signature of the function. You should either… | |||||
"""Fills the directories given in input with the contents information | |||||
stored inside the directory child, only if they have contents. | |||||
""" | |||||
for path, child_node in self.children.items(): | |||||
Not Done Inline ActionsCould you document further the format of values? vlorentz: Could you document further the format of values? | |||||
Not Done Inline Actionsmissed this comment vlorentz: missed this comment | |||||
if child_node.otype == DIRECTORY: | |||||
rel_path = path.relative_to(root) | |||||
contents_info = child_node.count_contents() | |||||
# checks the first element of the tuple | |||||
# (the number of contents in a directory) | |||||
# if it is equal to zero it means that there are no contents | |||||
# in that directory. | |||||
Not Done Inline Actionsno need to return it (same reason as in the other comment) vlorentz: no need to return it (same reason as in the other comment) | |||||
if not contents_info[0] == 0: | |||||
directories[rel_path] = contents_info | |||||
if child_node.has_dirs(): | |||||
child_node.__getSubDirsInfo(root, directories) | |||||
def getDirectoriesInfo(self, root: PosixPath | |||||
) -> Dict[PosixPath, Tuple[int, int]]: | |||||
Not Done Inline ActionsAs it's fixed-length, and both values have different meanings, use Tuple[int, int] instead. vlorentz: As it's fixed-length, and both values have different meanings, use `Tuple[int, int]` instead. | |||||
"""Get information about all directories under the given root. | |||||
Returns: | |||||
A dictionary with a directory path as key and the relative | |||||
contents information (the result of count_contents) as values. | |||||
Not Done Inline Actions"A tuple of the total number of contents, and the number of discovered contents." What do you mean by "discovered"? vlorentz: "A tuple of the total number of contents, and the number of discovered contents."
What do you… | |||||
Not Done Inline ActionsIt's the number of contents found in the archive in a specific directory. DanSeraf: It's the number of contents found in the archive in a specific directory. | |||||
Not Done Inline ActionsCould you explain this in both docstrings? vlorentz: Could you explain this in both docstrings? | |||||
Not Done Inline Actionsand this one vlorentz: and this one | |||||
""" | |||||
directories = {root: self.count_contents()} | |||||
self.__getSubDirsInfo(root, directories) | |||||
return directories | |||||
def count_contents(self) -> Tuple[int, int]: | |||||
"""Count how many contents are present inside a directory. | |||||
If a directory has a pid returns as it has all the contents. | |||||
Returns: | |||||
A tuple with the total number of the contents and the number | |||||
of contents known (the ones that have a persistent identifier). | |||||
""" | |||||
contents = 0 | |||||
discovered = 0 | |||||
if not self.otype == DIRECTORY: | |||||
raise InvalidObjectType('Can\'t calculate contents of the ' | |||||
'object type: %s' % self.otype) | |||||
if self.pid: | |||||
# to identify a directory with all files/directories present | |||||
return (1, 1) | |||||
else: | |||||
for _, child_node in self.children.items(): | |||||
if child_node.otype == CONTENT: | |||||
contents += 1 | |||||
if child_node.pid: | |||||
discovered += 1 | |||||
return (contents, discovered) | |||||
def has_dirs(self) -> bool: | |||||
"""Checks if node has directories | |||||
""" | |||||
for _, child_node in self.children.items(): | |||||
if child_node.otype == DIRECTORY: | |||||
return True | |||||
return False |
directories looks like an accumulator; and it's initialized by the called with the content_count, which is the job done in this function.
It's also confusing to take an argument, then modify in place and return it.
Instead, you should turn this function into a private one (by prepending an understanding) without a return type, and add a public function getDirectoriesInfo(root: PosixPath) -> Dict[PosixPath, List].
This way, callers don't have to initialize the dict.