diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,6 +2,9 @@ aioresponses pytest_asyncio pytest_flask +plotly +pandas +numpy swh.core[testing-core] swh.model[testing] swh.storage[testing] diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -35,7 +35,8 @@ metavar='API_URL', show_default=True, help="url for the api request") @click.option('-f', '--format', - type=click.Choice(['text', 'json'], case_sensitive=False), + type=click.Choice(['text', 'json', 'sunburst'], + case_sensitive=False), default='text', help="select the output format") @click.pass_context diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -7,9 +7,11 @@ import sys import json from pathlib import PosixPath -from typing import Any, Dict +from typing import Any, Dict, List from enum import Enum +from .plot import sunburst + from swh.model.identifiers import ( DIRECTORY, CONTENT ) @@ -37,7 +39,7 @@ self.children: Dict[PosixPath, Tree] = {} def addNode(self, path: PosixPath, pid: str = None) -> None: - """Recursively add a new node path + """Recursively add a new path. """ relative_path = path.relative_to(self.path) @@ -53,9 +55,10 @@ self.children[new_path].addNode(path, pid) def show(self, format) -> None: - """Print all the tree""" + """Show tree in different formats""" if format == 'json': print(json.dumps(self.getTree(), indent=4, sort_keys=True)) + elif format == 'text': isatty = sys.stdout.isatty() @@ -63,7 +66,13 @@ else str(self.path)) self.printChildren(isatty) - def printChildren(self, isatty: bool, inc: int = 0) -> None: + elif format == 'sunburst': + root = self.path + directories = {root: self.count_contents()} + directories = self.getDirectoriesInfo(directories, root) + sunburst(directories, root) + + def printChildren(self, isatty: bool, inc: int = 1) -> None: for path, node in self.children.items(): self.printNode(node, isatty, inc) if node.children: @@ -104,3 +113,53 @@ child_tree[rel_path] = next_tree return child_tree + + def getDirectoriesInfo(self, directories, root) -> Dict[PosixPath, List]: + """Get information about all directories stored inside the tree. + + Returns: + A dictionary with the path as key and the contents information + as values. + + """ + for path, child_node in self.children.items(): + if child_node.otype == DIRECTORY: + rel_path = path.relative_to(root) + contents_info = child_node.count_contents() + if not contents_info[0] == 0: + directories[rel_path] = contents_info + if child_node.has_dirs(): + child_node.getDirectoriesInfo(directories, root) + + return directories + + def count_contents(self) -> List[int]: + """Count how many contents are present inside a directory. + If a directory has a pid returns as it has all the contents. + + Returns: + A list with the number of contents / discovered contents. + + """ + contents = 0 + discovered = 0 + + # to identificate a directory with all files/directories present + if self.otype == DIRECTORY and self.pid: + return [1, 1] + + for _, child_node in self.children.items(): + if child_node.otype == CONTENT: + contents += 1 + if child_node.pid: + discovered += 1 + + return [contents, discovered] + + def has_dirs(self) -> bool: + """Checks if node has directories + """ + for _, child_node in self.children.items(): + if child_node.otype == DIRECTORY: + return True + return False diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py new file mode 100644 --- /dev/null +++ b/swh/scanner/plot.py @@ -0,0 +1,123 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import plotly.graph_objects as go # type: ignore +import pandas as pd # type: ignore +import numpy as np # type: ignore +from typing import Iterable, List, Dict +from pathlib import PosixPath + + +def get_hierarchical_dataframe( + df: pd.DataFrame, levels: List[str], + color_columns: List[str]) -> pd.DataFrame: + """ + Build a hierarchy of levels for Sunburst or Treemap charts. + """ + complete_df = pd.DataFrame(columns=['id', 'parent', 'value', 'color']) + value_col = color_columns[0] + color_col = color_columns[1] + + for i, level in enumerate(levels): + df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color']) + dfg = df.groupby(levels[i:]).sum() + dfg = dfg.reset_index() + df_tree['id'] = dfg[level].copy() + if i < len(levels) - 1: + df_tree['parent'] = dfg[levels[i+1]].copy() + else: + df_tree['parent'] = 'total' + df_tree['value'] = dfg[value_col] + df_tree['color'] = dfg[color_col] / dfg[value_col] * 100 + complete_df = complete_df.append(df_tree, ignore_index=True) + + tot_avg = df[color_col].sum() / df[value_col].sum() * 100 + total = pd.Series(dict(id='total', parent='', + value=df[value_col].sum(), + color=tot_avg)) + + complete_df = complete_df.append(total, ignore_index=True) + + return complete_df + + +def find_levels(df: pd.DataFrame) -> Iterable[str]: + """Discover levels inside the dataframe + """ + for level in reversed(list(df.to_dict().keys())): + if level.startswith('lev'): + yield level + + +def generate_df_columns(dirs, root: PosixPath): + """Generate columns based on the level of directories present + """ + max_lvl = 0 + for dir_path in dirs: + if dir_path == root: + continue + + curr_lvl = len(dir_path.parts) + if curr_lvl > max_lvl: + max_lvl = curr_lvl + + return ['lev'+str(i) for i in range(max_lvl)] + + +def generate_df(dirs, columns: List[str], root: PosixPath, + max_level: int) -> pd.DataFrame: + """Generate a dataframe with the directories given in input + """ + def get_dirs_array(dir_path: PosixPath) -> Iterable[List[str]]: + for dir_path, contents_info in dirs.items(): + levels = max_level - len(dir_path.parts) + + if dir_path == root: + # ignore the root but store contents information + yield ['']*(max_level) + contents_info + else: + path_array = [part for part in dir_path.parts] + yield path_array + ['']*levels + contents_info + + df = pd.DataFrame(np.array( + [dir_array for dir_array in get_dirs_array(dirs)]), columns=columns) + + df['size'] = pd.to_numeric(df['size']) + df['discovered'] = pd.to_numeric(df['discovered']) + + return df + + +def sunburst(directories: Dict[PosixPath, List[int]], root: PosixPath) -> None: + columns = ['size', 'discovered'] + + df_columns = generate_df_columns(directories.keys(), root) + columns + max_level = len(df_columns) - 2 + df = generate_df(directories, df_columns, root, max_level) + + # levels used for the hierarchical chart + levels = [e for e in find_levels(df)] + + hierarchical_df = get_hierarchical_dataframe(df, levels, columns) + discovered_avg = df['discovered'].sum() / df['size'].sum() + + fig = go.Figure() + + fig.add_trace(go.Sunburst( + labels=hierarchical_df['id'], + parents=hierarchical_df['parent'], + values=hierarchical_df['value'], + branchvalues='total', + marker=dict( + colors=hierarchical_df['color'], + colorscale='RdBu', + cmid=discovered_avg), + hovertemplate='''%{label} +
Files: %{value} +
Discovered: %{color:.2f}%''', + name='' + )) + + fig.show() diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -46,7 +46,9 @@ root = { subdir: { + subsubdir filesample.txt + filesample2.txt } subdir2 subfile.txt @@ -54,23 +56,32 @@ """ root = tmp_path_factory.getbasetemp() subdir = tmp_path_factory.mktemp('subdir') + subsubdir = subdir.joinpath('subsubdir') + subsubdir.mkdir() subdir2 = tmp_path_factory.mktemp('subdir2') subfile = root / 'subfile.txt' subfile.touch() filesample = subdir / 'filesample.txt' filesample.touch() + filesample2 = subdir / 'filesample2.txt' + filesample2.touch() avail_path = { subdir: pid_of_dir(bytes(subdir)), + subsubdir: pid_of_dir(bytes(subsubdir)), subdir2: pid_of_dir(bytes(subdir2)), subfile: pid_of_file(bytes(subfile)), - filesample: pid_of_file(bytes(filesample)) + filesample: pid_of_file(bytes(filesample)), + filesample2: pid_of_file(bytes(filesample2)) } return { 'root': root, 'paths': avail_path, - 'filesample': filesample + 'filesample': filesample, + 'filesample2': filesample2, + 'subsubdir': subsubdir, + 'subdir': subdir } diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -65,3 +65,25 @@ assert len(tree_dict) == 1 assert tree_dict['subdir0']['filesample.txt'] + + +def test_get_directories_info(example_tree, temp_folder): + root_path = temp_folder['root'] + filesample_path = temp_folder['filesample'] + filesample2_path = temp_folder['filesample2'] + subdir_path = temp_folder['subdir'].relative_to(root_path) + subsubdir_path = temp_folder['subsubdir'].relative_to(root_path) + + for path, pid in temp_folder['paths'].items(): + if path == filesample_path or path == filesample2_path: + print(path) + example_tree.addNode(path, pid) + else: + example_tree.addNode(path) + + tree_root = example_tree + directories = {tree_root.path: tree_root.count_contents()} + directories = tree_root.getDirectoriesInfo(directories, tree_root.path) + + assert subsubdir_path not in directories + assert directories[subdir_path] == [2, 2]