diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -2,6 +2,9 @@
aioresponses
pytest_asyncio
pytest_flask
+plotly
+pandas
+numpy
swh.core[testing-core]
swh.model[testing]
swh.storage[testing]
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -35,7 +35,8 @@
metavar='API_URL', show_default=True,
help="url for the api request")
@click.option('-f', '--format',
- type=click.Choice(['text', 'json'], case_sensitive=False),
+ type=click.Choice(['text', 'json', 'sunburst'],
+ case_sensitive=False),
default='text',
help="select the output format")
@click.pass_context
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -7,9 +7,11 @@
import sys
import json
from pathlib import PosixPath
-from typing import Any, Dict
+from typing import Any, Dict, List
from enum import Enum
+from .plot import sunburst
+
from swh.model.identifiers import (
DIRECTORY, CONTENT
)
@@ -37,7 +39,7 @@
self.children: Dict[PosixPath, Tree] = {}
def addNode(self, path: PosixPath, pid: str = None) -> None:
- """Recursively add a new node path
+ """Recursively add a new path.
"""
relative_path = path.relative_to(self.path)
@@ -53,9 +55,10 @@
self.children[new_path].addNode(path, pid)
def show(self, format) -> None:
- """Print all the tree"""
+ """Show tree in different formats"""
if format == 'json':
print(json.dumps(self.getTree(), indent=4, sort_keys=True))
+
elif format == 'text':
isatty = sys.stdout.isatty()
@@ -63,7 +66,13 @@
else str(self.path))
self.printChildren(isatty)
- def printChildren(self, isatty: bool, inc: int = 0) -> None:
+ elif format == 'sunburst':
+ root = self.path
+ directories = {root: self.count_contents()}
+ directories = self.getDirectoriesInfo(directories, root)
+ sunburst(directories, root)
+
+ def printChildren(self, isatty: bool, inc: int = 1) -> None:
for path, node in self.children.items():
self.printNode(node, isatty, inc)
if node.children:
@@ -104,3 +113,53 @@
child_tree[rel_path] = next_tree
return child_tree
+
+ def getDirectoriesInfo(self, directories, root) -> Dict[PosixPath, List]:
+ """Get information about all directories stored inside the tree.
+
+ Returns:
+ A dictionary with the path as key and the contents information
+ as values.
+
+ """
+ for path, child_node in self.children.items():
+ if child_node.otype == DIRECTORY:
+ rel_path = path.relative_to(root)
+ contents_info = child_node.count_contents()
+ if not contents_info[0] == 0:
+ directories[rel_path] = contents_info
+ if child_node.has_dirs():
+ child_node.getDirectoriesInfo(directories, root)
+
+ return directories
+
+ def count_contents(self) -> List[int]:
+ """Count how many contents are present inside a directory.
+ If a directory has a pid returns as it has all the contents.
+
+ Returns:
+ A list with the number of contents / discovered contents.
+
+ """
+ contents = 0
+ discovered = 0
+
+ # to identificate a directory with all files/directories present
+ if self.otype == DIRECTORY and self.pid:
+ return [1, 1]
+
+ for _, child_node in self.children.items():
+ if child_node.otype == CONTENT:
+ contents += 1
+ if child_node.pid:
+ discovered += 1
+
+ return [contents, discovered]
+
+ def has_dirs(self) -> bool:
+ """Checks if node has directories
+ """
+ for _, child_node in self.children.items():
+ if child_node.otype == DIRECTORY:
+ return True
+ return False
diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/plot.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import plotly.graph_objects as go # type: ignore
+import pandas as pd # type: ignore
+import numpy as np # type: ignore
+from typing import Iterable, List, Dict
+from pathlib import PosixPath
+
+
+def get_hierarchical_dataframe(
+ df: pd.DataFrame, levels: List[str],
+ color_columns: List[str]) -> pd.DataFrame:
+ """
+ Build a hierarchy of levels for Sunburst or Treemap charts.
+ """
+ complete_df = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
+ value_col = color_columns[0]
+ color_col = color_columns[1]
+
+ for i, level in enumerate(levels):
+ df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
+ dfg = df.groupby(levels[i:]).sum()
+ dfg = dfg.reset_index()
+ df_tree['id'] = dfg[level].copy()
+ if i < len(levels) - 1:
+ df_tree['parent'] = dfg[levels[i+1]].copy()
+ else:
+ df_tree['parent'] = 'total'
+ df_tree['value'] = dfg[value_col]
+ df_tree['color'] = dfg[color_col] / dfg[value_col] * 100
+ complete_df = complete_df.append(df_tree, ignore_index=True)
+
+ tot_avg = df[color_col].sum() / df[value_col].sum() * 100
+ total = pd.Series(dict(id='total', parent='',
+ value=df[value_col].sum(),
+ color=tot_avg))
+
+ complete_df = complete_df.append(total, ignore_index=True)
+
+ return complete_df
+
+
+def find_levels(df: pd.DataFrame) -> Iterable[str]:
+ """Discover levels inside the dataframe
+ """
+ for level in reversed(list(df.to_dict().keys())):
+ if level.startswith('lev'):
+ yield level
+
+
+def generate_df_columns(dirs, root: PosixPath):
+ """Generate columns based on the level of directories present
+ """
+ max_lvl = 0
+ for dir_path in dirs:
+ if dir_path == root:
+ continue
+
+ curr_lvl = len(dir_path.parts)
+ if curr_lvl > max_lvl:
+ max_lvl = curr_lvl
+
+ return ['lev'+str(i) for i in range(max_lvl)]
+
+
+def generate_df(dirs, columns: List[str], root: PosixPath,
+ max_level: int) -> pd.DataFrame:
+ """Generate a dataframe with the directories given in input
+ """
+ def get_dirs_array(dir_path: PosixPath) -> Iterable[List[str]]:
+ for dir_path, contents_info in dirs.items():
+ levels = max_level - len(dir_path.parts)
+
+ if dir_path == root:
+ # ignore the root but store contents information
+ yield ['']*(max_level) + contents_info
+ else:
+ path_array = [part for part in dir_path.parts]
+ yield path_array + ['']*levels + contents_info
+
+ df = pd.DataFrame(np.array(
+ [dir_array for dir_array in get_dirs_array(dirs)]), columns=columns)
+
+ df['size'] = pd.to_numeric(df['size'])
+ df['discovered'] = pd.to_numeric(df['discovered'])
+
+ return df
+
+
+def sunburst(directories: Dict[PosixPath, List[int]], root: PosixPath) -> None:
+ columns = ['size', 'discovered']
+
+ df_columns = generate_df_columns(directories.keys(), root) + columns
+ max_level = len(df_columns) - 2
+ df = generate_df(directories, df_columns, root, max_level)
+
+ # levels used for the hierarchical chart
+ levels = [e for e in find_levels(df)]
+
+ hierarchical_df = get_hierarchical_dataframe(df, levels, columns)
+ discovered_avg = df['discovered'].sum() / df['size'].sum()
+
+ fig = go.Figure()
+
+ fig.add_trace(go.Sunburst(
+ labels=hierarchical_df['id'],
+ parents=hierarchical_df['parent'],
+ values=hierarchical_df['value'],
+ branchvalues='total',
+ marker=dict(
+ colors=hierarchical_df['color'],
+ colorscale='RdBu',
+ cmid=discovered_avg),
+ hovertemplate='''%{label}
+
Files: %{value}
+
Discovered: %{color:.2f}%''',
+ name=''
+ ))
+
+ fig.show()
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -46,7 +46,9 @@
root = {
subdir: {
+ subsubdir
filesample.txt
+ filesample2.txt
}
subdir2
subfile.txt
@@ -54,23 +56,32 @@
"""
root = tmp_path_factory.getbasetemp()
subdir = tmp_path_factory.mktemp('subdir')
+ subsubdir = subdir.joinpath('subsubdir')
+ subsubdir.mkdir()
subdir2 = tmp_path_factory.mktemp('subdir2')
subfile = root / 'subfile.txt'
subfile.touch()
filesample = subdir / 'filesample.txt'
filesample.touch()
+ filesample2 = subdir / 'filesample2.txt'
+ filesample2.touch()
avail_path = {
subdir: pid_of_dir(bytes(subdir)),
+ subsubdir: pid_of_dir(bytes(subsubdir)),
subdir2: pid_of_dir(bytes(subdir2)),
subfile: pid_of_file(bytes(subfile)),
- filesample: pid_of_file(bytes(filesample))
+ filesample: pid_of_file(bytes(filesample)),
+ filesample2: pid_of_file(bytes(filesample2))
}
return {
'root': root,
'paths': avail_path,
- 'filesample': filesample
+ 'filesample': filesample,
+ 'filesample2': filesample2,
+ 'subsubdir': subsubdir,
+ 'subdir': subdir
}
diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py
--- a/swh/scanner/tests/test_model.py
+++ b/swh/scanner/tests/test_model.py
@@ -65,3 +65,25 @@
assert len(tree_dict) == 1
assert tree_dict['subdir0']['filesample.txt']
+
+
+def test_get_directories_info(example_tree, temp_folder):
+ root_path = temp_folder['root']
+ filesample_path = temp_folder['filesample']
+ filesample2_path = temp_folder['filesample2']
+ subdir_path = temp_folder['subdir'].relative_to(root_path)
+ subsubdir_path = temp_folder['subsubdir'].relative_to(root_path)
+
+ for path, pid in temp_folder['paths'].items():
+ if path == filesample_path or path == filesample2_path:
+ print(path)
+ example_tree.addNode(path, pid)
+ else:
+ example_tree.addNode(path)
+
+ tree_root = example_tree
+ directories = {tree_root.path: tree_root.count_contents()}
+ directories = tree_root.getDirectoriesInfo(directories, tree_root.path)
+
+ assert subsubdir_path not in directories
+ assert directories[subdir_path] == [2, 2]