diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,7 @@
vcversioner
requests
aiohttp
+plotly
+pandas
+numpy
dulwich
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -35,7 +35,8 @@
metavar='API_URL', show_default=True,
help="url for the api request")
@click.option('-f', '--format',
- type=click.Choice(['text', 'json'], case_sensitive=False),
+ type=click.Choice(['text', 'json', 'sunburst'],
+ case_sensitive=False),
default='text',
help="select the output format")
@click.pass_context
diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py
--- a/swh/scanner/exceptions.py
+++ b/swh/scanner/exceptions.py
@@ -4,6 +4,10 @@
# See top-level LICENSE file for more information
+class InvalidObjectType(TypeError):
+ pass
+
+
class APIError(Exception):
def __str__(self):
return '"%s"' % self.args
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -7,9 +7,12 @@
import sys
import json
from pathlib import PosixPath
-from typing import Any, Dict
+from typing import Any, Dict, Tuple
from enum import Enum
+from .plot import sunburst
+from .exceptions import InvalidObjectType
+
from swh.model.identifiers import (
DIRECTORY, CONTENT
)
@@ -37,7 +40,7 @@
self.children: Dict[PosixPath, Tree] = {}
def addNode(self, path: PosixPath, pid: str = None) -> None:
- """Recursively add a new node path
+ """Recursively add a new path.
"""
relative_path = path.relative_to(self.path)
@@ -53,9 +56,10 @@
self.children[new_path].addNode(path, pid)
def show(self, format) -> None:
- """Print all the tree"""
+ """Show tree in different formats"""
if format == 'json':
print(json.dumps(self.getTree(), indent=4, sort_keys=True))
+
elif format == 'text':
isatty = sys.stdout.isatty()
@@ -63,7 +67,12 @@
else str(self.path))
self.printChildren(isatty)
- def printChildren(self, isatty: bool, inc: int = 0) -> None:
+ elif format == 'sunburst':
+ root = self.path
+ directories = self.getDirectoriesInfo(root)
+ sunburst(directories, root)
+
+ def printChildren(self, isatty: bool, inc: int = 1) -> None:
for path, node in self.children.items():
self.printNode(node, isatty, inc)
if node.children:
@@ -104,3 +113,69 @@
child_tree[rel_path] = next_tree
return child_tree
+
+ def __getSubDirsInfo(self, root, directories):
+ """Fills the directories given in input with the contents information
+ stored inside the directory child, only if they have contents.
+ """
+ for path, child_node in self.children.items():
+ if child_node.otype == DIRECTORY:
+ rel_path = path.relative_to(root)
+ contents_info = child_node.count_contents()
+ # checks the first element of the tuple
+ # (the number of contents in a directory)
+ # if it is equal to zero it means that there are no contents
+ # in that directory.
+ if not contents_info[0] == 0:
+ directories[rel_path] = contents_info
+ if child_node.has_dirs():
+ child_node.__getSubDirsInfo(root, directories)
+
+ def getDirectoriesInfo(self, root: PosixPath
+ ) -> Dict[PosixPath, Tuple[int, int]]:
+ """Get information about all directories under the given root.
+
+ Returns:
+ A dictionary with a directory path as key and the relative
+ contents information (the result of count_contents) as values.
+
+ """
+ directories = {root: self.count_contents()}
+ self.__getSubDirsInfo(root, directories)
+ return directories
+
+ def count_contents(self) -> Tuple[int, int]:
+ """Count how many contents are present inside a directory.
+ If a directory has a pid returns as it has all the contents.
+
+ Returns:
+ A tuple with the total number of the contents and the number
+ of contents known (the ones that have a persistent identifier).
+
+ """
+ contents = 0
+ discovered = 0
+
+ if not self.otype == DIRECTORY:
+ raise InvalidObjectType('Can\'t calculate contents of the '
+ 'object type: %s' % self.otype)
+
+ if self.pid:
+ # to identify a directory with all files/directories present
+ return (1, 1)
+ else:
+ for _, child_node in self.children.items():
+ if child_node.otype == CONTENT:
+ contents += 1
+ if child_node.pid:
+ discovered += 1
+
+ return (contents, discovered)
+
+ def has_dirs(self) -> bool:
+ """Checks if node has directories
+ """
+ for _, child_node in self.children.items():
+ if child_node.otype == DIRECTORY:
+ return True
+ return False
diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/plot.py
@@ -0,0 +1,257 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""
+The purpose of this module is to display and to interact with the result of the
+scanner contained in the model.
+
+The `sunburst` function generates a navigable sunburst chart from the
+directories information retrieved from the model. The chart displays for
+each directory the total number of files and the percentage of file known.
+
+The size of the directory is defined by the total number of contents whereas
+the color gradient is generated relying on the percentage of contents known.
+"""
+
+from typing import List, Dict, Tuple
+from pathlib import PosixPath
+
+from plotly.offline import offline # type: ignore
+import plotly.graph_objects as go # type: ignore
+import pandas as pd # type: ignore
+import numpy as np # type: ignore
+
+
+def build_hierarchical_df(
+ dirs_dataframe: pd.DataFrame, levels: List[str],
+ metrics_columns: List[str]) -> pd.DataFrame:
+ """
+ Build a hierarchy of levels for Sunburst or Treemap charts.
+
+ For each directory the new dataframe will have the following
+ information:
+
+ id: the directory name
+ parent: the parent directory of id
+ contents: the total number of contents of the directory id and
+ the relative subdirectories
+ known: the percentage of contents known relative to computed
+ 'contents'
+
+ Example:
+ Given the following dataframe:
+
+ .. code-block:: none
+
+ lev0 lev1 contents known
+ '' '' 20 2 //root
+ kernel subdirker 5 0
+ telnet subdirtel 10 4
+
+ The output hierarchical dataframe will be like the following:
+
+ .. code-block:: none
+
+ id parent contents known
+ 20 10.00
+ subdirker kernel 5 0.00
+ subdirtel telnet 10 40.00
+ total 20 10.00
+ kernel total 5 0.00
+ telnet total 10 40.00
+ total 35 17.14
+
+ To create the hierarchical dataframe we need to iterate through
+ the dataframe given in input relying on the number of levels.
+
+ Based on the previous example we have to do two iterations:
+
+ iteration 1
+ The generated dataframe 'df_tree' will be:
+
+ .. code-block:: none
+
+ id parent contents known
+ 20 10.0
+ subdirker kernel 5 0.0
+ subdirtel telnet 10 40.0
+
+ iteration 2
+ The generated dataframe 'df_tree' will be:
+
+ .. code-block:: none
+
+ id parent contents known
+ total 20 10.0
+ kernel total 5 0.0
+ telnet total 10 40.0
+
+ Note that since we have reached the last level, the parent given
+ to the directory id is 'total'.
+
+ The 'total' row il computed by adding the number of contents of the
+ dataframe given in input and the average of the contents known on
+ the total number of contents.
+
+ """
+ def compute_known_percentage(contents: pd.Series, known: pd.Series
+ ) -> pd.Series:
+ """This function compute the percentage of known contents and generate
+ the new known column with the percentage values.
+
+ It also assures that if there is no contents inside a directory
+ the percentage is zero
+
+ """
+ known_values = []
+ for idx, content_val in enumerate(contents):
+ if content_val == 0:
+ known_values.append(0)
+ else:
+ percentage = known[idx] / contents[idx] * 100
+ known_values.append(percentage)
+
+ return pd.Series(np.array(known_values))
+
+ complete_df = pd.DataFrame(columns=['id', 'parent', 'contents', 'known'])
+ # revert the level order to start from the deepest
+ levels = [level for level in reversed(levels)]
+ contents_col = metrics_columns[0]
+ known_col = metrics_columns[1]
+
+ df_tree_list = []
+ for i, level in enumerate(levels):
+ df_tree = pd.DataFrame(columns=['id', 'parent', 'contents', 'known'])
+ dfg = dirs_dataframe.groupby(levels[i:]).sum()
+ dfg = dfg.reset_index()
+ df_tree['id'] = dfg[level].copy()
+ if i < len(levels) - 1:
+ # copy the parent directories (one level above)
+ df_tree['parent'] = dfg[levels[i+1]].copy()
+ else:
+ # last level reached
+ df_tree['parent'] = 'total'
+
+ # copy the contents column
+ df_tree['contents'] = dfg[contents_col]
+ # compute the percentage relative to the contents
+ df_tree['known'] = compute_known_percentage(
+ dfg[contents_col], dfg[known_col])
+
+ df_tree_list.append(df_tree)
+
+ complete_df = complete_df.append(df_tree_list, ignore_index=True)
+
+ # create the main parent
+ total_contents = dirs_dataframe[contents_col].sum()
+ total_known = dirs_dataframe[known_col].sum()
+ total_avg = total_known / total_contents * 100
+
+ total = pd.Series(dict(id='total', parent='',
+ contents=total_contents,
+ known=total_avg))
+
+ complete_df = complete_df.append(total, ignore_index=True)
+
+ return complete_df
+
+
+def compute_max_depth(dirs_path: List[PosixPath], root: PosixPath) -> int:
+ """Compute the maximum depth level of the given directory paths.
+
+ Example: for `var/log/kernel/` the depth level is 3
+
+ """
+ max_depth = 0
+ for dir_path in dirs_path:
+ if dir_path == root:
+ continue
+
+ dir_depth = len(dir_path.parts)
+ if dir_depth > max_depth:
+ max_depth = dir_depth
+
+ return max_depth
+
+
+def generate_df_from_dirs(dirs: Dict[PosixPath, Tuple[int, int]],
+ columns: List[str], root: PosixPath, max_depth: int
+ ) -> pd.DataFrame:
+ """Generate a dataframe from the directories given in input.
+
+ Example:
+ given the following directories as input
+
+ .. code-block:: python
+
+ dirs = {
+ '/var/log/': (23, 2),
+ '/var/log/kernel': (5, 0),
+ '/var/log/telnet': (10, 3)
+ }
+
+ The generated dataframe will be:
+
+ .. code-block:: none
+
+ lev0 lev1 lev2 contents known
+ 'var' 'log' '' 23 2
+ 'var' 'log' 'kernel' 5 0
+ 'var' 'log' 'telnet' 10 3
+
+ """
+ def get_dirs_array():
+ for dir_path, contents_info in dirs.items():
+ empty_lvl = max_depth - len(dir_path.parts)
+
+ if dir_path == root:
+ # ignore the root but store contents information
+ yield ['']*(max_depth) + list(contents_info)
+ else:
+ path_array = list(dir_path.parts)
+ yield path_array + ['']*empty_lvl + list(contents_info)
+
+ df = pd.DataFrame(np.array(
+ [dir_array for dir_array in get_dirs_array()]), columns=columns)
+
+ df['contents'] = pd.to_numeric(df['contents'])
+ df['known'] = pd.to_numeric(df['known'])
+
+ return df
+
+
+def sunburst(directories: Dict[PosixPath, Tuple[int, int]],
+ root: PosixPath) -> None:
+ """Show the sunburst chart from the directories given in input.
+
+ """
+ max_depth = compute_max_depth(list(directories.keys()), root)
+ metrics_columns = ['contents', 'known']
+ levels_columns = ['lev'+str(i) for i in range(max_depth)]
+
+ df_columns = levels_columns + metrics_columns
+ df = generate_df_from_dirs(directories, df_columns, root, max_depth)
+
+ hierarchical_df = build_hierarchical_df(
+ df, levels_columns, metrics_columns)
+ known_avg = df['known'].sum() / df['contents'].sum()
+
+ fig = go.Figure()
+ fig.add_trace(go.Sunburst(
+ labels=hierarchical_df['id'],
+ parents=hierarchical_df['parent'],
+ values=hierarchical_df['contents'],
+ branchvalues='total',
+ marker=dict(
+ colors=hierarchical_df['known'],
+ colorscale='RdBu',
+ cmid=known_avg),
+ hovertemplate='''%{label}
+
Files: %{value}
+
Known: %{color:.2f}%''',
+ name=''
+ ))
+
+ offline.plot(fig, filename='sunburst.html')
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -11,6 +11,7 @@
from aioresponses import aioresponses # type: ignore
from swh.model.cli import pid_of_file, pid_of_dir
+from swh.scanner.model import Tree
from .flask_api import create_app
@@ -46,7 +47,9 @@
root = {
subdir: {
+ subsubdir
filesample.txt
+ filesample2.txt
}
subdir2
subfile.txt
@@ -54,31 +57,67 @@
"""
root = tmp_path_factory.getbasetemp()
subdir = tmp_path_factory.mktemp('subdir')
+ subsubdir = subdir.joinpath('subsubdir')
+ subsubdir.mkdir()
subdir2 = tmp_path_factory.mktemp('subdir2')
subfile = root / 'subfile.txt'
subfile.touch()
filesample = subdir / 'filesample.txt'
filesample.touch()
+ filesample2 = subdir / 'filesample2.txt'
+ filesample2.touch()
avail_path = {
subdir: pid_of_dir(bytes(subdir)),
+ subsubdir: pid_of_dir(bytes(subsubdir)),
subdir2: pid_of_dir(bytes(subdir2)),
subfile: pid_of_file(bytes(subfile)),
- filesample: pid_of_file(bytes(filesample))
+ filesample: pid_of_file(bytes(filesample)),
+ filesample2: pid_of_file(bytes(filesample2))
}
return {
'root': root,
'paths': avail_path,
- 'filesample': filesample
+ 'filesample': filesample,
+ 'filesample2': filesample2,
+ 'subsubdir': subsubdir,
+ 'subdir': subdir
}
-@pytest.fixture(scope='session')
-def app():
- """Flask backend API (used by live_server)."""
- app = create_app()
- return app
+@pytest.fixture(scope='function')
+def example_tree(temp_folder):
+ """Fixture that generate a Tree with the root present in the
+ session fixture "temp_folder".
+ """
+ example_tree = Tree(temp_folder['root'])
+ assert example_tree.path == temp_folder['root']
+
+ return example_tree
+
+
+@pytest.fixture(scope='function')
+def example_dirs(example_tree, temp_folder):
+ """
+ Fixture that fill the fixture example_tree with the values contained in
+ the fixture temp_folder and returns the directories information of the
+ filled example_tree.
+
+ """
+ root = temp_folder['root']
+ filesample_path = temp_folder['filesample']
+ filesample2_path = temp_folder['filesample2']
+ subsubdir_path = temp_folder['subsubdir']
+ known_paths = [filesample_path, filesample2_path, subsubdir_path]
+
+ for path, pid in temp_folder['paths'].items():
+ if path in known_paths:
+ example_tree.addNode(path, pid)
+ else:
+ example_tree.addNode(path)
+
+ return example_tree.getDirectoriesInfo(root)
@pytest.fixture
@@ -88,3 +127,10 @@
tests_data_folder = tests_path.joinpath('data')
assert tests_data_folder.exists()
return tests_data_folder
+
+
+@pytest.fixture(scope='session')
+def app():
+ """Flask backend API (used by live_server)."""
+ app = create_app()
+ return app
diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py
--- a/swh/scanner/tests/test_model.py
+++ b/swh/scanner/tests/test_model.py
@@ -3,21 +3,6 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import pytest
-
-from swh.scanner.model import Tree
-
-
-@pytest.fixture(scope='function')
-def example_tree(temp_folder):
- """Fixture that generate a Tree with the root present in the
- session fixture "temp_folder".
- """
- example_tree = Tree(temp_folder['root'])
- assert example_tree.path == temp_folder['root']
-
- return example_tree
-
def test_tree_add_node(example_tree, temp_folder):
avail_paths = temp_folder['paths'].keys()
@@ -65,3 +50,22 @@
assert len(tree_dict) == 1
assert tree_dict['subdir0']['filesample.txt']
+
+
+def test_get_directories_info(example_tree, temp_folder):
+ root_path = temp_folder['root']
+ filesample_path = temp_folder['filesample']
+ filesample2_path = temp_folder['filesample2']
+ subdir_path = temp_folder['subdir'].relative_to(root_path)
+ subsubdir_path = temp_folder['subsubdir'].relative_to(root_path)
+
+ for path, pid in temp_folder['paths'].items():
+ if path == filesample_path or path == filesample2_path:
+ example_tree.addNode(path, pid)
+ else:
+ example_tree.addNode(path)
+
+ directories = example_tree.getDirectoriesInfo(example_tree.path)
+
+ assert subsubdir_path not in directories
+ assert directories[subdir_path] == (2, 2)
diff --git a/swh/scanner/tests/test_plot.py b/swh/scanner/tests/test_plot.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/tests/test_plot.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scanner.plot import (
+ compute_max_depth, generate_df_from_dirs, build_hierarchical_df
+)
+
+
+def test_max_depth(temp_folder, example_dirs):
+ root = temp_folder['root']
+ max_depth = compute_max_depth(example_dirs, root)
+ assert max_depth == 2
+
+
+def test_generate_df_from_dirs(temp_folder, example_dirs):
+ root = temp_folder['root']
+ max_depth = compute_max_depth(example_dirs, root)
+ metrics_columns = ['contents', 'known']
+ levels_columns = ['lev'+str(i) for i in range(max_depth)]
+ df_columns = levels_columns + metrics_columns
+
+ actual_df = generate_df_from_dirs(
+ example_dirs, df_columns, root, max_depth)
+
+ # assert root is empty
+ assert actual_df['lev0'][0] == ''
+ assert actual_df['lev1'][0] == ''
+
+ # assert subdir has correct contents information
+ assert actual_df['contents'][1] == 2
+ assert actual_df['known'][1] == 2
+
+ # assert subsubdir has correct level information
+ assert actual_df['lev0'][2] == 'subdir0'
+ assert actual_df['lev1'][2] == 'subsubdir'
+
+
+def test_build_hierarchical_df(temp_folder, example_dirs):
+ root = temp_folder['root']
+ max_depth = compute_max_depth(example_dirs, root)
+ metrics_columns = ['contents', 'known']
+ levels_columns = ['lev'+str(i) for i in range(max_depth)]
+ df_columns = levels_columns + metrics_columns
+
+ actual_df = generate_df_from_dirs(
+ example_dirs, df_columns, root, max_depth)
+
+ actual_result = build_hierarchical_df(
+ actual_df, levels_columns, metrics_columns)
+
+ assert actual_result['parent'][1] == 'subdir0'
+ assert actual_result['contents'][1] == 2
+ assert actual_result['id'][5] == 'total'
+ assert actual_result['known'][5] == 75