diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,7 @@ vcversioner requests aiohttp +plotly +pandas +numpy dulwich diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -35,7 +35,8 @@ metavar='API_URL', show_default=True, help="url for the api request") @click.option('-f', '--format', - type=click.Choice(['text', 'json'], case_sensitive=False), + type=click.Choice(['text', 'json', 'sunburst'], + case_sensitive=False), default='text', help="select the output format") @click.pass_context diff --git a/swh/scanner/exceptions.py b/swh/scanner/exceptions.py --- a/swh/scanner/exceptions.py +++ b/swh/scanner/exceptions.py @@ -4,6 +4,10 @@ # See top-level LICENSE file for more information +class InvalidObjectType(TypeError): + pass + + class APIError(Exception): def __str__(self): return '"%s"' % self.args diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -7,9 +7,12 @@ import sys import json from pathlib import PosixPath -from typing import Any, Dict +from typing import Any, Dict, Tuple from enum import Enum +from .plot import sunburst +from .exceptions import InvalidObjectType + from swh.model.identifiers import ( DIRECTORY, CONTENT ) @@ -37,7 +40,7 @@ self.children: Dict[PosixPath, Tree] = {} def addNode(self, path: PosixPath, pid: str = None) -> None: - """Recursively add a new node path + """Recursively add a new path. """ relative_path = path.relative_to(self.path) @@ -53,9 +56,10 @@ self.children[new_path].addNode(path, pid) def show(self, format) -> None: - """Print all the tree""" + """Show tree in different formats""" if format == 'json': print(json.dumps(self.getTree(), indent=4, sort_keys=True)) + elif format == 'text': isatty = sys.stdout.isatty() @@ -63,7 +67,12 @@ else str(self.path)) self.printChildren(isatty) - def printChildren(self, isatty: bool, inc: int = 0) -> None: + elif format == 'sunburst': + root = self.path + directories = self.getDirectoriesInfo(root) + sunburst(directories, root) + + def printChildren(self, isatty: bool, inc: int = 1) -> None: for path, node in self.children.items(): self.printNode(node, isatty, inc) if node.children: @@ -104,3 +113,69 @@ child_tree[rel_path] = next_tree return child_tree + + def __getSubDirsInfo(self, root, directories): + """Fills the directories given in input with the contents information + stored inside the directory child, only if they have contents. + """ + for path, child_node in self.children.items(): + if child_node.otype == DIRECTORY: + rel_path = path.relative_to(root) + contents_info = child_node.count_contents() + # checks the first element of the tuple + # (the number of contents in a directory) + # if it is equal to zero it means that there are no contents + # in that directory. + if not contents_info[0] == 0: + directories[rel_path] = contents_info + if child_node.has_dirs(): + child_node.__getSubDirsInfo(root, directories) + + def getDirectoriesInfo(self, root: PosixPath + ) -> Dict[PosixPath, Tuple[int, int]]: + """Get information about all directories under the given root. + + Returns: + A dictionary with a directory path as key and the relative + contents information (the result of count_contents) as values. + + """ + directories = {root: self.count_contents()} + self.__getSubDirsInfo(root, directories) + return directories + + def count_contents(self) -> Tuple[int, int]: + """Count how many contents are present inside a directory. + If a directory has a pid returns as it has all the contents. + + Returns: + A tuple with the total number of the contents and the number + of contents known (the ones that have a persistent identifier). + + """ + contents = 0 + discovered = 0 + + if not self.otype == DIRECTORY: + raise InvalidObjectType('Can\'t calculate contents of the ' + 'object type: %s' % self.otype) + + if self.pid: + # to identify a directory with all files/directories present + return (1, 1) + else: + for _, child_node in self.children.items(): + if child_node.otype == CONTENT: + contents += 1 + if child_node.pid: + discovered += 1 + + return (contents, discovered) + + def has_dirs(self) -> bool: + """Checks if node has directories + """ + for _, child_node in self.children.items(): + if child_node.otype == DIRECTORY: + return True + return False diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py new file mode 100644 --- /dev/null +++ b/swh/scanner/plot.py @@ -0,0 +1,264 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +The purpose of this module is to display and to interact with the result of the +scanner contained in the model. + +The `sunburst` function generates a navigable sunburst chart from the +directories information retrieved from the model. The chart displays for +each directory the total number of files and the percentage of file known. + +The size of the directory is defined by the total number of contents whereas +the color gradient is generated relying on the percentage of contents known. +""" + +from typing import List, Dict, Tuple +from pathlib import PosixPath + +from plotly.offline import offline # type: ignore +import plotly.graph_objects as go # type: ignore +import pandas as pd # type: ignore +import numpy as np # type: ignore + + +def build_hierarchical_df( + dirs_dataframe: pd.DataFrame, levels: List[str], + metrics_columns: List[str], root_name: str) -> pd.DataFrame: + """ + Build a hierarchy of levels for Sunburst or Treemap charts. + + For each directory the new dataframe will have the following + information: + + id: the directory name + parent: the parent directory of id + contents: the total number of contents of the directory id and + the relative subdirectories + known: the percentage of contents known relative to computed + 'contents' + + Example: + Given the following dataframe: + + .. code-block:: none + + lev0 lev1 contents known + '' '' 20 2 //root + kernel kernel/subdirker 5 0 + telnet telnet/subdirtel 10 4 + + The output hierarchical dataframe will be like the following: + + .. code-block:: none + + id parent contents known + 20 10.00 + kernel/subdirker kernel 5 0.00 + telnet/subdirtel telnet 10 40.00 + total 20 10.00 + kernel total 5 0.00 + telnet total 10 40.00 + total 35 17.14 + + To create the hierarchical dataframe we need to iterate through + the dataframe given in input relying on the number of levels. + + Based on the previous example we have to do two iterations: + + iteration 1 + The generated dataframe 'df_tree' will be: + + .. code-block:: none + + id parent contents known + 20 10.0 + kernel/subdirker kernel 5 0.0 + telnet/subdirtel telnet 10 40.0 + + iteration 2 + The generated dataframe 'df_tree' will be: + + .. code-block:: none + + id parent contents known + total 20 10.0 + kernel total 5 0.0 + telnet total 10 40.0 + + Note that since we have reached the last level, the parent given + to the directory id is the directory root. + + The 'total' row il computed by adding the number of contents of the + dataframe given in input and the average of the contents known on + the total number of contents. + + """ + def compute_known_percentage(contents: pd.Series, known: pd.Series + ) -> pd.Series: + """This function compute the percentage of known contents and generate + the new known column with the percentage values. + + It also assures that if there is no contents inside a directory + the percentage is zero + + """ + known_values = [] + for idx, content_val in enumerate(contents): + if content_val == 0: + known_values.append(0) + else: + percentage = known[idx] / contents[idx] * 100 + known_values.append(percentage) + + return pd.Series(np.array(known_values)) + + complete_df = pd.DataFrame(columns=['id', 'parent', 'contents', 'known']) + # revert the level order to start from the deepest + levels = [level for level in reversed(levels)] + contents_col = metrics_columns[0] + known_col = metrics_columns[1] + + df_tree_list = [] + for i, level in enumerate(levels): + df_tree = pd.DataFrame(columns=['id', 'parent', 'contents', 'known']) + dfg = dirs_dataframe.groupby(levels[i:]).sum() + dfg = dfg.reset_index() + df_tree['id'] = dfg[level].copy() + if i < len(levels) - 1: + # copy the parent directories (one level above) + df_tree['parent'] = dfg[levels[i+1]].copy() + else: + # last level reached + df_tree['parent'] = root_name + + # copy the contents column + df_tree['contents'] = dfg[contents_col] + # compute the percentage relative to the contents + df_tree['known'] = compute_known_percentage( + dfg[contents_col], dfg[known_col]) + + df_tree_list.append(df_tree) + + complete_df = complete_df.append(df_tree_list, ignore_index=True) + + # create the main parent + total_contents = dirs_dataframe[contents_col].sum() + total_known = dirs_dataframe[known_col].sum() + total_avg = total_known / total_contents * 100 + + total = pd.Series(dict(id=root_name, parent='', + contents=total_contents, + known=total_avg)) + + complete_df = complete_df.append(total, ignore_index=True) + + return complete_df + + +def compute_max_depth(dirs_path: List[PosixPath], root: PosixPath) -> int: + """Compute the maximum depth level of the given directory paths. + + Example: for `var/log/kernel/` the depth level is 3 + + """ + max_depth = 0 + for dir_path in dirs_path: + if dir_path == root: + continue + + dir_depth = len(dir_path.parts) + if dir_depth > max_depth: + max_depth = dir_depth + + return max_depth + + +def generate_df_from_dirs(dirs: Dict[PosixPath, Tuple[int, int]], + columns: List[str], root: PosixPath, max_depth: int + ) -> pd.DataFrame: + """Generate a dataframe from the directories given in input. + + Example: + given the following directories as input + + .. code-block:: python + + dirs = { + '/var/log/': (23, 2), + '/var/log/kernel': (5, 0), + '/var/log/telnet': (10, 3) + } + + The generated dataframe will be: + + .. code-block:: none + + lev0 lev1 lev2 contents known + 'var' 'var/log' '' 23 2 + 'var' 'var/log' 'var/log/kernel' 5 0 + 'var' 'var/log' 'var/log/telnet' 10 3 + + """ + def get_parents(path: PosixPath): + parts = path.parts[1:] if path.parts[0] == '/' else path.parts + + for i in range(1, len(parts)+1): + yield '/'.join(parts[0:i]) + + def get_dirs_array(): + for dir_path, contents_info in dirs.items(): + empty_lvl = max_depth - len(dir_path.parts) + + if dir_path == root: + # ignore the root but store contents information + yield ['']*(max_depth) + list(contents_info) + else: + yield list(get_parents(dir_path)) + \ + ['']*empty_lvl + \ + list(contents_info) + + df = pd.DataFrame(np.array( + [dir_array for dir_array in get_dirs_array()]), columns=columns) + + df['contents'] = pd.to_numeric(df['contents']) + df['known'] = pd.to_numeric(df['known']) + + return df + + +def sunburst(directories: Dict[PosixPath, Tuple[int, int]], + root: PosixPath) -> None: + """Show the sunburst chart from the directories given in input. + + """ + max_depth = compute_max_depth(list(directories.keys()), root) + metrics_columns = ['contents', 'known'] + levels_columns = ['lev'+str(i) for i in range(max_depth)] + + df_columns = levels_columns + metrics_columns + dirs_df = generate_df_from_dirs(directories, df_columns, root, max_depth) + + hierarchical_df = build_hierarchical_df( + dirs_df, levels_columns, metrics_columns, str(root)) + known_avg = dirs_df['known'].sum() / dirs_df['contents'].sum() + + fig = go.Figure() + fig.add_trace(go.Sunburst( + labels=hierarchical_df['id'], + parents=hierarchical_df['parent'], + values=hierarchical_df['contents'], + branchvalues='total', + marker=dict( + colors=hierarchical_df['known'], + colorscale='RdBu', + cmid=known_avg), + hovertemplate='''%{label} +
Files: %{value} +
Known: %{color:.2f}%''', + name='' + )) + + offline.plot(fig, filename='sunburst.html') diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -11,6 +11,7 @@ from aioresponses import aioresponses # type: ignore from swh.model.cli import pid_of_file, pid_of_dir +from swh.scanner.model import Tree from .flask_api import create_app @@ -46,7 +47,9 @@ root = { subdir: { + subsubdir filesample.txt + filesample2.txt } subdir2 subfile.txt @@ -54,31 +57,67 @@ """ root = tmp_path_factory.getbasetemp() subdir = tmp_path_factory.mktemp('subdir') + subsubdir = subdir.joinpath('subsubdir') + subsubdir.mkdir() subdir2 = tmp_path_factory.mktemp('subdir2') subfile = root / 'subfile.txt' subfile.touch() filesample = subdir / 'filesample.txt' filesample.touch() + filesample2 = subdir / 'filesample2.txt' + filesample2.touch() avail_path = { subdir: pid_of_dir(bytes(subdir)), + subsubdir: pid_of_dir(bytes(subsubdir)), subdir2: pid_of_dir(bytes(subdir2)), subfile: pid_of_file(bytes(subfile)), - filesample: pid_of_file(bytes(filesample)) + filesample: pid_of_file(bytes(filesample)), + filesample2: pid_of_file(bytes(filesample2)) } return { 'root': root, 'paths': avail_path, - 'filesample': filesample + 'filesample': filesample, + 'filesample2': filesample2, + 'subsubdir': subsubdir, + 'subdir': subdir } -@pytest.fixture(scope='session') -def app(): - """Flask backend API (used by live_server).""" - app = create_app() - return app +@pytest.fixture(scope='function') +def example_tree(temp_folder): + """Fixture that generate a Tree with the root present in the + session fixture "temp_folder". + """ + example_tree = Tree(temp_folder['root']) + assert example_tree.path == temp_folder['root'] + + return example_tree + + +@pytest.fixture(scope='function') +def example_dirs(example_tree, temp_folder): + """ + Fixture that fill the fixture example_tree with the values contained in + the fixture temp_folder and returns the directories information of the + filled example_tree. + + """ + root = temp_folder['root'] + filesample_path = temp_folder['filesample'] + filesample2_path = temp_folder['filesample2'] + subsubdir_path = temp_folder['subsubdir'] + known_paths = [filesample_path, filesample2_path, subsubdir_path] + + for path, pid in temp_folder['paths'].items(): + if path in known_paths: + example_tree.addNode(path, pid) + else: + example_tree.addNode(path) + + return example_tree.getDirectoriesInfo(root) @pytest.fixture @@ -88,3 +127,10 @@ tests_data_folder = tests_path.joinpath('data') assert tests_data_folder.exists() return tests_data_folder + + +@pytest.fixture(scope='session') +def app(): + """Flask backend API (used by live_server).""" + app = create_app() + return app diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -3,21 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest - -from swh.scanner.model import Tree - - -@pytest.fixture(scope='function') -def example_tree(temp_folder): - """Fixture that generate a Tree with the root present in the - session fixture "temp_folder". - """ - example_tree = Tree(temp_folder['root']) - assert example_tree.path == temp_folder['root'] - - return example_tree - def test_tree_add_node(example_tree, temp_folder): avail_paths = temp_folder['paths'].keys() @@ -65,3 +50,22 @@ assert len(tree_dict) == 1 assert tree_dict['subdir0']['filesample.txt'] + + +def test_get_directories_info(example_tree, temp_folder): + root_path = temp_folder['root'] + filesample_path = temp_folder['filesample'] + filesample2_path = temp_folder['filesample2'] + subdir_path = temp_folder['subdir'].relative_to(root_path) + subsubdir_path = temp_folder['subsubdir'].relative_to(root_path) + + for path, pid in temp_folder['paths'].items(): + if path == filesample_path or path == filesample2_path: + example_tree.addNode(path, pid) + else: + example_tree.addNode(path) + + directories = example_tree.getDirectoriesInfo(example_tree.path) + + assert subsubdir_path not in directories + assert directories[subdir_path] == (2, 2) diff --git a/swh/scanner/tests/test_plot.py b/swh/scanner/tests/test_plot.py new file mode 100644 --- /dev/null +++ b/swh/scanner/tests/test_plot.py @@ -0,0 +1,56 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scanner.plot import ( + compute_max_depth, generate_df_from_dirs, build_hierarchical_df +) + + +def test_max_depth(temp_folder, example_dirs): + root = temp_folder['root'] + max_depth = compute_max_depth(example_dirs, root) + assert max_depth == 2 + + +def test_generate_df_from_dirs(temp_folder, example_dirs): + root = temp_folder['root'] + max_depth = compute_max_depth(example_dirs, root) + metrics_columns = ['contents', 'known'] + levels_columns = ['lev'+str(i) for i in range(max_depth)] + df_columns = levels_columns + metrics_columns + + actual_df = generate_df_from_dirs( + example_dirs, df_columns, root, max_depth) + + # assert root is empty + assert actual_df['lev0'][0] == '' + assert actual_df['lev1'][0] == '' + + # assert subdir has correct contents information + assert actual_df['contents'][1] == 2 + assert actual_df['known'][1] == 2 + + # assert subsubdir has correct level information + assert actual_df['lev0'][2] == 'subdir0' + assert actual_df['lev1'][2] == 'subdir0/subsubdir' + + +def test_build_hierarchical_df(temp_folder, example_dirs): + root = temp_folder['root'] + max_depth = compute_max_depth(example_dirs, root) + metrics_columns = ['contents', 'known'] + levels_columns = ['lev'+str(i) for i in range(max_depth)] + df_columns = levels_columns + metrics_columns + + actual_df = generate_df_from_dirs( + example_dirs, df_columns, root, max_depth) + + actual_result = build_hierarchical_df( + actual_df, levels_columns, metrics_columns, root) + + assert actual_result['parent'][1] == 'subdir0' + assert actual_result['contents'][1] == 2 + assert actual_result['id'][5] == root + assert actual_result['known'][5] == 75