Changeset View
Standalone View
swh/scanner/plot.py
- This file was added.
# Copyright (C) 2020 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
""" | |||||
The purpose of this module is to display and to interact with the result of the | |||||
scanner contained in the model. | |||||
vlorentz: this group of imports should be after the other ones. See the second item here: https://www. | |||||
The `sunburst` function generates a navigable sunburst chart from the | |||||
directories information retrieved from the model. The chart displays for | |||||
each directory the total number of files and the percentage of file known. | |||||
The size of the directory is defined by the total number of contents whereas | |||||
the color gradient is generated relying on the percentage of contents known. | |||||
""" | |||||
from typing import List, Dict, Tuple | |||||
from pathlib import PosixPath | |||||
from plotly.offline import offline # type: ignore | |||||
import plotly.graph_objects as go # type: ignore | |||||
import pandas as pd # type: ignore | |||||
import numpy as np # type: ignore | |||||
Not Done Inline Actionsmodule docstring must be before imports vlorentz: module docstring must be before imports | |||||
def build_hierarchical_df( | |||||
dirs_dataframe: pd.DataFrame, levels: List[str], | |||||
metrics_columns: List[str]) -> pd.DataFrame: | |||||
""" | |||||
Build a hierarchy of levels for Sunburst or Treemap charts. | |||||
Not Done Inline ActionsCould you add a description of each argument? (and possibly change the name df to express what it contains) vlorentz: Could you add a description of each argument? (and possibly change the name `df` to express… | |||||
Not Done Inline Actionsrenaming df to dataframe isn't more explicit about what it contains :/ vlorentz: renaming `df` to `dataframe` isn't more explicit about what it contains :/ | |||||
For each directory the new dataframe will have the following | |||||
Not Done Inline Actions"For each directory level stored, the new dataframe will have"? vlorentz: "For each directory level store**d,** the new dataframe will have"? | |||||
information: | |||||
id: the directory name | |||||
parent: the parent directory of id | |||||
contents: the total number of contents of the directory id and | |||||
Not Done Inline ActionsCould you add an example of inputs, and what the outputs are? (like you did for generate_df) vlorentz: Could you add an example of inputs, and what the outputs are? (like you did for `generate_df`) | |||||
the relative subdirectories | |||||
known: the percentage of contents known relative to computed | |||||
'contents' | |||||
Example: | |||||
Given the following dataframe: | |||||
.. code-block:: none | |||||
Not Done Inline Actionsforgot this print vlorentz: forgot this print | |||||
lev0 lev1 contents known | |||||
'' '' 20 2 //root | |||||
kernel subdirker 5 0 | |||||
telnet subdirtel 10 4 | |||||
The output hierarchical dataframe will be like the following: | |||||
.. code-block:: none | |||||
Not Done Inline ActionsI think I understand what is going on here, but could you add a comment with an example of how the data is reshaped from df to df_tree step-by-step, for future readers? vlorentz: I think I understand what is going on here, but could you add a comment with an example of how… | |||||
id parent contents known | |||||
20 10.00 | |||||
Not Done Inline ActionsThe data from the first df_trees will be copied into a new dataframe for each run of the loop. This is inefficient (quadratic complexity). You might want to build a list of dataframes and concatenate everything at the end. vlorentz: The data from the first `df_tree`s will be copied into a new dataframe for each run of the loop. | |||||
subdirker kernel 5 0.00 | |||||
subdirtel telnet 10 40.00 | |||||
total 20 10.00 | |||||
kernel total 5 0.00 | |||||
telnet total 10 40.00 | |||||
total 35 17.14 | |||||
To create the hierarchical dataframe we need to iterate through | |||||
the dataframe given in input relying on the number of levels. | |||||
Based on the previous example we have to do two iterations: | |||||
Not Done Inline Actionsdirs_path: List[PosixPath] vlorentz: `dirs_path: List[PosixPath]` | |||||
iteration 1 | |||||
Not Done Inline Actionsalso, would be nice to write in this docstring what data is in the dataframe vlorentz: also, would be nice to write in this docstring what data is in the dataframe | |||||
The generated dataframe 'df_tree' will be: | |||||
.. code-block:: none | |||||
id parent contents known | |||||
20 10.0 | |||||
subdirker kernel 5 0.0 | |||||
subdirtel telnet 10 40.0 | |||||
iteration 2 | |||||
The generated dataframe 'df_tree' will be: | |||||
.. code-block:: none | |||||
id parent contents known | |||||
total 20 10.0 | |||||
kernel total 5 0.0 | |||||
Not Done Inline ActionsYou could rename it generate_df_from_dirs. And you can add a type annotation: dirs: Dict[PosixPath, Tuple[int, int]]. vlorentz: You could rename it `generate_df_from_dirs`.
And you can add a type annotation: `dirs: Dict… | |||||
telnet total 10 40.0 | |||||
Note that since we have reached the last level, the parent given | |||||
to the directory id is 'total'. | |||||
The 'total' row il computed by adding the number of contents of the | |||||
dataframe given in input and the average of the contents known on | |||||
Not Done Inline ActionsGreat! vlorentz: Great! | |||||
the total number of contents. | |||||
""" | |||||
def compute_known_percentage(contents: pd.Series, known: pd.Series | |||||
) -> pd.Series: | |||||
"""This function compute the percentage of known contents and generate | |||||
the new known column with the percentage values. | |||||
It also assures that if there is no contents inside a directory | |||||
the percentage is zero | |||||
""" | |||||
known_values = [] | |||||
for idx, content_val in enumerate(contents): | |||||
if content_val == 0: | |||||
known_values.append(0) | |||||
else: | |||||
percentage = known[idx] / contents[idx] * 100 | |||||
known_values.append(percentage) | |||||
return pd.Series(np.array(known_values)) | |||||
complete_df = pd.DataFrame(columns=['id', 'parent', 'contents', 'known']) | |||||
# revert the level order to start from the deepest | |||||
levels = [level for level in reversed(levels)] | |||||
contents_col = metrics_columns[0] | |||||
Not Done Inline Actionsparts_list = list(dir_path.parts) vlorentz: `parts_list = list(dir_path.parts)` | |||||
known_col = metrics_columns[1] | |||||
df_tree_list = [] | |||||
for i, level in enumerate(levels): | |||||
df_tree = pd.DataFrame(columns=['id', 'parent', 'contents', 'known']) | |||||
dfg = dirs_dataframe.groupby(levels[i:]).sum() | |||||
dfg = dfg.reset_index() | |||||
df_tree['id'] = dfg[level].copy() | |||||
if i < len(levels) - 1: | |||||
# copy the parent directories (one level above) | |||||
df_tree['parent'] = dfg[levels[i+1]].copy() | |||||
else: | |||||
# last level reached | |||||
df_tree['parent'] = 'total' | |||||
# copy the contents column | |||||
df_tree['contents'] = dfg[contents_col] | |||||
# compute the percentage relative to the contents | |||||
df_tree['known'] = compute_known_percentage( | |||||
dfg[contents_col], dfg[known_col]) | |||||
df_tree_list.append(df_tree) | |||||
complete_df = complete_df.append(df_tree_list, ignore_index=True) | |||||
# create the main parent | |||||
total_contents = dirs_dataframe[contents_col].sum() | |||||
total_known = dirs_dataframe[known_col].sum() | |||||
total_avg = total_known / total_contents * 100 | |||||
total = pd.Series(dict(id='total', parent='', | |||||
contents=total_contents, | |||||
known=total_avg)) | |||||
complete_df = complete_df.append(total, ignore_index=True) | |||||
return complete_df | |||||
def compute_max_depth(dirs_path: List[PosixPath], root: PosixPath) -> int: | |||||
"""Compute the maximum depth level of the given directory paths. | |||||
Example: for `var/log/kernel/` the depth level is 3 | |||||
""" | |||||
max_depth = 0 | |||||
for dir_path in dirs_path: | |||||
if dir_path == root: | |||||
continue | |||||
dir_depth = len(dir_path.parts) | |||||
if dir_depth > max_depth: | |||||
max_depth = dir_depth | |||||
return max_depth | |||||
def generate_df_from_dirs(dirs: Dict[PosixPath, Tuple[int, int]], | |||||
columns: List[str], root: PosixPath, max_depth: int | |||||
) -> pd.DataFrame: | |||||
"""Generate a dataframe from the directories given in input. | |||||
Example: | |||||
given the following directories as input | |||||
.. code-block:: python | |||||
dirs = { | |||||
'/var/log/': (23, 2), | |||||
'/var/log/kernel': (5, 0), | |||||
'/var/log/telnet': (10, 3) | |||||
} | |||||
The generated dataframe will be: | |||||
.. code-block:: none | |||||
lev0 lev1 lev2 contents known | |||||
'var' 'log' '' 23 2 | |||||
'var' 'log' 'kernel' 5 0 | |||||
'var' 'log' 'telnet' 10 3 | |||||
""" | |||||
def get_dirs_array(): | |||||
for dir_path, contents_info in dirs.items(): | |||||
empty_lvl = max_depth - len(dir_path.parts) | |||||
if dir_path == root: | |||||
# ignore the root but store contents information | |||||
yield ['']*(max_depth) + list(contents_info) | |||||
else: | |||||
path_array = list(dir_path.parts) | |||||
yield path_array + ['']*empty_lvl + list(contents_info) | |||||
df = pd.DataFrame(np.array( | |||||
[dir_array for dir_array in get_dirs_array()]), columns=columns) | |||||
df['contents'] = pd.to_numeric(df['contents']) | |||||
df['known'] = pd.to_numeric(df['known']) | |||||
return df | |||||
def sunburst(directories: Dict[PosixPath, Tuple[int, int]], | |||||
root: PosixPath) -> None: | |||||
"""Show the sunburst chart from the directories given in input. | |||||
""" | |||||
max_depth = compute_max_depth(list(directories.keys()), root) | |||||
metrics_columns = ['contents', 'known'] | |||||
levels_columns = ['lev'+str(i) for i in range(max_depth)] | |||||
df_columns = levels_columns + metrics_columns | |||||
df = generate_df_from_dirs(directories, df_columns, root, max_depth) | |||||
hierarchical_df = build_hierarchical_df( | |||||
df, levels_columns, metrics_columns) | |||||
known_avg = df['known'].sum() / df['contents'].sum() | |||||
fig = go.Figure() | |||||
fig.add_trace(go.Sunburst( | |||||
labels=hierarchical_df['id'], | |||||
parents=hierarchical_df['parent'], | |||||
values=hierarchical_df['contents'], | |||||
branchvalues='total', | |||||
marker=dict( | |||||
colors=hierarchical_df['known'], | |||||
colorscale='RdBu', | |||||
cmid=known_avg), | |||||
hovertemplate='''<b>%{label}</b> | |||||
<br>Files: %{value} | |||||
<br>Known: <b>%{color:.2f}%</b>''', | |||||
name='' | |||||
)) | |||||
offline.plot(fig, filename='sunburst.html') |
this group of imports should be after the other ones. See the second item here: https://www.python.org/dev/peps/pep-0008/#imports