Changeset View
Changeset View
Standalone View
Standalone View
swh/scanner/plot.py
Show All 25 Lines | |||||
def build_hierarchical_df( | def build_hierarchical_df( | ||||
dirs_dataframe: pd.DataFrame, | dirs_dataframe: pd.DataFrame, | ||||
levels: List[str], | levels: List[str], | ||||
metrics_columns: List[str], | metrics_columns: List[str], | ||||
root_name: str, | root_name: str, | ||||
) -> pd.DataFrame: | ) -> pd.DataFrame: | ||||
""" | """ | ||||
Build a hierarchy of levels for Sunburst or Treemap charts. | Build a hierarchy of levels for Sunburst or Treemap charts. | ||||
For each directory the new dataframe will have the following | For each directory the new dataframe will have the following | ||||
information: | information: | ||||
id: the directory name | id: the directory name | ||||
parent: the parent directory of id | parent: the parent directory of id | ||||
contents: the total number of contents of the directory id and | contents: the total number of contents of the directory id and | ||||
the relative subdirectories | the relative subdirectories | ||||
known: the percentage of contents known relative to computed | known: the percentage of contents known relative to computed | ||||
'contents' | 'contents' | ||||
Example: | Example: | ||||
Given the following dataframe: | Given the following dataframe: | ||||
.. code-block:: none | .. code-block:: none | ||||
lev0 lev1 contents known | lev0 lev1 contents known | ||||
'' '' 20 2 //root | '' '' 20 2 //root | ||||
kernel kernel/subdirker 5 0 | kernel kernel/subdirker 5 0 | ||||
telnet telnet/subdirtel 10 4 | telnet telnet/subdirtel 10 4 | ||||
The output hierarchical dataframe will be like the following: | The output hierarchical dataframe will be like the following: | ||||
.. code-block:: none | .. code-block:: none | ||||
id parent contents known | id parent contents known | ||||
20 10.00 | 20 10.00 | ||||
kernel/subdirker kernel 5 0.00 | kernel/subdirker kernel 5 0.00 | ||||
telnet/subdirtel telnet 10 40.00 | telnet/subdirtel telnet 10 40.00 | ||||
total 20 10.00 | total 20 10.00 | ||||
kernel total 5 0.00 | kernel total 5 0.00 | ||||
telnet total 10 40.00 | telnet total 10 40.00 | ||||
total 35 17.14 | total 35 17.14 | ||||
To create the hierarchical dataframe we need to iterate through | To create the hierarchical dataframe we need to iterate through | ||||
the dataframe given in input relying on the number of levels. | the dataframe given in input relying on the number of levels. | ||||
Based on the previous example we have to do two iterations: | Based on the previous example we have to do two iterations: | ||||
iteration 1 | iteration 1 | ||||
The generated dataframe 'df_tree' will be: | The generated dataframe 'df_tree' will be: | ||||
.. code-block:: none | .. code-block:: none | ||||
id parent contents known | id parent contents known | ||||
20 10.0 | 20 10.0 | ||||
kernel/subdirker kernel 5 0.0 | kernel/subdirker kernel 5 0.0 | ||||
telnet/subdirtel telnet 10 40.0 | telnet/subdirtel telnet 10 40.0 | ||||
iteration 2 | iteration 2 | ||||
The generated dataframe 'df_tree' will be: | The generated dataframe 'df_tree' will be: | ||||
.. code-block:: none | .. code-block:: none | ||||
id parent contents known | id parent contents known | ||||
total 20 10.0 | total 20 10.0 | ||||
kernel total 5 0.0 | kernel total 5 0.0 | ||||
telnet total 10 40.0 | telnet total 10 40.0 | ||||
Note that since we have reached the last level, the parent given | Note that since we have reached the last level, the parent given | ||||
to the directory id is the directory root. | to the directory id is the directory root. | ||||
The 'total' row il computed by adding the number of contents of the | The 'total' row il computed by adding the number of contents of the | ||||
dataframe given in input and the average of the contents known on | dataframe given in input and the average of the contents known on | ||||
the total number of contents. | the total number of contents. | ||||
""" | """ | ||||
def compute_known_percentage(contents: pd.Series, known: pd.Series) -> pd.Series: | def compute_known_percentage(contents: pd.Series, known: pd.Series) -> pd.Series: | ||||
"""This function compute the percentage of known contents and generate | """This function compute the percentage of known contents and generate | ||||
the new known column with the percentage values. | the new known column with the percentage values. | ||||
It also assures that if there is no contents inside a directory | It also assures that if there is no contents inside a directory | ||||
the percentage is zero | the percentage is zero | ||||
""" | """ | ||||
known_values = [] | known_values = [] | ||||
for idx, content_val in enumerate(contents): | for idx, content_val in enumerate(contents): | ||||
if content_val == 0: | if content_val == 0: | ||||
known_values.append(0) | known_values.append(0) | ||||
else: | else: | ||||
percentage = known[idx] / contents[idx] * 100 | percentage = known[idx] / contents[idx] * 100 | ||||
Show All 38 Lines | total = pd.Series( | ||||
dict(id=root_name, parent="", contents=total_contents, known=total_avg) | dict(id=root_name, parent="", contents=total_contents, known=total_avg) | ||||
) | ) | ||||
complete_df = complete_df.append(total, ignore_index=True) | complete_df = complete_df.append(total, ignore_index=True) | ||||
return complete_df | return complete_df | ||||
def compute_max_depth(dirs_path: List[Path], root: Path) -> int: | def compute_max_depth(dirs_path: List[Path]) -> int: | ||||
"""Compute the maximum depth level of the given directory paths. | """Compute the maximum depth level of the given directory paths. | ||||
Example: for `var/log/kernel/` the depth level is 3 | Example: for `var/log/kernel/` the depth level is 3 | ||||
""" | """ | ||||
max_depth = 0 | max_depth = 0 | ||||
for dir_path in dirs_path: | for dir_path in dirs_path: | ||||
if dir_path == root: | dir_depth = len( | ||||
continue | dir_path.parts[1:] if dir_path.parts[0] == "/" else dir_path.parts | ||||
) | |||||
dir_depth = len(dir_path.parts) | |||||
if dir_depth > max_depth: | if dir_depth > max_depth: | ||||
max_depth = dir_depth | max_depth = dir_depth | ||||
return max_depth | return max_depth | ||||
def generate_df_from_dirs( | def generate_df_from_dirs( | ||||
dirs: Dict[Path, Tuple[int, int]], columns: List[str], root: Path, max_depth: int, | dirs: Dict[Path, Tuple[int, int]], columns: List[str], max_depth: int, | ||||
) -> pd.DataFrame: | ) -> pd.DataFrame: | ||||
"""Generate a dataframe from the directories given in input. | """Generate a dataframe from the directories given in input. | ||||
Example: | Example: | ||||
given the following directories as input | given the following directories as input | ||||
.. code-block:: python | .. code-block:: python | ||||
Show All 19 Lines | def get_parents(path: Path): | ||||
for i in range(1, len(parts) + 1): | for i in range(1, len(parts) + 1): | ||||
yield "/".join(parts[0:i]) | yield "/".join(parts[0:i]) | ||||
def get_dirs_array(): | def get_dirs_array(): | ||||
for dir_path, contents_info in dirs.items(): | for dir_path, contents_info in dirs.items(): | ||||
empty_lvl = max_depth - len(dir_path.parts) | empty_lvl = max_depth - len(dir_path.parts) | ||||
if dir_path == root: | yield list(get_parents(dir_path)) + [""] * empty_lvl + list(contents_info) | ||||
# ignore the root but store contents information | |||||
yield [""] * (max_depth) + list(contents_info) | |||||
else: | |||||
yield list(get_parents(dir_path)) + [""] * empty_lvl + list( | |||||
contents_info | |||||
) | |||||
df = pd.DataFrame( | df = pd.DataFrame( | ||||
np.array([dir_array for dir_array in get_dirs_array()]), columns=columns | np.array([dir_array for dir_array in get_dirs_array()]), columns=columns | ||||
) | ) | ||||
df["contents"] = pd.to_numeric(df["contents"]) | df["contents"] = pd.to_numeric(df["contents"]) | ||||
df["known"] = pd.to_numeric(df["known"]) | df["known"] = pd.to_numeric(df["known"]) | ||||
return df | return df | ||||
def generate_sunburst( | def generate_sunburst( | ||||
directories: Dict[Path, Tuple[int, int]], root: Path | directories: Dict[Path, Tuple[int, int]], root: Path | ||||
) -> go.Sunburst: | ) -> go.Sunburst: | ||||
"""Generate a sunburst chart from the directories given in input. | """Generate a sunburst chart from the directories given in input.""" | ||||
max_depth = compute_max_depth(list(directories.keys())) | |||||
""" | |||||
max_depth = compute_max_depth(list(directories.keys()), root) | |||||
metrics_columns = ["contents", "known"] | metrics_columns = ["contents", "known"] | ||||
levels_columns = ["lev" + str(i) for i in range(max_depth)] | levels_columns = ["lev" + str(i) for i in range(max_depth)] | ||||
df_columns = levels_columns + metrics_columns | df_columns = levels_columns + metrics_columns | ||||
dirs_df = generate_df_from_dirs(directories, df_columns, root, max_depth) | dirs_df = generate_df_from_dirs(directories, df_columns, max_depth) | ||||
hierarchical_df = build_hierarchical_df( | hierarchical_df = build_hierarchical_df( | ||||
dirs_df, levels_columns, metrics_columns, str(root) | dirs_df, levels_columns, metrics_columns, str(root) | ||||
) | ) | ||||
sunburst = go.Sunburst( | sunburst = go.Sunburst( | ||||
labels=hierarchical_df["id"], | labels=hierarchical_df["id"], | ||||
parents=hierarchical_df["parent"], | parents=hierarchical_df["parent"], | ||||
Show All 10 Lines | sunburst = go.Sunburst( | ||||
<br>Known: <b>%{color:.2f}%</b>""", | <br>Known: <b>%{color:.2f}%</b>""", | ||||
name="", | name="", | ||||
) | ) | ||||
return sunburst | return sunburst | ||||
def offline_plot(graph_object: go): | def offline_plot(graph_object: go): | ||||
"""Plot a graph object to an html file | """Plot a graph object to an html file""" | ||||
""" | |||||
fig = go.Figure() | fig = go.Figure() | ||||
fig.add_trace(graph_object) | fig.add_trace(graph_object) | ||||
offline.plot(fig, filename="chart.html") | offline.plot(fig, filename="chart.html") |