diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -7,7 +7,7 @@ import sys import json from pathlib import PosixPath -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Iterable from enum import Enum from .plot import sunburst @@ -35,24 +35,25 @@ self.father = father self.path = path self.otype = DIRECTORY if path.is_dir() else CONTENT - self.pid = "" + self.swhid = "" + self.known = False self.children: Dict[PosixPath, Tree] = {} - def addNode(self, path: PosixPath, pid: str = None) -> None: + def addNode(self, path: PosixPath, swhid: str, known: bool) -> None: """Recursively add a new path. """ relative_path = path.relative_to(self.path) if relative_path == PosixPath("."): - if pid is not None: - self.pid = pid + self.swhid = swhid + self.known = known return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: self.children[new_path] = Tree(new_path, self) - self.children[new_path].addNode(path, pid) + self.children[new_path].addNode(path, swhid, known) def show(self, format) -> None: """Show tree in different formats""" @@ -82,7 +83,7 @@ end = "/" if node.otype == DIRECTORY else "" if isatty: - if not node.pid: + if not node.known: rel_path = colorize(rel_path, Color.red) elif node.otype == DIRECTORY: rel_path = colorize(rel_path, Color.blue) @@ -91,6 +92,43 @@ print(f"{begin}{rel_path}{end}") + @property + def info(self): + """ + Get information about the current path + + Returns: + a dictionary containing a path with its known/unknown status and the + Software Heritage persistent identifier + + """ + node_info = {} + node_info["swhid"] = self.swhid + node_info["known"] = self.known + return {str(self.path): node_info} + + def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]: + """ + Groups each child inside a dictionary + """ + for node_dict in self.iterate(): + dict_nodes.update(node_dict) + return dict_nodes + + def iterate(self) -> Iterable[Dict[str, Dict]]: + """ + Recursively iterate through the children of the current node + + Yields: + a dictionary containing a path with its known/unknown status and the + Software Heritage persistent identifier + + """ + for _, child_node in self.children.items(): + yield child_node.info + if child_node.otype == DIRECTORY: + yield from child_node.iterate() + def getTree(self): """Walk through the tree to discover content or directory that have a persistent identifier. If a persistent identifier is found it saves @@ -103,8 +141,8 @@ child_tree = {} for path, child_node in self.children.items(): rel_path = str(child_node.path.relative_to(self.path)) - if child_node.pid: - child_tree[rel_path] = child_node.pid + if child_node.swhid: + child_tree[rel_path] = child_node.swhid else: next_tree = child_node.getTree() if next_tree: @@ -158,14 +196,14 @@ "Can't calculate contents of the " "object type: %s" % self.otype ) - if self.pid: + if self.known: # to identify a directory with all files/directories present return (1, 1) else: for _, child_node in self.children.items(): if child_node.otype == CONTENT: contents += 1 - if child_node.pid: + if child_node.known: discovered += 1 return (contents, discovered) diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -164,18 +164,16 @@ """ async def _scan(root, session, api_url, source_tree, exclude_patterns): - for path, pid, found in await parse_path( + for path, pid, known in await parse_path( root, session, api_url, exclude_patterns ): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: - source_tree.addNode(path, pid if found else None) + source_tree.addNode(path, pid, known) elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): - if found: - source_tree.addNode(path, pid) - else: - source_tree.addNode(path) + source_tree.addNode(path, pid, known) + if not known: await _scan(path, session, api_url, source_tree, exclude_patterns) async with aiohttp.ClientSession() as session: diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -89,7 +89,7 @@ @pytest.fixture(scope="function") def example_tree(temp_folder): """Fixture that generate a Tree with the root present in the - session fixture "temp_folder". + session fixture "temp_folder". """ example_tree = Tree(temp_folder["root"]) assert example_tree.path == temp_folder["root"] @@ -113,9 +113,9 @@ for path, pid in temp_folder["paths"].items(): if path in known_paths: - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) else: - example_tree.addNode(path) + example_tree.addNode(path, pid, False) return example_tree.getDirectoriesInfo(root) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -10,9 +10,12 @@ } # present pids inside /data/sample-folder -present_pids = [ +present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] + + +to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json deleted file mode 100644 --- a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "bar": { - "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" - }, - "link-to-foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" -} diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json deleted file mode 100644 --- a/swh/scanner/tests/data/sample-folder-result.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "bar": { - "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" - }, - "link-to-foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", - "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" -} diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -5,7 +5,7 @@ from flask import Flask, request -from .data import present_pids +from .data import present_swhids from swh.web.common.exc import LargePayloadExc @@ -15,17 +15,17 @@ @app.route("/known/", methods=["POST"]) def known(): - pids = request.get_json() + swhids = request.get_json() - if len(pids) > 900: + if len(swhids) > 900: raise LargePayloadExc( "The maximum number of PIDs this endpoint " "can receive is 900" ) - res = {pid: {"known": False} for pid in pids} - for pid in pids: - if pid in present_pids: - res[pid]["known"] = True + res = {swhid: {"known": False} for swhid in swhids} + for swhid in swhids: + if swhid in present_swhids: + res[swhid]["known"] = True return res diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -8,7 +8,7 @@ avail_paths = temp_folder["paths"].keys() for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, False) for path, node in example_tree.children.items(): assert path in avail_paths @@ -17,39 +17,41 @@ assert subpath in avail_paths -def test_get_json_tree_all_not_present(example_tree, temp_folder): +def test_to_json_no_one_present(example_tree, temp_folder): for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path) + example_tree.addNode(path, pid, False) - json_tree = example_tree.getTree() + result = example_tree.toDict() - assert len(json_tree) == 0 + assert len(result) == 6 + + for _, node_info in result.items(): + assert node_info["known"] is False def test_get_json_tree_all_present(example_tree, temp_folder): for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) + + result = example_tree.toDict() - tree_dict = example_tree.getTree() + assert len(result) == 6 - assert len(tree_dict) == 3 - # since subdir have a pid, it can't have a children path - assert tree_dict["subdir0"] is not dict + for _, node_info in result.items(): + assert node_info["known"] is True def test_get_json_tree_only_one_present(example_tree, temp_folder): + root = temp_folder["root"] filesample_path = temp_folder["filesample"] for path, pid in temp_folder["paths"].items(): - if path == filesample_path: - example_tree.addNode(path, pid) - else: - example_tree.addNode(path) + example_tree.addNode(path, pid, True if path == filesample_path else False) - tree_dict = example_tree.getTree() + result = example_tree.toDict() - assert len(tree_dict) == 1 - assert tree_dict["subdir0"]["filesample.txt"] + assert len(result) == 6 + assert result[str(root) + "/subdir0/filesample.txt"]["known"] is True def test_get_directories_info(example_tree, temp_folder): @@ -61,9 +63,9 @@ for path, pid in temp_folder["paths"].items(): if path == filesample_path or path == filesample2_path: - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) else: - example_tree.addNode(path) + example_tree.addNode(path, pid, False) directories = example_tree.getDirectoriesInfo(example_tree.path) diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -7,7 +7,7 @@ import json from pathlib import PosixPath -from .data import correct_api_response +from .data import correct_api_response, present_swhids, to_exclude_swhid from swh.scanner.scanner import pids_discovery, get_subpaths, run from swh.scanner.model import Tree @@ -73,30 +73,24 @@ def test_scanner_result(live_server, event_loop, test_folder): api_url = live_server.url() + "/" - result_path = test_folder.joinpath(PosixPath("sample-folder-result.json")) - with open(result_path, "r") as json_file: - expected_result = json.loads(json_file.read()) - sample_folder = test_folder.joinpath(PosixPath("sample-folder")) source_tree = Tree(sample_folder) - event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple())) - - actual_result = source_tree.getTree() + event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set())) - assert actual_result == expected_result + for node_dict in source_tree.iterate(): + node_info = list(node_dict.values())[0] + if node_info["swhid"] in present_swhids: + assert node_info["known"] is True + else: + assert node_info["known"] is False def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder): api_url = live_server.url() + "/" - result_path = test_folder.joinpath( - PosixPath("sample-folder-result-no-toexclude.json") - ) - with open(result_path, "r") as json_file: - expected_result = json.loads(json_file.read()) - sample_folder = test_folder.joinpath(PosixPath("sample-folder")) + patterns = (str(sample_folder) + "/toexclude",) exclude_pattern = { reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns) @@ -107,6 +101,6 @@ run(sample_folder, api_url, source_tree, exclude_pattern) ) - actual_result = source_tree.getTree() - - assert actual_result == expected_result + for node_dict in source_tree.iterate(): + node_info = list(node_dict.values())[0] + assert node_info["swhid"] != to_exclude_swhid