diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ plotly pandas numpy +ndjson dulwich diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py --- a/swh/scanner/cli.py +++ b/swh/scanner/cli.py @@ -76,7 +76,7 @@ @click.option( "-f", "--format", - type=click.Choice(["text", "json", "sunburst"], case_sensitive=False), + type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False), default="text", help="select the output format", ) @@ -95,7 +95,7 @@ loop = asyncio.get_event_loop() loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns)) - source_tree.show(format) + source_tree.output(format) if __name__ == "__main__": diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -7,9 +7,11 @@ import sys import json from pathlib import PosixPath -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Iterable from enum import Enum +import ndjson # type: ignore + from .plot import sunburst from .exceptions import InvalidObjectType @@ -36,28 +38,32 @@ self.path = path self.otype = DIRECTORY if path.is_dir() else CONTENT self.pid = "" + self.known = False self.children: Dict[PosixPath, Tree] = {} - def addNode(self, path: PosixPath, pid: str = None) -> None: + def addNode(self, path: PosixPath, pid: str, known: bool) -> None: """Recursively add a new path. """ relative_path = path.relative_to(self.path) if relative_path == PosixPath("."): - if pid is not None: - self.pid = pid + self.pid = pid + self.known = known return new_path = self.path.joinpath(relative_path.parts[0]) if new_path not in self.children: self.children[new_path] = Tree(new_path, self) - self.children[new_path].addNode(path, pid) + self.children[new_path].addNode(path, pid, known) - def show(self, format) -> None: - """Show tree in different formats""" + def output(self, format) -> None: + """Display the model with the specified format""" if format == "json": - print(json.dumps(self.getTree(), indent=4, sort_keys=True)) + print(json.dumps(self.toDict(), indent=4, sort_keys=True)) + + elif format == "ndjson": + print(ndjson.dumps(dict_path for dict_path in self.iterate())) elif format == "text": isatty = sys.stdout.isatty() @@ -82,7 +88,7 @@ end = "/" if node.otype == DIRECTORY else "" if isatty: - if not node.pid: + if not node.known: rel_path = colorize(rel_path, Color.red) elif node.otype == DIRECTORY: rel_path = colorize(rel_path, Color.blue) @@ -91,26 +97,42 @@ print(f"{begin}{rel_path}{end}") - def getTree(self): - """Walk through the tree to discover content or directory that have - a persistent identifier. If a persistent identifier is found it saves - the path with the relative PID. + @property + def info(self): + """ + Get information about the current path Returns: - child_tree: the tree with the content/directory found + a dictionary containing a path with its known/unknown status and the + Software Heritage persistent identifier """ - child_tree = {} - for path, child_node in self.children.items(): - rel_path = str(child_node.path.relative_to(self.path)) - if child_node.pid: - child_tree[rel_path] = child_node.pid - else: - next_tree = child_node.getTree() - if next_tree: - child_tree[rel_path] = next_tree + node_info = {} + node_info["swhid"] = self.pid + node_info["known"] = self.known + return {str(self.path): node_info} + + def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]: + """ + Groups each child inside a dictionary + """ + for node_dict in self.iterate(): + dict_nodes.update(node_dict) + return dict_nodes - return child_tree + def iterate(self) -> Iterable[Dict[str, Dict]]: + """ + Recursively iterate through the children of the current node + + Yields: + a dictionary containing a path with its known/unknown status and the + Software Heritage persistent identifier + + """ + for _, child_node in self.children.items(): + yield child_node.info + if child_node.otype == DIRECTORY: + yield from child_node.iterate() def __getSubDirsInfo(self, root, directories): """Fills the directories given in input with the contents information @@ -158,14 +180,14 @@ "Can't calculate contents of the " "object type: %s" % self.otype ) - if self.pid: + if self.known: # to identify a directory with all files/directories present return (1, 1) else: for _, child_node in self.children.items(): if child_node.otype == CONTENT: contents += 1 - if child_node.pid: + if child_node.known: discovered += 1 return (contents, discovered) diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -164,18 +164,16 @@ """ async def _scan(root, session, api_url, source_tree, exclude_patterns): - for path, pid, found in await parse_path( + for path, pid, known in await parse_path( root, session, api_url, exclude_patterns ): obj_type = parse_persistent_identifier(pid).object_type if obj_type == CONTENT: - source_tree.addNode(path, pid if found else None) + source_tree.addNode(path, pid, known) elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): - if found: - source_tree.addNode(path, pid) - else: - source_tree.addNode(path) + source_tree.addNode(path, pid, known) + if not known: await _scan(path, session, api_url, source_tree, exclude_patterns) async with aiohttp.ClientSession() as session: diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -89,7 +89,7 @@ @pytest.fixture(scope="function") def example_tree(temp_folder): """Fixture that generate a Tree with the root present in the - session fixture "temp_folder". + session fixture "temp_folder". """ example_tree = Tree(temp_folder["root"]) assert example_tree.path == temp_folder["root"] @@ -113,9 +113,9 @@ for path, pid in temp_folder["paths"].items(): if path in known_paths: - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) else: - example_tree.addNode(path) + example_tree.addNode(path, pid, False) return example_tree.getDirectoriesInfo(root) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -10,9 +10,12 @@ } # present pids inside /data/sample-folder -present_pids = [ +present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/ "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/ ] + + +to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326" diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json deleted file mode 100644 --- a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "bar": { - "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" - }, - "link-to-foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" -} diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json deleted file mode 100644 --- a/swh/scanner/tests/data/sample-folder-result.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "bar": { - "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93" - }, - "link-to-foo": { - "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" - }, - "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", - "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb" -} diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -5,7 +5,7 @@ from flask import Flask, request -from .data import present_pids +from .data import present_swhids from swh.web.common.exc import LargePayloadExc @@ -15,17 +15,18 @@ @app.route("/known/", methods=["POST"]) def known(): - pids = request.get_json() + swhids = request.get_json() + max_requests = 100 - if len(pids) > 900: + if len(swhids) > max_requests: raise LargePayloadExc( - "The maximum number of PIDs this endpoint " "can receive is 900" + "The maximum number of PIDs this endpoint " "can receive is 50" ) - res = {pid: {"known": False} for pid in pids} - for pid in pids: - if pid in present_pids: - res[pid]["known"] = True + res = {swhid: {"known": False} for swhid in swhids} + for swhid in swhids: + if swhid in present_swhids: + res[swhid]["known"] = True return res diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -8,7 +8,7 @@ avail_paths = temp_folder["paths"].keys() for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, False) for path, node in example_tree.children.items(): assert path in avail_paths @@ -17,39 +17,41 @@ assert subpath in avail_paths -def test_get_json_tree_all_not_present(example_tree, temp_folder): +def test_to_json_no_one_present(example_tree, temp_folder): for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path) + example_tree.addNode(path, pid, False) - json_tree = example_tree.getTree() + result = example_tree.toDict() - assert len(json_tree) == 0 + assert len(result) == 6 + + for _, node_info in result.items(): + assert node_info["known"] is False def test_get_json_tree_all_present(example_tree, temp_folder): for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) + + result = example_tree.toDict() - tree_dict = example_tree.getTree() + assert len(result) == 6 - assert len(tree_dict) == 3 - # since subdir have a pid, it can't have a children path - assert tree_dict["subdir0"] is not dict + for _, node_info in result.items(): + assert node_info["known"] is True def test_get_json_tree_only_one_present(example_tree, temp_folder): + root = temp_folder["root"] filesample_path = temp_folder["filesample"] for path, pid in temp_folder["paths"].items(): - if path == filesample_path: - example_tree.addNode(path, pid) - else: - example_tree.addNode(path) + example_tree.addNode(path, pid, True if path == filesample_path else False) - tree_dict = example_tree.getTree() + result = example_tree.toDict() - assert len(tree_dict) == 1 - assert tree_dict["subdir0"]["filesample.txt"] + assert len(result) == 6 + assert result[str(root) + "/subdir0/filesample.txt"]["known"] is True def test_get_directories_info(example_tree, temp_folder): @@ -61,9 +63,9 @@ for path, pid in temp_folder["paths"].items(): if path == filesample_path or path == filesample2_path: - example_tree.addNode(path, pid) + example_tree.addNode(path, pid, True) else: - example_tree.addNode(path) + example_tree.addNode(path, pid, False) directories = example_tree.getDirectoriesInfo(example_tree.path) diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -7,7 +7,7 @@ import json from pathlib import PosixPath -from .data import correct_api_response +from .data import correct_api_response, present_swhids, to_exclude_swhid from swh.scanner.scanner import pids_discovery, get_subpaths, run from swh.scanner.model import Tree @@ -45,8 +45,8 @@ api_url = live_server.url() + "/" request = [ - "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901) - ] # /known/ is limited at 900 + "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(101) + ] # /known/ maximum accepted requests 100 with pytest.raises(APIError): event_loop.run_until_complete(pids_discovery(request, aiosession, api_url)) @@ -73,30 +73,24 @@ def test_scanner_result(live_server, event_loop, test_folder): api_url = live_server.url() + "/" - result_path = test_folder.joinpath(PosixPath("sample-folder-result.json")) - with open(result_path, "r") as json_file: - expected_result = json.loads(json_file.read()) - sample_folder = test_folder.joinpath(PosixPath("sample-folder")) source_tree = Tree(sample_folder) - event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple())) - - actual_result = source_tree.getTree() + event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set())) - assert actual_result == expected_result + for node_dict in source_tree.iterate(): + node_info = list(node_dict.values())[0] + if node_info["swhid"] in present_swhids: + assert node_info["known"] is True + else: + assert node_info["known"] is False def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder): api_url = live_server.url() + "/" - result_path = test_folder.joinpath( - PosixPath("sample-folder-result-no-toexclude.json") - ) - with open(result_path, "r") as json_file: - expected_result = json.loads(json_file.read()) - sample_folder = test_folder.joinpath(PosixPath("sample-folder")) + patterns = (str(sample_folder) + "/toexclude",) exclude_pattern = { reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns) @@ -107,6 +101,6 @@ run(sample_folder, api_url, source_tree, exclude_pattern) ) - actual_result = source_tree.getTree() - - assert actual_result == expected_result + for node_dict in source_tree.iterate(): + node_info = list(node_dict.values())[0] + assert node_info["swhid"] != to_exclude_swhid