Page MenuHomeSoftware Heritage

D3069.id10908.diff
No OneTemporary

D3069.id10908.diff

diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@
plotly
pandas
numpy
+ndjson
dulwich
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -76,7 +76,7 @@
@click.option(
"-f",
"--format",
- type=click.Choice(["text", "json", "sunburst"], case_sensitive=False),
+ type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False),
default="text",
help="select the output format",
)
@@ -95,7 +95,7 @@
loop = asyncio.get_event_loop()
loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns))
- source_tree.show(format)
+ source_tree.output(format)
if __name__ == "__main__":
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -7,9 +7,11 @@
import sys
import json
from pathlib import PosixPath
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Tuple, Iterable
from enum import Enum
+import ndjson # type: ignore
+
from .plot import sunburst
from .exceptions import InvalidObjectType
@@ -36,28 +38,32 @@
self.path = path
self.otype = DIRECTORY if path.is_dir() else CONTENT
self.pid = ""
+ self.known = False
self.children: Dict[PosixPath, Tree] = {}
- def addNode(self, path: PosixPath, pid: str = None) -> None:
+ def addNode(self, path: PosixPath, pid: str, known: bool) -> None:
"""Recursively add a new path.
"""
relative_path = path.relative_to(self.path)
if relative_path == PosixPath("."):
- if pid is not None:
- self.pid = pid
+ self.pid = pid
+ self.known = known
return
new_path = self.path.joinpath(relative_path.parts[0])
if new_path not in self.children:
self.children[new_path] = Tree(new_path, self)
- self.children[new_path].addNode(path, pid)
+ self.children[new_path].addNode(path, pid, known)
- def show(self, format) -> None:
- """Show tree in different formats"""
+ def output(self, format) -> None:
+ """Display the model with the specified format"""
if format == "json":
- print(json.dumps(self.getTree(), indent=4, sort_keys=True))
+ print(json.dumps(self.toDict(), indent=4, sort_keys=True))
+
+ elif format == "ndjson":
+ print(ndjson.dumps(dict_path for dict_path in self.iterate()))
elif format == "text":
isatty = sys.stdout.isatty()
@@ -82,7 +88,7 @@
end = "/" if node.otype == DIRECTORY else ""
if isatty:
- if not node.pid:
+ if not node.known:
rel_path = colorize(rel_path, Color.red)
elif node.otype == DIRECTORY:
rel_path = colorize(rel_path, Color.blue)
@@ -91,26 +97,42 @@
print(f"{begin}{rel_path}{end}")
- def getTree(self):
- """Walk through the tree to discover content or directory that have
- a persistent identifier. If a persistent identifier is found it saves
- the path with the relative PID.
+ @property
+ def info(self):
+ """
+ Get information about the current path
Returns:
- child_tree: the tree with the content/directory found
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
"""
- child_tree = {}
- for path, child_node in self.children.items():
- rel_path = str(child_node.path.relative_to(self.path))
- if child_node.pid:
- child_tree[rel_path] = child_node.pid
- else:
- next_tree = child_node.getTree()
- if next_tree:
- child_tree[rel_path] = next_tree
+ node_info = {}
+ node_info["swhid"] = self.pid
+ node_info["known"] = self.known
+ return {str(self.path): node_info}
+
+ def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]:
+ """
+ Groups each child inside a dictionary
+ """
+ for node_dict in self.iterate():
+ dict_nodes.update(node_dict)
+ return dict_nodes
- return child_tree
+ def iterate(self) -> Iterable[Dict[str, Dict]]:
+ """
+ Recursively iterate through the children of the current node
+
+ Yields:
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
+
+ """
+ for _, child_node in self.children.items():
+ yield child_node.info
+ if child_node.otype == DIRECTORY:
+ yield from child_node.iterate()
def __getSubDirsInfo(self, root, directories):
"""Fills the directories given in input with the contents information
@@ -158,14 +180,14 @@
"Can't calculate contents of the " "object type: %s" % self.otype
)
- if self.pid:
+ if self.known:
# to identify a directory with all files/directories present
return (1, 1)
else:
for _, child_node in self.children.items():
if child_node.otype == CONTENT:
contents += 1
- if child_node.pid:
+ if child_node.known:
discovered += 1
return (contents, discovered)
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -164,18 +164,16 @@
"""
async def _scan(root, session, api_url, source_tree, exclude_patterns):
- for path, pid, found in await parse_path(
+ for path, pid, known in await parse_path(
root, session, api_url, exclude_patterns
):
obj_type = parse_persistent_identifier(pid).object_type
if obj_type == CONTENT:
- source_tree.addNode(path, pid if found else None)
+ source_tree.addNode(path, pid, known)
elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
- if found:
- source_tree.addNode(path, pid)
- else:
- source_tree.addNode(path)
+ source_tree.addNode(path, pid, known)
+ if not known:
await _scan(path, session, api_url, source_tree, exclude_patterns)
async with aiohttp.ClientSession() as session:
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -89,7 +89,7 @@
@pytest.fixture(scope="function")
def example_tree(temp_folder):
"""Fixture that generate a Tree with the root present in the
- session fixture "temp_folder".
+ session fixture "temp_folder".
"""
example_tree = Tree(temp_folder["root"])
assert example_tree.path == temp_folder["root"]
@@ -113,9 +113,9 @@
for path, pid in temp_folder["paths"].items():
if path in known_paths:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
return example_tree.getDirectoriesInfo(root)
diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py
--- a/swh/scanner/tests/data.py
+++ b/swh/scanner/tests/data.py
@@ -10,9 +10,12 @@
}
# present pids inside /data/sample-folder
-present_pids = [
+present_swhids = [
"swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md
"swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary
"swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/
"swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/
]
+
+
+to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326"
diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -5,7 +5,7 @@
from flask import Flask, request
-from .data import present_pids
+from .data import present_swhids
from swh.web.common.exc import LargePayloadExc
@@ -15,17 +15,18 @@
@app.route("/known/", methods=["POST"])
def known():
- pids = request.get_json()
+ swhids = request.get_json()
+ max_requests = 100
- if len(pids) > 900:
+ if len(swhids) > max_requests:
raise LargePayloadExc(
- "The maximum number of PIDs this endpoint " "can receive is 900"
+ "The maximum number of PIDs this endpoint " "can receive is 50"
)
- res = {pid: {"known": False} for pid in pids}
- for pid in pids:
- if pid in present_pids:
- res[pid]["known"] = True
+ res = {swhid: {"known": False} for swhid in swhids}
+ for swhid in swhids:
+ if swhid in present_swhids:
+ res[swhid]["known"] = True
return res
diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py
--- a/swh/scanner/tests/test_model.py
+++ b/swh/scanner/tests/test_model.py
@@ -8,7 +8,7 @@
avail_paths = temp_folder["paths"].keys()
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, False)
for path, node in example_tree.children.items():
assert path in avail_paths
@@ -17,39 +17,41 @@
assert subpath in avail_paths
-def test_get_json_tree_all_not_present(example_tree, temp_folder):
+def test_to_json_no_one_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
- json_tree = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(json_tree) == 0
+ assert len(result) == 6
+
+ for _, node_info in result.items():
+ assert node_info["known"] is False
def test_get_json_tree_all_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
+
+ result = example_tree.toDict()
- tree_dict = example_tree.getTree()
+ assert len(result) == 6
- assert len(tree_dict) == 3
- # since subdir have a pid, it can't have a children path
- assert tree_dict["subdir0"] is not dict
+ for _, node_info in result.items():
+ assert node_info["known"] is True
def test_get_json_tree_only_one_present(example_tree, temp_folder):
+ root = temp_folder["root"]
filesample_path = temp_folder["filesample"]
for path, pid in temp_folder["paths"].items():
- if path == filesample_path:
- example_tree.addNode(path, pid)
- else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, True if path == filesample_path else False)
- tree_dict = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(tree_dict) == 1
- assert tree_dict["subdir0"]["filesample.txt"]
+ assert len(result) == 6
+ assert result[str(root) + "/subdir0/filesample.txt"]["known"] is True
def test_get_directories_info(example_tree, temp_folder):
@@ -61,9 +63,9 @@
for path, pid in temp_folder["paths"].items():
if path == filesample_path or path == filesample2_path:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
directories = example_tree.getDirectoriesInfo(example_tree.path)
diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py
--- a/swh/scanner/tests/test_scanner.py
+++ b/swh/scanner/tests/test_scanner.py
@@ -7,7 +7,7 @@
import json
from pathlib import PosixPath
-from .data import correct_api_response
+from .data import correct_api_response, present_swhids, to_exclude_swhid
from swh.scanner.scanner import pids_discovery, get_subpaths, run
from swh.scanner.model import Tree
@@ -45,8 +45,8 @@
api_url = live_server.url() + "/"
request = [
- "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901)
- ] # /known/ is limited at 900
+ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(101)
+ ] # /known/ maximum accepted requests 100
with pytest.raises(APIError):
event_loop.run_until_complete(pids_discovery(request, aiosession, api_url))
@@ -73,30 +73,24 @@
def test_scanner_result(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(PosixPath("sample-folder-result.json"))
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
source_tree = Tree(sample_folder)
- event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple()))
-
- actual_result = source_tree.getTree()
+ event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set()))
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ if node_info["swhid"] in present_swhids:
+ assert node_info["known"] is True
+ else:
+ assert node_info["known"] is False
def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(
- PosixPath("sample-folder-result-no-toexclude.json")
- )
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
+
patterns = (str(sample_folder) + "/toexclude",)
exclude_pattern = {
reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns)
@@ -107,6 +101,6 @@
run(sample_folder, api_url, source_tree, exclude_pattern)
)
- actual_result = source_tree.getTree()
-
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ assert node_info["swhid"] != to_exclude_swhid

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 3:38 PM (6 h, 53 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230738

Event Timeline