Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163790
D3069.id10908.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
15 KB
Subscribers
None
D3069.id10908.diff
View Options
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@
plotly
pandas
numpy
+ndjson
dulwich
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -76,7 +76,7 @@
@click.option(
"-f",
"--format",
- type=click.Choice(["text", "json", "sunburst"], case_sensitive=False),
+ type=click.Choice(["text", "json", "ndjson", "sunburst"], case_sensitive=False),
default="text",
help="select the output format",
)
@@ -95,7 +95,7 @@
loop = asyncio.get_event_loop()
loop.run_until_complete(run(root_path, api_url, source_tree, sre_patterns))
- source_tree.show(format)
+ source_tree.output(format)
if __name__ == "__main__":
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -7,9 +7,11 @@
import sys
import json
from pathlib import PosixPath
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Tuple, Iterable
from enum import Enum
+import ndjson # type: ignore
+
from .plot import sunburst
from .exceptions import InvalidObjectType
@@ -36,28 +38,32 @@
self.path = path
self.otype = DIRECTORY if path.is_dir() else CONTENT
self.pid = ""
+ self.known = False
self.children: Dict[PosixPath, Tree] = {}
- def addNode(self, path: PosixPath, pid: str = None) -> None:
+ def addNode(self, path: PosixPath, pid: str, known: bool) -> None:
"""Recursively add a new path.
"""
relative_path = path.relative_to(self.path)
if relative_path == PosixPath("."):
- if pid is not None:
- self.pid = pid
+ self.pid = pid
+ self.known = known
return
new_path = self.path.joinpath(relative_path.parts[0])
if new_path not in self.children:
self.children[new_path] = Tree(new_path, self)
- self.children[new_path].addNode(path, pid)
+ self.children[new_path].addNode(path, pid, known)
- def show(self, format) -> None:
- """Show tree in different formats"""
+ def output(self, format) -> None:
+ """Display the model with the specified format"""
if format == "json":
- print(json.dumps(self.getTree(), indent=4, sort_keys=True))
+ print(json.dumps(self.toDict(), indent=4, sort_keys=True))
+
+ elif format == "ndjson":
+ print(ndjson.dumps(dict_path for dict_path in self.iterate()))
elif format == "text":
isatty = sys.stdout.isatty()
@@ -82,7 +88,7 @@
end = "/" if node.otype == DIRECTORY else ""
if isatty:
- if not node.pid:
+ if not node.known:
rel_path = colorize(rel_path, Color.red)
elif node.otype == DIRECTORY:
rel_path = colorize(rel_path, Color.blue)
@@ -91,26 +97,42 @@
print(f"{begin}{rel_path}{end}")
- def getTree(self):
- """Walk through the tree to discover content or directory that have
- a persistent identifier. If a persistent identifier is found it saves
- the path with the relative PID.
+ @property
+ def info(self):
+ """
+ Get information about the current path
Returns:
- child_tree: the tree with the content/directory found
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
"""
- child_tree = {}
- for path, child_node in self.children.items():
- rel_path = str(child_node.path.relative_to(self.path))
- if child_node.pid:
- child_tree[rel_path] = child_node.pid
- else:
- next_tree = child_node.getTree()
- if next_tree:
- child_tree[rel_path] = next_tree
+ node_info = {}
+ node_info["swhid"] = self.pid
+ node_info["known"] = self.known
+ return {str(self.path): node_info}
+
+ def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]:
+ """
+ Groups each child inside a dictionary
+ """
+ for node_dict in self.iterate():
+ dict_nodes.update(node_dict)
+ return dict_nodes
- return child_tree
+ def iterate(self) -> Iterable[Dict[str, Dict]]:
+ """
+ Recursively iterate through the children of the current node
+
+ Yields:
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
+
+ """
+ for _, child_node in self.children.items():
+ yield child_node.info
+ if child_node.otype == DIRECTORY:
+ yield from child_node.iterate()
def __getSubDirsInfo(self, root, directories):
"""Fills the directories given in input with the contents information
@@ -158,14 +180,14 @@
"Can't calculate contents of the " "object type: %s" % self.otype
)
- if self.pid:
+ if self.known:
# to identify a directory with all files/directories present
return (1, 1)
else:
for _, child_node in self.children.items():
if child_node.otype == CONTENT:
contents += 1
- if child_node.pid:
+ if child_node.known:
discovered += 1
return (contents, discovered)
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -164,18 +164,16 @@
"""
async def _scan(root, session, api_url, source_tree, exclude_patterns):
- for path, pid, found in await parse_path(
+ for path, pid, known in await parse_path(
root, session, api_url, exclude_patterns
):
obj_type = parse_persistent_identifier(pid).object_type
if obj_type == CONTENT:
- source_tree.addNode(path, pid if found else None)
+ source_tree.addNode(path, pid, known)
elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
- if found:
- source_tree.addNode(path, pid)
- else:
- source_tree.addNode(path)
+ source_tree.addNode(path, pid, known)
+ if not known:
await _scan(path, session, api_url, source_tree, exclude_patterns)
async with aiohttp.ClientSession() as session:
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -89,7 +89,7 @@
@pytest.fixture(scope="function")
def example_tree(temp_folder):
"""Fixture that generate a Tree with the root present in the
- session fixture "temp_folder".
+ session fixture "temp_folder".
"""
example_tree = Tree(temp_folder["root"])
assert example_tree.path == temp_folder["root"]
@@ -113,9 +113,9 @@
for path, pid in temp_folder["paths"].items():
if path in known_paths:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
return example_tree.getDirectoriesInfo(root)
diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py
--- a/swh/scanner/tests/data.py
+++ b/swh/scanner/tests/data.py
@@ -10,9 +10,12 @@
}
# present pids inside /data/sample-folder
-present_pids = [
+present_swhids = [
"swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md
"swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary
"swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/
"swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/
]
+
+
+to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326"
diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -5,7 +5,7 @@
from flask import Flask, request
-from .data import present_pids
+from .data import present_swhids
from swh.web.common.exc import LargePayloadExc
@@ -15,17 +15,18 @@
@app.route("/known/", methods=["POST"])
def known():
- pids = request.get_json()
+ swhids = request.get_json()
+ max_requests = 100
- if len(pids) > 900:
+ if len(swhids) > max_requests:
raise LargePayloadExc(
- "The maximum number of PIDs this endpoint " "can receive is 900"
+ "The maximum number of PIDs this endpoint " "can receive is 50"
)
- res = {pid: {"known": False} for pid in pids}
- for pid in pids:
- if pid in present_pids:
- res[pid]["known"] = True
+ res = {swhid: {"known": False} for swhid in swhids}
+ for swhid in swhids:
+ if swhid in present_swhids:
+ res[swhid]["known"] = True
return res
diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py
--- a/swh/scanner/tests/test_model.py
+++ b/swh/scanner/tests/test_model.py
@@ -8,7 +8,7 @@
avail_paths = temp_folder["paths"].keys()
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, False)
for path, node in example_tree.children.items():
assert path in avail_paths
@@ -17,39 +17,41 @@
assert subpath in avail_paths
-def test_get_json_tree_all_not_present(example_tree, temp_folder):
+def test_to_json_no_one_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
- json_tree = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(json_tree) == 0
+ assert len(result) == 6
+
+ for _, node_info in result.items():
+ assert node_info["known"] is False
def test_get_json_tree_all_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
+
+ result = example_tree.toDict()
- tree_dict = example_tree.getTree()
+ assert len(result) == 6
- assert len(tree_dict) == 3
- # since subdir have a pid, it can't have a children path
- assert tree_dict["subdir0"] is not dict
+ for _, node_info in result.items():
+ assert node_info["known"] is True
def test_get_json_tree_only_one_present(example_tree, temp_folder):
+ root = temp_folder["root"]
filesample_path = temp_folder["filesample"]
for path, pid in temp_folder["paths"].items():
- if path == filesample_path:
- example_tree.addNode(path, pid)
- else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, True if path == filesample_path else False)
- tree_dict = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(tree_dict) == 1
- assert tree_dict["subdir0"]["filesample.txt"]
+ assert len(result) == 6
+ assert result[str(root) + "/subdir0/filesample.txt"]["known"] is True
def test_get_directories_info(example_tree, temp_folder):
@@ -61,9 +63,9 @@
for path, pid in temp_folder["paths"].items():
if path == filesample_path or path == filesample2_path:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
directories = example_tree.getDirectoriesInfo(example_tree.path)
diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py
--- a/swh/scanner/tests/test_scanner.py
+++ b/swh/scanner/tests/test_scanner.py
@@ -7,7 +7,7 @@
import json
from pathlib import PosixPath
-from .data import correct_api_response
+from .data import correct_api_response, present_swhids, to_exclude_swhid
from swh.scanner.scanner import pids_discovery, get_subpaths, run
from swh.scanner.model import Tree
@@ -45,8 +45,8 @@
api_url = live_server.url() + "/"
request = [
- "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901)
- ] # /known/ is limited at 900
+ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(101)
+ ] # /known/ maximum accepted requests 100
with pytest.raises(APIError):
event_loop.run_until_complete(pids_discovery(request, aiosession, api_url))
@@ -73,30 +73,24 @@
def test_scanner_result(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(PosixPath("sample-folder-result.json"))
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
source_tree = Tree(sample_folder)
- event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple()))
-
- actual_result = source_tree.getTree()
+ event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set()))
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ if node_info["swhid"] in present_swhids:
+ assert node_info["known"] is True
+ else:
+ assert node_info["known"] is False
def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(
- PosixPath("sample-folder-result-no-toexclude.json")
- )
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
+
patterns = (str(sample_folder) + "/toexclude",)
exclude_pattern = {
reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns)
@@ -107,6 +101,6 @@
run(sample_folder, api_url, source_tree, exclude_pattern)
)
- actual_result = source_tree.getTree()
-
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ assert node_info["swhid"] != to_exclude_swhid
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 3:38 PM (6 h, 53 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230738
Attached To
D3069: scanner: json output format
Event Timeline
Log In to Comment