Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163792
D3070.id10921.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D3070.id10921.diff
View Options
diff --git a/swh/scanner/model.py b/swh/scanner/model.py
--- a/swh/scanner/model.py
+++ b/swh/scanner/model.py
@@ -7,7 +7,7 @@
import sys
import json
from pathlib import PosixPath
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Tuple, Iterable
from enum import Enum
from .plot import sunburst
@@ -35,24 +35,25 @@
self.father = father
self.path = path
self.otype = DIRECTORY if path.is_dir() else CONTENT
- self.pid = ""
+ self.swhid = ""
+ self.known = False
self.children: Dict[PosixPath, Tree] = {}
- def addNode(self, path: PosixPath, pid: str = None) -> None:
+ def addNode(self, path: PosixPath, swhid: str, known: bool) -> None:
"""Recursively add a new path.
"""
relative_path = path.relative_to(self.path)
if relative_path == PosixPath("."):
- if pid is not None:
- self.pid = pid
+ self.swhid = swhid
+ self.known = known
return
new_path = self.path.joinpath(relative_path.parts[0])
if new_path not in self.children:
self.children[new_path] = Tree(new_path, self)
- self.children[new_path].addNode(path, pid)
+ self.children[new_path].addNode(path, swhid, known)
def show(self, format) -> None:
"""Show tree in different formats"""
@@ -82,7 +83,7 @@
end = "/" if node.otype == DIRECTORY else ""
if isatty:
- if not node.pid:
+ if not node.known:
rel_path = colorize(rel_path, Color.red)
elif node.otype == DIRECTORY:
rel_path = colorize(rel_path, Color.blue)
@@ -91,6 +92,43 @@
print(f"{begin}{rel_path}{end}")
+ @property
+ def info(self):
+ """
+ Get information about the current path
+
+ Returns:
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
+
+ """
+ node_info = {}
+ node_info["swhid"] = self.swhid
+ node_info["known"] = self.known
+ return {str(self.path): node_info}
+
+ def toDict(self, dict_nodes={}) -> Dict[str, Dict[str, Dict]]:
+ """
+ Groups each child inside a dictionary
+ """
+ for node_dict in self.iterate():
+ dict_nodes.update(node_dict)
+ return dict_nodes
+
+ def iterate(self) -> Iterable[Dict[str, Dict]]:
+ """
+ Recursively iterate through the children of the current node
+
+ Yields:
+ a dictionary containing a path with its known/unknown status and the
+ Software Heritage persistent identifier
+
+ """
+ for _, child_node in self.children.items():
+ yield child_node.info
+ if child_node.otype == DIRECTORY:
+ yield from child_node.iterate()
+
def getTree(self):
"""Walk through the tree to discover content or directory that have
a persistent identifier. If a persistent identifier is found it saves
@@ -103,8 +141,8 @@
child_tree = {}
for path, child_node in self.children.items():
rel_path = str(child_node.path.relative_to(self.path))
- if child_node.pid:
- child_tree[rel_path] = child_node.pid
+ if child_node.swhid:
+ child_tree[rel_path] = child_node.swhid
else:
next_tree = child_node.getTree()
if next_tree:
@@ -158,14 +196,14 @@
"Can't calculate contents of the " "object type: %s" % self.otype
)
- if self.pid:
+ if self.known:
# to identify a directory with all files/directories present
return (1, 1)
else:
for _, child_node in self.children.items():
if child_node.otype == CONTENT:
contents += 1
- if child_node.pid:
+ if child_node.known:
discovered += 1
return (contents, discovered)
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -164,18 +164,16 @@
"""
async def _scan(root, session, api_url, source_tree, exclude_patterns):
- for path, pid, found in await parse_path(
+ for path, pid, known in await parse_path(
root, session, api_url, exclude_patterns
):
obj_type = parse_persistent_identifier(pid).object_type
if obj_type == CONTENT:
- source_tree.addNode(path, pid if found else None)
+ source_tree.addNode(path, pid, known)
elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
- if found:
- source_tree.addNode(path, pid)
- else:
- source_tree.addNode(path)
+ source_tree.addNode(path, pid, known)
+ if not known:
await _scan(path, session, api_url, source_tree, exclude_patterns)
async with aiohttp.ClientSession() as session:
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -89,7 +89,7 @@
@pytest.fixture(scope="function")
def example_tree(temp_folder):
"""Fixture that generate a Tree with the root present in the
- session fixture "temp_folder".
+ session fixture "temp_folder".
"""
example_tree = Tree(temp_folder["root"])
assert example_tree.path == temp_folder["root"]
@@ -113,9 +113,9 @@
for path, pid in temp_folder["paths"].items():
if path in known_paths:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
return example_tree.getDirectoriesInfo(root)
diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py
--- a/swh/scanner/tests/data.py
+++ b/swh/scanner/tests/data.py
@@ -10,9 +10,12 @@
}
# present pids inside /data/sample-folder
-present_pids = [
+present_swhids = [
"swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md
"swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary
"swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93", # barfoo2/
"swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326", # toexclude/
]
+
+
+to_exclude_swhid = "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326"
diff --git a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json b/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result-no-toexclude.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/data/sample-folder-result.json b/swh/scanner/tests/data/sample-folder-result.json
deleted file mode 100644
--- a/swh/scanner/tests/data/sample-folder-result.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "bar": {
- "barfoo2": "swh:1:dir:9619a28687b2462efbb5be816bc1185b95753d93"
- },
- "link-to-foo": {
- "quotes.md": "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
- },
- "toexclude": "swh:1:dir:07d4d9ec5c406632d203dbd4631e7863612a0326",
- "some-binary": "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb"
-}
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -5,7 +5,7 @@
from flask import Flask, request
-from .data import present_pids
+from .data import present_swhids
from swh.web.common.exc import LargePayloadExc
@@ -15,17 +15,17 @@
@app.route("/known/", methods=["POST"])
def known():
- pids = request.get_json()
+ swhids = request.get_json()
- if len(pids) > 900:
+ if len(swhids) > 900:
raise LargePayloadExc(
"The maximum number of PIDs this endpoint " "can receive is 900"
)
- res = {pid: {"known": False} for pid in pids}
- for pid in pids:
- if pid in present_pids:
- res[pid]["known"] = True
+ res = {swhid: {"known": False} for swhid in swhids}
+ for swhid in swhids:
+ if swhid in present_swhids:
+ res[swhid]["known"] = True
return res
diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py
--- a/swh/scanner/tests/test_model.py
+++ b/swh/scanner/tests/test_model.py
@@ -8,7 +8,7 @@
avail_paths = temp_folder["paths"].keys()
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, False)
for path, node in example_tree.children.items():
assert path in avail_paths
@@ -17,39 +17,41 @@
assert subpath in avail_paths
-def test_get_json_tree_all_not_present(example_tree, temp_folder):
+def test_to_json_no_one_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
- json_tree = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(json_tree) == 0
+ assert len(result) == 6
+
+ for _, node_info in result.items():
+ assert node_info["known"] is False
def test_get_json_tree_all_present(example_tree, temp_folder):
for path, pid in temp_folder["paths"].items():
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
+
+ result = example_tree.toDict()
- tree_dict = example_tree.getTree()
+ assert len(result) == 6
- assert len(tree_dict) == 3
- # since subdir have a pid, it can't have a children path
- assert tree_dict["subdir0"] is not dict
+ for _, node_info in result.items():
+ assert node_info["known"] is True
def test_get_json_tree_only_one_present(example_tree, temp_folder):
+ root = temp_folder["root"]
filesample_path = temp_folder["filesample"]
for path, pid in temp_folder["paths"].items():
- if path == filesample_path:
- example_tree.addNode(path, pid)
- else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, True if path == filesample_path else False)
- tree_dict = example_tree.getTree()
+ result = example_tree.toDict()
- assert len(tree_dict) == 1
- assert tree_dict["subdir0"]["filesample.txt"]
+ assert len(result) == 6
+ assert result[str(root) + "/subdir0/filesample.txt"]["known"] is True
def test_get_directories_info(example_tree, temp_folder):
@@ -61,9 +63,9 @@
for path, pid in temp_folder["paths"].items():
if path == filesample_path or path == filesample2_path:
- example_tree.addNode(path, pid)
+ example_tree.addNode(path, pid, True)
else:
- example_tree.addNode(path)
+ example_tree.addNode(path, pid, False)
directories = example_tree.getDirectoriesInfo(example_tree.path)
diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py
--- a/swh/scanner/tests/test_scanner.py
+++ b/swh/scanner/tests/test_scanner.py
@@ -7,7 +7,7 @@
import json
from pathlib import PosixPath
-from .data import correct_api_response
+from .data import correct_api_response, present_swhids, to_exclude_swhid
from swh.scanner.scanner import pids_discovery, get_subpaths, run
from swh.scanner.model import Tree
@@ -73,30 +73,24 @@
def test_scanner_result(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(PosixPath("sample-folder-result.json"))
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
source_tree = Tree(sample_folder)
- event_loop.run_until_complete(run(sample_folder, api_url, source_tree, tuple()))
-
- actual_result = source_tree.getTree()
+ event_loop.run_until_complete(run(sample_folder, api_url, source_tree, set()))
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ if node_info["swhid"] in present_swhids:
+ assert node_info["known"] is True
+ else:
+ assert node_info["known"] is False
def test_scanner_result_with_exclude_patterns(live_server, event_loop, test_folder):
api_url = live_server.url() + "/"
- result_path = test_folder.joinpath(
- PosixPath("sample-folder-result-no-toexclude.json")
- )
- with open(result_path, "r") as json_file:
- expected_result = json.loads(json_file.read())
-
sample_folder = test_folder.joinpath(PosixPath("sample-folder"))
+
patterns = (str(sample_folder) + "/toexclude",)
exclude_pattern = {
reg_obj for reg_obj in extract_regex_objs(sample_folder, patterns)
@@ -107,6 +101,6 @@
run(sample_folder, api_url, source_tree, exclude_pattern)
)
- actual_result = source_tree.getTree()
-
- assert actual_result == expected_result
+ for node_dict in source_tree.iterate():
+ node_info = list(node_dict.values())[0]
+ assert node_info["swhid"] != to_exclude_swhid
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 3:38 PM (1 h, 35 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221396
Attached To
D3070: model: known attribute in Tree structure
Event Timeline
Log In to Comment