Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124761
D6065.id21947.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D6065.id21947.diff
View Options
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -139,7 +139,7 @@
"--policy",
default="auto",
show_default=True,
- type=click.Choice(["auto", "bfs", "filepriority", "dirpriority"]),
+ type=click.Choice(["auto", "bfs", "greedybfs", "filepriority", "dirpriority"]),
help="The scan policy.",
)
@click.pass_context
@@ -155,6 +155,9 @@
bfs: scan the source code in the BFS order, checking unknown directories only.
+ greedybfs: scan the source code files/dirs in chunks, in order to fill the Web API
+ rate limit.
+
filepriority: scan all the source code file contents, checking only unset
directories. (useful if the codebase contains a lot of source files)
diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py
--- a/swh/scanner/policy.py
+++ b/swh/scanner/policy.py
@@ -67,6 +67,10 @@
return await make_request(swhids)
+def source_size(source_tree: Directory):
+ return len([n for n in source_tree.iter_tree(dedup=False)])
+
+
class Policy(metaclass=abc.ABCMeta):
data: MerkleNodeInfo
@@ -120,6 +124,53 @@
self.data[sub_node.swhid()]["known"] = True # type: ignore
+class GreedyBFS(Policy):
+ """Query chunks of nodes (to fill the Web API rate limit) and set the downstream
+ contents of known directories.
+ """
+
+ async def run(
+ self, session: aiohttp.ClientSession, api_url: str,
+ ):
+ ssize = source_size(self.source_tree)
+ seen = []
+
+ async for nodes_chunk in self.get_nodes_chunks(session, api_url, ssize):
+ for node in nodes_chunk:
+ seen.append(node)
+ if len(seen) == ssize:
+ return
+ if node.object_type == DIRECTORY and self.data[node.swhid()]["known"]:
+ sub_nodes = [n for n in node.iter_tree(dedup=False)]
+ sub_nodes.remove(node) # remove root node
+ for sub_node in sub_nodes:
+ seen.append(sub_node)
+ self.data[sub_node.swhid()]["known"] = True
+
+ @no_type_check
+ async def get_nodes_chunks(
+ self, session: aiohttp.ClientSession, api_url: str, ssize: int
+ ):
+ """Query chunks of 1000 nodes at once in order to fill the Web API rate limit.
+ It query all the nodes in the case the source code contains less than 1000
+ nodes.
+ """
+ nodes = self.source_tree.iter_tree(dedup=False)
+ chunk_size = QUERY_LIMIT
+ n_chunks = int(ssize / QUERY_LIMIT) + 1
+ for n in range(0, n_chunks):
+ if n == n_chunks - 1:
+ chunk_size = ssize - (QUERY_LIMIT * n)
+ nodes_chunks = [node for node in itertools.islice(nodes, chunk_size)]
+ swhids = [node.swhid() for node in nodes_chunks]
+ swhids_res = await swhids_discovery(swhids, session, api_url)
+ for node in nodes_chunks:
+ self.data[node.swhid()]["known"] = swhids_res[str(node.swhid())][
+ "known"
+ ]
+ yield nodes_chunks
+
+
class FilePriority(Policy):
"""Check the Merkle tree querying all the file contents and set all the upstream
directories to unknown in the case a file content is unknown.
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -13,7 +13,15 @@
from .data import MerkleNodeInfo
from .output import Output
-from .policy import QUERY_LIMIT, DirectoryPriority, FilePriority, LazyBFS, QueryAll
+from .policy import (
+ QUERY_LIMIT,
+ DirectoryPriority,
+ FilePriority,
+ GreedyBFS,
+ LazyBFS,
+ QueryAll,
+ source_size,
+)
async def run(config: Dict[str, Any], policy) -> None:
@@ -35,10 +43,6 @@
await policy.run(session, api_url)
-def source_size(source_tree: Directory):
- return len([n for n in source_tree.iter_tree(dedup=False)])
-
-
def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str):
if policy == "auto":
return (
@@ -48,6 +52,8 @@
)
elif policy == "bfs":
return LazyBFS(source_tree, nodes_data)
+ elif policy == "greedybfs":
+ return GreedyBFS(source_tree, nodes_data)
elif policy == "filepriority":
return FilePriority(source_tree, nodes_data)
elif policy == "dirpriority":
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -7,6 +7,7 @@
import os
from pathlib import Path
import shutil
+import sys
import aiohttp
from aioresponses import aioresponses # type: ignore
@@ -78,13 +79,15 @@
"""Generate a model.from_disk.Directory from a "big" temporary directory
(more than 1000 nodes)
"""
+ sys.setrecursionlimit(1100)
dir_ = tmp_path / "big-directory"
- dir_.mkdir()
+ sub_dirs = dir_
for i in range(0, QUERY_LIMIT + 1):
- file_ = dir_ / f"file_{i}.org"
- file_.touch()
+ sub_dirs = sub_dirs / "dir"
+ sub_dirs.mkdir(parents=True, exist_ok=True)
+ file_ = sub_dirs / "file.org"
+ file_.touch()
dir_obj = model_of_dir(str(dir_).encode())
- assert len(dir_obj) > QUERY_LIMIT
return dir_obj
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -6,6 +6,7 @@
from flask import Flask, request
from swh.scanner.exceptions import LargePayloadExc
+from swh.scanner.policy import QUERY_LIMIT
from .data import unknown_swhids
@@ -24,9 +25,10 @@
for swhid in swhids:
f.write(swhid + "\n")
- if len(swhids) > 900:
+ if len(swhids) > QUERY_LIMIT:
raise LargePayloadExc(
- "The maximum number of SWHIDs this endpoint can receive is 900"
+ f"The maximum number of SWHIDs this endpoint can receive is "
+ f"{QUERY_LIMIT}"
)
res = {swhid: {"known": False} for swhid in swhids}
diff --git a/swh/scanner/tests/test_policy.py b/swh/scanner/tests/test_policy.py
--- a/swh/scanner/tests/test_policy.py
+++ b/swh/scanner/tests/test_policy.py
@@ -8,13 +8,15 @@
from flask import url_for
import pytest
-from swh.model.identifiers import CoreSWHID, ObjectType
+from swh.model.identifiers import CONTENT, CoreSWHID, ObjectType
from swh.scanner.data import MerkleNodeInfo
from swh.scanner.exceptions import APIError
from swh.scanner.policy import (
DirectoryPriority,
FilePriority,
+ GreedyBFS,
LazyBFS,
+ source_size,
swhids_discovery,
)
@@ -47,17 +49,6 @@
)
-def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server):
-
- api_url = url_for("index", _external=True)
- request = [
- "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901)
- ] # /known/ is limited at 900
-
- with pytest.raises(APIError):
- event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url))
-
-
def test_scanner_directory_priority_has_contents(source_tree):
nodes_data = MerkleNodeInfo()
policy = DirectoryPriority(source_tree, nodes_data)
@@ -143,3 +134,35 @@
for swhid in backend_swhids_requests[5:]:
assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY
+
+
+def test_greedy_bfs_policy(
+ live_server, event_loop, aiosession, big_source_tree, tmp_requests
+):
+ open(tmp_requests, "w").close()
+ api_url = url_for("index", _external=True)
+
+ nodes_data = MerkleNodeInfo()
+ policy = GreedyBFS(big_source_tree, nodes_data)
+ event_loop.run_until_complete(policy.run(aiosession, api_url))
+
+ backend_swhids_requests = get_backend_swhids_order(tmp_requests)
+
+ last_swhid = backend_swhids_requests[-1]
+ assert CoreSWHID.from_string(last_swhid).object_type == ObjectType.CONTENT
+
+
+@pytest.mark.asyncio
+async def test_greedy_bfs_get_nodes_chunks(live_server, aiosession, big_source_tree):
+ api_url = url_for("index", _external=True)
+
+ nodes_data = MerkleNodeInfo()
+ policy = GreedyBFS(big_source_tree, nodes_data)
+ chunks = [
+ n_chunk
+ async for n_chunk in policy.get_nodes_chunks(
+ aiosession, api_url, source_size(big_source_tree)
+ )
+ ]
+ assert len(chunks) == 2
+ assert chunks[1][-1].object_type == CONTENT
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 6:40 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230987
Attached To
D6065: swh-scanner: add policy greedybfs
Event Timeline
Log In to Comment