Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124515
D6065.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D6065.diff
View Options
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
--- a/swh/scanner/cli.py
+++ b/swh/scanner/cli.py
@@ -139,7 +139,7 @@
"--policy",
default="auto",
show_default=True,
- type=click.Choice(["auto", "bfs", "filepriority", "dirpriority"]),
+ type=click.Choice(["auto", "bfs", "greedybfs", "filepriority", "dirpriority"]),
help="The scan policy.",
)
@click.pass_context
@@ -155,6 +155,9 @@
bfs: scan the source code in the BFS order, checking unknown directories only.
+ greedybfs: same as "bfs" policy, but lookup the status of source code artifacts in
+ chunks, in order to minimize the number of Web API round-trips with the archive.
+
filepriority: scan all the source code file contents, checking only unset
directories. (useful if the codebase contains a lot of source files)
diff --git a/swh/scanner/policy.py b/swh/scanner/policy.py
--- a/swh/scanner/policy.py
+++ b/swh/scanner/policy.py
@@ -10,6 +10,7 @@
import aiohttp
+from swh.core.utils import grouper
from swh.model.from_disk import Directory
from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID
@@ -67,6 +68,12 @@
return await make_request(swhids)
+def source_size(source_tree: Directory):
+ """return the size of a source tree as the number of nodes it contains
+ """
+ return sum(1 for n in source_tree.iter_tree(dedup=False))
+
+
class Policy(metaclass=abc.ABCMeta):
data: MerkleNodeInfo
@@ -120,6 +127,48 @@
self.data[sub_node.swhid()]["known"] = True # type: ignore
+class GreedyBFS(Policy):
+ """Query graph nodes in chunks (to maximize the Web API rate limit use) and set the
+ downstream contents of known directories to known.
+ """
+
+ async def run(
+ self, session: aiohttp.ClientSession, api_url: str,
+ ):
+ ssize = source_size(self.source_tree)
+ seen = []
+
+ async for nodes_chunk in self.get_nodes_chunks(session, api_url, ssize):
+ for node in nodes_chunk:
+ seen.append(node)
+ if len(seen) == ssize:
+ return
+ if node.object_type == DIRECTORY and self.data[node.swhid()]["known"]:
+ sub_nodes = [n for n in node.iter_tree(dedup=False)]
+ sub_nodes.remove(node) # remove root node
+ for sub_node in sub_nodes:
+ seen.append(sub_node)
+ self.data[sub_node.swhid()]["known"] = True
+
+ @no_type_check
+ async def get_nodes_chunks(
+ self, session: aiohttp.ClientSession, api_url: str, ssize: int
+ ):
+ """Query chunks of QUERY_LIMIT nodes at once in order to fill the Web API
+ rate limit. It query all the nodes in the case the source code contains
+ less than QUERY_LIMIT nodes.
+ """
+ nodes = self.source_tree.iter_tree(dedup=False)
+ for nodes_chunk in grouper(nodes, QUERY_LIMIT):
+ nodes_chunk = [n for n in nodes_chunk]
+ swhids = [node.swhid() for node in nodes_chunk]
+ swhids_res = await swhids_discovery(swhids, session, api_url)
+ for node in nodes_chunk:
+ swhid = node.swhid()
+ self.data[swhid]["known"] = swhids_res[str(swhid)]["known"]
+ yield nodes_chunk
+
+
class FilePriority(Policy):
"""Check the Merkle tree querying all the file contents and set all the upstream
directories to unknown in the case a file content is unknown.
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -13,7 +13,15 @@
from .data import MerkleNodeInfo
from .output import Output
-from .policy import QUERY_LIMIT, DirectoryPriority, FilePriority, LazyBFS, QueryAll
+from .policy import (
+ QUERY_LIMIT,
+ DirectoryPriority,
+ FilePriority,
+ GreedyBFS,
+ LazyBFS,
+ QueryAll,
+ source_size,
+)
async def run(config: Dict[str, Any], policy) -> None:
@@ -35,10 +43,6 @@
await policy.run(session, api_url)
-def source_size(source_tree: Directory):
- return len([n for n in source_tree.iter_tree(dedup=False)])
-
-
def get_policy_obj(source_tree: Directory, nodes_data: MerkleNodeInfo, policy: str):
if policy == "auto":
return (
@@ -48,6 +52,8 @@
)
elif policy == "bfs":
return LazyBFS(source_tree, nodes_data)
+ elif policy == "greedybfs":
+ return GreedyBFS(source_tree, nodes_data)
elif policy == "filepriority":
return FilePriority(source_tree, nodes_data)
elif policy == "dirpriority":
diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py
--- a/swh/scanner/tests/conftest.py
+++ b/swh/scanner/tests/conftest.py
@@ -7,6 +7,7 @@
import os
from pathlib import Path
import shutil
+import sys
import aiohttp
from aioresponses import aioresponses # type: ignore
@@ -78,13 +79,17 @@
"""Generate a model.from_disk.Directory from a "big" temporary directory
(more than 1000 nodes)
"""
+ # workaround to avoid a RecursionError that could be generated while creating
+ # a large number of directories
+ sys.setrecursionlimit(1100)
dir_ = tmp_path / "big-directory"
- dir_.mkdir()
+ sub_dirs = dir_
for i in range(0, QUERY_LIMIT + 1):
- file_ = dir_ / f"file_{i}.org"
- file_.touch()
+ sub_dirs = sub_dirs / "dir"
+ sub_dirs.mkdir(parents=True, exist_ok=True)
+ file_ = sub_dirs / "file.org"
+ file_.touch()
dir_obj = model_of_dir(str(dir_).encode())
- assert len(dir_obj) > QUERY_LIMIT
return dir_obj
diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py
--- a/swh/scanner/tests/flask_api.py
+++ b/swh/scanner/tests/flask_api.py
@@ -6,6 +6,7 @@
from flask import Flask, request
from swh.scanner.exceptions import LargePayloadExc
+from swh.scanner.policy import QUERY_LIMIT
from .data import unknown_swhids
@@ -24,9 +25,10 @@
for swhid in swhids:
f.write(swhid + "\n")
- if len(swhids) > 900:
+ if len(swhids) > QUERY_LIMIT:
raise LargePayloadExc(
- "The maximum number of SWHIDs this endpoint can receive is 900"
+ f"The maximum number of SWHIDs this endpoint can receive is "
+ f"{QUERY_LIMIT}"
)
res = {swhid: {"known": False} for swhid in swhids}
diff --git a/swh/scanner/tests/test_policy.py b/swh/scanner/tests/test_policy.py
--- a/swh/scanner/tests/test_policy.py
+++ b/swh/scanner/tests/test_policy.py
@@ -8,13 +8,15 @@
from flask import url_for
import pytest
-from swh.model.identifiers import CoreSWHID, ObjectType
+from swh.model.identifiers import CONTENT, CoreSWHID, ObjectType
from swh.scanner.data import MerkleNodeInfo
from swh.scanner.exceptions import APIError
from swh.scanner.policy import (
DirectoryPriority,
FilePriority,
+ GreedyBFS,
LazyBFS,
+ source_size,
swhids_discovery,
)
@@ -47,17 +49,6 @@
)
-def test_scanner_raise_apierror_input_size_limit(event_loop, aiosession, live_server):
-
- api_url = url_for("index", _external=True)
- request = [
- "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a" for i in range(901)
- ] # /known/ is limited at 900
-
- with pytest.raises(APIError):
- event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url))
-
-
def test_scanner_directory_priority_has_contents(source_tree):
nodes_data = MerkleNodeInfo()
policy = DirectoryPriority(source_tree, nodes_data)
@@ -143,3 +134,35 @@
for swhid in backend_swhids_requests[5:]:
assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY
+
+
+def test_greedy_bfs_policy(
+ live_server, event_loop, aiosession, big_source_tree, tmp_requests
+):
+ open(tmp_requests, "w").close()
+ api_url = url_for("index", _external=True)
+
+ nodes_data = MerkleNodeInfo()
+ policy = GreedyBFS(big_source_tree, nodes_data)
+ event_loop.run_until_complete(policy.run(aiosession, api_url))
+
+ backend_swhids_requests = get_backend_swhids_order(tmp_requests)
+
+ last_swhid = backend_swhids_requests[-1]
+ assert CoreSWHID.from_string(last_swhid).object_type == ObjectType.CONTENT
+
+
+@pytest.mark.asyncio
+async def test_greedy_bfs_get_nodes_chunks(live_server, aiosession, big_source_tree):
+ api_url = url_for("index", _external=True)
+
+ nodes_data = MerkleNodeInfo()
+ policy = GreedyBFS(big_source_tree, nodes_data)
+ chunks = [
+ n_chunk
+ async for n_chunk in policy.get_nodes_chunks(
+ aiosession, api_url, source_size(big_source_tree)
+ )
+ ]
+ assert len(chunks) == 2
+ assert chunks[1][-1].object_type == CONTENT
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 12:46 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225444
Attached To
D6065: swh-scanner: add policy greedybfs
Event Timeline
Log In to Comment