Changeset View
Changeset View
Standalone View
Standalone View
swh/scanner/policy.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||||
import abc | import abc | ||||||||||||||
import asyncio | import asyncio | ||||||||||||||
import itertools | import itertools | ||||||||||||||
from typing import Dict, List, no_type_check | from typing import Dict, List, no_type_check | ||||||||||||||
import aiohttp | import aiohttp | ||||||||||||||
from swh.core.utils import grouper | |||||||||||||||
from swh.model.from_disk import Directory | from swh.model.from_disk import Directory | ||||||||||||||
from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID | from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID | ||||||||||||||
from .data import MerkleNodeInfo | from .data import MerkleNodeInfo | ||||||||||||||
from .exceptions import error_response | from .exceptions import error_response | ||||||||||||||
# Maximum number of SWHIDs that can be requested by a single call to the | # Maximum number of SWHIDs that can be requested by a single call to the | ||||||||||||||
# Web API endpoint /known/ | # Web API endpoint /known/ | ||||||||||||||
Show All 40 Lines | if len(swhids) > QUERY_LIMIT: | ||||||||||||||
requests.append(asyncio.create_task(make_request(swhids_chunk))) | requests.append(asyncio.create_task(make_request(swhids_chunk))) | ||||||||||||||
res = await asyncio.gather(*requests) | res = await asyncio.gather(*requests) | ||||||||||||||
# concatenate list of dictionaries | # concatenate list of dictionaries | ||||||||||||||
return dict(itertools.chain.from_iterable(e.items() for e in res)) | return dict(itertools.chain.from_iterable(e.items() for e in res)) | ||||||||||||||
else: | else: | ||||||||||||||
return await make_request(swhids) | return await make_request(swhids) | ||||||||||||||
zack: docstring please, e.g.: "return the size of a source tree as the number of nodes it contains" | |||||||||||||||
def source_size(source_tree: Directory): | |||||||||||||||
Not Done Inline Actions
Oh I missed this. You can do it without allocating a list, like this ^ vlorentz: Oh I missed this. You can do it without allocating a list, like this ^ | |||||||||||||||
"""return the size of a source tree as the number of nodes it contains | |||||||||||||||
""" | |||||||||||||||
return sum(1 for n in source_tree.iter_tree(dedup=False)) | |||||||||||||||
class Policy(metaclass=abc.ABCMeta): | class Policy(metaclass=abc.ABCMeta): | ||||||||||||||
data: MerkleNodeInfo | data: MerkleNodeInfo | ||||||||||||||
"""information about contents and directories of the merkle tree""" | """information about contents and directories of the merkle tree""" | ||||||||||||||
source_tree: Directory | source_tree: Directory | ||||||||||||||
"""representation of a source code project directory in the merkle tree""" | """representation of a source code project directory in the merkle tree""" | ||||||||||||||
Show All 37 Lines | ): | ||||||||||||||
queue.extend(children) | queue.extend(children) | ||||||||||||||
else: | else: | ||||||||||||||
for sub_node in node.iter_tree(): | for sub_node in node.iter_tree(): | ||||||||||||||
if sub_node == node: | if sub_node == node: | ||||||||||||||
continue | continue | ||||||||||||||
self.data[sub_node.swhid()]["known"] = True # type: ignore | self.data[sub_node.swhid()]["known"] = True # type: ignore | ||||||||||||||
class GreedyBFS(Policy): | |||||||||||||||
Not Done Inline Actionsbetter: "query graph nodes in chunks (to maximize [...]" rationale: you're not querying node chunks (i.e., node sub-parts), but batches of nodes together zack: better: "query graph nodes in chunks (to maximize [...]"
rationale: you're not querying node… | |||||||||||||||
"""Query graph nodes in chunks (to maximize the Web API rate limit use) and set the | |||||||||||||||
downstream contents of known directories to known. | |||||||||||||||
""" | |||||||||||||||
async def run( | |||||||||||||||
self, session: aiohttp.ClientSession, api_url: str, | |||||||||||||||
): | |||||||||||||||
ssize = source_size(self.source_tree) | |||||||||||||||
seen = [] | |||||||||||||||
async for nodes_chunk in self.get_nodes_chunks(session, api_url, ssize): | |||||||||||||||
for node in nodes_chunk: | |||||||||||||||
seen.append(node) | |||||||||||||||
if len(seen) == ssize: | |||||||||||||||
return | |||||||||||||||
if node.object_type == DIRECTORY and self.data[node.swhid()]["known"]: | |||||||||||||||
sub_nodes = [n for n in node.iter_tree(dedup=False)] | |||||||||||||||
sub_nodes.remove(node) # remove root node | |||||||||||||||
for sub_node in sub_nodes: | |||||||||||||||
seen.append(sub_node) | |||||||||||||||
self.data[sub_node.swhid()]["known"] = True | |||||||||||||||
@no_type_check | |||||||||||||||
async def get_nodes_chunks( | |||||||||||||||
self, session: aiohttp.ClientSession, api_url: str, ssize: int | |||||||||||||||
): | |||||||||||||||
"""Query chunks of QUERY_LIMIT nodes at once in order to fill the Web API | |||||||||||||||
Not Done Inline Actionss/1000/QUERY_LIMIT/ zack: s/1000/QUERY_LIMIT/ | |||||||||||||||
Not Done Inline Actionsyou missed an occurrence of 1000 here :-), please fix it before landing zack: you missed an occurrence of `1000` here :-), please fix it before landing | |||||||||||||||
rate limit. It query all the nodes in the case the source code contains | |||||||||||||||
less than QUERY_LIMIT nodes. | |||||||||||||||
""" | |||||||||||||||
nodes = self.source_tree.iter_tree(dedup=False) | |||||||||||||||
for nodes_chunk in grouper(nodes, QUERY_LIMIT): | |||||||||||||||
nodes_chunk = [n for n in nodes_chunk] | |||||||||||||||
swhids = [node.swhid() for node in nodes_chunk] | |||||||||||||||
swhids_res = await swhids_discovery(swhids, session, api_url) | |||||||||||||||
for node in nodes_chunk: | |||||||||||||||
Not Done Inline ActionsDoesn't this always take the vlorentz: Doesn't this always take the | |||||||||||||||
Not Done Inline ActionsWhat do you mean? DanSeraf: What do you mean? | |||||||||||||||
Not Done Inline Actionsnvm, I started writing this before noticing nodes is an iterator, and it looks like I forgot to delete it vlorentz: nvm, I started writing this before noticing `nodes` is an iterator, and it looks like I forgot… | |||||||||||||||
swhid = node.swhid() | |||||||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||||||
self.data[swhid]["known"] = swhids_res[str(swhid)]["known"] | |||||||||||||||
yield nodes_chunk | |||||||||||||||
class FilePriority(Policy): | class FilePriority(Policy): | ||||||||||||||
Not Done Inline Actions
purely aesthetic (to prevent black from reformatting it with the weird newline placement) vlorentz: purely aesthetic (to prevent black from reformatting it with the weird newline placement) | |||||||||||||||
Done Inline ActionsOk, thanks for the suggestion! DanSeraf: Ok, thanks for the suggestion! | |||||||||||||||
"""Check the Merkle tree querying all the file contents and set all the upstream | """Check the Merkle tree querying all the file contents and set all the upstream | ||||||||||||||
directories to unknown in the case a file content is unknown. | directories to unknown in the case a file content is unknown. | ||||||||||||||
Finally check all the directories which status is still unknown and set all the | Finally check all the directories which status is still unknown and set all the | ||||||||||||||
sub-directories of known directories to known. | sub-directories of known directories to known. | ||||||||||||||
""" | """ | ||||||||||||||
@no_type_check | @no_type_check | ||||||||||||||
async def run( | async def run( | ||||||||||||||
▲ Show 20 Lines • Show All 152 Lines • Show Last 20 Lines |
docstring please, e.g.: "return the size of a source tree as the number of nodes it contains"