Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123117
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
View Options
diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py
index d163658..293a485 100644
--- a/swh/scanner/benchmark_algos.py
+++ b/swh/scanner/benchmark_algos.py
@@ -1,386 +1,392 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import collections
import itertools
import json
import os
from pathlib import Path
import random
from typing import Dict, Iterable, List, Optional
import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
from swh.model.from_disk import Content, Directory, accept_all_directories
from swh.model.identifiers import CONTENT, DIRECTORY, swhid
from .exceptions import APIError
from .model import Status, Tree
from .scanner import directory_filter, extract_regex_objs
+session = requests.Session()
+retries_rule = Retry(total=5, backoff_factor=1)
+session.mount("http://", HTTPAdapter(max_retries=retries_rule))
+
def query_swhids(
swhids: List[Tree], api_url: str, counter: Optional[collections.Counter] = None
) -> Dict[str, Dict[str, bool]]:
"""
Returns:
A dictionary with:
key(str): persistent identifier
value(dict):
value['known'] = True if pid is found
value['known'] = False if pid is not found
"""
endpoint = api_url + "known/"
chunk_size = 1000
if counter:
counter["queries"] += len(swhids)
def make_request(swhids):
swhids = [swhid.swhid for swhid in swhids]
- req = requests.post(endpoint, json=swhids)
+ req = session.post(endpoint, json=swhids)
if req.status_code != 200:
error_message = "%s with given values %s" % (req.text, str(swhids))
raise APIError(error_message)
if counter:
counter["api_calls"] += 1
resp = req.text
return json.loads(resp)
def get_chunk(swhids):
for i in range(0, len(swhids), chunk_size):
yield swhids[i : i + chunk_size]
if len(swhids) > chunk_size:
return dict(
itertools.chain.from_iterable(
make_request(swhids_chunk).items() for swhids_chunk in get_chunk(swhids)
)
)
else:
return make_request(swhids)
def stopngo(source_tree: Tree, api_url: str, counter: collections.Counter):
def set_children_known(node):
for child_node in node.iterate():
child_node.known = True
nodes = []
nodes.append(source_tree)
while len(nodes) > 0:
parsed_nodes = query_swhids(nodes, api_url, counter)
for node in nodes.copy():
nodes.remove(node)
node.known = parsed_nodes[node.swhid]["known"]
node.status = Status.queried
if node.otype == DIRECTORY:
if not node.known:
nodes.extend(list(node.children.values()))
else:
set_children_known(node)
def set_father_status(node, known):
"""
Recursively change father known and visited status of a given node
"""
parent = node.father
if parent is None:
return
if parent.status != Status.unset:
return
parent.known = known
set_father_status(parent, known)
def set_children_status(node, node_type, known, status: Status = Status.unset):
"""
Recursively change father known and visited status of a given node
"""
for child_node in node.iterate():
if child_node.otype == node_type and child_node.status == status:
child_node.known = known
def file_priority(source_tree: Tree, api_url: str, counter: collections.Counter):
# get all the files
all_contents = list(
filter(lambda node: node.otype == CONTENT, source_tree.iterate_bfs())
)
all_contents.reverse() # we check nodes from the deepest
# query the backend to get all file contents status
parsed_contents = query_swhids(all_contents, api_url, counter)
# set all the file contents status
for cnt in all_contents:
cnt.known = parsed_contents[cnt.swhid]["known"]
cnt.status = Status.queried
# set all the upstream directories of unknown file contents to unknown
if not cnt.known:
set_father_status(cnt, False)
# get all unset directories and check their status
# (update children directories accordingly)
unset_dirs = list(
filter(
lambda node: node.otype == DIRECTORY and node.status == Status.unset,
source_tree.iterate(),
)
)
if source_tree.status == Status.unset:
unset_dirs.append(source_tree)
# check unset directories
for dir_ in unset_dirs:
if dir_.status == Status.unset:
# update directory status
dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
dir_.status = Status.queried
set_children_status(dir_, DIRECTORY, dir_.known)
def directory_priority(source_tree: Tree, api_url: str, counter: collections.Counter):
# get all directory contents that have at least one file content
unset_dirs = list(
filter(
lambda dir_: dir_.otype == DIRECTORY and dir_.has_contents,
source_tree.iterate_bfs(),
)
)
unset_dirs.reverse()
# insert root if it has no contents
if source_tree.has_contents:
unset_dirs.append(source_tree)
for dir_ in unset_dirs:
# if the directory is known set all the downstream file contents to known
if dir_.status == Status.unset:
dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
dir_.status = Status.queried
if dir_.known:
set_children_status(dir_, CONTENT, True)
else:
set_father_status(dir_, False)
# get remaining directories that have no file contents
unset_dirs_no_cnts = list(
filter(
lambda node: node.otype == DIRECTORY and not node.has_contents,
source_tree.iterate_bfs(),
)
)
parsed_dirs_no_cnts = query_swhids(unset_dirs_no_cnts, api_url, counter)
# update status of directories that have no file contents
for dir_ in unset_dirs_no_cnts:
dir_.known = parsed_dirs_no_cnts[dir_.swhid]["known"]
dir_.status = Status.queried
# check unknown file contents
unset_files = list(
filter(
lambda node: node.otype == CONTENT and node.status == Status.unset,
source_tree.iterate(),
)
)
parsed_unset_files = query_swhids(unset_files, api_url, counter)
for file_ in unset_files:
file_.known = parsed_unset_files[file_.swhid]["known"]
file_.status = Status.queried
def random_(
source_tree: Tree,
api_url: str,
counter: collections.Counter,
seed: Optional[int] = None,
):
if seed:
random.seed(seed)
# get all directory/file contents
all_nodes = [node for node in source_tree.iterate()] + [source_tree]
# shuffle contents
random.shuffle(all_nodes)
while len(all_nodes):
node = all_nodes.pop()
if node.status != Status.unset:
continue
node.known = query_swhids([node], api_url, counter)[node.swhid]["known"]
node.status = Status.queried
if node.otype == DIRECTORY and node.known:
for child_node in node.iterate():
child_node.known = True
elif node.otype == CONTENT and not node.known:
set_father_status(node, False)
def algo_min(source_tree: Tree, api_url: str):
"""
The minimal number of queries knowing the known/unknown status of every node
"""
def remove_parents(node, nodes):
parent = node.father
if parent is None or parent not in nodes:
return
else:
nodes.remove(parent)
remove_parents(parent, nodes)
def remove_children(node, nodes):
for child_node in node.iterate():
nodes.remove(child_node)
all_nodes = [node for node in source_tree.iterate()]
all_nodes.insert(0, source_tree)
parsed_nodes = query_swhids(all_nodes, api_url)
for node in all_nodes:
node.known = parsed_nodes[node.swhid]["known"]
all_nodes_copy = all_nodes.copy()
for node in all_nodes:
if node.otype == CONTENT and not node.known:
all_nodes_copy.remove(node)
remove_parents(node, all_nodes_copy)
for node in all_nodes_copy:
if node.otype == DIRECTORY and node.known:
remove_children(node, all_nodes_copy)
return len(all_nodes_copy)
def get_swhids(paths: Iterable[Path], exclude_patterns):
def swhid_of(path):
if path.is_dir():
if exclude_patterns:
def dir_filter(dirpath, *args):
return directory_filter(dirpath, exclude_patterns)
else:
dir_filter = accept_all_directories
obj = Directory.from_disk(
path=bytes(path), dir_filter=dir_filter
).get_data()
return swhid(DIRECTORY, obj)
else:
obj = Content.from_file(path=bytes(path)).get_data()
return swhid(CONTENT, obj)
for path in paths:
yield str(path), swhid_of(path)
def load_source(root, sre_patterns):
"""
Load the source code inside the Tree data structure
"""
def _scan(root_path, source_tree, sre_patterns):
dirpath, dnames, fnames = next(os.walk(root_path, followlinks=False))
dirpath = Path(dirpath)
if fnames:
files = [dirpath.joinpath(fname) for fname in fnames]
parsed_file_swhids = dict(get_swhids(files, sre_patterns))
for path, swhid_ in parsed_file_swhids.items():
source_tree.add_node(Path(path), swhid_)
if dnames:
dirs = [dirpath.joinpath(dname) for dname in dnames]
parsed_dirs_swhids = dict(get_swhids(dirs, sre_patterns))
for path, swhid_ in parsed_dirs_swhids.items():
if not directory_filter(path, sre_patterns):
continue
source_tree.add_node(Path(path), swhid_)
_scan(path, source_tree, sre_patterns)
source_tree = Tree(root)
root_swhid = dict(get_swhids([root], sre_patterns))
source_tree.swhid = root_swhid[str(root)]
_scan(root, source_tree, sre_patterns)
return source_tree
def run(
root: str,
api_url: str,
backend_name: str,
exclude_patterns: Iterable[str],
algo: str,
origin: str,
commit: str,
seed: Optional[int] = None,
):
sre_patterns = set()
if exclude_patterns:
sre_patterns = {
reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns)
}
# temporary directory prefix
repo_id = Path(root).parts[-1].split("_")[0]
counter: collections.Counter = collections.Counter()
counter["api_calls"] = 0
counter["queries"] = 0
source_tree = load_source(Path(root), sre_patterns)
if algo == "random":
if seed:
random_(source_tree, api_url, counter, seed)
else:
random_(source_tree, api_url, counter)
elif algo == "algo_min":
min_queries = algo_min(source_tree, api_url)
min_result = (
repo_id,
origin,
commit,
backend_name,
len(source_tree),
algo,
-1,
min_queries,
)
print(*min_result, sep=",")
return
elif algo == "stopngo":
stopngo(source_tree, api_url, counter)
elif algo == "file_priority":
file_priority(source_tree, api_url, counter)
elif algo == "directory_priority":
directory_priority(source_tree, api_url, counter)
else:
raise Exception(f'Algorithm "{algo}" not found')
result = (
repo_id,
origin,
commit,
backend_name,
len(source_tree),
algo,
counter["api_calls"],
counter["queries"],
)
print(*result, sep=",")
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jun 21, 5:05 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3262861
Attached To
rDTSCN Code scanner
Event Timeline
Log In to Comment