No OneTemporary
Actions

Size

12 KB

Subscribers

None

View Options

	diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py
	index d163658..293a485 100644
	--- a/swh/scanner/benchmark_algos.py
	+++ b/swh/scanner/benchmark_algos.py
	@@ -1,386 +1,392 @@
	# Copyright (C) 2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import collections
	import itertools
	import json
	import os
	from pathlib import Path
	import random
	from typing import Dict, Iterable, List, Optional

	import requests
	+from requests.adapters import HTTPAdapter
	+from requests.packages.urllib3.util.retry import Retry

	from swh.model.from_disk import Content, Directory, accept_all_directories
	from swh.model.identifiers import CONTENT, DIRECTORY, swhid

	from .exceptions import APIError
	from .model import Status, Tree
	from .scanner import directory_filter, extract_regex_objs

	+session = requests.Session()
	+retries_rule = Retry(total=5, backoff_factor=1)
	+session.mount("http://", HTTPAdapter(max_retries=retries_rule))
	+

	def query_swhids(
	swhids: List[Tree], api_url: str, counter: Optional[collections.Counter] = None
	) -> Dict[str, Dict[str, bool]]:
	"""
	Returns:
	A dictionary with:
	key(str): persistent identifier
	value(dict):
	value['known'] = True if pid is found
	value['known'] = False if pid is not found
	"""
	endpoint = api_url + "known/"
	chunk_size = 1000

	if counter:
	counter["queries"] += len(swhids)

	def make_request(swhids):
	swhids = [swhid.swhid for swhid in swhids]
	- req = requests.post(endpoint, json=swhids)
	+ req = session.post(endpoint, json=swhids)
	if req.status_code != 200:
	error_message = "%s with given values %s" % (req.text, str(swhids))
	raise APIError(error_message)
	if counter:
	counter["api_calls"] += 1
	resp = req.text
	return json.loads(resp)

	def get_chunk(swhids):
	for i in range(0, len(swhids), chunk_size):
	yield swhids[i : i + chunk_size]

	if len(swhids) > chunk_size:
	return dict(
	itertools.chain.from_iterable(
	make_request(swhids_chunk).items() for swhids_chunk in get_chunk(swhids)
	)
	)
	else:
	return make_request(swhids)


	def stopngo(source_tree: Tree, api_url: str, counter: collections.Counter):
	def set_children_known(node):
	for child_node in node.iterate():
	child_node.known = True

	nodes = []
	nodes.append(source_tree)

	while len(nodes) > 0:
	parsed_nodes = query_swhids(nodes, api_url, counter)
	for node in nodes.copy():
	nodes.remove(node)
	node.known = parsed_nodes[node.swhid]["known"]
	node.status = Status.queried
	if node.otype == DIRECTORY:
	if not node.known:
	nodes.extend(list(node.children.values()))
	else:
	set_children_known(node)


	def set_father_status(node, known):
	"""
	Recursively change father known and visited status of a given node
	"""
	parent = node.father

	if parent is None:
	return
	if parent.status != Status.unset:
	return

	parent.known = known
	set_father_status(parent, known)


	def set_children_status(node, node_type, known, status: Status = Status.unset):
	"""
	Recursively change father known and visited status of a given node
	"""
	for child_node in node.iterate():
	if child_node.otype == node_type and child_node.status == status:
	child_node.known = known


	def file_priority(source_tree: Tree, api_url: str, counter: collections.Counter):
	# get all the files
	all_contents = list(
	filter(lambda node: node.otype == CONTENT, source_tree.iterate_bfs())
	)
	all_contents.reverse() # we check nodes from the deepest

	# query the backend to get all file contents status
	parsed_contents = query_swhids(all_contents, api_url, counter)
	# set all the file contents status
	for cnt in all_contents:
	cnt.known = parsed_contents[cnt.swhid]["known"]
	cnt.status = Status.queried
	# set all the upstream directories of unknown file contents to unknown
	if not cnt.known:
	set_father_status(cnt, False)

	# get all unset directories and check their status
	# (update children directories accordingly)
	unset_dirs = list(
	filter(
	lambda node: node.otype == DIRECTORY and node.status == Status.unset,
	source_tree.iterate(),
	)
	)

	if source_tree.status == Status.unset:
	unset_dirs.append(source_tree)

	# check unset directories
	for dir_ in unset_dirs:
	if dir_.status == Status.unset:
	# update directory status
	dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
	dir_.status = Status.queried
	set_children_status(dir_, DIRECTORY, dir_.known)


	def directory_priority(source_tree: Tree, api_url: str, counter: collections.Counter):
	# get all directory contents that have at least one file content
	unset_dirs = list(
	filter(
	lambda dir_: dir_.otype == DIRECTORY and dir_.has_contents,
	source_tree.iterate_bfs(),
	)
	)
	unset_dirs.reverse()
	# insert root if it has no contents
	if source_tree.has_contents:
	unset_dirs.append(source_tree)

	for dir_ in unset_dirs:
	# if the directory is known set all the downstream file contents to known
	if dir_.status == Status.unset:
	dir_.known = query_swhids([dir_], api_url, counter)[dir_.swhid]["known"]
	dir_.status = Status.queried
	if dir_.known:
	set_children_status(dir_, CONTENT, True)
	else:
	set_father_status(dir_, False)

	# get remaining directories that have no file contents
	unset_dirs_no_cnts = list(
	filter(
	lambda node: node.otype == DIRECTORY and not node.has_contents,
	source_tree.iterate_bfs(),
	)
	)
	parsed_dirs_no_cnts = query_swhids(unset_dirs_no_cnts, api_url, counter)

	# update status of directories that have no file contents
	for dir_ in unset_dirs_no_cnts:
	dir_.known = parsed_dirs_no_cnts[dir_.swhid]["known"]
	dir_.status = Status.queried

	# check unknown file contents
	unset_files = list(
	filter(
	lambda node: node.otype == CONTENT and node.status == Status.unset,
	source_tree.iterate(),
	)
	)
	parsed_unset_files = query_swhids(unset_files, api_url, counter)

	for file_ in unset_files:
	file_.known = parsed_unset_files[file_.swhid]["known"]
	file_.status = Status.queried


	def random_(
	source_tree: Tree,
	api_url: str,
	counter: collections.Counter,
	seed: Optional[int] = None,
	):

	if seed:
	random.seed(seed)
	# get all directory/file contents
	all_nodes = [node for node in source_tree.iterate()] + [source_tree]
	# shuffle contents
	random.shuffle(all_nodes)

	while len(all_nodes):
	node = all_nodes.pop()

	if node.status != Status.unset:
	continue

	node.known = query_swhids([node], api_url, counter)[node.swhid]["known"]
	node.status = Status.queried
	if node.otype == DIRECTORY and node.known:
	for child_node in node.iterate():
	child_node.known = True
	elif node.otype == CONTENT and not node.known:
	set_father_status(node, False)


	def algo_min(source_tree: Tree, api_url: str):
	"""
	The minimal number of queries knowing the known/unknown status of every node
	"""

	def remove_parents(node, nodes):
	parent = node.father
	if parent is None or parent not in nodes:
	return
	else:
	nodes.remove(parent)
	remove_parents(parent, nodes)

	def remove_children(node, nodes):
	for child_node in node.iterate():
	nodes.remove(child_node)

	all_nodes = [node for node in source_tree.iterate()]
	all_nodes.insert(0, source_tree)

	parsed_nodes = query_swhids(all_nodes, api_url)
	for node in all_nodes:
	node.known = parsed_nodes[node.swhid]["known"]

	all_nodes_copy = all_nodes.copy()

	for node in all_nodes:
	if node.otype == CONTENT and not node.known:
	all_nodes_copy.remove(node)
	remove_parents(node, all_nodes_copy)

	for node in all_nodes_copy:
	if node.otype == DIRECTORY and node.known:
	remove_children(node, all_nodes_copy)

	return len(all_nodes_copy)


	def get_swhids(paths: Iterable[Path], exclude_patterns):
	def swhid_of(path):
	if path.is_dir():
	if exclude_patterns:

	def dir_filter(dirpath, *args):
	return directory_filter(dirpath, exclude_patterns)

	else:
	dir_filter = accept_all_directories

	obj = Directory.from_disk(
	path=bytes(path), dir_filter=dir_filter
	).get_data()

	return swhid(DIRECTORY, obj)
	else:
	obj = Content.from_file(path=bytes(path)).get_data()
	return swhid(CONTENT, obj)

	for path in paths:
	yield str(path), swhid_of(path)


	def load_source(root, sre_patterns):
	"""
	Load the source code inside the Tree data structure
	"""

	def _scan(root_path, source_tree, sre_patterns):
	dirpath, dnames, fnames = next(os.walk(root_path, followlinks=False))
	dirpath = Path(dirpath)

	if fnames:
	files = [dirpath.joinpath(fname) for fname in fnames]
	parsed_file_swhids = dict(get_swhids(files, sre_patterns))

	for path, swhid_ in parsed_file_swhids.items():
	source_tree.add_node(Path(path), swhid_)

	if dnames:
	dirs = [dirpath.joinpath(dname) for dname in dnames]
	parsed_dirs_swhids = dict(get_swhids(dirs, sre_patterns))

	for path, swhid_ in parsed_dirs_swhids.items():
	if not directory_filter(path, sre_patterns):
	continue
	source_tree.add_node(Path(path), swhid_)
	_scan(path, source_tree, sre_patterns)

	source_tree = Tree(root)
	root_swhid = dict(get_swhids([root], sre_patterns))
	source_tree.swhid = root_swhid[str(root)]
	_scan(root, source_tree, sre_patterns)
	return source_tree


	def run(
	root: str,
	api_url: str,
	backend_name: str,
	exclude_patterns: Iterable[str],
	algo: str,
	origin: str,
	commit: str,
	seed: Optional[int] = None,
	):
	sre_patterns = set()
	if exclude_patterns:
	sre_patterns = {
	reg_obj for reg_obj in extract_regex_objs(Path(root), exclude_patterns)
	}

	# temporary directory prefix
	repo_id = Path(root).parts[-1].split("_")[0]
	counter: collections.Counter = collections.Counter()
	counter["api_calls"] = 0
	counter["queries"] = 0
	source_tree = load_source(Path(root), sre_patterns)

	if algo == "random":
	if seed:
	random_(source_tree, api_url, counter, seed)
	else:
	random_(source_tree, api_url, counter)
	elif algo == "algo_min":
	min_queries = algo_min(source_tree, api_url)
	min_result = (
	repo_id,
	origin,
	commit,
	backend_name,
	len(source_tree),
	algo,
	-1,
	min_queries,
	)
	print(*min_result, sep=",")
	return
	elif algo == "stopngo":
	stopngo(source_tree, api_url, counter)
	elif algo == "file_priority":
	file_priority(source_tree, api_url, counter)
	elif algo == "directory_priority":
	directory_priority(source_tree, api_url, counter)
	else:
	raise Exception(f'Algorithm "{algo}" not found')

	result = (
	repo_id,
	origin,
	commit,
	backend_name,
	len(source_tree),
	algo,
	counter["api_calls"],
	counter["queries"],
	)

	print(*result, sep=",")

File Metadata

Mime Type: text/x-diff
Expires: Sat, Jun 21, 5:05 PM (1 w, 6 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3262861

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions