Changeset View
Changeset View
Standalone View
Standalone View
swh/scanner/benchmark_algos.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import collections | import collections | ||||
import itertools | import itertools | ||||
import json | import json | ||||
import os | import os | ||||
from pathlib import Path | from pathlib import Path | ||||
import random | import random | ||||
from typing import Dict, Iterable, List, Optional | from typing import Dict, Iterable, List, Optional | ||||
import requests | import requests | ||||
from requests.adapters import HTTPAdapter | |||||
from requests.packages.urllib3.util.retry import Retry | |||||
from swh.model.from_disk import Content, Directory, accept_all_directories | from swh.model.from_disk import Content, Directory, accept_all_directories | ||||
from swh.model.identifiers import CONTENT, DIRECTORY, swhid | from swh.model.identifiers import CONTENT, DIRECTORY, swhid | ||||
from .exceptions import APIError | from .exceptions import APIError | ||||
from .model import Status, Tree | from .model import Status, Tree | ||||
from .scanner import directory_filter, extract_regex_objs | from .scanner import directory_filter, extract_regex_objs | ||||
session = requests.Session() | |||||
retries_rule = Retry(total=5, backoff_factor=1) | |||||
session.mount("http://", HTTPAdapter(max_retries=retries_rule)) | |||||
def query_swhids( | def query_swhids( | ||||
swhids: List[Tree], api_url: str, counter: Optional[collections.Counter] = None | swhids: List[Tree], api_url: str, counter: Optional[collections.Counter] = None | ||||
) -> Dict[str, Dict[str, bool]]: | ) -> Dict[str, Dict[str, bool]]: | ||||
""" | """ | ||||
Returns: | Returns: | ||||
A dictionary with: | A dictionary with: | ||||
key(str): persistent identifier | key(str): persistent identifier | ||||
value(dict): | value(dict): | ||||
value['known'] = True if pid is found | value['known'] = True if pid is found | ||||
value['known'] = False if pid is not found | value['known'] = False if pid is not found | ||||
""" | """ | ||||
endpoint = api_url + "known/" | endpoint = api_url + "known/" | ||||
chunk_size = 1000 | chunk_size = 1000 | ||||
if counter: | if counter: | ||||
counter["queries"] += len(swhids) | counter["queries"] += len(swhids) | ||||
def make_request(swhids): | def make_request(swhids): | ||||
swhids = [swhid.swhid for swhid in swhids] | swhids = [swhid.swhid for swhid in swhids] | ||||
req = requests.post(endpoint, json=swhids) | req = session.post(endpoint, json=swhids) | ||||
if req.status_code != 200: | if req.status_code != 200: | ||||
error_message = "%s with given values %s" % (req.text, str(swhids)) | error_message = "%s with given values %s" % (req.text, str(swhids)) | ||||
raise APIError(error_message) | raise APIError(error_message) | ||||
if counter: | if counter: | ||||
counter["api_calls"] += 1 | counter["api_calls"] += 1 | ||||
resp = req.text | resp = req.text | ||||
return json.loads(resp) | return json.loads(resp) | ||||
▲ Show 20 Lines • Show All 197 Lines • ▼ Show 20 Lines | def algo_min(source_tree: Tree, api_url: str): | ||||
parsed_nodes = query_swhids(all_nodes, api_url) | parsed_nodes = query_swhids(all_nodes, api_url) | ||||
for node in all_nodes: | for node in all_nodes: | ||||
node.known = parsed_nodes[node.swhid]["known"] | node.known = parsed_nodes[node.swhid]["known"] | ||||
all_nodes_copy = all_nodes.copy() | all_nodes_copy = all_nodes.copy() | ||||
for node in all_nodes: | for node in all_nodes: | ||||
if node.otype == CONTENT and not node.known: | if node.otype == CONTENT and not node.known: | ||||
all_nodes_copy.remove(node) | |||||
remove_parents(node, all_nodes_copy) | remove_parents(node, all_nodes_copy) | ||||
for node in all_nodes_copy: | for node in all_nodes_copy: | ||||
if node.otype == DIRECTORY and node.known: | if node.otype == DIRECTORY and node.known: | ||||
remove_children(node, all_nodes_copy) | remove_children(node, all_nodes_copy) | ||||
return len(all_nodes_copy) | return len(all_nodes_copy) | ||||
▲ Show 20 Lines • Show All 121 Lines • Show Last 20 Lines |