diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core >= 0.3 -swh.model >= 0.3.8 +swh.model >= 2.3.0 diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py --- a/swh/scanner/benchmark_algos.py +++ b/swh/scanner/benchmark_algos.py @@ -10,6 +10,7 @@ import os from pathlib import Path import random +import time from typing import Dict, Iterable, List, Optional import requests @@ -17,7 +18,7 @@ from requests.packages.urllib3.util.retry import Retry from swh.model.from_disk import Content, Directory, accept_all_directories -from swh.model.identifiers import CONTENT, DIRECTORY, swhid +from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID, ObjectType from .exceptions import APIError from .model import Status, Tree @@ -229,7 +230,7 @@ set_father_status(node, False) -def algo_min(source_tree: Tree, api_url: str): +def algo_min(source_tree: Tree, api_url: str, counter: collections.Counter): """ The minimal number of queries knowing the known/unknown status of every node """ @@ -278,7 +279,8 @@ filter(lambda node: node.status == Status.unset, source_tree.iterate_bfs()) ) - return len(source_tree) - len(unset_cnts) + counter["api_calls"] = -1 + counter["queries"] = len(source_tree) - len(unset_cnts) def get_swhids(paths: Iterable[Path], exclude_patterns): @@ -296,10 +298,12 @@ path=bytes(path), dir_filter=dir_filter ).get_data() - return swhid(DIRECTORY, obj) + return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"])) else: obj = Content.from_file(path=bytes(path)).get_data() - return swhid(CONTENT, obj) + return str( + CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"]) + ) for path in paths: yield str(path), swhid_of(path) @@ -369,6 +373,7 @@ f'started processing repo "{repo_id}" with algorithm ' f'"{algo}" and knowledge base "{backend_name}"' ) + tstart = time.time() if algo == "random": if seed: @@ -376,19 +381,7 @@ else: random_(source_tree, api_url, counter) elif algo == "algo_min": - min_queries = algo_min(source_tree, api_url) - min_result = ( - repo_id, - origin, - commit, - backend_name, - len(source_tree), - algo, - -1, - min_queries, - ) - print(*min_result, sep=",") - return + algo_min(source_tree, api_url, counter) elif algo == "stopngo": stopngo(source_tree, api_url, counter) elif algo == "file_priority": @@ -398,6 +391,7 @@ else: raise Exception(f'Algorithm "{algo}" not found') + tend = time.time() result = ( repo_id, origin, @@ -407,6 +401,7 @@ algo, counter["api_calls"], counter["queries"], + tend - tstart, ) logging.info( diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py --- a/swh/scanner/plot.py +++ b/swh/scanner/plot.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Dict, List, Tuple -import numpy as np # type: ignore +import numpy as np import pandas as pd # type: ignore import plotly.graph_objects as go from plotly.offline import offline diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -15,7 +15,7 @@ import aiohttp from swh.model.from_disk import Content, Directory, accept_all_directories -from swh.model.identifiers import CONTENT, DIRECTORY, parse_swhid, swhid +from swh.model.identifiers import CoreSWHID, ObjectType from .dashboard.dashboard import run_app from .exceptions import InvalidDirectoryPath, error_response @@ -114,10 +114,12 @@ path=bytes(path), dir_filter=dir_filter ).get_data() - return swhid(DIRECTORY, obj) + return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"])) else: obj = Content.from_file(path=bytes(path)).get_data() - return swhid(CONTENT, obj) + return str( + CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"]) + ) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): @@ -176,11 +178,13 @@ for path, obj_swhid, known in await parse_path( root, session, api_url, exclude_patterns ): - obj_type = parse_swhid(obj_swhid).object_type + obj_type = CoreSWHID.from_string(obj_swhid).object_type - if obj_type == CONTENT: + if obj_type == ObjectType.CONTENT: source_tree.add_node(path, obj_swhid, known) - elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): + elif obj_type == ObjectType.DIRECTORY and directory_filter( + path, exclude_patterns + ): source_tree.add_node(path, obj_swhid, known) if not known: await _scan(path, session, api_url, source_tree, exclude_patterns)