Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124484
D5644.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D5644.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,3 +1,3 @@
# Add here internal Software Heritage dependencies, one per line.
swh.core >= 0.3
-swh.model >= 0.3.8
+swh.model >= 2.3.0
diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py
--- a/swh/scanner/benchmark_algos.py
+++ b/swh/scanner/benchmark_algos.py
@@ -10,6 +10,7 @@
import os
from pathlib import Path
import random
+import time
from typing import Dict, Iterable, List, Optional
import requests
@@ -17,7 +18,7 @@
from requests.packages.urllib3.util.retry import Retry
from swh.model.from_disk import Content, Directory, accept_all_directories
-from swh.model.identifiers import CONTENT, DIRECTORY, swhid
+from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID, ObjectType
from .exceptions import APIError
from .model import Status, Tree
@@ -229,7 +230,7 @@
set_father_status(node, False)
-def algo_min(source_tree: Tree, api_url: str):
+def algo_min(source_tree: Tree, api_url: str, counter: collections.Counter):
"""
The minimal number of queries knowing the known/unknown status of every node
"""
@@ -278,7 +279,8 @@
filter(lambda node: node.status == Status.unset, source_tree.iterate_bfs())
)
- return len(source_tree) - len(unset_cnts)
+ counter["api_calls"] = -1
+ counter["queries"] = len(source_tree) - len(unset_cnts)
def get_swhids(paths: Iterable[Path], exclude_patterns):
@@ -296,10 +298,12 @@
path=bytes(path), dir_filter=dir_filter
).get_data()
- return swhid(DIRECTORY, obj)
+ return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"]))
else:
obj = Content.from_file(path=bytes(path)).get_data()
- return swhid(CONTENT, obj)
+ return str(
+ CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"])
+ )
for path in paths:
yield str(path), swhid_of(path)
@@ -369,6 +373,7 @@
f'started processing repo "{repo_id}" with algorithm '
f'"{algo}" and knowledge base "{backend_name}"'
)
+ tstart = time.time()
if algo == "random":
if seed:
@@ -376,19 +381,7 @@
else:
random_(source_tree, api_url, counter)
elif algo == "algo_min":
- min_queries = algo_min(source_tree, api_url)
- min_result = (
- repo_id,
- origin,
- commit,
- backend_name,
- len(source_tree),
- algo,
- -1,
- min_queries,
- )
- print(*min_result, sep=",")
- return
+ algo_min(source_tree, api_url, counter)
elif algo == "stopngo":
stopngo(source_tree, api_url, counter)
elif algo == "file_priority":
@@ -398,6 +391,7 @@
else:
raise Exception(f'Algorithm "{algo}" not found')
+ tend = time.time()
result = (
repo_id,
origin,
@@ -407,6 +401,7 @@
algo,
counter["api_calls"],
counter["queries"],
+ tend - tstart,
)
logging.info(
diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py
--- a/swh/scanner/plot.py
+++ b/swh/scanner/plot.py
@@ -18,7 +18,7 @@
from pathlib import Path
from typing import Dict, List, Tuple
-import numpy as np # type: ignore
+import numpy as np
import pandas as pd # type: ignore
import plotly.graph_objects as go
from plotly.offline import offline
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -15,7 +15,7 @@
import aiohttp
from swh.model.from_disk import Content, Directory, accept_all_directories
-from swh.model.identifiers import CONTENT, DIRECTORY, parse_swhid, swhid
+from swh.model.identifiers import CoreSWHID, ObjectType
from .dashboard.dashboard import run_app
from .exceptions import InvalidDirectoryPath, error_response
@@ -114,10 +114,12 @@
path=bytes(path), dir_filter=dir_filter
).get_data()
- return swhid(DIRECTORY, obj)
+ return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"]))
else:
obj = Content.from_file(path=bytes(path)).get_data()
- return swhid(CONTENT, obj)
+ return str(
+ CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"])
+ )
dirpath, dnames, fnames = next(os.walk(path))
for node in itertools.chain(dnames, fnames):
@@ -176,11 +178,13 @@
for path, obj_swhid, known in await parse_path(
root, session, api_url, exclude_patterns
):
- obj_type = parse_swhid(obj_swhid).object_type
+ obj_type = CoreSWHID.from_string(obj_swhid).object_type
- if obj_type == CONTENT:
+ if obj_type == ObjectType.CONTENT:
source_tree.add_node(path, obj_swhid, known)
- elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
+ elif obj_type == ObjectType.DIRECTORY and directory_filter(
+ path, exclude_patterns
+ ):
source_tree.add_node(path, obj_swhid, known)
if not known:
await _scan(path, session, api_url, source_tree, exclude_patterns)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 12:03 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226548
Attached To
D5644: scanner-benchmark: add algorithms timings in results
Event Timeline
Log In to Comment