Page MenuHomeSoftware Heritage

D5644.diff
No OneTemporary

D5644.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,3 +1,3 @@
# Add here internal Software Heritage dependencies, one per line.
swh.core >= 0.3
-swh.model >= 0.3.8
+swh.model >= 2.3.0
diff --git a/swh/scanner/benchmark_algos.py b/swh/scanner/benchmark_algos.py
--- a/swh/scanner/benchmark_algos.py
+++ b/swh/scanner/benchmark_algos.py
@@ -10,6 +10,7 @@
import os
from pathlib import Path
import random
+import time
from typing import Dict, Iterable, List, Optional
import requests
@@ -17,7 +18,7 @@
from requests.packages.urllib3.util.retry import Retry
from swh.model.from_disk import Content, Directory, accept_all_directories
-from swh.model.identifiers import CONTENT, DIRECTORY, swhid
+from swh.model.identifiers import CONTENT, DIRECTORY, CoreSWHID, ObjectType
from .exceptions import APIError
from .model import Status, Tree
@@ -229,7 +230,7 @@
set_father_status(node, False)
-def algo_min(source_tree: Tree, api_url: str):
+def algo_min(source_tree: Tree, api_url: str, counter: collections.Counter):
"""
The minimal number of queries knowing the known/unknown status of every node
"""
@@ -278,7 +279,8 @@
filter(lambda node: node.status == Status.unset, source_tree.iterate_bfs())
)
- return len(source_tree) - len(unset_cnts)
+ counter["api_calls"] = -1
+ counter["queries"] = len(source_tree) - len(unset_cnts)
def get_swhids(paths: Iterable[Path], exclude_patterns):
@@ -296,10 +298,12 @@
path=bytes(path), dir_filter=dir_filter
).get_data()
- return swhid(DIRECTORY, obj)
+ return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"]))
else:
obj = Content.from_file(path=bytes(path)).get_data()
- return swhid(CONTENT, obj)
+ return str(
+ CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"])
+ )
for path in paths:
yield str(path), swhid_of(path)
@@ -369,6 +373,7 @@
f'started processing repo "{repo_id}" with algorithm '
f'"{algo}" and knowledge base "{backend_name}"'
)
+ tstart = time.time()
if algo == "random":
if seed:
@@ -376,19 +381,7 @@
else:
random_(source_tree, api_url, counter)
elif algo == "algo_min":
- min_queries = algo_min(source_tree, api_url)
- min_result = (
- repo_id,
- origin,
- commit,
- backend_name,
- len(source_tree),
- algo,
- -1,
- min_queries,
- )
- print(*min_result, sep=",")
- return
+ algo_min(source_tree, api_url, counter)
elif algo == "stopngo":
stopngo(source_tree, api_url, counter)
elif algo == "file_priority":
@@ -398,6 +391,7 @@
else:
raise Exception(f'Algorithm "{algo}" not found')
+ tend = time.time()
result = (
repo_id,
origin,
@@ -407,6 +401,7 @@
algo,
counter["api_calls"],
counter["queries"],
+ tend - tstart,
)
logging.info(
diff --git a/swh/scanner/plot.py b/swh/scanner/plot.py
--- a/swh/scanner/plot.py
+++ b/swh/scanner/plot.py
@@ -18,7 +18,7 @@
from pathlib import Path
from typing import Dict, List, Tuple
-import numpy as np # type: ignore
+import numpy as np
import pandas as pd # type: ignore
import plotly.graph_objects as go
from plotly.offline import offline
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
--- a/swh/scanner/scanner.py
+++ b/swh/scanner/scanner.py
@@ -15,7 +15,7 @@
import aiohttp
from swh.model.from_disk import Content, Directory, accept_all_directories
-from swh.model.identifiers import CONTENT, DIRECTORY, parse_swhid, swhid
+from swh.model.identifiers import CoreSWHID, ObjectType
from .dashboard.dashboard import run_app
from .exceptions import InvalidDirectoryPath, error_response
@@ -114,10 +114,12 @@
path=bytes(path), dir_filter=dir_filter
).get_data()
- return swhid(DIRECTORY, obj)
+ return str(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj["id"]))
else:
obj = Content.from_file(path=bytes(path)).get_data()
- return swhid(CONTENT, obj)
+ return str(
+ CoreSWHID(object_type=ObjectType.CONTENT, object_id=obj["sha1_git"])
+ )
dirpath, dnames, fnames = next(os.walk(path))
for node in itertools.chain(dnames, fnames):
@@ -176,11 +178,13 @@
for path, obj_swhid, known in await parse_path(
root, session, api_url, exclude_patterns
):
- obj_type = parse_swhid(obj_swhid).object_type
+ obj_type = CoreSWHID.from_string(obj_swhid).object_type
- if obj_type == CONTENT:
+ if obj_type == ObjectType.CONTENT:
source_tree.add_node(path, obj_swhid, known)
- elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns):
+ elif obj_type == ObjectType.DIRECTORY and directory_filter(
+ path, exclude_patterns
+ ):
source_tree.add_node(path, obj_swhid, known)
if not known:
await _scan(path, session, api_url, source_tree, exclude_patterns)

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 12:03 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226548

Event Timeline