`_.
diff --git a/swh.graph.egg-info/SOURCES.txt b/swh.graph.egg-info/SOURCES.txt
index dac7f07..cca3c22 100644
--- a/swh.graph.egg-info/SOURCES.txt
+++ b/swh.graph.egg-info/SOURCES.txt
@@ -1,193 +1,193 @@
.gitignore
.pre-commit-config.yaml
AUTHORS
CODE_OF_CONDUCT.md
CONTRIBUTORS
LICENSE
MANIFEST.in
Makefile
Makefile.local
README.rst
mypy.ini
pyproject.toml
pytest.ini
requirements-swh.txt
requirements-test.txt
requirements.txt
setup.cfg
setup.py
tox.ini
docker/Dockerfile
docker/build.sh
docker/run.sh
docs/.gitignore
docs/Makefile
docs/Makefile.local
docs/README.rst
docs/api.rst
docs/cli.rst
docs/compression.rst
docs/conf.py
docs/docker.rst
docs/git2graph.md
docs/index.rst
docs/quickstart.rst
docs/use-cases.rst
docs/_static/.placeholder
docs/_templates/.placeholder
docs/images/.gitignore
docs/images/Makefile
docs/images/compression_steps.dot
java/.coding-style.xml
java/.gitignore
java/AUTHORS
java/LICENSE
java/README.md
java/pom.xml
java/.mvn/jvm.config
java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java
java/src/main/java/org/softwareheritage/graph/Entry.java
java/src/main/java/org/softwareheritage/graph/Graph.java
java/src/main/java/org/softwareheritage/graph/Node.java
java/src/main/java/org/softwareheritage/graph/NodesFiltering.java
java/src/main/java/org/softwareheritage/graph/SWHID.java
java/src/main/java/org/softwareheritage/graph/Stats.java
java/src/main/java/org/softwareheritage/graph/Subgraph.java
java/src/main/java/org/softwareheritage/graph/SwhPath.java
java/src/main/java/org/softwareheritage/graph/Traversal.java
java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java
java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java
java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java
java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java
java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java
java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java
java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java
java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java
java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java
java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java
java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java
java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java
java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java
java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java
java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java
java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
java/src/main/java/org/softwareheritage/graph/maps/MapFile.java
java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java
java/src/main/java/org/softwareheritage/graph/server/App.java
java/src/main/java/org/softwareheritage/graph/server/Endpoint.java
java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java
java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java
java/src/test/java/org/softwareheritage/graph/GraphTest.java
java/src/test/java/org/softwareheritage/graph/LeavesTest.java
java/src/test/java/org/softwareheritage/graph/NeighborsTest.java
java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
java/src/test/java/org/softwareheritage/graph/VisitTest.java
java/src/test/java/org/softwareheritage/graph/WalkTest.java
-java/target/swh-graph-0.5.1.jar
+java/target/swh-graph-0.5.2.jar
reports/.gitignore
reports/benchmarks/Makefile
reports/benchmarks/benchmarks.tex
reports/experiments/Makefile
reports/experiments/experiments.tex
reports/linux_log/LinuxLog.java
reports/linux_log/Makefile
reports/linux_log/linux_log.tex
reports/node_mapping/Makefile
reports/node_mapping/NodeIdMapHaloDB.java
reports/node_mapping/NodeIdMapRocksDB.java
reports/node_mapping/node_mapping.tex
swh/__init__.py
swh.graph.egg-info/PKG-INFO
swh.graph.egg-info/SOURCES.txt
swh.graph.egg-info/dependency_links.txt
swh.graph.egg-info/entry_points.txt
swh.graph.egg-info/requires.txt
swh.graph.egg-info/top_level.txt
swh/graph/__init__.py
swh/graph/backend.py
swh/graph/cli.py
swh/graph/client.py
swh/graph/config.py
swh/graph/dot.py
swh/graph/naive_client.py
swh/graph/py.typed
swh/graph/swhid.py
swh/graph/webgraph.py
swh/graph/server/__init__.py
swh/graph/server/app.py
swh/graph/tests/__init__.py
swh/graph/tests/conftest.py
swh/graph/tests/test_api_client.py
swh/graph/tests/test_cli.py
swh/graph/tests/test_swhid.py
swh/graph/tests/dataset/.gitignore
swh/graph/tests/dataset/example.edges.csv
swh/graph/tests/dataset/example.edges.csv.zst
swh/graph/tests/dataset/example.nodes.csv
swh/graph/tests/dataset/example.nodes.csv.zst
swh/graph/tests/dataset/generate_graph.sh
swh/graph/tests/dataset/img/.gitignore
swh/graph/tests/dataset/img/Makefile
swh/graph/tests/dataset/img/example.dot
swh/graph/tests/dataset/output/example-transposed.graph
swh/graph/tests/dataset/output/example-transposed.obl
swh/graph/tests/dataset/output/example-transposed.offsets
swh/graph/tests/dataset/output/example-transposed.properties
swh/graph/tests/dataset/output/example.graph
swh/graph/tests/dataset/output/example.indegree
swh/graph/tests/dataset/output/example.mph
swh/graph/tests/dataset/output/example.node2swhid.bin
swh/graph/tests/dataset/output/example.node2type.map
swh/graph/tests/dataset/output/example.obl
swh/graph/tests/dataset/output/example.offsets
swh/graph/tests/dataset/output/example.order
swh/graph/tests/dataset/output/example.outdegree
swh/graph/tests/dataset/output/example.properties
swh/graph/tests/dataset/output/example.stats
tools/dir2graph
tools/swhid2int2int2swhid.sh
tools/git2graph/.gitignore
tools/git2graph/Makefile
tools/git2graph/README.md
tools/git2graph/git2graph.c
tools/git2graph/tests/edge-filters.bats
tools/git2graph/tests/full-graph.bats
tools/git2graph/tests/node-filters.bats
tools/git2graph/tests/repo_helper.bash
tools/git2graph/tests/data/sample-repo.tgz
tools/git2graph/tests/data/graphs/dir-nodes/edges.csv
tools/git2graph/tests/data/graphs/dir-nodes/nodes.csv
tools/git2graph/tests/data/graphs/from-dir-edges/edges.csv
tools/git2graph/tests/data/graphs/from-dir-edges/nodes.csv
tools/git2graph/tests/data/graphs/from-rel-edges/edges.csv
tools/git2graph/tests/data/graphs/from-rel-edges/nodes.csv
tools/git2graph/tests/data/graphs/fs-nodes/edges.csv
tools/git2graph/tests/data/graphs/fs-nodes/nodes.csv
tools/git2graph/tests/data/graphs/full/edges.csv
tools/git2graph/tests/data/graphs/full/nodes.csv
tools/git2graph/tests/data/graphs/rev-edges/edges.csv
tools/git2graph/tests/data/graphs/rev-edges/nodes.csv
tools/git2graph/tests/data/graphs/rev-nodes/edges.csv
tools/git2graph/tests/data/graphs/rev-nodes/nodes.csv
tools/git2graph/tests/data/graphs/to-rev-edges/edges.csv
tools/git2graph/tests/data/graphs/to-rev-edges/nodes.csv
\ No newline at end of file
diff --git a/swh.graph.egg-info/entry_points.txt b/swh.graph.egg-info/entry_points.txt
index cfdaffe..ad77351 100644
--- a/swh.graph.egg-info/entry_points.txt
+++ b/swh.graph.egg-info/entry_points.txt
@@ -1,6 +1,5 @@
+[console_scripts]
+swh-graph = swh.graph.cli:main
- [console_scripts]
- swh-graph=swh.graph.cli:main
- [swh.cli.subcommands]
- graph=swh.graph.cli
-
\ No newline at end of file
+[swh.cli.subcommands]
+graph = swh.graph.cli
diff --git a/swh/graph/naive_client.py b/swh/graph/naive_client.py
index 9e08d65..680df11 100644
--- a/swh/graph/naive_client.py
+++ b/swh/graph/naive_client.py
@@ -1,369 +1,396 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import inspect
import re
import statistics
from typing import (
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Set,
Tuple,
TypeVar,
+ Union,
)
-from swh.model.swhids import ExtendedSWHID, ValidationError
+from swh.model.swhids import CoreSWHID, ExtendedSWHID, ValidationError
from .client import GraphArgumentException
_NODE_TYPES = "ori|snp|rel|rev|dir|cnt"
NODES_RE = re.compile(fr"(\*|{_NODE_TYPES})")
EDGES_RE = re.compile(fr"(\*|{_NODE_TYPES}):(\*|{_NODE_TYPES})")
T = TypeVar("T", bound=Callable)
+SWHIDlike = Union[CoreSWHID, ExtendedSWHID, str]
def check_arguments(f: T) -> T:
"""Decorator for generic argument checking for methods of NaiveClient.
Checks ``src`` is a valid and known SWHID, and ``edges`` has the right format."""
signature = inspect.signature(f)
@functools.wraps(f)
def newf(*args, **kwargs):
__tracebackhide__ = True # for pytest
try:
bound_args = signature.bind(*args, **kwargs)
except TypeError as e:
# rethrow the exception from here so pytest doesn't flood the terminal
# with signature.bind's call stack.
raise TypeError(*e.args) from None
self = bound_args.arguments["self"]
src = bound_args.arguments.get("src")
if src:
self._check_swhid(src)
edges = bound_args.arguments.get("edges")
if edges:
if edges != "*" and not EDGES_RE.match(edges):
raise GraphArgumentException(f"invalid edge restriction: {edges}")
return_types = bound_args.arguments.get("return_types")
if return_types:
if not NODES_RE.match(return_types):
raise GraphArgumentException(
f"invalid return_types restriction: {return_types}"
)
return f(*args, **kwargs)
return newf # type: ignore
def filter_node_types(node_types: str, nodes: Iterable[str]) -> Iterator[str]:
if node_types == "*":
yield from nodes
else:
prefixes = tuple(f"swh:1:{type_}:" for type_ in node_types.split(","))
for node in nodes:
if node.startswith(prefixes):
yield node
class NaiveClient:
"""An alternative implementation of :class:`swh.graph.backend.Backend`,
written in pure-python and meant for simulating it in other components' test
- cases.
+ cases; constructed from a list of nodes and (directed) edges, both
+ represented as SWHIDs.
It is NOT meant to be efficient in any way; only to be a very simple
- implementation that provides the same behavior."""
-
- def __init__(self, *, nodes: List[str], edges: List[Tuple[str, str]]):
+ implementation that provides the same behavior.
+
+ >>> nodes = [
+ ... "swh:1:rev:1111111111111111111111111111111111111111",
+ ... "swh:1:rev:2222222222222222222222222222222222222222",
+ ... "swh:1:rev:3333333333333333333333333333333333333333",
+ ... ]
+ >>> edges = [
+ ... (
+ ... "swh:1:rev:1111111111111111111111111111111111111111",
+ ... "swh:1:rev:2222222222222222222222222222222222222222",
+ ... ),
+ ... (
+ ... "swh:1:rev:2222222222222222222222222222222222222222",
+ ... "swh:1:rev:3333333333333333333333333333333333333333",
+ ... ),
+ ... ]
+ >>> c = NaiveClient(nodes=nodes, edges=edges)
+ >>> list(c.leaves("swh:1:rev:1111111111111111111111111111111111111111"))
+ ['swh:1:rev:3333333333333333333333333333333333333333']
+ """
+
+ def __init__(
+ self, *, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]]
+ ):
self.graph = Graph(nodes, edges)
def _check_swhid(self, swhid):
try:
ExtendedSWHID.from_string(swhid)
except ValidationError as e:
raise GraphArgumentException(*e.args) from None
if swhid not in self.graph.nodes:
raise GraphArgumentException(f"SWHID not found: {swhid}")
def stats(self) -> Dict:
return {
"counts": {
"nodes": len(self.graph.nodes),
"edges": sum(map(len, self.graph.forward_edges.values())),
},
"ratios": {
"compression": 1.0,
"bits_per_edge": 100.0,
"bits_per_node": 100.0,
"avg_locality": 0.0,
},
"indegree": {
"min": min(map(len, self.graph.backward_edges.values())),
"max": max(map(len, self.graph.backward_edges.values())),
"avg": statistics.mean(map(len, self.graph.backward_edges.values())),
},
"outdegree": {
"min": min(map(len, self.graph.forward_edges.values())),
"max": max(map(len, self.graph.forward_edges.values())),
"avg": statistics.mean(map(len, self.graph.forward_edges.values())),
},
}
@check_arguments
def leaves(
self,
src: str,
edges: str = "*",
direction: str = "forward",
max_edges: int = 0,
return_types: str = "*",
) -> Iterator[str]:
# TODO: max_edges
yield from filter_node_types(
return_types,
[
node
for node in self.graph.get_subgraph(src, edges, direction)
if not self.graph.get_filtered_neighbors(node, edges, direction)
],
)
@check_arguments
def neighbors(
self,
src: str,
edges: str = "*",
direction: str = "forward",
max_edges: int = 0,
return_types: str = "*",
) -> Iterator[str]:
# TODO: max_edges
yield from filter_node_types(
return_types, self.graph.get_filtered_neighbors(src, edges, direction)
)
@check_arguments
def visit_nodes(
self,
src: str,
edges: str = "*",
direction: str = "forward",
max_edges: int = 0,
return_types: str = "*",
) -> Iterator[str]:
# TODO: max_edges
yield from filter_node_types(
return_types, self.graph.get_subgraph(src, edges, direction)
)
@check_arguments
def visit_edges(
self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0
) -> Iterator[Tuple[str, str]]:
if max_edges == 0:
max_edges = None # type: ignore
else:
max_edges -= 1
yield from list(self.graph.iter_edges_dfs(direction, edges, src))[:max_edges]
@check_arguments
def visit_paths(
self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0
) -> Iterator[List[str]]:
# TODO: max_edges
for path in self.graph.iter_paths_dfs(direction, edges, src):
if path[-1] in self.leaves(src, edges, direction):
yield list(path)
@check_arguments
def walk(
self,
src: str,
dst: str,
edges: str = "*",
traversal: str = "dfs",
direction: str = "forward",
limit: Optional[int] = None,
) -> Iterator[str]:
# TODO: implement algo="bfs"
# TODO: limit
match_path: Callable[[str], bool]
if ":" in dst:
match_path = dst.__eq__
self._check_swhid(dst)
else:
match_path = lambda node: node.startswith(f"swh:1:{dst}:") # noqa
for path in self.graph.iter_paths_dfs(direction, edges, src):
if match_path(path[-1]):
if not limit:
# 0 or None
yield from path
elif limit > 0:
yield from path[0:limit]
else:
yield from path[limit:]
@check_arguments
def random_walk(
self,
src: str,
dst: str,
edges: str = "*",
direction: str = "forward",
limit: Optional[int] = None,
):
# TODO: limit
yield from self.walk(src, dst, edges, "dfs", direction, limit)
@check_arguments
def count_leaves(
self, src: str, edges: str = "*", direction: str = "forward"
) -> int:
return len(list(self.leaves(src, edges, direction)))
@check_arguments
def count_neighbors(
self, src: str, edges: str = "*", direction: str = "forward"
) -> int:
return len(self.graph.get_filtered_neighbors(src, edges, direction))
@check_arguments
def count_visit_nodes(
self, src: str, edges: str = "*", direction: str = "forward"
) -> int:
return len(self.graph.get_subgraph(src, edges, direction))
class Graph:
- def __init__(self, nodes: List[str], edges: List[Tuple[str, str]]):
- self.nodes = nodes
+ def __init__(
+ self, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]]
+ ):
+ self.nodes = [str(node) for node in nodes]
self.forward_edges: Dict[str, List[str]] = {}
self.backward_edges: Dict[str, List[str]] = {}
for node in nodes:
- self.forward_edges[node] = []
- self.backward_edges[node] = []
+ self.forward_edges[str(node)] = []
+ self.backward_edges[str(node)] = []
for (src, dst) in edges:
- self.forward_edges[src].append(dst)
- self.backward_edges[dst].append(src)
+ self.forward_edges[str(src)].append(str(dst))
+ self.backward_edges[str(dst)].append(str(src))
def get_filtered_neighbors(
self, src: str, edges_fmt: str, direction: str,
) -> Set[str]:
if direction == "forward":
edges = self.forward_edges
elif direction == "backward":
edges = self.backward_edges
else:
raise GraphArgumentException(f"invalid direction: {direction}")
neighbors = edges.get(src, [])
if edges_fmt == "*":
return set(neighbors)
else:
filtered_neighbors: Set[str] = set()
for edges_fmt_item in edges_fmt.split(","):
(src_fmt, dst_fmt) = edges_fmt_item.split(":")
if src_fmt != "*" and not src.startswith(f"swh:1:{src_fmt}:"):
continue
if dst_fmt == "*":
filtered_neighbors.update(neighbors)
else:
prefix = f"swh:1:{dst_fmt}:"
filtered_neighbors.update(
n for n in neighbors if n.startswith(prefix)
)
return filtered_neighbors
def get_subgraph(self, src: str, edges_fmt: str, direction: str) -> Set[str]:
seen = set()
to_visit = {src}
while to_visit:
node = to_visit.pop()
seen.add(node)
neighbors = set(self.get_filtered_neighbors(node, edges_fmt, direction))
new_nodes = neighbors - seen
to_visit.update(new_nodes)
return seen
def iter_paths_dfs(
self, direction: str, edges_fmt: str, src: str
) -> Iterator[Tuple[str, ...]]:
for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src):
yield path + (node,)
def iter_edges_dfs(
self, direction: str, edges_fmt: str, src: str
) -> Iterator[Tuple[str, str]]:
for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src):
if len(path) > 0:
yield (path[-1], node)
class SubgraphIterator(Iterator[Tuple[Tuple[str, ...], str]]):
def __init__(self, graph: Graph, direction: str, edges_fmt: str, src: str):
self.graph = graph
self.direction = direction
self.edges_fmt = edges_fmt
self.seen: Set[str] = set()
self.src = src
def more_work(self) -> bool:
raise NotImplementedError()
def pop(self) -> Tuple[Tuple[str, ...], str]:
raise NotImplementedError()
def push(self, new_path: Tuple[str, ...], neighbor: str) -> None:
raise NotImplementedError()
def __next__(self) -> Tuple[Tuple[str, ...], str]:
# Stores (path, next_node)
if not self.more_work():
raise StopIteration()
(path, node) = self.pop()
new_path = path + (node,)
if node not in self.seen:
neighbors = self.graph.get_filtered_neighbors(
node, self.edges_fmt, self.direction
)
# We want to visit the first neighbor first, and to_visit is a stack;
# so we need to reversed() the list of neighbors to get it on top
# of the stack.
for neighbor in reversed(list(neighbors)):
self.push(new_path, neighbor)
self.seen.add(node)
return (path, node)
class DfsSubgraphIterator(SubgraphIterator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.to_visit: List[Tuple[Tuple[str, ...], str]] = [((), self.src)]
def more_work(self) -> bool:
return bool(self.to_visit)
def pop(self) -> Tuple[Tuple[str, ...], str]:
return self.to_visit.pop()
def push(self, new_path: Tuple[str, ...], neighbor: str) -> None:
self.to_visit.append((new_path, neighbor))
diff --git a/swh/graph/server/app.py b/swh/graph/server/app.py
index 0128f2a..3a883e9 100644
--- a/swh/graph/server/app.py
+++ b/swh/graph/server/app.py
@@ -1,373 +1,373 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
A proxy HTTP server for swh-graph, talking to the Java code via py4j, and using
FIFO as a transport to stream integers between the two languages.
"""
import asyncio
from collections import deque
import os
from typing import Optional
import aiohttp.web
from swh.core.api.asynchronous import RPCServerApp
from swh.core.config import read as config_read
from swh.graph.backend import Backend
from swh.model.swhids import EXTENDED_SWHID_TYPES
try:
from contextlib import asynccontextmanager
except ImportError:
# Compatibility with 3.6 backport
from async_generator import asynccontextmanager # type: ignore
# maximum number of retries for random walks
-RANDOM_RETRIES = 5 # TODO make this configurable via rpc-serve configuration
+RANDOM_RETRIES = 10 # TODO make this configurable via rpc-serve configuration
class GraphServerApp(RPCServerApp):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.on_startup.append(self._start_gateway)
self.on_shutdown.append(self._stop_gateway)
@staticmethod
async def _start_gateway(app):
# Equivalent to entering `with app["backend"]:`
app["backend"].start_gateway()
@staticmethod
async def _stop_gateway(app):
# Equivalent to exiting `with app["backend"]:` with no error
app["backend"].stop_gateway()
async def index(request):
return aiohttp.web.Response(
content_type="text/html",
body="""
Software Heritage graph server
You have reached the
Software Heritage graph API server.
See its
API
documentation for more information.
""",
)
class GraphView(aiohttp.web.View):
"""Base class for views working on the graph, with utility functions"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.backend = self.request.app["backend"]
def get_direction(self):
"""Validate HTTP query parameter `direction`"""
s = self.request.query.get("direction", "forward")
if s not in ("forward", "backward"):
raise aiohttp.web.HTTPBadRequest(text=f"invalid direction: {s}")
return s
def get_edges(self):
"""Validate HTTP query parameter `edges`, i.e., edge restrictions"""
s = self.request.query.get("edges", "*")
if any(
[
node_type != "*" and node_type not in EXTENDED_SWHID_TYPES
for edge in s.split(":")
for node_type in edge.split(",", maxsplit=1)
]
):
raise aiohttp.web.HTTPBadRequest(text=f"invalid edge restriction: {s}")
return s
def get_return_types(self):
"""Validate HTTP query parameter 'return types', i.e,
a set of types which we will filter the query results with"""
s = self.request.query.get("return_types", "*")
if any(
node_type != "*" and node_type not in EXTENDED_SWHID_TYPES
for node_type in s.split(",")
):
raise aiohttp.web.HTTPBadRequest(
text=f"invalid type for filtering res: {s}"
)
# if the user puts a star,
# then we filter nothing, we don't need the other information
if "*" in s:
return "*"
else:
return s
def get_traversal(self):
"""Validate HTTP query parameter `traversal`, i.e., visit order"""
s = self.request.query.get("traversal", "dfs")
if s not in ("bfs", "dfs"):
raise aiohttp.web.HTTPBadRequest(text=f"invalid traversal order: {s}")
return s
def get_limit(self):
"""Validate HTTP query parameter `limit`, i.e., number of results"""
s = self.request.query.get("limit", "0")
try:
return int(s)
except ValueError:
raise aiohttp.web.HTTPBadRequest(text=f"invalid limit value: {s}")
def get_max_edges(self):
"""Validate HTTP query parameter 'max_edges', i.e.,
the limit of the number of edges that can be visited"""
s = self.request.query.get("max_edges", "0")
try:
return int(s)
except ValueError:
raise aiohttp.web.HTTPBadRequest(text=f"invalid max_edges value: {s}")
def check_swhid(self, swhid):
"""Validate that the given SWHID exists in the graph"""
try:
self.backend.check_swhid(swhid)
except (NameError, ValueError) as e:
raise aiohttp.web.HTTPBadRequest(text=str(e))
class StreamingGraphView(GraphView):
"""Base class for views streaming their response line by line."""
content_type = "text/plain"
@asynccontextmanager
async def response_streamer(self, *args, **kwargs):
"""Context manager to prepare then close a StreamResponse"""
response = aiohttp.web.StreamResponse(*args, **kwargs)
response.content_type = self.content_type
await response.prepare(self.request)
yield response
await response.write_eof()
async def get(self):
await self.prepare_response()
async with self.response_streamer() as self.response_stream:
self._buf = []
try:
await self.stream_response()
finally:
await self._flush_buffer()
return self.response_stream
async def prepare_response(self):
"""This can be overridden with some setup to be run before the response
actually starts streaming.
"""
pass
async def stream_response(self):
"""Override this to perform the response streaming. Implementations of
this should await self.stream_line(line) to write each line.
"""
raise NotImplementedError
async def stream_line(self, line):
"""Write a line in the response stream."""
self._buf.append(line)
if len(self._buf) > 100:
await self._flush_buffer()
async def _flush_buffer(self):
await self.response_stream.write("\n".join(self._buf).encode() + b"\n")
self._buf = []
class StatsView(GraphView):
"""View showing some statistics on the graph"""
async def get(self):
stats = self.backend.stats()
return aiohttp.web.Response(body=stats, content_type="application/json")
class SimpleTraversalView(StreamingGraphView):
"""Base class for views of simple traversals"""
simple_traversal_type: Optional[str] = None
async def prepare_response(self):
self.src = self.request.match_info["src"]
self.edges = self.get_edges()
self.direction = self.get_direction()
self.max_edges = self.get_max_edges()
self.return_types = self.get_return_types()
self.check_swhid(self.src)
async def stream_response(self):
async for res_line in self.backend.traversal(
self.simple_traversal_type,
self.direction,
self.edges,
self.src,
self.max_edges,
self.return_types,
):
await self.stream_line(res_line)
class LeavesView(SimpleTraversalView):
simple_traversal_type = "leaves"
class NeighborsView(SimpleTraversalView):
simple_traversal_type = "neighbors"
class VisitNodesView(SimpleTraversalView):
simple_traversal_type = "visit_nodes"
class VisitEdgesView(SimpleTraversalView):
simple_traversal_type = "visit_edges"
class WalkView(StreamingGraphView):
async def prepare_response(self):
self.src = self.request.match_info["src"]
self.dst = self.request.match_info["dst"]
self.edges = self.get_edges()
self.direction = self.get_direction()
self.algo = self.get_traversal()
self.limit = self.get_limit()
self.max_edges = self.get_max_edges()
self.return_types = self.get_return_types()
self.check_swhid(self.src)
if self.dst not in EXTENDED_SWHID_TYPES:
self.check_swhid(self.dst)
async def get_walk_iterator(self):
return self.backend.traversal(
"walk",
self.direction,
self.edges,
self.algo,
self.src,
self.dst,
self.max_edges,
self.return_types,
)
async def stream_response(self):
it = self.get_walk_iterator()
if self.limit < 0:
queue = deque(maxlen=-self.limit)
async for res_swhid in it:
queue.append(res_swhid)
while queue:
await self.stream_line(queue.popleft())
else:
count = 0
async for res_swhid in it:
if self.limit == 0 or count < self.limit:
await self.stream_line(res_swhid)
count += 1
else:
break
class RandomWalkView(WalkView):
def get_walk_iterator(self):
return self.backend.traversal(
"random_walk",
self.direction,
self.edges,
RANDOM_RETRIES,
self.src,
self.dst,
self.max_edges,
self.return_types,
)
class CountView(GraphView):
"""Base class for counting views."""
count_type: Optional[str] = None
async def get(self):
self.src = self.request.match_info["src"]
self.check_swhid(self.src)
self.edges = self.get_edges()
self.direction = self.get_direction()
self.max_edges = self.get_max_edges()
loop = asyncio.get_event_loop()
cnt = await loop.run_in_executor(
None,
self.backend.count,
self.count_type,
self.direction,
self.edges,
self.src,
self.max_edges,
)
return aiohttp.web.Response(body=str(cnt), content_type="application/json")
class CountNeighborsView(CountView):
count_type = "neighbors"
class CountLeavesView(CountView):
count_type = "leaves"
class CountVisitNodesView(CountView):
count_type = "visit_nodes"
def make_app(config=None, backend=None, **kwargs):
if (config is None) == (backend is None):
raise ValueError("make_app() expects exactly one of 'config' or 'backend'")
if backend is None:
backend = Backend(graph_path=config["graph"]["path"], config=config["graph"])
app = GraphServerApp(**kwargs)
app.add_routes(
[
aiohttp.web.get("/", index),
aiohttp.web.get("/graph", index),
aiohttp.web.view("/graph/stats", StatsView),
aiohttp.web.view("/graph/leaves/{src}", LeavesView),
aiohttp.web.view("/graph/neighbors/{src}", NeighborsView),
aiohttp.web.view("/graph/visit/nodes/{src}", VisitNodesView),
aiohttp.web.view("/graph/visit/edges/{src}", VisitEdgesView),
# temporarily disabled in wait of a proper fix for T1969
# aiohttp.web.view("/graph/walk/{src}/{dst}", WalkView)
aiohttp.web.view("/graph/randomwalk/{src}/{dst}", RandomWalkView),
aiohttp.web.view("/graph/neighbors/count/{src}", CountNeighborsView),
aiohttp.web.view("/graph/leaves/count/{src}", CountLeavesView),
aiohttp.web.view("/graph/visit/nodes/count/{src}", CountVisitNodesView),
]
)
app["backend"] = backend
return app
def make_app_from_configfile():
"""Load configuration and then build application to run
"""
config_file = os.environ.get("SWH_CONFIG_FILENAME")
config = config_read(config_file)
return make_app(config=config)
diff --git a/swh/graph/tests/test_api_client.py b/swh/graph/tests/test_api_client.py
index 46e0227..79b4d86 100644
--- a/swh/graph/tests/test_api_client.py
+++ b/swh/graph/tests/test_api_client.py
@@ -1,379 +1,379 @@
import pytest
from pytest import raises
from swh.core.api import RemoteException
from swh.graph.client import GraphArgumentException
def test_stats(graph_client):
stats = graph_client.stats()
assert set(stats.keys()) == {"counts", "ratios", "indegree", "outdegree"}
assert set(stats["counts"].keys()) == {"nodes", "edges"}
assert set(stats["ratios"].keys()) == {
"compression",
"bits_per_node",
"bits_per_edge",
"avg_locality",
}
assert set(stats["indegree"].keys()) == {"min", "max", "avg"}
assert set(stats["outdegree"].keys()) == {"min", "max", "avg"}
assert stats["counts"]["nodes"] == 21
assert stats["counts"]["edges"] == 23
assert isinstance(stats["ratios"]["compression"], float)
assert isinstance(stats["ratios"]["bits_per_node"], float)
assert isinstance(stats["ratios"]["bits_per_edge"], float)
assert isinstance(stats["ratios"]["avg_locality"], float)
assert stats["indegree"]["min"] == 0
assert stats["indegree"]["max"] == 3
assert isinstance(stats["indegree"]["avg"], float)
assert stats["outdegree"]["min"] == 0
assert stats["outdegree"]["max"] == 3
assert isinstance(stats["outdegree"]["avg"], float)
def test_leaves(graph_client):
actual = list(
graph_client.leaves("swh:1:ori:0000000000000000000000000000000000000021")
)
expected = [
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
"swh:1:cnt:0000000000000000000000000000000000000007",
]
assert set(actual) == set(expected)
def test_neighbors(graph_client):
actual = list(
graph_client.neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
)
expected = [
"swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000013",
]
assert set(actual) == set(expected)
def test_visit_nodes(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", return_types="dir",
)
)
expected = [
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered_star(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", return_types="*",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
]
assert set(actual) == set(expected)
def test_visit_edges(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
assert set(actual) == set(expected)
def test_visit_edges_limited(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
max_edges=4,
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
# As there are four valid answers (up to reordering), we cannot check for
# equality. Instead, we check the client returned all edges but one.
assert set(actual).issubset(set(expected))
assert len(actual) == 3
def test_visit_edges_diamond_pattern(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rev:0000000000000000000000000000000000000009", edges="*",
)
)
expected = [
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
(
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005",
),
]
assert set(actual) == set(expected)
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_walk(graph_client):
args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel")
kwargs = {
"edges": "dir:dir,dir:rev,rev:*",
"direction": "backward",
"traversal": "bfs",
}
actual = list(graph_client.walk(*args, **kwargs))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019",
]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.walk(*args, **kwargs2))
expected = ["swh:1:rel:0000000000000000000000000000000000000019"]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = 2
actual = list(graph_client.walk(*args, **kwargs2))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
]
assert set(actual) == set(expected)
def test_random_walk_dst_is_type(graph_client):
- """as the walk is random, we test a visit from a cnt node to the only
- origin in the dataset, and only check the final node of the path
- (i.e., the origin)
+ """as the walk is random, we test a visit from a cnt node to a release
+ reachable from every single path in the backward graph, and only check the
+ final node of the path (i.e., the release)
"""
- args = ("swh:1:cnt:0000000000000000000000000000000000000001", "ori")
+ args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel")
kwargs = {"direction": "backward"}
- expected_root = "swh:1:ori:0000000000000000000000000000000000000021"
+ expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
- assert len(actual) > 1 # no origin directly links to a content
+ assert len(actual) > 1 # no release directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
def test_random_walk_dst_is_node(graph_client):
- """Same as test_random_walk_dst_is_type, but we target the specific origin
+ """Same as test_random_walk_dst_is_type, but we target the specific release
node instead of a type
"""
args = (
- "swh:1:cnt:0000000000000000000000000000000000000001",
- "swh:1:ori:0000000000000000000000000000000000000021",
+ "swh:1:cnt:0000000000000000000000000000000000000015",
+ "swh:1:rel:0000000000000000000000000000000000000019",
)
kwargs = {"direction": "backward"}
- expected_root = "swh:1:ori:0000000000000000000000000000000000000021"
+ expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
assert len(actual) > 1 # no origin directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
def test_count(graph_client):
actual = graph_client.count_leaves(
"swh:1:ori:0000000000000000000000000000000000000021"
)
assert actual == 4
actual = graph_client.count_visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev"
)
assert actual == 3
actual = graph_client.count_neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
assert actual == 3
def test_param_validation(graph_client):
with raises(GraphArgumentException) as exc_info: # SWHID not found
list(graph_client.leaves("swh:1:ori:fff0000000000000000000000000000000000021"))
if exc_info.value.response:
assert exc_info.value.response.status_code == 404
with raises(GraphArgumentException) as exc_info: # malformed SWHID
list(
graph_client.neighbors("swh:1:ori:fff000000zzzzzz0000000000000000000000021")
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:notanodetype,dir:rev,rev:*",
direction="backward",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed direction
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:dir,dir:rev,rev:*",
direction="notadirection",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_param_validation_walk(graph_client):
"""test validation of walk-specific parameters only"""
with raises(RemoteException) as exc_info: # malformed traversal order
list(
graph_client.walk(
"swh:1:dir:0000000000000000000000000000000000000016",
"rel",
edges="dir:dir,dir:rev,rev:*",
direction="backward",
traversal="notatraversalorder",
)
)
assert exc_info.value.response.status_code == 400
diff --git a/tox.ini b/tox.ini
index 959bbb8..96313db 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,76 +1,77 @@
[tox]
envlist=black,flake8,mypy,py3
[testenv]
extras =
testing
deps =
pytest-cov
whitelist_externals =
mvn
sh
commands =
sh -c 'if ! [ -d {envdir}/share/swh-graph ]; then mvn -f java/pom.xml compile assembly:single; mkdir {envdir}/share/swh-graph; cp java/target/*.jar {envdir}/share/swh-graph; fi'
pytest --cov={envsitepackagesdir}/swh/graph \
{envsitepackagesdir}/swh/graph \
+ --doctest-modules \
--cov-branch {posargs}
[testenv:black]
skip_install = true
deps =
black==19.10b0
commands =
{envpython} -m black --check swh
[testenv:flake8]
skip_install = true
deps =
flake8
commands =
{envpython} -m flake8
[testenv:mypy]
extras =
testing
deps =
mypy==0.920
commands =
mypy swh
# build documentation outside swh-environment using the current
# git HEAD of swh-docs, is executed on CI for each diff to prevent
# breaking doc build
[testenv:sphinx]
whitelist_externals = make
usedevelop = true
extras =
testing
deps =
# fetch and install swh-docs in develop mode
-e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs
setenv =
SWH_PACKAGE_DOC_TOX_BUILD = 1
# turn warnings into errors
SPHINXOPTS = -W
commands =
make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs
# build documentation only inside swh-environment using local state
# of swh-docs package
[testenv:sphinx-dev]
whitelist_externals = make
usedevelop = true
extras =
testing
deps =
# install swh-docs in develop mode
-e ../swh-docs
setenv =
SWH_PACKAGE_DOC_TOX_BUILD = 1
# turn warnings into errors
SPHINXOPTS = -W
commands =
make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs