Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9339983
cli.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
cli.py
View Options
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
aiohttp
import
click
import
sys
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Tuple
import
swh.model.exceptions
from
swh.core
import
config
from
swh.core.cli
import
CONTEXT_SETTINGS
,
AliasedGroup
from
swh.graph
import
client
,
webgraph
from
swh.graph.backend
import
NODE2PID_EXT
,
PID2NODE_EXT
from
swh.graph.pid
import
PidToNodeMap
,
NodeToPidMap
from
swh.graph.server.app
import
make_app
from
swh.graph.backend
import
Backend
from
swh.model.identifiers
import
parse_persistent_identifier
class
PathlibPath
(
click
.
Path
):
"""A Click path argument that returns a pathlib Path, not a string"""
def
convert
(
self
,
value
,
param
,
ctx
):
return
Path
(
super
()
.
convert
(
value
,
param
,
ctx
))
DEFAULT_CONFIG
=
{
'graph'
:
(
'dict'
,
{})
}
# type: Dict[str, Tuple[str, Any]]
@click.group
(
name
=
'graph'
,
context_settings
=
CONTEXT_SETTINGS
,
cls
=
AliasedGroup
)
@click.option
(
'--config-file'
,
'-C'
,
default
=
None
,
type
=
click
.
Path
(
exists
=
True
,
dir_okay
=
False
,),
help
=
'YAML configuration file'
)
@click.pass_context
def
cli
(
ctx
,
config_file
):
"""Software Heritage graph tools."""
ctx
.
ensure_object
(
dict
)
conf
=
config
.
read
(
config_file
,
DEFAULT_CONFIG
)
if
'graph'
not
in
conf
:
raise
ValueError
(
'no "graph" stanza found in configuration file
%s
'
%
config_file
)
ctx
.
obj
[
'config'
]
=
conf
@cli.command
(
'api-client'
)
@click.option
(
'--host'
,
default
=
'localhost'
,
help
=
'Graph server host'
)
@click.option
(
'--port'
,
default
=
'5009'
,
help
=
'Graph server port'
)
@click.pass_context
def
api_client
(
ctx
,
host
,
port
):
"""client for the graph REST service"""
url
=
'http://{}:{}'
.
format
(
host
,
port
)
app
=
client
.
RemoteGraphClient
(
url
)
# TODO: run web app
print
(
app
.
stats
())
@cli.group
(
'map'
)
@click.pass_context
def
map
(
ctx
):
"""Manage swh-graph on-disk maps"""
pass
def
dump_pid2node
(
filename
):
for
(
pid
,
int
)
in
PidToNodeMap
(
filename
):
print
(
'{}
\t
{}'
.
format
(
pid
,
int
))
def
dump_node2pid
(
filename
):
for
(
int
,
pid
)
in
NodeToPidMap
(
filename
):
print
(
'{}
\t
{}'
.
format
(
int
,
pid
))
def
restore_pid2node
(
filename
):
"""read a textual PID->int map from stdin and write its binary version to
filename
"""
with
open
(
filename
,
'wb'
)
as
dst
:
for
line
in
sys
.
stdin
:
(
str_pid
,
str_int
)
=
line
.
split
()
PidToNodeMap
.
write_record
(
dst
,
str_pid
,
int
(
str_int
))
def
restore_node2pid
(
filename
,
length
):
"""read a textual int->PID map from stdin and write its binary version to
filename
"""
node2pid
=
NodeToPidMap
(
filename
,
mode
=
'wb'
,
length
=
length
)
for
line
in
sys
.
stdin
:
(
str_int
,
str_pid
)
=
line
.
split
()
node2pid
[
int
(
str_int
)]
=
str_pid
node2pid
.
close
()
@map.command
(
'dump'
)
@click.option
(
'--type'
,
'-t'
,
'map_type'
,
required
=
True
,
type
=
click
.
Choice
([
'pid2node'
,
'node2pid'
]),
help
=
'type of map to dump'
)
@click.argument
(
'filename'
,
required
=
True
,
type
=
click
.
Path
(
exists
=
True
))
@click.pass_context
def
dump_map
(
ctx
,
map_type
,
filename
):
"""Dump a binary PID<->node map to textual format."""
if
map_type
==
'pid2node'
:
dump_pid2node
(
filename
)
elif
map_type
==
'node2pid'
:
dump_node2pid
(
filename
)
else
:
raise
ValueError
(
'invalid map type: '
+
map_type
)
pass
@map.command
(
'restore'
)
@click.option
(
'--type'
,
'-t'
,
'map_type'
,
required
=
True
,
type
=
click
.
Choice
([
'pid2node'
,
'node2pid'
]),
help
=
'type of map to dump'
)
@click.option
(
'--length'
,
'-l'
,
type
=
int
,
help
=
'''map size in number of logical records
(required for node2pid maps)'''
)
@click.argument
(
'filename'
,
required
=
True
,
type
=
click
.
Path
())
@click.pass_context
def
restore_map
(
ctx
,
map_type
,
length
,
filename
):
"""Restore a binary PID<->node map from textual format."""
if
map_type
==
'pid2node'
:
restore_pid2node
(
filename
)
elif
map_type
==
'node2pid'
:
if
length
is
None
:
raise
click
.
UsageError
(
'map length is required when restoring {} maps'
.
format
(
map_type
),
ctx
)
restore_node2pid
(
filename
,
length
)
else
:
raise
ValueError
(
'invalid map type: '
+
map_type
)
@map.command
(
'write'
)
@click.option
(
'--type'
,
'-t'
,
'map_type'
,
required
=
True
,
type
=
click
.
Choice
([
'pid2node'
,
'node2pid'
]),
help
=
'type of map to write'
)
@click.argument
(
'filename'
,
required
=
True
,
type
=
click
.
Path
())
@click.pass_context
def
write
(
ctx
,
map_type
,
filename
):
"""Write a map to disk sequentially.
read from stdin a textual PID->node mapping (for pid2node, or a simple
sequence of PIDs for node2pid) and write it to disk in the requested binary
map format
note that no sorting is applied, so the input should already be sorted as
required by the chosen map type (by PID for pid2node, by int for node2pid)
"""
with
open
(
filename
,
'wb'
)
as
f
:
if
map_type
==
'pid2node'
:
for
line
in
sys
.
stdin
:
(
pid
,
int_str
)
=
line
.
rstrip
()
.
split
(
maxsplit
=
1
)
PidToNodeMap
.
write_record
(
f
,
pid
,
int
(
int_str
))
elif
map_type
==
'node2pid'
:
for
line
in
sys
.
stdin
:
pid
=
line
.
rstrip
()
NodeToPidMap
.
write_record
(
f
,
pid
)
else
:
raise
ValueError
(
'invalid map type: '
+
map_type
)
@map.command
(
'lookup'
)
@click.option
(
'--graph'
,
'-g'
,
required
=
True
,
metavar
=
'GRAPH'
,
help
=
'compressed graph basename'
)
@click.argument
(
'identifier'
,
required
=
True
)
def
map_lookup
(
graph
,
identifier
):
"""Lookup an identifier using on-disk maps.
Depending on the identifier type lookup either a PID into a PID->node (and
return the node integer identifier) or, vice-versa, lookup a node integer
identifier into a node->PID (and return the PID). The desired behavior is
chosen depending on the syntax of the given identifier.
"""
is_pid
=
None
try
:
int
(
identifier
)
is_pid
=
False
except
ValueError
:
try
:
parse_persistent_identifier
(
identifier
)
is_pid
=
True
except
swh
.
model
.
exceptions
.
ValidationError
:
raise
ValueError
(
f
'invalid identifier: {identifier}'
)
if
is_pid
:
print
(
PidToNodeMap
(
f
'{graph}.{PID2NODE_EXT}'
)[
identifier
])
else
:
print
(
NodeToPidMap
(
f
'{graph}.{NODE2PID_EXT}'
)[
int
(
identifier
)])
@cli.command
(
name
=
'rpc-serve'
)
@click.option
(
'--host'
,
'-h'
,
default
=
'0.0.0.0'
,
metavar
=
'IP'
,
show_default
=
True
,
help
=
'host IP address to bind the server on'
)
@click.option
(
'--port'
,
'-p'
,
default
=
5009
,
type
=
click
.
INT
,
metavar
=
'PORT'
,
show_default
=
True
,
help
=
'port to bind the server on'
)
@click.option
(
'--graph'
,
'-g'
,
required
=
True
,
metavar
=
'GRAPH'
,
help
=
'compressed graph basename'
)
@click.pass_context
def
serve
(
ctx
,
host
,
port
,
graph
):
"""run the graph REST service"""
backend
=
Backend
(
graph_path
=
graph
)
app
=
make_app
(
backend
=
backend
)
with
backend
:
aiohttp
.
web
.
run_app
(
app
,
host
=
host
,
port
=
port
)
@cli.command
()
@click.option
(
'--graph'
,
'-g'
,
required
=
True
,
metavar
=
'GRAPH'
,
type
=
PathlibPath
(),
help
=
'input graph basename'
)
@click.option
(
'--outdir'
,
'-o'
,
'out_dir'
,
required
=
True
,
metavar
=
'DIR'
,
type
=
PathlibPath
(),
help
=
'directory where to store compressed graph'
)
@click.option
(
'--steps'
,
'-s'
,
metavar
=
'STEPS'
,
type
=
webgraph
.
StepOption
(),
help
=
'run only these compression steps (default: all steps)'
)
@click.pass_context
def
compress
(
ctx
,
graph
,
out_dir
,
steps
):
"""Compress a graph using WebGraph
Input: a pair of files g.nodes.csv.gz, g.edges.csv.gz
Output: a directory containing a WebGraph compressed graph
Compression steps are: (1) mph, (2) bv, (3) bv_obl, (4) bfs, (5) permute,
(6) permute_obl, (7) stats, (8) transpose, (9) transpose_obl, (10) maps,
(11) clean_tmp. Compression steps can be selected by name or number using
--steps, separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are
also supported.
"""
graph_name
=
graph
.
name
in_dir
=
graph
.
parent
try
:
conf
=
ctx
.
obj
[
'config'
][
'graph'
][
'compress'
]
except
KeyError
:
conf
=
{}
# use defaults
webgraph
.
compress
(
graph_name
,
in_dir
,
out_dir
,
steps
,
conf
)
def
main
():
return
cli
(
auto_envvar_prefix
=
'SWH_GRAPH'
)
if
__name__
==
'__main__'
:
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Jul 4 2025, 10:06 AM (5 w, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3371241
Attached To
rDGRPH Compressed graph representation
Event Timeline
Log In to Comment