Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7450728
LabelMapBuilder.java
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
LabelMapBuilder.java
View Options
package
org.softwareheritage.graph.maps
;
import
com.martiansoftware.jsap.*
;
import
it.unimi.dsi.big.webgraph.LazyLongIterator
;
import
it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph
;
import
it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
;
import
it.unimi.dsi.fastutil.BigArrays
;
import
it.unimi.dsi.fastutil.Size64
;
import
it.unimi.dsi.fastutil.bytes.ByteArrays
;
import
it.unimi.dsi.fastutil.io.BinIO
;
import
it.unimi.dsi.fastutil.io.FastBufferedInputStream
;
import
it.unimi.dsi.fastutil.longs.LongBigArrays
;
import
it.unimi.dsi.fastutil.objects.Object2LongFunction
;
import
it.unimi.dsi.io.OutputBitStream
;
import
it.unimi.dsi.logging.ProgressLogger
;
import
it.unimi.dsi.big.webgraph.BVGraph
;
import
it.unimi.dsi.big.webgraph.ImmutableGraph
;
import
it.unimi.dsi.big.webgraph.NodeIterator
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.softwareheritage.graph.labels.DirEntry
;
import
org.softwareheritage.graph.labels.SwhLabel
;
import
java.io.*
;
import
java.lang.reflect.Array
;
import
java.nio.charset.StandardCharsets
;
import
java.util.*
;
import
java.util.concurrent.TimeUnit
;
public
class
LabelMapBuilder
{
final
static
String
SORT_BUFFER_SIZE
=
"40%"
;
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
LabelMapBuilder
.
class
);
String
graphPath
;
String
debugPath
;
String
tmpDir
;
ImmutableGraph
graph
;
Object2LongFunction
<
byte
[]>
swhIdMph
;
long
[][]
orderMap
;
Object2LongFunction
<
byte
[]>
filenameMph
;
long
numFilenames
;
int
totalLabelWidth
;
public
LabelMapBuilder
(
String
graphPath
,
String
debugPath
,
String
tmpDir
)
{
this
.
graphPath
=
graphPath
;
this
.
debugPath
=
debugPath
;
this
.
tmpDir
=
tmpDir
;
}
private
static
JSAPResult
parse_args
(
String
[]
args
)
{
JSAPResult
config
=
null
;
try
{
SimpleJSAP
jsap
=
new
SimpleJSAP
(
LabelMapBuilder
.
class
.
getName
(),
""
,
new
Parameter
[]{
new
FlaggedOption
(
"graphPath"
,
JSAP
.
STRING_PARSER
,
JSAP
.
NO_DEFAULT
,
JSAP
.
REQUIRED
,
'g'
,
"graph"
,
"Basename of the compressed graph"
),
new
FlaggedOption
(
"debugPath"
,
JSAP
.
STRING_PARSER
,
JSAP
.
NO_DEFAULT
,
JSAP
.
NOT_REQUIRED
,
'd'
,
"debug-path"
,
"Store the intermediate representation here for debug"
),
new
FlaggedOption
(
"tmpDir"
,
JSAP
.
STRING_PARSER
,
"tmp"
,
JSAP
.
NOT_REQUIRED
,
't'
,
"tmp"
,
"Temporary directory path"
),});
config
=
jsap
.
parse
(
args
);
if
(
jsap
.
messagePrinted
())
{
System
.
exit
(
1
);
}
}
catch
(
JSAPException
e
)
{
e
.
printStackTrace
();
}
return
config
;
}
public
static
void
main
(
String
[]
args
)
throws
IOException
{
JSAPResult
config
=
parse_args
(
args
);
String
graphPath
=
config
.
getString
(
"graphPath"
);
String
tmpDir
=
config
.
getString
(
"tmpDir"
);
String
debugPath
=
config
.
getString
(
"debugPath"
);
LabelMapBuilder
builder
=
new
LabelMapBuilder
(
graphPath
,
debugPath
,
tmpDir
);
builder
.
computeLabelMap
();
}
@SuppressWarnings
(
"unchecked"
)
// Suppress warning for Object2LongFunction cast
static
Object2LongFunction
<
byte
[]>
loadMPH
(
String
mphBasename
)
throws
IOException
{
Object2LongFunction
<
byte
[]>
mphMap
=
null
;
try
{
mphMap
=
(
Object2LongFunction
<
byte
[]>)
BinIO
.
loadObject
(
mphBasename
+
".mph"
);
}
catch
(
ClassNotFoundException
e
)
{
logger
.
error
(
"unknown class object in .mph file: "
+
e
);
System
.
exit
(
2
);
}
return
mphMap
;
}
static
long
getMPHSize
(
Object2LongFunction
<
byte
[]>
mph
)
{
return
(
mph
instanceof
Size64
)
?
((
Size64
)
mph
).
size64
()
:
mph
.
size
();
}
void
computeLabelMap
()
throws
IOException
{
/*
* Pass the intermediate representation to sort(1) so that we see the labels in the order they will
* appear in the label file.
*/
logger
.
info
(
"Loading graph and MPH functions..."
);
loadGraph
();
logger
.
info
(
"Hashing the input labels..."
);
ProcessBuilder
processBuilder
=
new
ProcessBuilder
();
processBuilder
.
command
(
"sort"
,
"-k1,1n"
,
"-k2,2n"
,
// Numerical sort
"--numeric-sort"
,
"--buffer-size"
,
SORT_BUFFER_SIZE
,
"--temporary-directory"
,
tmpDir
);
Process
sort
=
processBuilder
.
start
();
BufferedOutputStream
sort_stdin
=
new
BufferedOutputStream
(
sort
.
getOutputStream
());
BufferedInputStream
sort_stdout
=
new
BufferedInputStream
(
sort
.
getInputStream
());
final
FastBufferedInputStream
fbis
=
new
FastBufferedInputStream
(
System
.
in
);
hashLabelStream
(
fbis
,
sort_stdin
);
sort_stdin
.
close
();
logger
.
info
(
"Writing label map to file..."
);
writeLabels
(
sort_stdout
);
logger
.
info
(
"Done"
);
}
void
loadGraph
()
throws
IOException
{
graph
=
BVGraph
.
loadMapped
(
graphPath
);
swhIdMph
=
loadMPH
(
graphPath
);
orderMap
=
LongBigArrays
.
newBigArray
(
getMPHSize
(
swhIdMph
));
BinIO
.
loadLongs
(
graphPath
+
".order"
,
orderMap
);
filenameMph
=
loadMPH
(
graphPath
+
"-labels"
);
numFilenames
=
getMPHSize
(
filenameMph
);
totalLabelWidth
=
DirEntry
.
labelWidth
(
numFilenames
);
}
void
hashLabelStream
(
FastBufferedInputStream
input
,
BufferedOutputStream
output
)
throws
IOException
{
// Compute intermediate representation and write it on :
// "<src node id> <dst node id> <label ids>\n"
ProgressLogger
plInter
=
new
ProgressLogger
(
logger
,
10
,
TimeUnit
.
SECONDS
);
plInter
.
itemsName
=
"edges"
;
plInter
.
expectedUpdates
=
graph
.
numArcs
();
plInter
.
start
(
"Hashing the label stream"
);
var
charset
=
StandardCharsets
.
US_ASCII
;
byte
[]
array
=
new
byte
[
1024
];
for
(
long
line
=
0
;;
line
++)
{
int
start
=
0
,
len
;
while
((
len
=
input
.
readLine
(
array
,
start
,
array
.
length
-
start
,
FastBufferedInputStream
.
ALL_TERMINATORS
))
==
array
.
length
-
start
)
{
start
+=
len
;
array
=
ByteArrays
.
grow
(
array
,
array
.
length
+
1
);
}
if
(
len
==
-
1
)
break
;
// EOF
final
int
lineLength
=
start
+
len
;
// Skip whitespace at the start of the line.
int
offset
=
0
;
while
(
offset
<
lineLength
&&
array
[
offset
]
>=
0
&&
array
[
offset
]
<=
' '
)
offset
++;
if
(
offset
==
lineLength
)
{
continue
;
}
if
(
array
[
0
]
==
'#'
)
continue
;
// Scan source id.
start
=
offset
;
while
(
offset
<
lineLength
&&
(
array
[
offset
]
<
0
||
array
[
offset
]
>
' '
))
offset
++;
final
byte
[]
ss
=
Arrays
.
copyOfRange
(
array
,
start
,
offset
);
// Skip whitespace between identifiers.
while
(
offset
<
lineLength
&&
array
[
offset
]
>=
0
&&
array
[
offset
]
<=
' '
)
offset
++;
if
(
offset
==
lineLength
)
{
logger
.
error
(
"Error at line "
+
line
+
": no target"
);
continue
;
}
// Scan target ID
start
=
offset
;
while
(
offset
<
lineLength
&&
(
array
[
offset
]
<
0
||
array
[
offset
]
>
' '
))
offset
++;
final
byte
[]
ts
=
Arrays
.
copyOfRange
(
array
,
start
,
offset
);
// Skip whitespace between identifiers.
while
(
offset
<
lineLength
&&
array
[
offset
]
>=
0
&&
array
[
offset
]
<=
' '
)
offset
++;
if
(
offset
==
lineLength
)
continue
;
// No label, skip
// Scan label
start
=
offset
;
while
(
offset
<
lineLength
&&
(
array
[
offset
]
<
0
||
array
[
offset
]
>
' '
))
offset
++;
final
byte
[]
ls
=
Arrays
.
copyOfRange
(
array
,
start
,
offset
);
// Skip whitespace between identifiers.
while
(
offset
<
lineLength
&&
array
[
offset
]
>=
0
&&
array
[
offset
]
<=
' '
)
offset
++;
// Scan permission
int
permission
=
0
;
if
(
offset
<
lineLength
)
{
start
=
offset
;
while
(
offset
<
lineLength
&&
(
array
[
offset
]
<
0
||
array
[
offset
]
>
' '
))
offset
++;
permission
=
Integer
.
parseInt
(
new
String
(
array
,
start
,
offset
-
start
,
charset
));
}
// System.err.format("DEBUG: read %s %s %s %d\n", ss, ts, ls, permission);
long
s
=
swhIdMph
.
getLong
(
ss
);
long
t
=
swhIdMph
.
getLong
(
ts
);
long
filenameId
=
filenameMph
.
getLong
(
ls
);
long
srcNode
=
BigArrays
.
get
(
orderMap
,
s
);
long
dstNode
=
BigArrays
.
get
(
orderMap
,
t
);
output
.
write
((
srcNode
+
"\t"
+
dstNode
+
"\t"
+
filenameId
+
"\t"
+
permission
+
"\n"
)
.
getBytes
(
StandardCharsets
.
US_ASCII
));
plInter
.
lightUpdate
();
}
plInter
.
done
();
}
void
writeLabels
(
InputStream
sortedMapping
)
throws
IOException
{
// Get the sorted output and write the labels and label offsets
ProgressLogger
plLabels
=
new
ProgressLogger
(
logger
,
10
,
TimeUnit
.
SECONDS
);
plLabels
.
itemsName
=
"nodes"
;
plLabels
.
expectedUpdates
=
graph
.
numNodes
();
FileWriter
debugFile
=
null
;
if
(
debugPath
!=
null
)
{
debugFile
=
new
FileWriter
(
debugPath
);
}
OutputBitStream
labels
=
new
OutputBitStream
(
new
File
(
graphPath
+
"-labelled"
+
BitStreamArcLabelledImmutableGraph
.
LABELS_EXTENSION
));
OutputBitStream
offsets
=
new
OutputBitStream
(
new
File
(
graphPath
+
"-labelled"
+
BitStreamArcLabelledImmutableGraph
.
LABEL_OFFSETS_EXTENSION
));
offsets
.
writeGamma
(
0
);
Scanner
mapLines
=
new
Scanner
(
sortedMapping
,
StandardCharsets
.
US_ASCII
);
NodeIterator
it
=
graph
.
nodeIterator
();
long
labelSrcNode
=
-
1
;
long
labelDstNode
=
-
1
;
long
labelFilenameId
=
-
1
;
int
labelPermission
=
-
1
;
ArrayList
<
DirEntry
>
labelBuffer
=
new
ArrayList
<>(
128
);
while
(
it
.
hasNext
())
{
long
srcNode
=
it
.
nextLong
();
int
bits
=
0
;
LazyLongIterator
s
=
it
.
successors
();
long
dstNode
;
while
((
dstNode
=
s
.
nextLong
())
>=
0
)
{
while
(
labelSrcNode
<=
srcNode
&&
labelDstNode
<=
dstNode
)
{
if
(
labelSrcNode
==
srcNode
&&
labelDstNode
==
dstNode
)
{
labelBuffer
.
add
(
new
DirEntry
(
labelFilenameId
,
labelPermission
));
if
(
debugFile
!=
null
)
{
debugFile
.
write
(
labelSrcNode
+
" "
+
labelDstNode
+
" "
+
labelFilenameId
+
" "
+
labelPermission
+
"\n"
);
}
}
if
(!
mapLines
.
hasNext
())
break
;
String
line
=
mapLines
.
nextLine
();
String
[]
parts
=
line
.
split
(
"\\t"
);
labelSrcNode
=
Long
.
parseLong
(
parts
[
0
]);
labelDstNode
=
Long
.
parseLong
(
parts
[
1
]);
labelFilenameId
=
Long
.
parseLong
(
parts
[
2
]);
labelPermission
=
Integer
.
parseInt
(
parts
[
3
]);
}
SwhLabel
l
=
new
SwhLabel
(
"edgelabel"
,
totalLabelWidth
,
labelBuffer
.
toArray
(
new
DirEntry
[
0
]));
labelBuffer
.
clear
();
bits
+=
l
.
toBitStream
(
labels
,
-
1
);
}
offsets
.
writeGamma
(
bits
);
plLabels
.
lightUpdate
();
}
labels
.
close
();
offsets
.
close
();
plLabels
.
done
();
if
(
debugFile
!=
null
)
{
debugFile
.
close
();
}
PrintWriter
pw
=
new
PrintWriter
(
new
FileWriter
((
new
File
(
graphPath
)).
getName
()
+
"-labelled.properties"
));
pw
.
println
(
ImmutableGraph
.
GRAPHCLASS_PROPERTY_KEY
+
" = "
+
BitStreamArcLabelledImmutableGraph
.
class
.
getName
());
pw
.
println
(
BitStreamArcLabelledImmutableGraph
.
LABELSPEC_PROPERTY_KEY
+
" = "
+
SwhLabel
.
class
.
getName
()
+
"(DirEntry,"
+
totalLabelWidth
+
")"
);
pw
.
println
(
ArcLabelledImmutableGraph
.
UNDERLYINGGRAPH_PROPERTY_KEY
+
" = "
+
graphPath
);
pw
.
close
();
}
}
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Thu, Apr 17, 8:31 AM (3 d, 23 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3287705
Attached To
rDGRPH Compressed graph representation
Event Timeline
Log In to Comment