Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9344202
utils.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
4 KB
Subscribers
None
utils.py
View Options
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
sqlite3
import
subprocess
try
:
# Plyvel shouldn't be a hard dependency if we want to use sqlite instead
import
plyvel
except
ImportError
:
plyvel
=
None
class
ZSTFile
:
"""
Object-like wrapper around a ZST file. Uses a subprocess of the "zstd"
command to compress and deflate the objects.
"""
def
__init__
(
self
,
path
:
str
,
mode
:
str
=
"r"
):
if
mode
not
in
(
"r"
,
"rb"
,
"w"
,
"wb"
):
raise
ValueError
(
f
"ZSTFile mode {mode} is invalid."
)
self
.
path
=
path
self
.
mode
=
mode
def
__enter__
(
self
)
->
"ZSTFile"
:
is_text
=
not
(
self
.
mode
in
(
"rb"
,
"wb"
))
writing
=
self
.
mode
in
(
"w"
,
"wb"
)
if
writing
:
cmd
=
[
"zstd"
,
"-q"
,
"-o"
,
self
.
path
]
else
:
cmd
=
[
"zstdcat"
,
self
.
path
]
self
.
process
=
subprocess
.
Popen
(
cmd
,
text
=
is_text
,
stdin
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
)
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
tb
):
self
.
process
.
stdin
.
close
()
self
.
process
.
stdout
.
close
()
self
.
process
.
wait
()
def
read
(
self
,
*
args
):
return
self
.
process
.
stdout
.
read
(
*
args
)
def
write
(
self
,
buf
):
self
.
process
.
stdin
.
write
(
buf
)
class
SQLiteSet
:
"""
On-disk Set object for hashes using SQLite as an indexer backend. Used to
deduplicate objects when processing large queues with duplicates.
"""
def
__init__
(
self
,
db_path
):
self
.
db_path
=
db_path
def
__enter__
(
self
):
self
.
db
=
sqlite3
.
connect
(
str
(
self
.
db_path
))
self
.
db
.
execute
(
"CREATE TABLE IF NOT EXISTS"
" tmpset (val TEXT NOT NULL PRIMARY KEY)"
" WITHOUT ROWID"
)
self
.
db
.
execute
(
"PRAGMA synchronous = OFF"
)
self
.
db
.
execute
(
"PRAGMA journal_mode = OFF"
)
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
db
.
commit
()
self
.
db
.
close
()
def
add
(
self
,
v
:
bytes
)
->
bool
:
"""
Add an item to the set.
Args:
v: The value to add to the set.
Returns:
True if the value was added to the set, False if it was already present.
"""
try
:
self
.
db
.
execute
(
"INSERT INTO tmpset(val) VALUES (?)"
,
(
v
.
hex
(),))
except
sqlite3
.
IntegrityError
:
return
False
else
:
return
True
class
LevelDBSet
:
"""
On-disk Set object for hashes using LevelDB as an indexer backend. Used to
deduplicate objects when processing large queues with duplicates.
"""
def
__init__
(
self
,
db_path
):
self
.
db_path
=
db_path
if
plyvel
is
None
:
raise
ImportError
(
"Plyvel library not found, required for LevelDBSet"
)
def
__enter__
(
self
):
self
.
db
=
plyvel
.
DB
(
str
(
self
.
db_path
),
create_if_missing
=
True
)
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
db
.
close
()
def
add
(
self
,
v
:
bytes
)
->
bool
:
"""
Add an item to the set.
Args:
v: The value to add to the set.
Returns:
True if the value was added to the set, False if it was already present.
"""
if
self
.
db
.
get
(
v
):
return
False
else
:
self
.
db
.
put
(
v
,
b
"T"
)
return
True
def
remove_pull_requests
(
snapshot
):
"""
Heuristic to filter out pull requests in snapshots: remove all branches
that start with refs/ but do not start with refs/heads or refs/tags.
"""
# Copy the items with list() to remove items during iteration
for
branch_name
,
branch
in
list
(
snapshot
[
"branches"
]
.
items
()):
original_branch_name
=
branch_name
while
branch
and
branch
.
get
(
"target_type"
)
==
"alias"
:
branch_name
=
branch
[
"target"
]
branch
=
snapshot
[
"branches"
]
.
get
(
branch_name
)
if
branch
is
None
or
not
branch_name
:
continue
if
branch_name
.
startswith
(
b
"refs/"
)
and
not
(
branch_name
.
startswith
(
b
"refs/heads"
)
or
branch_name
.
startswith
(
b
"refs/tags"
)
):
snapshot
[
"branches"
]
.
pop
(
original_branch_name
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Fri, Jul 4, 2:12 PM (23 h, 16 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3295136
Attached To
rDDATASET Datasets
Event Timeline
Log In to Comment