Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9338492
converters.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
converters.py
View Options
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Convert dulwich objects to dictionaries suitable for swh.storage"""
from
swh.model
import
hashutil
HASH_ALGORITHMS
=
hashutil
.
DEFAULT_ALGORITHMS
-
{
'sha1_git'
}
def
origin_url_to_origin
(
origin_url
):
"""Format a pygit2.Repository as an origin suitable for swh.storage"""
return
{
'type'
:
'git'
,
'url'
:
origin_url
,
}
def
dulwich_blob_to_content_id
(
blob
):
"""Convert a dulwich blob to a Software Heritage content id"""
if
blob
.
type_name
!=
b
'blob'
:
return
size
=
blob
.
raw_length
()
ret
=
{
'sha1_git'
:
blob
.
sha
()
.
digest
(),
'length'
:
size
,
}
data
=
blob
.
as_raw_string
()
ret
.
update
(
hashutil
.
hash_data
(
data
,
HASH_ALGORITHMS
))
return
ret
def
dulwich_blob_to_content
(
blob
,
log
=
None
,
max_content_size
=
None
,
origin_id
=
None
):
"""Convert a dulwich blob to a Software Heritage content"""
if
blob
.
type_name
!=
b
'blob'
:
return
ret
=
dulwich_blob_to_content_id
(
blob
)
size
=
ret
[
'length'
]
if
max_content_size
:
if
size
>
max_content_size
:
id
=
hashutil
.
hash_to_hex
(
ret
[
'sha1_git'
])
if
log
:
log
.
info
(
'Skipping content
%s
, too large (
%s
>
%s
)'
%
(
id
,
size
,
max_content_size
),
extra
=
{
'swh_type'
:
'loader_git_content_skip'
,
'swh_id'
:
id
,
'swh_size'
:
size
,
})
ret
[
'status'
]
=
'absent'
ret
[
'reason'
]
=
'Content too large'
ret
[
'origin'
]
=
origin_id
return
ret
data
=
blob
.
as_raw_string
()
ret
[
'data'
]
=
data
ret
[
'status'
]
=
'visible'
return
ret
def
dulwich_tree_to_directory
(
tree
,
log
=
None
):
"""Format a tree as a directory"""
if
tree
.
type_name
!=
b
'tree'
:
return
ret
=
{
'id'
:
tree
.
sha
()
.
digest
(),
}
entries
=
[]
ret
[
'entries'
]
=
entries
entry_mode_map
=
{
0
o040000
:
'dir'
,
0
o160000
:
'rev'
,
0
o100644
:
'file'
,
0
o100755
:
'file'
,
0
o120000
:
'file'
,
}
for
entry
in
tree
.
iteritems
():
entries
.
append
({
'type'
:
entry_mode_map
.
get
(
entry
.
mode
,
'file'
),
'perms'
:
entry
.
mode
,
'name'
:
entry
.
path
,
'target'
:
hashutil
.
hash_to_bytes
(
entry
.
sha
.
decode
(
'ascii'
)),
})
return
ret
def
parse_author
(
name_email
):
"""Parse an author line"""
if
name_email
is
None
:
return
None
try
:
open_bracket
=
name_email
.
index
(
b
'<'
)
except
ValueError
:
name
=
email
=
None
else
:
raw_name
=
name_email
[:
open_bracket
]
raw_email
=
name_email
[
open_bracket
+
1
:]
if
not
raw_name
:
name
=
None
elif
raw_name
.
endswith
(
b
' '
):
name
=
raw_name
[:
-
1
]
else
:
name
=
raw_name
try
:
close_bracket
=
raw_email
.
index
(
b
'>'
)
except
ValueError
:
email
=
None
else
:
email
=
raw_email
[:
close_bracket
]
return
{
'name'
:
name
,
'email'
:
email
,
'fullname'
:
name_email
,
}
def
dulwich_tsinfo_to_timestamp
(
timestamp
,
timezone
,
timezone_neg_utc
):
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage"""
return
{
'timestamp'
:
timestamp
,
'offset'
:
timezone
//
60
,
'negative_utc'
:
timezone_neg_utc
if
timezone
==
0
else
None
,
}
def
dulwich_commit_to_revision
(
commit
,
log
=
None
):
if
commit
.
type_name
!=
b
'commit'
:
return
ret
=
{
'id'
:
commit
.
sha
()
.
digest
(),
'author'
:
parse_author
(
commit
.
author
),
'date'
:
dulwich_tsinfo_to_timestamp
(
commit
.
author_time
,
commit
.
author_timezone
,
commit
.
_author_timezone_neg_utc
,
),
'committer'
:
parse_author
(
commit
.
committer
),
'committer_date'
:
dulwich_tsinfo_to_timestamp
(
commit
.
commit_time
,
commit
.
commit_timezone
,
commit
.
_commit_timezone_neg_utc
,
),
'type'
:
'git'
,
'directory'
:
bytes
.
fromhex
(
commit
.
tree
.
decode
()),
'message'
:
commit
.
message
,
'metadata'
:
None
,
'synthetic'
:
False
,
'parents'
:
[
bytes
.
fromhex
(
p
.
decode
())
for
p
in
commit
.
parents
],
}
git_metadata
=
[]
if
commit
.
encoding
is
not
None
:
git_metadata
.
append
([
'encoding'
,
commit
.
encoding
])
if
commit
.
mergetag
:
for
mergetag
in
commit
.
mergetag
:
raw_string
=
mergetag
.
as_raw_string
()
assert
raw_string
.
endswith
(
b
'
\n
'
)
git_metadata
.
append
([
'mergetag'
,
raw_string
[:
-
1
]])
if
commit
.
extra
:
git_metadata
.
extend
([
k
.
decode
(
'utf-8'
),
v
]
for
k
,
v
in
commit
.
extra
)
if
commit
.
gpgsig
:
git_metadata
.
append
([
'gpgsig'
,
commit
.
gpgsig
])
if
git_metadata
:
ret
[
'metadata'
]
=
{
'extra_headers'
:
git_metadata
,
}
return
ret
DULWICH_TYPES
=
{
b
'blob'
:
'content'
,
b
'tree'
:
'directory'
,
b
'commit'
:
'revision'
,
b
'tag'
:
'release'
,
}
def
dulwich_tag_to_release
(
tag
,
log
=
None
):
if
tag
.
type_name
!=
b
'tag'
:
return
target_type
,
target
=
tag
.
object
ret
=
{
'id'
:
tag
.
sha
()
.
digest
(),
'name'
:
tag
.
name
,
'target'
:
bytes
.
fromhex
(
target
.
decode
()),
'target_type'
:
DULWICH_TYPES
[
target_type
.
type_name
],
'message'
:
tag
.
_message
,
'metadata'
:
None
,
'synthetic'
:
False
,
}
if
tag
.
tagger
:
ret
[
'author'
]
=
parse_author
(
tag
.
tagger
)
ret
[
'date'
]
=
dulwich_tsinfo_to_timestamp
(
tag
.
tag_time
,
tag
.
tag_timezone
,
tag
.
_tag_timezone_neg_utc
,
)
else
:
ret
[
'author'
]
=
ret
[
'date'
]
=
None
return
ret
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Jul 4 2025, 8:51 AM (6 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3278705
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment