Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F8392901
git.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
git.py
View Options
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
glob
import
logging
import
os
import
subprocess
import
time
import
pygit2
from
collections
import
defaultdict
from
datetime
import
datetime
from
pygit2
import
GIT_REF_OID
,
Oid
from
pygit2
import
GIT_OBJ_BLOB
,
GIT_OBJ_TREE
,
GIT_OBJ_COMMIT
,
GIT_OBJ_TAG
,
GIT_SORT_TOPOLOGICAL
from
enum
import
Enum
from
swh.core
import
hashutil
from
swh.loader.git.data
import
swhrepo
from
swh.storage.storage
import
Storage
class
DirectoryTypeEntry
(
Enum
):
"""Types of git objects.
"""
file
=
'file'
directory
=
'directory'
def
date_format
(
d
):
"""d is expected to be a datetime object.
"""
return
time
.
strftime
(
"%a,
%d
%b %Y %H:%M:%S +0000"
,
d
.
timetuple
())
def
now
():
"""Cheat time values."""
return
date_format
(
datetime
.
utcnow
())
def
timestamp_to_string
(
timestamp
):
"""Convert a timestamps to string.
"""
return
date_format
(
datetime
.
utcfromtimestamp
(
timestamp
))
def
list_objects_from_packfile_index
(
packfile_index
):
"""List the objects indexed by this packfile, in packfile offset
order.
"""
input_file
=
open
(
packfile_index
,
'rb'
)
with
subprocess
.
Popen
(
[
'/usr/bin/git'
,
'show-index'
],
stdin
=
input_file
,
stdout
=
subprocess
.
PIPE
,
)
as
process
:
data
=
[]
for
line
in
process
.
stdout
.
readlines
():
# git show-index returns the line as:
# <packfile offset> <object_id> (<object CRC>)
line_components
=
line
.
split
()
offset
=
int
(
line_components
[
0
])
object_id
=
line_components
[
1
]
data
.
append
((
offset
,
object_id
))
yield from
(
Oid
(
hex
=
object_id
.
decode
(
'ascii'
))
for
_
,
object_id
in
sorted
(
data
))
input_file
.
close
()
def
simple_list_objects
(
repo
):
"""List the objects in a given repository. Watch out for duplicates!"""
objects_dir
=
os
.
path
.
join
(
repo
.
path
,
'objects'
)
# Git hashes are 40-character long
objects_glob
=
os
.
path
.
join
(
objects_dir
,
'[0-9a-f]'
*
2
,
'[0-9a-f]'
*
38
)
packfile_dir
=
os
.
path
.
join
(
objects_dir
,
'pack'
)
if
os
.
path
.
isdir
(
packfile_dir
):
for
packfile_index
in
os
.
listdir
(
packfile_dir
):
if
not
packfile_index
.
endswith
(
'.idx'
):
# Not an index file
continue
packfile_index_path
=
os
.
path
.
join
(
packfile_dir
,
packfile_index
)
yield from
list_objects_from_packfile_index
(
packfile_index_path
)
for
object_file
in
glob
.
glob
(
objects_glob
):
# Rebuild the object id as the last two components of the path
yield
Oid
(
hex
=
''
.
join
(
object_file
.
split
(
os
.
path
.
sep
)[
-
2
:]))
def
list_objects
(
repo
):
"""List the objects in a given repository, removing duplicates"""
seen
=
set
()
for
oid
in
simple_list_objects
(
repo
):
if
oid
not
in
seen
:
yield
oid
seen
.
add
(
oid
)
def
get_objects_per_object_type
(
repo
):
"""Get all the (pygit2-parsed) objects from repo per object type"""
objects_per_object_type
=
defaultdict
(
list
)
for
object_id
in
list_objects
(
repo
):
object
=
repo
[
object_id
]
objects_per_object_type
[
object
.
type
]
.
append
(
object_id
)
return
objects_per_object_type
HASH_ALGORITHMS
=
[
'sha1'
,
'sha256'
]
def
send_in_packets
(
repo
,
source_list
,
formatter
,
sender
,
packet_size
,
extra_data
=
None
):
"""Send objects from `source_list`, passed through `formatter` (being
passed the `repo` and `extra_data`), by the `sender`, in packets
of `packet_size` objects
"""
if
extra_data
is
None
:
extra_data
=
{}
objects
=
[]
for
id
in
source_list
:
objects
.
append
(
formatter
(
repo
,
id
,
**
extra_data
))
if
len
(
objects
)
>=
packet_size
:
sender
(
objects
)
objects
=
[]
sender
(
objects
)
def
send_contents
(
content_list
):
"""Actually send properly formatted contents to the database"""
logging
.
info
(
"Sending
%d
contents"
%
len
(
content_list
))
s
=
Storage
(
'dbname=softwareheritage-dev'
,
'/tmp/swh-loader-git/test'
)
s
.
content_add
(
content_list
)
logging
.
info
(
"Done sending
%d
contents"
%
len
(
content_list
))
def
send_directories
(
directory_list
):
"""Actually send properly formatted directories to the database"""
# TODO: send directories
logging
.
info
(
"Sending
%d
directories"
%
len
(
directory_list
))
s
=
Storage
(
'dbname=softwareheritage-dev'
,
'/tmp/swh-loader-git/test'
)
s
.
directory_add
(
directory_list
)
logging
.
info
(
"Done sending
%d
directories"
%
len
(
directory_list
))
def
send_revisions
(
revision_list
):
"""Actually send properly formatted revisions to the database"""
logging
.
info
(
"Sending
%d
revisions"
%
len
(
revision_list
))
# TODO: send revisions
logging
.
info
(
"Done sending
%d
revisions"
%
len
(
revision_list
))
def
send_releases
(
release_list
):
"""Actually send properly formatted releases to the database"""
logging
.
info
(
"Sending
%d
releases"
%
len
(
release_list
))
# TODO: send releases
logging
.
info
(
"Done sending
%d
releases"
%
len
(
release_list
))
def
blob_to_content
(
repo
,
id
):
"""Format a blob as a content"""
blob
=
repo
[
id
]
data
=
blob
.
data
hashes
=
hashutil
.
hashdata
(
data
,
HASH_ALGORITHMS
)
return
{
'sha1_git'
:
id
.
raw
,
'sha1'
:
hashes
[
'sha1'
],
'sha256'
:
hashes
[
'sha256'
],
'data'
:
data
,
'length'
:
blob
.
size
,
}
def
tree_to_directory
(
repo
,
id
):
"""Format a tree as a directory"""
ret
=
{
'id'
:
id
.
raw
,
}
entries
=
[]
ret
[
'entries'
]
=
entries
entry_type_map
=
{
'tree'
:
'dir'
,
'blob'
:
'file'
,
'commit'
:
'rev'
,
}
for
entry
in
repo
[
id
]:
entries
.
append
({
'type'
:
entry_type_map
[
entry
.
type
],
'perms'
:
entry
.
filemode
,
'name'
:
entry
.
name
,
'target'
:
entry
.
id
.
raw
,
'atime'
:
None
,
'mtime'
:
None
,
'ctime'
:
None
,
})
return
ret
def
commit_to_revision
(
repo
,
id
):
"""Format a commit as a revision"""
# TODO: format commits
return
{
'id'
:
id
,
}
def
annotated_tag_to_release
(
repo
,
id
):
"""Format an annotated tag as a release"""
# TODO: format annotated tags
return
{
'id'
:
id
,
}
def
unannotated_tag_to_release
(
repo
,
id
):
"""Format an unannotated tag as a release"""
# TODO: format unannotated tags
return
{
'id'
:
id
,
}
def
bulk_send_blobs
(
repo
,
blob_dict
):
"""Format blobs as swh contents and send them to the database in bulks
of maximum `threshold` objects
"""
# TODO: move to config file
content_packet_size
=
100000
send_in_packets
(
repo
,
blob_dict
,
blob_to_content
,
send_contents
,
content_packet_size
)
def
bulk_send_trees
(
repo
,
tree_dict
):
"""Format trees as swh directories and send them to the database
"""
# TODO: move to config file
directory_packet_size
=
25000
send_in_packets
(
repo
,
tree_dict
,
tree_to_directory
,
send_directories
,
directory_packet_size
)
def
bulk_send_commits
(
repo
,
commit_dict
):
"""Format commits as swh revisions and send them to the database
"""
# TODO: move to config file
revision_packet_size
=
10000
send_in_packets
(
repo
,
commit_dict
,
commit_to_revision
,
send_revisions
,
revision_packet_size
)
def
bulk_send_annotated_tags
(
repo
,
tag_dict
):
"""Format annotated tags (pygit2.Tag objects) as swh releases and send
them to the database
"""
# TODO: move to config file
release_packet_size
=
10000
send_in_packets
(
repo
,
tag_dict
,
annotated_tag_to_release
,
send_releases
,
release_packet_size
)
def
bulk_send_unannotated_tags
(
repo
,
tag_dict
,
commit_dict
):
"""Format unannotated tags (strings) as swh releases and send
them to the database
"""
# TODO: move to config file
release_packet_size
=
10000
extra_data
=
{
'commits'
:
commit_dict
,
}
send_in_packets
(
repo
,
tag_dict
,
unannotated_tag_to_release
,
send_releases
,
release_packet_size
,
extra_data
)
def
parse_via_object_list
(
repo_path
):
logging
.
info
(
"Started loading
%s
"
%
repo_path
)
repo
=
pygit2
.
Repository
(
repo_path
)
objects_per_object_type
=
get_objects_per_object_type
(
repo
)
logging
.
info
(
"Done listing the objects in
%s
: will load
%d
contents, "
"
%d
directories,
%d
revisions,
%d
releases"
%
(
repo_path
,
len
(
objects_per_object_type
[
GIT_OBJ_BLOB
]),
len
(
objects_per_object_type
[
GIT_OBJ_TREE
]),
len
(
objects_per_object_type
[
GIT_OBJ_COMMIT
]),
len
(
objects_per_object_type
[
GIT_OBJ_TAG
])))
# bulk_send_blobs(repo, objects_per_object_type[GIT_OBJ_BLOB])
bulk_send_trees
(
repo
,
objects_per_object_type
[
GIT_OBJ_TREE
])
bulk_send_commits
(
repo
,
objects_per_object_type
[
GIT_OBJ_COMMIT
])
bulk_send_annotated_tags
(
repo
,
objects_per_object_type
[
GIT_OBJ_TAG
])
# TODO: send unannotated tags
bulk_send_unannotated_tags
(
repo
,
[],
objects_per_object_type
[
GIT_OBJ_COMMIT
])
return
objects_per_object_type
,
{
type
:
len
(
list
)
for
type
,
list
in
objects_per_object_type
.
items
()}
def
parse
(
repo_path
):
"""Given a repository path, parse and return a memory model of such
repository."""
def
read_signature
(
signature
):
return
'
%s
<
%s
>'
%
(
signature
.
name
,
signature
.
email
)
def
treewalk
(
repo
,
tree
):
"""Walk a tree with the same implementation as `os.path`.
Returns: tree, trees, blobs
"""
trees
,
blobs
,
dir_entries
=
[],
[],
[]
for
tree_entry
in
tree
:
if
swh_repo
.
already_visited
(
tree_entry
.
hex
):
logging
.
debug
(
'tree_entry
%s
already visited, skipped'
%
tree_entry
.
hex
)
continue
obj
=
repo
.
get
(
tree_entry
.
oid
)
if
obj
is
None
:
# or obj.type == GIT_OBJ_COMMIT:
logging
.
warn
(
'skip submodule-commit
%s
'
%
tree_entry
.
hex
)
continue
# submodule!
if
obj
.
type
==
GIT_OBJ_TREE
:
logging
.
debug
(
'found tree
%s
'
%
tree_entry
.
hex
)
nature
=
DirectoryTypeEntry
.
directory
.
value
trees
.
append
(
tree_entry
)
else
:
logging
.
debug
(
'found content
%s
'
%
tree_entry
.
hex
)
data
=
obj
.
data
nature
=
DirectoryTypeEntry
.
file
.
value
hashes
=
hashutil
.
hashdata
(
data
,
HASH_ALGORITHMS
)
blobs
.
append
({
'id'
:
obj
.
hex
,
'type'
:
storage
.
Type
.
content
,
'content-sha1'
:
hashes
[
'sha1'
],
'content-sha256'
:
hashes
[
'sha256'
],
'content'
:
data
,
'size'
:
obj
.
size
})
dir_entries
.
append
({
'name'
:
tree_entry
.
name
,
'type'
:
storage
.
Type
.
directory_entry
,
'target-sha1'
:
obj
.
hex
,
'nature'
:
nature
,
'perms'
:
tree_entry
.
filemode
,
'atime'
:
None
,
'mtime'
:
None
,
'ctime'
:
None
,
'parent'
:
tree
.
hex
})
yield
tree
,
dir_entries
,
trees
,
blobs
for
tree_entry
in
trees
:
for
x
in
treewalk
(
repo
,
repo
[
tree_entry
.
oid
]):
yield
x
def
walk_tree
(
repo
,
swh_repo
,
rev
):
"""Walk the rev revision's directories.
"""
if
swh_repo
.
already_visited
(
rev
.
hex
):
logging
.
debug
(
'commit
%s
already visited, skipped'
%
rev
.
hex
)
return
swh_repo
for
dir_root
,
dir_entries
,
_
,
contents_ref
in
treewalk
(
repo
,
rev
.
tree
):
for
content_ref
in
contents_ref
:
swh_repo
.
add_content
(
content_ref
)
swh_repo
.
add_directory
({
'id'
:
dir_root
.
hex
,
'type'
:
storage
.
Type
.
directory
,
'entries'
:
dir_entries
})
revision_parent_sha1s
=
list
(
map
(
str
,
rev
.
parent_ids
))
author
=
{
'name'
:
rev
.
author
.
name
,
'email'
:
rev
.
author
.
email
,
'type'
:
storage
.
Type
.
person
}
committer
=
{
'name'
:
rev
.
committer
.
name
,
'email'
:
rev
.
committer
.
email
,
'type'
:
storage
.
Type
.
person
}
swh_repo
.
add_revision
({
'id'
:
rev
.
hex
,
'type'
:
storage
.
Type
.
revision
,
'date'
:
timestamp_to_string
(
rev
.
commit_time
),
'directory'
:
rev
.
tree
.
hex
,
'message'
:
rev
.
message
,
'committer'
:
committer
,
'author'
:
author
,
'parent-sha1s'
:
revision_parent_sha1s
})
swh_repo
.
add_person
(
read_signature
(
rev
.
author
),
author
)
swh_repo
.
add_person
(
read_signature
(
rev
.
committer
),
committer
)
return
swh_repo
def
walk_revision_from
(
repo
,
swh_repo
,
head_rev
):
"""Walk the rev history log from head_rev.
- repo is the current repository
- rev is the latest rev to start from.
"""
for
rev
in
repo
.
walk
(
head_rev
.
id
,
GIT_SORT_TOPOLOGICAL
):
swh_repo
=
walk_tree
(
repo
,
swh_repo
,
rev
)
return
swh_repo
repo
=
pygit2
.
Repository
(
repo_path
)
# memory model
swh_repo
=
swhrepo
.
SWHRepo
()
# add origin
origin
=
{
'type'
:
'git'
,
'url'
:
'file://'
+
repo
.
path
}
swh_repo
.
add_origin
(
origin
)
# add references and crawl them
for
ref_name
in
repo
.
listall_references
():
logging
.
info
(
'walk reference
%s
'
%
ref_name
)
ref
=
repo
.
lookup_reference
(
ref_name
)
head_rev
=
repo
[
ref
.
target
]
\
if
ref
.
type
is
GIT_REF_OID
\
else
ref
.
peel
(
GIT_OBJ_COMMIT
)
# noqa
if
isinstance
(
head_rev
,
pygit2
.
Tag
):
head_start
=
head_rev
.
get_object
()
taggerSig
=
head_rev
.
tagger
author
=
{
'name'
:
taggerSig
.
name
,
'email'
:
taggerSig
.
email
,
'type'
:
storage
.
Type
.
person
}
release
=
{
'id'
:
head_rev
.
hex
,
'type'
:
storage
.
Type
.
release
,
'revision'
:
head_rev
.
target
.
hex
,
'name'
:
ref_name
,
'date'
:
now
(),
# FIXME: find the tag's date,
'author'
:
author
,
'comment'
:
head_rev
.
message
}
swh_repo
.
add_release
(
release
)
swh_repo
.
add_person
(
read_signature
(
taggerSig
),
author
)
else
:
swh_repo
.
add_occurrence
({
'id'
:
head_rev
.
hex
,
'revision'
:
head_rev
.
hex
,
'reference'
:
ref_name
,
'url-origin'
:
origin
[
'url'
],
'type'
:
storage
.
Type
.
occurrence
})
head_start
=
head_rev
# crawl commits and trees
walk_revision_from
(
repo
,
swh_repo
,
head_start
)
return
swh_repo
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Jun 4 2025, 7:04 PM (10 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3398926
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment