Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9343486
dumb.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
dumb.py
View Options
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from
__future__
import
annotations
from
collections
import
defaultdict
import
logging
import
stat
import
struct
from
tempfile
import
SpooledTemporaryFile
from
typing
import
TYPE_CHECKING
,
Callable
,
Dict
,
Iterable
,
List
,
Set
,
cast
import
urllib.parse
from
dulwich.errors
import
NotGitRepository
from
dulwich.objects
import
S_IFGITLINK
,
Commit
,
ShaFile
,
Tree
from
dulwich.pack
import
Pack
,
PackData
,
PackIndex
,
load_pack_index_file
import
requests
from
swh.loader.git.utils
import
HexBytes
if
TYPE_CHECKING
:
from
.loader
import
RepoRepresentation
logger
=
logging
.
getLogger
(
__name__
)
HEADERS
=
{
"User-Agent"
:
"Software Heritage dumb Git loader"
}
def
check_protocol
(
repo_url
:
str
)
->
bool
:
"""Checks if a git repository can be cloned using the dumb protocol.
Args:
repo_url: Base URL of a git repository
Returns:
Whether the dumb protocol is supported.
"""
if
not
repo_url
.
startswith
(
"http"
):
return
False
url
=
urllib
.
parse
.
urljoin
(
repo_url
.
rstrip
(
"/"
)
+
"/"
,
"info/refs?service=git-upload-pack/"
)
logger
.
debug
(
"Fetching
%s
"
,
url
)
response
=
requests
.
get
(
url
,
headers
=
HEADERS
)
content_type
=
response
.
headers
.
get
(
"Content-Type"
)
return
(
response
.
status_code
in
(
200
,
304
,
)
# header is not mandatory in protocol specification
and
(
content_type
is
None
or
not
content_type
.
startswith
(
"application/x-git-"
))
)
class
GitObjectsFetcher
:
"""Git objects fetcher using dumb HTTP protocol.
Fetches a set of git objects for a repository according to its archival
state by Software Heritage and provides iterators on them.
Args:
repo_url: Base URL of a git repository
base_repo: State of repository archived by Software Heritage
"""
def
__init__
(
self
,
repo_url
:
str
,
base_repo
:
RepoRepresentation
):
self
.
_session
=
requests
.
Session
()
self
.
repo_url
=
repo_url
self
.
base_repo
=
base_repo
self
.
objects
:
Dict
[
bytes
,
Set
[
bytes
]]
=
defaultdict
(
set
)
self
.
refs
=
self
.
_get_refs
()
self
.
head
=
self
.
_get_head
()
if
self
.
refs
else
{}
self
.
packs
=
self
.
_get_packs
()
def
fetch_object_ids
(
self
)
->
None
:
"""Fetches identifiers of git objects to load into the archive."""
wants
=
self
.
base_repo
.
determine_wants
(
self
.
refs
)
# process refs
commit_objects
=
[]
for
ref
in
wants
:
ref_object
=
self
.
_get_git_object
(
ref
)
if
ref_object
.
get_type
()
==
Commit
.
type_num
:
commit_objects
.
append
(
cast
(
Commit
,
ref_object
))
self
.
objects
[
b
"commit"
]
.
add
(
ref
)
else
:
self
.
objects
[
b
"tag"
]
.
add
(
ref
)
# perform DFS on commits graph
while
commit_objects
:
commit
=
commit_objects
.
pop
()
# fetch tree and blob ids recursively
self
.
_fetch_tree_objects
(
commit
.
tree
)
for
parent
in
commit
.
parents
:
if
(
# commit not already seen in the current load
parent
not
in
self
.
objects
[
b
"commit"
]
# commit not already archived by a previous load
and
parent
not
in
self
.
base_repo
.
heads
):
commit_objects
.
append
(
cast
(
Commit
,
self
.
_get_git_object
(
parent
)))
self
.
objects
[
b
"commit"
]
.
add
(
parent
)
def
iter_objects
(
self
,
object_type
:
bytes
)
->
Iterable
[
ShaFile
]:
"""Returns a generator on fetched git objects per type.
Args:
object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"
Returns:
A generator fetching git objects on the fly.
"""
return
map
(
self
.
_get_git_object
,
self
.
objects
[
object_type
])
def
_http_get
(
self
,
path
:
str
)
->
SpooledTemporaryFile
:
url
=
urllib
.
parse
.
urljoin
(
self
.
repo_url
.
rstrip
(
"/"
)
+
"/"
,
path
)
logger
.
debug
(
"Fetching
%s
"
,
url
)
response
=
self
.
_session
.
get
(
url
,
headers
=
HEADERS
)
buffer
=
SpooledTemporaryFile
(
max_size
=
100
*
1024
*
1024
)
for
chunk
in
response
.
iter_content
(
chunk_size
=
10
*
1024
*
1024
):
buffer
.
write
(
chunk
)
buffer
.
flush
()
buffer
.
seek
(
0
)
return
buffer
def
_get_refs
(
self
)
->
Dict
[
bytes
,
HexBytes
]:
refs
=
{}
refs_resp_bytes
=
self
.
_http_get
(
"info/refs"
)
for
ref_line
in
refs_resp_bytes
.
readlines
():
ref_target
,
ref_name
=
ref_line
.
replace
(
b
"
\n
"
,
b
""
)
.
split
(
b
"
\t
"
)
refs
[
ref_name
]
=
ref_target
return
refs
def
_get_head
(
self
)
->
Dict
[
bytes
,
HexBytes
]:
head_resp_bytes
=
self
.
_http_get
(
"HEAD"
)
_
,
head_target
=
head_resp_bytes
.
readline
()
.
replace
(
b
"
\n
"
,
b
""
)
.
split
(
b
" "
)
return
{
b
"HEAD"
:
head_target
}
def
_get_pack_data
(
self
,
pack_name
:
str
)
->
Callable
[[],
PackData
]:
def
_pack_data
()
->
PackData
:
pack_data_bytes
=
self
.
_http_get
(
f
"objects/pack/{pack_name}"
)
return
PackData
(
pack_name
,
file
=
pack_data_bytes
)
return
_pack_data
def
_get_pack_idx
(
self
,
pack_idx_name
:
str
)
->
Callable
[[],
PackIndex
]:
def
_pack_idx
()
->
PackIndex
:
pack_idx_bytes
=
self
.
_http_get
(
f
"objects/pack/{pack_idx_name}"
)
return
load_pack_index_file
(
pack_idx_name
,
pack_idx_bytes
)
return
_pack_idx
def
_get_packs
(
self
)
->
List
[
Pack
]:
packs
=
[]
packs_info_bytes
=
self
.
_http_get
(
"objects/info/packs"
)
packs_info
=
packs_info_bytes
.
read
()
.
decode
()
for
pack_info
in
packs_info
.
split
(
"
\n
"
):
if
pack_info
:
pack_name
=
pack_info
.
split
(
" "
)[
1
]
pack_idx_name
=
pack_name
.
replace
(
".pack"
,
".idx"
)
# pack index and data file will be lazily fetched when required
packs
.
append
(
Pack
.
from_lazy_objects
(
self
.
_get_pack_data
(
pack_name
),
self
.
_get_pack_idx
(
pack_idx_name
),
)
)
return
packs
def
_get_git_object
(
self
,
sha
:
bytes
)
->
ShaFile
:
# try to get the object from a pack file first to avoid flooding
# git server with numerous HTTP requests
for
pack
in
list
(
self
.
packs
):
try
:
if
sha
in
pack
:
return
pack
[
sha
]
except
(
NotGitRepository
,
struct
.
error
):
# missing (dulwich http client raises NotGitRepository on 404)
# or invalid pack index/content, remove it from global packs list
logger
.
debug
(
"A pack file is missing or its content is invalid"
)
self
.
packs
.
remove
(
pack
)
# fetch it from objects/ directory otherwise
sha_hex
=
sha
.
decode
()
object_path
=
f
"objects/{sha_hex[:2]}/{sha_hex[2:]}"
return
ShaFile
.
from_file
(
self
.
_http_get
(
object_path
))
def
_fetch_tree_objects
(
self
,
sha
:
bytes
)
->
None
:
if
sha
not
in
self
.
objects
[
b
"tree"
]:
tree
=
cast
(
Tree
,
self
.
_get_git_object
(
sha
))
self
.
objects
[
b
"tree"
]
.
add
(
sha
)
for
item
in
tree
.
items
():
if
item
.
mode
==
S_IFGITLINK
:
# skip submodules as objects are not stored in repository
continue
if
item
.
mode
&
stat
.
S_IFDIR
:
self
.
_fetch_tree_objects
(
item
.
sha
)
else
:
self
.
objects
[
b
"blob"
]
.
add
(
item
.
sha
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Fri, Jul 4, 1:34 PM (5 d, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3301479
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment