Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Paste
P596
bitbucket dump curl
Active
Public
Actions
Authored by
vlorentz
on Feb 11 2020, 5:54 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Subscribers
None
import
datetime
import
gc
import
gzip
import
hashlib
import
multiprocessing
import
os
import
re
import
subprocess
import
time
import
traceback
BASE_PATH
=
'/var/lib/bitbucket-dump'
url_pattern
=
re
.
compile
(
'https://bitbucket.org/(?P<owner>[^/]*)/(?P<reponame>[^/]*)'
)
def
get_dest_path_for_url
(
url
):
hash_
=
hashlib
.
sha1
(
url
)
.
hexdigest
()
return
f
'{BASE_PATH}/{hash_[0:2]}/{hash_}'
def
handle_url
(
url
):
dest_path
=
get_dest_path_for_url
(
url
)
if
not
dest_path
:
return
url
=
url
.
decode
()
os
.
makedirs
(
dest_path
,
exist_ok
=
True
)
status_path
=
f
'{dest_path}/status.txt'
try
:
with
open
(
status_path
)
as
fd
:
status
=
fd
.
readline
()
.
strip
()
lines
=
fd
.
read
()
.
split
(
'
\n
'
)
except
FileNotFoundError
:
status
=
'not_loaded'
else
:
if
not
any
(
line
.
startswith
(
'md5sum='
)
for
line
in
lines
):
# old version of this script didn't store checksums
status
=
'redownload'
if
status
==
'success'
:
print
(
f
'Skipping {url}, already loaded.'
)
else
:
load_url
(
url
,
dest_path
,
status_path
)
return
url
def
load_url
(
url
,
dest_path
,
status_path
):
print
(
f
'Loading {url}'
)
try
:
md5sum
=
run_curl
(
url
,
dest_path
)
except
Exception
:
print
(
'Error loading {url}:'
)
traceback
.
print_exc
()
with
open
(
status_path
,
'w'
)
as
fd
:
fd
.
write
(
'error
\n
'
)
fd
.
write
(
traceback
.
format_exc
()
+
'
\n
'
)
else
:
with
open
(
status_path
,
'w'
)
as
fd
:
fd
.
write
(
'success
\n
'
)
fd
.
write
(
datetime
.
datetime
.
now
()
.
isoformat
()
+
'
\n
'
)
fd
.
write
(
url
+
'
\n
'
)
fd
.
write
(
f
'md5sum={md5sum}
\n
'
)
def
run_curl
(
url
,
dest_path
):
curl_cmd_parts
=
[
'curl'
,
f
'{url}/?cmd=getbundle'
,
'-H'
,
'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)'
,
'-H'
,
'Accept-Encoding: identity'
,
'-H'
,
'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull'
,
'-H'
,
'accept: application/mercurial-0.1'
,
'-H'
,
'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)'
,
'-H'
,
'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull'
,
'-H'
,
'vary: X-HgArg-1,X-HgProto-1'
,
'-H'
,
'x-hgarg-1: bookmarks=1&bundlecaps=HG20%2Cbundle2%3DHG20%250Abookmarks%250Achangegroup%253D01%252C02%250Adigests%253Dmd5%252Csha1%252Csha512%250Aerror%253Dabort%252Cunsupportedcontent%252Cpushraced%252Cpushkey%250Ahgtagsfnodes%250Alistkeys%250Aphases%253Dheads%250Apushkey%250Arev-branch-cache%250Astream%253Dv2&cbattempted=1&cg=1&common=0000000000000000000000000000000000000000&listkeys=bookmarks&phases=1'
,
]
curl_cmd
=
' '
.
join
(
'"
%s
"'
%
x
for
x
in
curl_cmd_parts
)
curl_cmd
+=
f
'| tee "{dest_path}/bundle" | md5sum -b'
p
=
subprocess
.
run
([
'bash'
,
'-c'
,
curl_cmd
],
capture_output
=
True
)
md5sum
=
p
.
stdout
.
decode
(
'ascii'
)
.
split
()[
0
]
.
strip
()
return
md5sum
PROGRESS_INTERVAL
=
1000
def
main
():
with
multiprocessing
.
Pool
(
4
)
as
pool
:
with
gzip
.
open
(
f
'{BASE_PATH}/bitbucket_urls.gz'
,
'rb'
)
as
fd
:
urls
=
[
line
.
strip
()
for
line
in
fd
]
urls
=
[
url
for
url
in
urls
if
url
and
not
url
.
endswith
(
b
'.git'
)]
last_time
=
time
.
time
()
with
open
(
f
'{BASE_PATH}/processed_urls'
,
'wt'
)
as
log_fd
:
for
(
i
,
url
)
in
enumerate
(
pool
.
imap_unordered
(
handle_url
,
urls
)):
log_fd
.
write
(
url
+
'
\n
'
)
if
i
%
PROGRESS_INTERVAL
==
0
:
now
=
time
.
time
()
rate
=
PROGRESS_INTERVAL
/
(
now
-
last_time
)
*
60
print
(
f
'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)'
)
last_time
=
now
if
__name__
==
'__main__'
:
main
()
Event Timeline
vlorentz
created this paste.
Feb 11 2020, 5:54 PM
2020-02-11 17:54:29 (UTC+1)
vlorentz
created this object with visibility "All Users".
vlorentz
changed the visibility from "All Users" to "Public (No Login Required)".
Feb 11 2020, 6:04 PM
2020-02-11 18:04:29 (UTC+1)
Log In to Comment