Page MenuHomeSoftware Heritage
Paste P596

bitbucket dump curl
ActivePublic

Authored by vlorentz on Feb 11 2020, 5:54 PM.
import datetime
import gc
import gzip
import hashlib
import multiprocessing
import os
import re
import subprocess
import time
import traceback
BASE_PATH = '/var/lib/bitbucket-dump'
url_pattern = re.compile('https://bitbucket.org/(?P<owner>[^/]*)/(?P<reponame>[^/]*)')
def get_dest_path_for_url(url):
hash_ = hashlib.sha1(url).hexdigest()
return f'{BASE_PATH}/{hash_[0:2]}/{hash_}'
def handle_url(url):
dest_path = get_dest_path_for_url(url)
if not dest_path:
return
url = url.decode()
os.makedirs(dest_path, exist_ok=True)
status_path = f'{dest_path}/status.txt'
try:
with open(status_path) as fd:
status = fd.readline().strip()
lines = fd.read().split('\n')
except FileNotFoundError:
status = 'not_loaded'
else:
if not any(line.startswith('md5sum=') for line in lines):
# old version of this script didn't store checksums
status = 'redownload'
if status == 'success':
print(f'Skipping {url}, already loaded.')
else:
load_url(url, dest_path, status_path)
return url
def load_url(url, dest_path, status_path):
print(f'Loading {url}')
try:
md5sum = run_curl(url, dest_path)
except Exception:
print('Error loading {url}:')
traceback.print_exc()
with open(status_path, 'w') as fd:
fd.write('error\n')
fd.write(traceback.format_exc() + '\n')
else:
with open(status_path, 'w') as fd:
fd.write('success\n')
fd.write(datetime.datetime.now().isoformat() + '\n')
fd.write(url + '\n')
fd.write(f'md5sum={md5sum}\n')
def run_curl(url, dest_path):
curl_cmd_parts = [
'curl',
f'{url}/?cmd=getbundle',
'-H', 'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)',
'-H', 'Accept-Encoding: identity',
'-H', 'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull',
'-H', 'accept: application/mercurial-0.1',
'-H', 'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)',
'-H', 'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull',
'-H', 'vary: X-HgArg-1,X-HgProto-1',
'-H', 'x-hgarg-1: bookmarks=1&bundlecaps=HG20%2Cbundle2%3DHG20%250Abookmarks%250Achangegroup%253D01%252C02%250Adigests%253Dmd5%252Csha1%252Csha512%250Aerror%253Dabort%252Cunsupportedcontent%252Cpushraced%252Cpushkey%250Ahgtagsfnodes%250Alistkeys%250Aphases%253Dheads%250Apushkey%250Arev-branch-cache%250Astream%253Dv2&cbattempted=1&cg=1&common=0000000000000000000000000000000000000000&listkeys=bookmarks&phases=1',
]
curl_cmd = ' '.join('"%s"' % x for x in curl_cmd_parts)
curl_cmd += f'| tee "{dest_path}/bundle" | md5sum -b'
p = subprocess.run([
'bash', '-c', curl_cmd
], capture_output=True)
md5sum = p.stdout.decode('ascii').split()[0].strip()
return md5sum
PROGRESS_INTERVAL = 1000
def main():
with multiprocessing.Pool(4) as pool:
with gzip.open(f'{BASE_PATH}/bitbucket_urls.gz', 'rb') as fd:
urls = [line.strip() for line in fd]
urls = [url for url in urls if url and not url.endswith(b'.git')]
last_time = time.time()
with open(f'{BASE_PATH}/processed_urls', 'wt') as log_fd:
for (i, url) in enumerate(pool.imap_unordered(handle_url, urls)):
log_fd.write(url + '\n')
if i % PROGRESS_INTERVAL == 0:
now = time.time()
rate = PROGRESS_INTERVAL/(now-last_time)*60
print(f'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)')
last_time = now
if __name__ == '__main__':
main()

Event Timeline

vlorentz created this object with visibility "All Users".
vlorentz changed the visibility from "All Users" to "Public (No Login Required)".Feb 11 2020, 6:04 PM