import datetime import gc import gzip import hashlib import multiprocessing import os import re import subprocess import time import traceback BASE_PATH = '/var/lib/bitbucket-dump' url_pattern = re.compile('https://bitbucket.org/(?P[^/]*)/(?P[^/]*)') def get_dest_path_for_url(url): hash_ = hashlib.sha1(url).hexdigest() return f'{BASE_PATH}/{hash_[0:2]}/{hash_}' def handle_url(url): dest_path = get_dest_path_for_url(url) if not dest_path: return url = url.decode() os.makedirs(dest_path, exist_ok=True) status_path = f'{dest_path}/status.txt' try: with open(status_path) as fd: status = fd.readline().strip() lines = fd.read().split('\n') except FileNotFoundError: status = 'not_loaded' else: if not any(line.startswith('md5sum=') for line in lines): # old version of this script didn't store checksums status = 'redownload' if status == 'success': print(f'Skipping {url}, already loaded.') else: load_url(url, dest_path, status_path) return url def load_url(url, dest_path, status_path): print(f'Loading {url}') try: md5sum = run_curl(url, dest_path) except Exception: print('Error loading {url}:') traceback.print_exc() with open(status_path, 'w') as fd: fd.write('error\n') fd.write(traceback.format_exc() + '\n') else: with open(status_path, 'w') as fd: fd.write('success\n') fd.write(datetime.datetime.now().isoformat() + '\n') fd.write(url + '\n') fd.write(f'md5sum={md5sum}\n') def run_curl(url, dest_path): curl_cmd_parts = [ 'curl', f'{url}/?cmd=getbundle', '-H', 'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)', '-H', 'Accept-Encoding: identity', '-H', 'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull', '-H', 'accept: application/mercurial-0.1', '-H', 'user-agent: mercurial/proto-1.0 (Mercurial 4.8.2)', '-H', 'x-hgproto-1: 0.1 0.2 comp=zstd,zlib,none,bzip2 partial-pull', '-H', 'vary: X-HgArg-1,X-HgProto-1', '-H', 'x-hgarg-1: bookmarks=1&bundlecaps=HG20%2Cbundle2%3DHG20%250Abookmarks%250Achangegroup%253D01%252C02%250Adigests%253Dmd5%252Csha1%252Csha512%250Aerror%253Dabort%252Cunsupportedcontent%252Cpushraced%252Cpushkey%250Ahgtagsfnodes%250Alistkeys%250Aphases%253Dheads%250Apushkey%250Arev-branch-cache%250Astream%253Dv2&cbattempted=1&cg=1&common=0000000000000000000000000000000000000000&listkeys=bookmarks&phases=1', ] curl_cmd = ' '.join('"%s"' % x for x in curl_cmd_parts) curl_cmd += f'| tee "{dest_path}/bundle" | md5sum -b' p = subprocess.run([ 'bash', '-c', curl_cmd ], capture_output=True) md5sum = p.stdout.decode('ascii').split()[0].strip() return md5sum PROGRESS_INTERVAL = 1000 def main(): with multiprocessing.Pool(4) as pool: with gzip.open(f'{BASE_PATH}/bitbucket_urls.gz', 'rb') as fd: urls = [line.strip() for line in fd] urls = [url for url in urls if url and not url.endswith(b'.git')] last_time = time.time() with open(f'{BASE_PATH}/processed_urls', 'wt') as log_fd: for (i, url) in enumerate(pool.imap_unordered(handle_url, urls)): log_fd.write(url + '\n') if i % PROGRESS_INTERVAL == 0: now = time.time() rate = PROGRESS_INTERVAL/(now-last_time)*60 print(f'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)') last_time = now if __name__ == '__main__': main()