import datetime import gzip import hashlib import multiprocessing import os import re import subprocess import time import traceback BASE_PATH = '/var/lib/bitbucket-clone2' url_pattern = re.compile('https://bitbucket.org/(?P[^/]*)/(?P[^/]*)') def get_dest_path_for_url(url): hash_ = hashlib.sha1(url).hexdigest() return f'{BASE_PATH}/{hash_[0:2]}/{hash_}' def handle_url(url): dest_path = get_dest_path_for_url(url) if not dest_path: return url = url.decode() os.makedirs(dest_path, exist_ok=True) status_path = f'{dest_path}/status.txt' try: with open(status_path) as fd: status = fd.readline().strip() lines = fd.read().split('\n') except FileNotFoundError: status = 'not_loaded' if status == 'success': print(f'Skipping {url}, already loaded.') else: load_url(url, dest_path, status_path) return url def load_url(url, dest_path, status_path): print(f'Loading {url}') try: run_hg(url, dest_path) except Exception: print('Error loading {url}:') traceback.print_exc() with open(status_path, 'w') as fd: fd.write('error\n') fd.write(traceback.format_exc() + '\n') else: with open(status_path, 'w') as fd: fd.write('success\n') fd.write(datetime.datetime.now().isoformat() + '\n') fd.write(url + '\n') def run_hg(url, dest_path): p = subprocess.run([ 'hg', 'clone', '--noninteractive', '--noupdate', url, f'{dest_path}/repo-{int(time.time())}' ], capture_output=True) PROGRESS_INTERVAL = 1000 def main(): with multiprocessing.Pool(10) as pool: with gzip.open(f'{BASE_PATH}/bitbucket_urls.gz', 'rb') as fd: urls = [line.strip() for line in fd] urls = [url for url in urls if url and not url.endswith(b'.git')] last_time = time.time() with open(f'{BASE_PATH}/processed_urls', 'wt') as log_fd: for (i, url) in enumerate(pool.imap_unordered(handle_url, urls)): log_fd.write(url + '\n') if i % PROGRESS_INTERVAL == 0: now = time.time() rate = PROGRESS_INTERVAL/(now-last_time)*60 print(f'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)') last_time = now if __name__ == '__main__': main()