import datetime
import gzip
import hashlib
import multiprocessing
import os
import re
import subprocess
import time
import traceback

BASE_PATH = '/var/lib/bitbucket-clone2'


url_pattern = re.compile('https://bitbucket.org/(?P<owner>[^/]*)/(?P<reponame>[^/]*)')


def get_dest_path_for_url(url):
    hash_ = hashlib.sha1(url).hexdigest()

    return f'{BASE_PATH}/{hash_[0:2]}/{hash_}'


def handle_url(url):
    dest_path = get_dest_path_for_url(url)
    if not dest_path:
        return
    url = url.decode()
    os.makedirs(dest_path, exist_ok=True)
    status_path = f'{dest_path}/status.txt'
    try:
        with open(status_path) as fd:
            status = fd.readline().strip()
            lines = fd.read().split('\n')
    except FileNotFoundError:
        status = 'not_loaded'

    if status == 'success':
        print(f'Skipping {url}, already loaded.')
    else:
        load_url(url, dest_path, status_path)
    return url


def load_url(url, dest_path, status_path):
    print(f'Loading {url}')
    try:
        run_hg(url, dest_path)
    except Exception:
        print('Error loading {url}:')
        traceback.print_exc()
        with open(status_path, 'w') as fd:
            fd.write('error\n')
            fd.write(traceback.format_exc() + '\n')
    else:
        with open(status_path, 'w') as fd:
            fd.write('success\n')
            fd.write(datetime.datetime.now().isoformat() + '\n')
            fd.write(url + '\n')


def run_hg(url, dest_path):
    p = subprocess.run([
        'hg', 'clone', '--noninteractive', '--noupdate',
        url, f'{dest_path}/repo-{int(time.time())}'
    ], capture_output=True)


PROGRESS_INTERVAL = 1000


def main():
    with multiprocessing.Pool(10) as pool:
        with gzip.open(f'{BASE_PATH}/bitbucket_urls.gz', 'rb') as fd:
            urls = [line.strip() for line in fd]
            urls = [url for url in urls if url and not url.endswith(b'.git')]
        last_time = time.time()
        with open(f'{BASE_PATH}/processed_urls', 'wt') as log_fd:
            for (i, url) in enumerate(pool.imap_unordered(handle_url, urls)):
                log_fd.write(url + '\n')
                if i % PROGRESS_INTERVAL == 0:
                    now = time.time()
                    rate = PROGRESS_INTERVAL/(now-last_time)*60
                    print(f'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)')
                    last_time = now

if __name__ == '__main__':
    main()