Page MenuHomeSoftware Heritage
Paste P597

bitbucket dump clone
ActivePublic

Authored by vlorentz on Feb 11 2020, 5:55 PM.
import datetime
import gzip
import hashlib
import multiprocessing
import os
import re
import subprocess
import time
import traceback
BASE_PATH = '/var/lib/bitbucket-clone2'
url_pattern = re.compile('https://bitbucket.org/(?P<owner>[^/]*)/(?P<reponame>[^/]*)')
def get_dest_path_for_url(url):
hash_ = hashlib.sha1(url).hexdigest()
return f'{BASE_PATH}/{hash_[0:2]}/{hash_}'
def handle_url(url):
dest_path = get_dest_path_for_url(url)
if not dest_path:
return
url = url.decode()
os.makedirs(dest_path, exist_ok=True)
status_path = f'{dest_path}/status.txt'
try:
with open(status_path) as fd:
status = fd.readline().strip()
lines = fd.read().split('\n')
except FileNotFoundError:
status = 'not_loaded'
if status == 'success':
print(f'Skipping {url}, already loaded.')
else:
load_url(url, dest_path, status_path)
return url
def load_url(url, dest_path, status_path):
print(f'Loading {url}')
try:
run_hg(url, dest_path)
except Exception:
print('Error loading {url}:')
traceback.print_exc()
with open(status_path, 'w') as fd:
fd.write('error\n')
fd.write(traceback.format_exc() + '\n')
else:
with open(status_path, 'w') as fd:
fd.write('success\n')
fd.write(datetime.datetime.now().isoformat() + '\n')
fd.write(url + '\n')
def run_hg(url, dest_path):
p = subprocess.run([
'hg', 'clone', '--noninteractive', '--noupdate',
url, f'{dest_path}/repo-{int(time.time())}'
], capture_output=True)
PROGRESS_INTERVAL = 1000
def main():
with multiprocessing.Pool(10) as pool:
with gzip.open(f'{BASE_PATH}/bitbucket_urls.gz', 'rb') as fd:
urls = [line.strip() for line in fd]
urls = [url for url in urls if url and not url.endswith(b'.git')]
last_time = time.time()
with open(f'{BASE_PATH}/processed_urls', 'wt') as log_fd:
for (i, url) in enumerate(pool.imap_unordered(handle_url, urls)):
log_fd.write(url + '\n')
if i % PROGRESS_INTERVAL == 0:
now = time.time()
rate = PROGRESS_INTERVAL/(now-last_time)*60
print(f'{i*100/len(urls):.2f}% {i}/{len(urls)} ({rate:.0f}/min)')
last_time = now
if __name__ == '__main__':
main()

Event Timeline

vlorentz created this object with visibility "All Users".
vlorentz changed the visibility from "All Users" to "Public (No Login Required)".Feb 11 2020, 6:04 PM