Page MenuHomeSoftware Heritage

fix_revisions.py
No OneTemporary

fix_revisions.py

#!/usr/bin/env python3
import logging
import pickle
import os
import sys
from .reader import GitCommitRemoteReader
def get_revisions_and_origins_from_file(filename):
revs = {}
with open(filename, 'r') as f:
for line in f:
data = line.strip().split()
revision, origins = data[0], data[1:]
revs[revision] = set(origins)
return revs
def revisions_from_origin(origin_url):
reader = GitCommitRemoteReader()
return {
id.decode(): revision
for id, revision in reader.load(origin_url).items()
}
def dump_to_file(filename, data):
with open('%s.tmp' % filename, 'wb') as f:
pickle.dump(data, f)
os.rename('%s.tmp' % filename, filename)
if __name__ == '__main__':
filename = sys.argv[1]
snapshot = sys.argv[2]
if os.path.exists(snapshot):
with open(snapshot, 'rb') as f:
revs, parsed_revs, origins = pickle.load(f)
else:
revs = get_revisions_and_origins_from_file(filename)
origins = set()
for urls in revs.values():
origins |= {url for url in urls
if url.startswith('https://github.com/')}
parsed_revs = {}
dump_to_file(snapshot, [revs, parsed_revs, origins])
ctr = 0
while origins:
print("%s origins, %s/%s revs remaining" % (
len(origins), len(revs) - len(parsed_revs), len(revs)
))
ctr += 1
origin_url = origins.pop()
try:
origin_revs = revisions_from_origin(origin_url)
except Exception as e:
logging.exception(e)
continue
for id, revision in origin_revs.items():
if id in revs and id not in parsed_revs:
parsed_revs[id] = revision
if ctr >= 10:
ctr = 0
dump_to_file(snapshot, [revs, parsed_revs, origins])
dump_to_file(snapshot, [revs, parsed_revs, origins])

File Metadata

Mime Type
text/x-python
Expires
Jun 4 2025, 6:52 PM (11 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3398848

Event Timeline