Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9697476
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
22 KB
Subscribers
None
View Options
diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
index abf70e2..572a89d 100644
--- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
+++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
@@ -1,646 +1,645 @@
#!/usr/local/bin/python
#
# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Usage
#
# First import:
# % git init --bare /git/openbsd.git
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# > openbsd.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd.dump
#
# Periodic import:
# % sudo cvsync
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# /git/openbsd.git > openbsd2.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd2.dump
#
import getopt
import os
import re
import subprocess
import sys
import time
import swh.loader.cvs.rcsparse as rcsparse
CHANGESET_FUZZ_SEC = 300
def usage():
print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '
'[-E log_encodings]\n'
'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'
'\tcvsroot [git_dir]', file=sys.stderr)
def main():
email_domain = None
do_incremental = False
git_tip = None
git_branch = 'master'
dump_all = False
log_encoding = 'utf-8,iso-8859-1'
rcs = RcsKeywords()
modules = []
last_revision = None
fuzzsec = CHANGESET_FUZZ_SEC
try:
opts, args = getopt.getopt(sys.argv[1:], 'ab:hm:z:e:E:k:t:l:')
for opt, v in opts:
if opt == '-z':
fuzzsec = int(v)
elif opt == '-e':
email_domain = v
elif opt == '-a':
dump_all = True
elif opt == '-b':
git_branch = v
elif opt == '-E':
log_encoding = v
elif opt == '-k':
rcs.add_id_keyword(v)
elif opt == '-m':
if v == '.git':
print('Cannot handle the path named \'.git\'',
file=sys.stderr)
sys.exit(1)
modules.append(v)
elif opt == '-l':
last_revision = v
elif opt == '-h':
usage()
sys.exit(1)
except getopt.GetoptError as msg:
print(msg, file=sys.stderr)
usage()
sys.exit(1)
if len(args) == 0 or len(args) > 2:
usage()
sys.exit(1)
log_encodings = log_encoding.split(',')
cvsroot = args[0]
while cvsroot[-1] == '/':
cvsroot = cvsroot[:-1]
if len(args) == 2:
do_incremental = True
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', git_branch],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Couldn't exec git", file=sys.stderr)
sys.exit(git.returncode)
git_tip = outs[2].strip()
if last_revision is not None:
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', last_revision],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Coundn't exec git", file=sys.stderr)
sys.exit(git.returncode)
last_author = outs[0].strip()
last_ctime = float(outs[1].split()[0])
# strip off the domain part from the last author since cvs doesn't have
# the domain part.
if do_incremental and email_domain is not None and \
last_author.lower().endswith(('@' + email_domain).lower()):
last_author = last_author[:-1 * (1 + len(email_domain))]
cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec)
print('** walk cvs tree', file=sys.stderr)
if len(modules) == 0:
cvs.walk()
else:
for module in modules:
cvs.walk(module)
changesets = sorted(cvs.changesets)
nchangesets = len(changesets)
print('** cvs has %d changeset' % (nchangesets), file=sys.stderr)
if nchangesets <= 0:
sys.exit(0)
if not dump_all:
# don't use last 10 minutes for safety
max_time_max = changesets[-1].max_time - 600
else:
max_time_max = changesets[-1].max_time
found_last_revision = False
markseq = cvs.markseq
extags = set()
for k in changesets:
if do_incremental and not found_last_revision:
if k.min_time == last_ctime and k.author == last_author:
found_last_revision = True
for tag in k.tags:
extags.add(tag)
continue
if k.max_time > max_time_max:
break
marks = {}
for f in k.revs:
if not do_incremental:
marks[f.markseq] = f
else:
markseq = markseq + 1
git_dump_file(f.path, f.rev, rcs, markseq)
marks[markseq] = f
log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)
for i, e in enumerate(log_encodings):
try:
how = 'ignore' if i == len(log_encodings) - 1 else 'strict'
log = log.decode(e, how)
break
except UnicodeError:
pass
log = log.encode('utf-8', 'ignore')
output('commit refs/heads/' + git_branch)
markseq = markseq + 1
output('mark :%d' % (markseq))
email = k.author if email_domain is None \
else k.author + '@' + email_domain
output('author %s <%s> %d +0000' % (k.author, email, k.min_time))
output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))
output('data', len(log))
output(log, end='')
if do_incremental and git_tip is not None:
output('from', git_tip)
git_tip = None
for m in marks:
f = marks[m]
mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644
fn = file_path(cvs.cvsroot, f.path)
if f.state == 'dead':
output('D', fn)
else:
output('M %o :%d %s' % (mode, m, fn))
output('')
for tag in k.tags:
if tag in extags:
continue
output('reset refs/tags/%s' % (tag))
output('from :%d' % (markseq))
output('')
if do_incremental and not found_last_revision:
raise Exception('could not find the last revision')
print('** dumped', file=sys.stderr)
#
# Encode by UTF-8 always for string objects since encoding for git-fast-import
# is UTF-8. Also write without conversion for a bytes object (file bodies
# might be various encodings)
#
def output(*args, end='\n'):
if len(args) == 0:
pass
elif len(args) > 1 or isinstance(args[0], str):
lines = ' '.join(
[arg if isinstance(arg, str) else str(arg) for arg in args])
sys.stdout.buffer.write(lines.encode('utf-8'))
else:
sys.stdout.buffer.write(args[0])
if len(end) > 0:
sys.stdout.buffer.write(end.encode('utf-8'))
class FileRevision:
def __init__(self, path, rev, state, markseq):
self.path = path
self.rev = rev
self.state = state
self.markseq = markseq
class ChangeSetKey:
def __init__(self, branch, author, timestamp, log, commitid, fuzzsec):
self.branch = branch
self.author = author
self.min_time = timestamp
self.max_time = timestamp
self.commitid = commitid
self.fuzzsec = fuzzsec
self.revs = []
self.tags = []
self.log_hash = 0
h = 0
for c in log:
h = 31 * h + c
self.log_hash = h
def __lt__(self, other):
return self._cmp(other) < 0
def __gt__(self, other):
return self._cmp(other) > 0
def __eq__(self, other):
return self._cmp(other) == 0
def __le__(self, other):
return self._cmp(other) <= 0
def __ge__(self, other):
return self._cmp(other) >= 0
def __ne__(self, other):
return self._cmp(other) != 0
def _cmp(self, anon):
# compare by the commitid
cid = _cmp2(self.commitid, anon.commitid)
if cid == 0 and self.commitid is not None:
# both have commitid and they are same
return 0
# compare by the time
ma = anon.min_time - self.max_time
mi = self.min_time - anon.max_time
ct = self.min_time - anon.min_time
if ma > self.fuzzsec or mi > self.fuzzsec:
return ct
if cid != 0:
# only one has the commitid, this means different commit
return cid if ct == 0 else ct
# compare by log, branch and author
c = _cmp2(self.log_hash, anon.log_hash)
if c == 0:
c = _cmp2(self.branch, anon.branch)
if c == 0:
c = _cmp2(self.author, anon.author)
if c == 0:
return 0
return ct if ct != 0 else c
def merge(self, anot):
self.max_time = max(self.max_time, anot.max_time)
self.min_time = min(self.min_time, anot.min_time)
self.revs.extend(anot.revs)
def __hash__(self):
return hash(self.branch + '/' + self.author) * 31 + self.log_hash
def put_file(self, path, rev, state, markseq):
self.revs.append(FileRevision(path, rev, state, markseq))
def _cmp2(a, b):
_a = a is not None
_b = b is not None
return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b)
class CvsConv:
def __init__(self, cvsroot, rcs, dumpfile, fuzzsec):
self.cvsroot = cvsroot
self.rcs = rcs
self.changesets = dict()
self.dumpfile = dumpfile
self.markseq = 0
self.tags = dict()
self.fuzzsec = fuzzsec
def walk(self, module=None):
p = [self.cvsroot]
if module is not None:
p.append(module)
path = os.path.join(*p)
for root, dirs, files in os.walk(path):
if '.git' in dirs:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
dirs.remove('.git')
if '.git' in files:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
files.remove('.git')
for f in files:
if not f[-2:] == ',v':
continue
self.parse_file(root + os.sep + f)
for t, c in list(self.tags.items()):
c.tags.append(t)
def parse_file(self, path):
rtags = dict()
rcsfile = rcsparse.rcsfile(path)
branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
for k, v in list(rcsfile.symbols.items()):
r = v.split('.')
if len(r) == 3:
branches[v] = 'VENDOR'
elif len(r) >= 3 and r[-2] == '0':
branches['.'.join(r[:-2] + r[-1:])] = k
if len(r) == 2 and branches[r[0]] == 'HEAD':
if v not in rtags:
rtags[v] = list()
rtags[v].append(k)
revs = rcsfile.revs.items()
# sort by revision descending to priorize 1.1.1.1 than 1.1
revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
# sort by time
revs = sorted(revs, key=lambda a: a[1][1])
novendor = False
have_initial_revision = False
last_vendor_status = None
for k, v in revs:
r = k.split('.')
if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
and r[3] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
last_vendor_status = v[3]
have_initial_revision = True
elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
if novendor:
continue
last_vendor_status = v[3]
elif len(r) == 2:
if r[0] == '1' and r[1] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
have_initial_revision = True
elif r[0] == '1' and r[1] != '1':
novendor = True
if last_vendor_status == 'dead' and v[3] == 'dead':
last_vendor_status = None
continue
last_vendor_status = None
else:
# trunk only
continue
if self.dumpfile:
self.markseq = self.markseq + 1
git_dump_file(path, k, self.rcs, self.markseq)
b = '.'.join(r[:-1])
try:
a = ChangeSetKey(
branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6],
self.fuzzsec)
except Exception as e:
print('Aborted at %s %s' % (path, v[0]), file=sys.stderr)
raise e
a.put_file(path, k, v[3], self.markseq)
while a in self.changesets:
c = self.changesets[a]
del self.changesets[a]
c.merge(a)
a = c
self.changesets[a] = a
if k in rtags:
for t in rtags[k]:
if t not in self.tags or \
self.tags[t].max_time < a.max_time:
self.tags[t] = a
def file_path(r, p):
if r.endswith('/'):
r = r[:-1]
path = p[:-2] # drop ",v"
p = path.split('/')
if len(p) > 0 and p[-2] == 'Attic':
path = '/'.join(p[:-2] + [p[-1]])
if path.startswith(r):
path = path[len(r) + 1:]
return path
def git_dump_file(path, k, rcs, markseq):
try:
- cont = rcs.expand_keyword(path, k)
+ cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)
except RuntimeError as msg:
print('Unexpected runtime error on parsing',
path, k, ':', msg, file=sys.stderr)
print('unlimit the resource limit may fix this problem.',
file=sys.stderr)
sys.exit(1)
output('blob')
output('mark :%d' % markseq)
output('data', len(cont))
output(cont)
class RcsKeywords:
RCS_KW_AUTHOR = (1 << 0)
RCS_KW_DATE = (1 << 1)
RCS_KW_LOG = (1 << 2)
RCS_KW_NAME = (1 << 3)
RCS_KW_RCSFILE = (1 << 4)
RCS_KW_REVISION = (1 << 5)
RCS_KW_SOURCE = (1 << 6)
RCS_KW_STATE = (1 << 7)
RCS_KW_FULLPATH = (1 << 8)
RCS_KW_MDOCDATE = (1 << 9)
RCS_KW_LOCKER = (1 << 10)
RCS_KW_ID = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE |
RCS_KW_AUTHOR | RCS_KW_STATE)
RCS_KW_HEADER = (RCS_KW_ID | RCS_KW_FULLPATH)
rcs_expkw = {
b"Author": RCS_KW_AUTHOR,
b"Date": RCS_KW_DATE,
b"Header": RCS_KW_HEADER,
b"Id": RCS_KW_ID,
b"Log": RCS_KW_LOG,
b"Name": RCS_KW_NAME,
b"RCSfile": RCS_KW_RCSFILE,
b"Revision": RCS_KW_REVISION,
b"Source": RCS_KW_SOURCE,
b"State": RCS_KW_STATE,
b"Mdocdate": RCS_KW_MDOCDATE,
b"Locker": RCS_KW_LOCKER
}
RCS_KWEXP_NONE = (1 << 0)
RCS_KWEXP_NAME = (1 << 1) # include keyword name
RCS_KWEXP_VAL = (1 << 2) # include keyword value
RCS_KWEXP_LKR = (1 << 3) # include name of locker
RCS_KWEXP_OLD = (1 << 4) # generate old keyword string
RCS_KWEXP_ERR = (1 << 5) # mode has an error
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL)
RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR)
def __init__(self):
self.rerecomple()
def rerecomple(self):
pat = b'|'.join(list(self.rcs_expkw.keys()))
self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")
def add_id_keyword(self, keyword):
self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID
self.rerecomple()
def kflag_get(self, flags):
if flags is None:
return self.RCS_KWEXP_DEFAULT
fl = 0
for fc in flags:
if fc == 'k':
fl |= self.RCS_KWEXP_NAME
elif fc == 'v':
fl |= self.RCS_KWEXP_VAL
elif fc == 'l':
fl |= self.RCS_KWEXP_LKR
elif fc == 'o':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_OLD
elif fc == 'b':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_NONE
else:
fl |= self.RCS_KWEXP_ERR
return fl
- def expand_keyword(self, filename, r):
- rcs = rcsparse.rcsfile(filename)
+ def expand_keyword(self, filename, rcs, r):
rev = rcs.revs[r]
mode = self.kflag_get(rcs.expand)
if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0:
return rcs.checkout(rev[0])
ret = []
for line in rcs.checkout(rev[0]).split(b'\n'):
logbuf = None
m = self.re_kw.match(line)
if m is None:
# No RCS Keywords, use it as it is
ret += [line]
continue
line0 = b''
while m is not None:
try:
dsign = m.end(1) + line[m.end(1):].index(b'$')
except ValueError:
break
prefix = line[:m.start(1) - 1]
line = line[dsign + 1:]
line0 += prefix
expbuf = ''
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
expbuf += m.group(1).decode('ascii')
if (mode & self.RCS_KWEXP_VAL) != 0:
expbuf += ': '
if (mode & self.RCS_KWEXP_VAL) != 0:
expkw = self.rcs_expkw[m.group(1)]
if (expkw & self.RCS_KW_RCSFILE) != 0:
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
if (expkw & self.RCS_KW_REVISION) != 0:
expbuf += rev[0]
expbuf += " "
if (expkw & self.RCS_KW_DATE) != 0:
expbuf += time.strftime(
"%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1]))
if (expkw & self.RCS_KW_MDOCDATE) != 0:
d = time.gmtime(rev[1])
expbuf += time.strftime(
"%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d)
if (expkw & self.RCS_KW_AUTHOR) != 0:
expbuf += rev[2]
expbuf += " "
if (expkw & self.RCS_KW_STATE) != 0:
expbuf += rev[3]
expbuf += " "
if (expkw & self.RCS_KW_LOG) != 0:
p = prefix
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
logbuf = p + (
'Revision %s %s %s\n' % (
rev[0], time.strftime(
"%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])),
rev[2])).encode('ascii')
for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'):
if len(lline) == 0:
logbuf += p.rstrip() + b'\n'
else:
logbuf += p + lline.lstrip() + b'\n'
if len(line) == 0:
logbuf += p.rstrip()
else:
logbuf += p + line.lstrip()
line = b''
if (expkw & self.RCS_KW_SOURCE) != 0:
expbuf += filename
expbuf += " "
if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0:
expbuf += " "
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
line0 += expbuf[:255].encode('ascii')
m = self.re_kw.match(line)
ret += [line0 + line]
if logbuf is not None:
ret += [logbuf]
return b'\n'.join(ret)
# ----------------------------------------------------------------------
# entry point
# ----------------------------------------------------------------------
if __name__ == '__main__':
main()
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Aug 18, 11:55 PM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3256058
Attached To
rDLDCVS CVS Loader
Event Timeline
Log In to Comment