No OneTemporary
Actions

Size

22 KB

Subscribers

None

View Options

	diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
	index abf70e2..572a89d 100644
	--- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
	+++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
	@@ -1,646 +1,645 @@
	#!/usr/local/bin/python

	#
	# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
	#
	# Permission to use, copy, modify, and distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	# Usage
	#
	# First import:
	# % git init --bare /git/openbsd.git
	# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
	# > openbsd.dump
	# % git --git-dir /git/openbsd.git fast-import < openbsd.dump
	#
	# Periodic import:
	# % sudo cvsync
	# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
	# /git/openbsd.git > openbsd2.dump
	# % git --git-dir /git/openbsd.git fast-import < openbsd2.dump
	#

	import getopt
	import os
	import re
	import subprocess
	import sys
	import time
	import swh.loader.cvs.rcsparse as rcsparse

	CHANGESET_FUZZ_SEC = 300


	def usage():
	print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '
	'[-E log_encodings]\n'
	'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'
	'\tcvsroot [git_dir]', file=sys.stderr)


	def main():
	email_domain = None
	do_incremental = False
	git_tip = None
	git_branch = 'master'
	dump_all = False
	log_encoding = 'utf-8,iso-8859-1'
	rcs = RcsKeywords()
	modules = []
	last_revision = None
	fuzzsec = CHANGESET_FUZZ_SEC

	try:
	opts, args = getopt.getopt(sys.argv[1:], 'ab:hm:z:e:E:k:t:l:')
	for opt, v in opts:
	if opt == '-z':
	fuzzsec = int(v)
	elif opt == '-e':
	email_domain = v
	elif opt == '-a':
	dump_all = True
	elif opt == '-b':
	git_branch = v
	elif opt == '-E':
	log_encoding = v
	elif opt == '-k':
	rcs.add_id_keyword(v)
	elif opt == '-m':
	if v == '.git':
	print('Cannot handle the path named \'.git\'',
	file=sys.stderr)
	sys.exit(1)
	modules.append(v)
	elif opt == '-l':
	last_revision = v
	elif opt == '-h':
	usage()
	sys.exit(1)
	except getopt.GetoptError as msg:
	print(msg, file=sys.stderr)
	usage()
	sys.exit(1)

	if len(args) == 0 or len(args) > 2:
	usage()
	sys.exit(1)

	log_encodings = log_encoding.split(',')

	cvsroot = args[0]
	while cvsroot[-1] == '/':
	cvsroot = cvsroot[:-1]

	if len(args) == 2:
	do_incremental = True
	git = subprocess.Popen(
	['git', '--git-dir=' + args[1], '-c',
	'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
	'--date=raw', '--format=%ae%n%ad%n%H', git_branch],
	encoding='utf-8', stdout=subprocess.PIPE)
	outs = git.stdout.readlines()
	git.wait()
	if git.returncode != 0:
	print("Couldn't exec git", file=sys.stderr)
	sys.exit(git.returncode)
	git_tip = outs[2].strip()

	if last_revision is not None:
	git = subprocess.Popen(
	['git', '--git-dir=' + args[1], '-c',
	'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
	'--date=raw', '--format=%ae%n%ad%n%H', last_revision],
	encoding='utf-8', stdout=subprocess.PIPE)
	outs = git.stdout.readlines()
	git.wait()
	if git.returncode != 0:
	print("Coundn't exec git", file=sys.stderr)
	sys.exit(git.returncode)
	last_author = outs[0].strip()
	last_ctime = float(outs[1].split()[0])

	# strip off the domain part from the last author since cvs doesn't have
	# the domain part.
	if do_incremental and email_domain is not None and \
	last_author.lower().endswith(('@' + email_domain).lower()):
	last_author = last_author[:-1 * (1 + len(email_domain))]

	cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec)
	print('** walk cvs tree', file=sys.stderr)
	if len(modules) == 0:
	cvs.walk()
	else:
	for module in modules:
	cvs.walk(module)

	changesets = sorted(cvs.changesets)
	nchangesets = len(changesets)
	print('** cvs has %d changeset' % (nchangesets), file=sys.stderr)

	if nchangesets <= 0:
	sys.exit(0)

	if not dump_all:
	# don't use last 10 minutes for safety
	max_time_max = changesets[-1].max_time - 600
	else:
	max_time_max = changesets[-1].max_time

	found_last_revision = False
	markseq = cvs.markseq
	extags = set()
	for k in changesets:
	if do_incremental and not found_last_revision:
	if k.min_time == last_ctime and k.author == last_author:
	found_last_revision = True
	for tag in k.tags:
	extags.add(tag)
	continue
	if k.max_time > max_time_max:
	break

	marks = {}

	for f in k.revs:
	if not do_incremental:
	marks[f.markseq] = f
	else:
	markseq = markseq + 1
	git_dump_file(f.path, f.rev, rcs, markseq)
	marks[markseq] = f
	log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)
	for i, e in enumerate(log_encodings):
	try:
	how = 'ignore' if i == len(log_encodings) - 1 else 'strict'
	log = log.decode(e, how)
	break
	except UnicodeError:
	pass
	log = log.encode('utf-8', 'ignore')

	output('commit refs/heads/' + git_branch)
	markseq = markseq + 1
	output('mark :%d' % (markseq))
	email = k.author if email_domain is None \
	else k.author + '@' + email_domain
	output('author %s <%s> %d +0000' % (k.author, email, k.min_time))
	output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))

	output('data', len(log))
	output(log, end='')
	if do_incremental and git_tip is not None:
	output('from', git_tip)
	git_tip = None

	for m in marks:
	f = marks[m]
	mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644
	fn = file_path(cvs.cvsroot, f.path)
	if f.state == 'dead':
	output('D', fn)
	else:
	output('M %o :%d %s' % (mode, m, fn))
	output('')
	for tag in k.tags:
	if tag in extags:
	continue
	output('reset refs/tags/%s' % (tag))
	output('from :%d' % (markseq))
	output('')

	if do_incremental and not found_last_revision:
	raise Exception('could not find the last revision')

	print('** dumped', file=sys.stderr)


	#
	# Encode by UTF-8 always for string objects since encoding for git-fast-import
	# is UTF-8. Also write without conversion for a bytes object (file bodies
	# might be various encodings)
	#
	def output(*args, end='\n'):
	if len(args) == 0:
	pass
	elif len(args) > 1 or isinstance(args[0], str):
	lines = ' '.join(
	[arg if isinstance(arg, str) else str(arg) for arg in args])
	sys.stdout.buffer.write(lines.encode('utf-8'))
	else:
	sys.stdout.buffer.write(args[0])
	if len(end) > 0:
	sys.stdout.buffer.write(end.encode('utf-8'))


	class FileRevision:
	def __init__(self, path, rev, state, markseq):
	self.path = path
	self.rev = rev
	self.state = state
	self.markseq = markseq


	class ChangeSetKey:
	def __init__(self, branch, author, timestamp, log, commitid, fuzzsec):
	self.branch = branch
	self.author = author
	self.min_time = timestamp
	self.max_time = timestamp
	self.commitid = commitid
	self.fuzzsec = fuzzsec
	self.revs = []
	self.tags = []
	self.log_hash = 0
	h = 0
	for c in log:
	h = 31 * h + c
	self.log_hash = h

	def __lt__(self, other):
	return self._cmp(other) < 0

	def __gt__(self, other):
	return self._cmp(other) > 0

	def __eq__(self, other):
	return self._cmp(other) == 0

	def __le__(self, other):
	return self._cmp(other) <= 0

	def __ge__(self, other):
	return self._cmp(other) >= 0

	def __ne__(self, other):
	return self._cmp(other) != 0

	def _cmp(self, anon):
	# compare by the commitid
	cid = _cmp2(self.commitid, anon.commitid)
	if cid == 0 and self.commitid is not None:
	# both have commitid and they are same
	return 0

	# compare by the time
	ma = anon.min_time - self.max_time
	mi = self.min_time - anon.max_time
	ct = self.min_time - anon.min_time
	if ma > self.fuzzsec or mi > self.fuzzsec:
	return ct

	if cid != 0:
	# only one has the commitid, this means different commit
	return cid if ct == 0 else ct

	# compare by log, branch and author
	c = _cmp2(self.log_hash, anon.log_hash)
	if c == 0:
	c = _cmp2(self.branch, anon.branch)
	if c == 0:
	c = _cmp2(self.author, anon.author)
	if c == 0:
	return 0

	return ct if ct != 0 else c

	def merge(self, anot):
	self.max_time = max(self.max_time, anot.max_time)
	self.min_time = min(self.min_time, anot.min_time)
	self.revs.extend(anot.revs)

	def __hash__(self):
	return hash(self.branch + '/' + self.author) * 31 + self.log_hash

	def put_file(self, path, rev, state, markseq):
	self.revs.append(FileRevision(path, rev, state, markseq))


	def _cmp2(a, b):
	_a = a is not None
	_b = b is not None
	return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b)


	class CvsConv:
	def __init__(self, cvsroot, rcs, dumpfile, fuzzsec):
	self.cvsroot = cvsroot
	self.rcs = rcs
	self.changesets = dict()
	self.dumpfile = dumpfile
	self.markseq = 0
	self.tags = dict()
	self.fuzzsec = fuzzsec

	def walk(self, module=None):
	p = [self.cvsroot]
	if module is not None:
	p.append(module)
	path = os.path.join(*p)

	for root, dirs, files in os.walk(path):
	if '.git' in dirs:
	print('Ignore %s: cannot handle the path named \'.git\'' % (
	root + os.sep + '.git'), file=sys.stderr)
	dirs.remove('.git')
	if '.git' in files:
	print('Ignore %s: cannot handle the path named \'.git\'' % (
	root + os.sep + '.git'), file=sys.stderr)
	files.remove('.git')
	for f in files:
	if not f[-2:] == ',v':
	continue
	self.parse_file(root + os.sep + f)

	for t, c in list(self.tags.items()):
	c.tags.append(t)

	def parse_file(self, path):
	rtags = dict()
	rcsfile = rcsparse.rcsfile(path)
	branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
	for k, v in list(rcsfile.symbols.items()):
	r = v.split('.')
	if len(r) == 3:
	branches[v] = 'VENDOR'
	elif len(r) >= 3 and r[-2] == '0':
	branches['.'.join(r[:-2] + r[-1:])] = k
	if len(r) == 2 and branches[r[0]] == 'HEAD':
	if v not in rtags:
	rtags[v] = list()
	rtags[v].append(k)

	revs = rcsfile.revs.items()
	# sort by revision descending to priorize 1.1.1.1 than 1.1
	revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
	# sort by time
	revs = sorted(revs, key=lambda a: a[1][1])
	novendor = False
	have_initial_revision = False
	last_vendor_status = None
	for k, v in revs:
	r = k.split('.')
	if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
	and r[3] == '1':
	if have_initial_revision:
	continue
	if v[3] == 'dead':
	continue
	last_vendor_status = v[3]
	have_initial_revision = True
	elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
	if novendor:
	continue
	last_vendor_status = v[3]
	elif len(r) == 2:
	if r[0] == '1' and r[1] == '1':
	if have_initial_revision:
	continue
	if v[3] == 'dead':
	continue
	have_initial_revision = True
	elif r[0] == '1' and r[1] != '1':
	novendor = True
	if last_vendor_status == 'dead' and v[3] == 'dead':
	last_vendor_status = None
	continue
	last_vendor_status = None
	else:
	# trunk only
	continue

	if self.dumpfile:
	self.markseq = self.markseq + 1
	git_dump_file(path, k, self.rcs, self.markseq)

	b = '.'.join(r[:-1])
	try:
	a = ChangeSetKey(
	branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6],
	self.fuzzsec)
	except Exception as e:
	print('Aborted at %s %s' % (path, v[0]), file=sys.stderr)
	raise e

	a.put_file(path, k, v[3], self.markseq)
	while a in self.changesets:
	c = self.changesets[a]
	del self.changesets[a]
	c.merge(a)
	a = c
	self.changesets[a] = a
	if k in rtags:
	for t in rtags[k]:
	if t not in self.tags or \
	self.tags[t].max_time < a.max_time:
	self.tags[t] = a


	def file_path(r, p):
	if r.endswith('/'):
	r = r[:-1]
	path = p[:-2] # drop ",v"
	p = path.split('/')
	if len(p) > 0 and p[-2] == 'Attic':
	path = '/'.join(p[:-2] + [p[-1]])
	if path.startswith(r):
	path = path[len(r) + 1:]
	return path


	def git_dump_file(path, k, rcs, markseq):
	try:
	- cont = rcs.expand_keyword(path, k)
	+ cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)
	except RuntimeError as msg:
	print('Unexpected runtime error on parsing',
	path, k, ':', msg, file=sys.stderr)
	print('unlimit the resource limit may fix this problem.',
	file=sys.stderr)
	sys.exit(1)
	output('blob')
	output('mark :%d' % markseq)
	output('data', len(cont))
	output(cont)


	class RcsKeywords:
	RCS_KW_AUTHOR = (1 << 0)
	RCS_KW_DATE = (1 << 1)
	RCS_KW_LOG = (1 << 2)
	RCS_KW_NAME = (1 << 3)
	RCS_KW_RCSFILE = (1 << 4)
	RCS_KW_REVISION = (1 << 5)
	RCS_KW_SOURCE = (1 << 6)
	RCS_KW_STATE = (1 << 7)
	RCS_KW_FULLPATH = (1 << 8)
	RCS_KW_MDOCDATE = (1 << 9)
	RCS_KW_LOCKER = (1 << 10)

	RCS_KW_ID = (RCS_KW_RCSFILE \| RCS_KW_REVISION \| RCS_KW_DATE \|
	RCS_KW_AUTHOR \| RCS_KW_STATE)
	RCS_KW_HEADER = (RCS_KW_ID \| RCS_KW_FULLPATH)

	rcs_expkw = {
	b"Author": RCS_KW_AUTHOR,
	b"Date": RCS_KW_DATE,
	b"Header": RCS_KW_HEADER,
	b"Id": RCS_KW_ID,
	b"Log": RCS_KW_LOG,
	b"Name": RCS_KW_NAME,
	b"RCSfile": RCS_KW_RCSFILE,
	b"Revision": RCS_KW_REVISION,
	b"Source": RCS_KW_SOURCE,
	b"State": RCS_KW_STATE,
	b"Mdocdate": RCS_KW_MDOCDATE,
	b"Locker": RCS_KW_LOCKER
	}

	RCS_KWEXP_NONE = (1 << 0)
	RCS_KWEXP_NAME = (1 << 1) # include keyword name
	RCS_KWEXP_VAL = (1 << 2) # include keyword value
	RCS_KWEXP_LKR = (1 << 3) # include name of locker
	RCS_KWEXP_OLD = (1 << 4) # generate old keyword string
	RCS_KWEXP_ERR = (1 << 5) # mode has an error
	RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL)
	RCS_KWEXP_KVL = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL \| RCS_KWEXP_LKR)

	def __init__(self):
	self.rerecomple()

	def rerecomple(self):
	pat = b'\|'.join(list(self.rcs_expkw.keys()))
	self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")

	def add_id_keyword(self, keyword):
	self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID
	self.rerecomple()

	def kflag_get(self, flags):
	if flags is None:
	return self.RCS_KWEXP_DEFAULT
	fl = 0
	for fc in flags:
	if fc == 'k':
	fl \|= self.RCS_KWEXP_NAME
	elif fc == 'v':
	fl \|= self.RCS_KWEXP_VAL
	elif fc == 'l':
	fl \|= self.RCS_KWEXP_LKR
	elif fc == 'o':
	if len(flags) != 1:
	fl \|= self.RCS_KWEXP_ERR
	fl \|= self.RCS_KWEXP_OLD
	elif fc == 'b':
	if len(flags) != 1:
	fl \|= self.RCS_KWEXP_ERR
	fl \|= self.RCS_KWEXP_NONE
	else:
	fl \|= self.RCS_KWEXP_ERR
	return fl

	- def expand_keyword(self, filename, r):
	- rcs = rcsparse.rcsfile(filename)
	+ def expand_keyword(self, filename, rcs, r):
	rev = rcs.revs[r]

	mode = self.kflag_get(rcs.expand)
	if (mode & (self.RCS_KWEXP_NONE \| self.RCS_KWEXP_OLD)) != 0:
	return rcs.checkout(rev[0])

	ret = []
	for line in rcs.checkout(rev[0]).split(b'\n'):
	logbuf = None
	m = self.re_kw.match(line)
	if m is None:
	# No RCS Keywords, use it as it is
	ret += [line]
	continue

	line0 = b''
	while m is not None:
	try:
	dsign = m.end(1) + line[m.end(1):].index(b'$')
	except ValueError:
	break
	prefix = line[:m.start(1) - 1]
	line = line[dsign + 1:]
	line0 += prefix
	expbuf = ''
	if (mode & self.RCS_KWEXP_NAME) != 0:
	expbuf += '$'
	expbuf += m.group(1).decode('ascii')
	if (mode & self.RCS_KWEXP_VAL) != 0:
	expbuf += ': '
	if (mode & self.RCS_KWEXP_VAL) != 0:
	expkw = self.rcs_expkw[m.group(1)]
	if (expkw & self.RCS_KW_RCSFILE) != 0:
	expbuf += filename \
	if (expkw & self.RCS_KW_FULLPATH) != 0 \
	else os.path.basename(filename)
	expbuf += " "
	if (expkw & self.RCS_KW_REVISION) != 0:
	expbuf += rev[0]
	expbuf += " "
	if (expkw & self.RCS_KW_DATE) != 0:
	expbuf += time.strftime(
	"%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1]))
	if (expkw & self.RCS_KW_MDOCDATE) != 0:
	d = time.gmtime(rev[1])
	expbuf += time.strftime(
	"%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d)
	if (expkw & self.RCS_KW_AUTHOR) != 0:
	expbuf += rev[2]
	expbuf += " "
	if (expkw & self.RCS_KW_STATE) != 0:
	expbuf += rev[3]
	expbuf += " "
	if (expkw & self.RCS_KW_LOG) != 0:
	p = prefix
	expbuf += filename \
	if (expkw & self.RCS_KW_FULLPATH) != 0 \
	else os.path.basename(filename)
	expbuf += " "
	logbuf = p + (
	'Revision %s %s %s\n' % (
	rev[0], time.strftime(
	"%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])),
	rev[2])).encode('ascii')
	for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'):
	if len(lline) == 0:
	logbuf += p.rstrip() + b'\n'
	else:
	logbuf += p + lline.lstrip() + b'\n'
	if len(line) == 0:
	logbuf += p.rstrip()
	else:
	logbuf += p + line.lstrip()
	line = b''
	if (expkw & self.RCS_KW_SOURCE) != 0:
	expbuf += filename
	expbuf += " "
	if (expkw & (self.RCS_KW_NAME \| self.RCS_KW_LOCKER)) != 0:
	expbuf += " "
	if (mode & self.RCS_KWEXP_NAME) != 0:
	expbuf += '$'
	line0 += expbuf[:255].encode('ascii')
	m = self.re_kw.match(line)

	ret += [line0 + line]
	if logbuf is not None:
	ret += [logbuf]
	return b'\n'.join(ret)


	# ----------------------------------------------------------------------
	# entry point
	# ----------------------------------------------------------------------
	if __name__ == '__main__':
	main()

File Metadata

Mime Type: text/x-diff
Expires: Mon, Aug 18, 11:55 PM (2 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3256058

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions