Differential D6598 Diff 23979 swh/loader/cvs/cvs2gitdump/cvs2gitdump.py

Changeset View

Standalone View

swh/loader/cvs/cvs2gitdump/cvs2gitdump.py

Show All 30 Lines
#		#

import getopt		import getopt
import os		import os
import re		import re
import subprocess		import subprocess
import sys		import sys
import time		import time
		from typing import Dict, List, Optional, Tuple, TypeVar

import swh.loader.cvs.rcsparse as rcsparse		import swh.loader.cvs.rcsparse as rcsparse

CHANGESET_FUZZ_SEC = 300		CHANGESET_FUZZ_SEC = 300


def usage():		def usage():
print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '		print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '
'[-E log_encodings]\n'		'[-E log_encodings]\n'
'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'		'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'
'\tcvsroot [git_dir]', file=sys.stderr)		'\tcvsroot [git_dir]', file=sys.stderr)


def main():		def main() -> None:
email_domain = None		email_domain = None
do_incremental = False		do_incremental = False
git_tip = None		git_tip = None
git_branch = 'master'		git_branch = 'master'
dump_all = False		dump_all = False
log_encoding = 'utf-8,iso-8859-1'		log_encoding = 'utf-8,iso-8859-1'
rcs = RcsKeywords()		rcs = RcsKeywords()
modules = []		modules = []
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	def main() -> None:

if len(args) == 2:		if len(args) == 2:
do_incremental = True		do_incremental = True
git = subprocess.Popen(		git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',		['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',		'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', git_branch],		'--date=raw', '--format=%ae%n%ad%n%H', git_branch],
encoding='utf-8', stdout=subprocess.PIPE)		encoding='utf-8', stdout=subprocess.PIPE)
		assert git.stdout is not None
outs = git.stdout.readlines()		outs = git.stdout.readlines()
git.wait()		git.wait()
if git.returncode != 0:		if git.returncode != 0:
print("Couldn't exec git", file=sys.stderr)		print("Couldn't exec git", file=sys.stderr)
sys.exit(git.returncode)		sys.exit(git.returncode)
git_tip = outs[2].strip()		git_tip = outs[2].strip()

if last_revision is not None:		if last_revision is not None:
git = subprocess.Popen(		git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',		['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',		'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', last_revision],		'--date=raw', '--format=%ae%n%ad%n%H', last_revision],
encoding='utf-8', stdout=subprocess.PIPE)		encoding='utf-8', stdout=subprocess.PIPE)
		assert git.stdout is not None
outs = git.stdout.readlines()		outs = git.stdout.readlines()
git.wait()		git.wait()
if git.returncode != 0:		if git.returncode != 0:
print("Coundn't exec git", file=sys.stderr)		print("Coundn't exec git", file=sys.stderr)
sys.exit(git.returncode)		sys.exit(git.returncode)
last_author = outs[0].strip()		last_author = outs[0].strip()
last_ctime = float(outs[1].split()[0])		last_ctime = float(outs[1].split()[0])

▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	for k in changesets:
else:		else:
markseq = markseq + 1		markseq = markseq + 1
git_dump_file(f.path, f.rev, rcs, markseq)		git_dump_file(f.path, f.rev, rcs, markseq)
marks[markseq] = f		marks[markseq] = f
log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)		log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)
for i, e in enumerate(log_encodings):		for i, e in enumerate(log_encodings):
try:		try:
how = 'ignore' if i == len(log_encodings) - 1 else 'strict'		how = 'ignore' if i == len(log_encodings) - 1 else 'strict'
log = log.decode(e, how)		log_str = log.decode(e, how)
break		break
except UnicodeError:		except UnicodeError:
pass		pass
log = log.encode('utf-8', 'ignore')		log = log_str.encode('utf-8', 'ignore')

output('commit refs/heads/' + git_branch)		output('commit refs/heads/' + git_branch)
markseq = markseq + 1		markseq = markseq + 1
output('mark :%d' % (markseq))		output('mark :%d' % (markseq))
email = k.author if email_domain is None \		email = k.author if email_domain is None \
else k.author + '@' + email_domain		else k.author + '@' + email_domain
output('author %s <%s> %d +0000' % (k.author, email, k.min_time))		output('author %s <%s> %d +0000' % (k.author, email, k.min_time))
output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))		output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))
Show All 26 Lines	def main() -> None:
print('** dumped', file=sys.stderr)		print('** dumped', file=sys.stderr)


#		#
# Encode by UTF-8 always for string objects since encoding for git-fast-import		# Encode by UTF-8 always for string objects since encoding for git-fast-import
# is UTF-8. Also write without conversion for a bytes object (file bodies		# is UTF-8. Also write without conversion for a bytes object (file bodies
# might be various encodings)		# might be various encodings)
#		#
def output(*args, end='\n'):		def output(*args, end='\n') -> None:
if len(args) == 0:		if len(args) == 0:
pass		pass
elif len(args) > 1 or isinstance(args[0], str):		elif len(args) > 1 or isinstance(args[0], str):
lines = ' '.join(		lines = ' '.join(
[arg if isinstance(arg, str) else str(arg) for arg in args])		[arg if isinstance(arg, str) else str(arg) for arg in args])
sys.stdout.buffer.write(lines.encode('utf-8'))		sys.stdout.buffer.write(lines.encode('utf-8'))
else:		else:
sys.stdout.buffer.write(args[0])		sys.stdout.buffer.write(args[0])
if len(end) > 0:		if len(end) > 0:
sys.stdout.buffer.write(end.encode('utf-8'))		sys.stdout.buffer.write(end.encode('utf-8'))


class FileRevision:		class FileRevision:
def __init__(self, path, rev, state, markseq):		def __init__(self, path: str, rev: str, state: str, markseq: int) -> None:
self.path = path		self.path = path
self.rev = rev		self.rev = rev
self.state = state		self.state = state
self.markseq = markseq		self.markseq = markseq


class ChangeSetKey:		class ChangeSetKey:
def __init__(self, branch, author, timestamp, log, commitid, fuzzsec):		def __init__(
		self,
		branch: str,
		author,
		timestamp: int,
		log: bytes,
		commitid: Optional[str],
		fuzzsec: int
		) -> None:
self.branch = branch		self.branch = branch
self.author = author		self.author = author
self.min_time = timestamp		self.min_time = timestamp
self.max_time = timestamp		self.max_time = timestamp
self.commitid = commitid		self.commitid = commitid
self.fuzzsec = fuzzsec		self.fuzzsec = fuzzsec
self.revs = []		self.revs: List[FileRevision] = []
self.tags = []		self.tags: List[str] = []
self.log_hash = 0		self.log_hash = 0
h = 0		h = 0
for c in log:		for c in log:
h = 31 * h + c		h = 31 * h + c
self.log_hash = h		self.log_hash = h

def __lt__(self, other):		def __lt__(self, other) -> bool:
return self._cmp(other) < 0		return self._cmp(other) < 0

def __gt__(self, other):		def __gt__(self, other) -> bool:
return self._cmp(other) > 0		return self._cmp(other) > 0

def __eq__(self, other):		def __eq__(self, other) -> bool:
return self._cmp(other) == 0		return self._cmp(other) == 0

def __le__(self, other):		def __le__(self, other) -> bool:
return self._cmp(other) <= 0		return self._cmp(other) <= 0

def __ge__(self, other):		def __ge__(self, other) -> bool:
return self._cmp(other) >= 0		return self._cmp(other) >= 0

def __ne__(self, other):		def __ne__(self, other) -> bool:
return self._cmp(other) != 0		return self._cmp(other) != 0

def _cmp(self, anon):		def _cmp(self, anon) -> int:
		if not isinstance(anon, ChangeSetKey):
		raise TypeError()
# compare by the commitid		# compare by the commitid
cid = _cmp2(self.commitid, anon.commitid)		cid = _cmp2(self.commitid, anon.commitid)
if cid == 0 and self.commitid is not None:		if cid == 0 and self.commitid is not None:
# both have commitid and they are same		# both have commitid and they are same
return 0		return 0

# compare by the time		# compare by the time
ma = anon.min_time - self.max_time		ma = anon.min_time - self.max_time
Show All 12 Lines	def _cmp(self, anon) -> int:
c = _cmp2(self.branch, anon.branch)		c = _cmp2(self.branch, anon.branch)
if c == 0:		if c == 0:
c = _cmp2(self.author, anon.author)		c = _cmp2(self.author, anon.author)
if c == 0:		if c == 0:
return 0		return 0

return ct if ct != 0 else c		return ct if ct != 0 else c

def merge(self, anot):		def merge(self, anot: "ChangeSetKey") -> None:
self.max_time = max(self.max_time, anot.max_time)		self.max_time = max(self.max_time, anot.max_time)
self.min_time = min(self.min_time, anot.min_time)		self.min_time = min(self.min_time, anot.min_time)
self.revs.extend(anot.revs)		self.revs.extend(anot.revs)

def __hash__(self):		def __hash__(self) -> int:
return hash(self.branch + '/' + self.author) * 31 + self.log_hash		return hash(self.branch + '/' + self.author) * 31 + self.log_hash

def put_file(self, path, rev, state, markseq):		def put_file(self, path: str, rev: str, state: str, markseq: int):
self.revs.append(FileRevision(path, rev, state, markseq))		self.revs.append(FileRevision(path, rev, state, markseq))


def _cmp2(a, b):		TCmp = TypeVar("TCmp", int, str)
		def _cmp2(a: Optional[TCmp], b: Optional[TCmp]) -> int:
_a = a is not None		_a = a is not None
_b = b is not None		_b = b is not None
return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b)		return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) # type: ignore


class CvsConv:		class CvsConv:
def __init__(self, cvsroot, rcs, dumpfile, fuzzsec):		def __init__(self, cvsroot: str, rcs: "RcsKeywords", dumpfile: bool, fuzzsec: int) -> None:
self.cvsroot = cvsroot		self.cvsroot = cvsroot
self.rcs = rcs		self.rcs = rcs
self.changesets = dict()		self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict()
self.dumpfile = dumpfile		self.dumpfile = dumpfile
self.markseq = 0		self.markseq = 0
self.tags = dict()		self.tags: Dict[str, ChangeSetKey] = dict()
self.fuzzsec = fuzzsec		self.fuzzsec = fuzzsec

def walk(self, module=None):		def walk(self, module: Optional[str] =None) -> None:
p = [self.cvsroot]		p = [self.cvsroot]
if module is not None:		if module is not None:
p.append(module)		p.append(module)
path = os.path.join(*p)		path = os.path.join(*p)

for root, dirs, files in os.walk(path):		for root, dirs, files in os.walk(path):
if '.git' in dirs:		if '.git' in dirs:
print('Ignore %s: cannot handle the path named \'.git\'' % (		print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)		root + os.sep + '.git'), file=sys.stderr)
dirs.remove('.git')		dirs.remove('.git')
if '.git' in files:		if '.git' in files:
print('Ignore %s: cannot handle the path named \'.git\'' % (		print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)		root + os.sep + '.git'), file=sys.stderr)
files.remove('.git')		files.remove('.git')
for f in files:		for f in files:
if not f[-2:] == ',v':		if not f[-2:] == ',v':
continue		continue
self.parse_file(root + os.sep + f)		self.parse_file(root + os.sep + f)

for t, c in list(self.tags.items()):		for t, c in list(self.tags.items()):
c.tags.append(t)		c.tags.append(t)

def parse_file(self, path):		def parse_file(self, path: str) -> None:
rtags = dict()		rtags: Dict[str, List[str]] = dict()
rcsfile = rcsparse.rcsfile(path)		rcsfile = rcsparse.rcsfile(path)
branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}		branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
for k, v in list(rcsfile.symbols.items()):		for k, v_ in list(rcsfile.symbols.items()):
r = v.split('.')		r = v_.split('.')
if len(r) == 3:		if len(r) == 3:
branches[v] = 'VENDOR'		branches[v_] = 'VENDOR'
elif len(r) >= 3 and r[-2] == '0':		elif len(r) >= 3 and r[-2] == '0':
branches['.'.join(r[:-2] + r[-1:])] = k		branches['.'.join(r[:-2] + r[-1:])] = k
if len(r) == 2 and branches[r[0]] == 'HEAD':		if len(r) == 2 and branches[r[0]] == 'HEAD':
if v not in rtags:		if v_ not in rtags:
rtags[v] = list()		rtags[v_] = list()
rtags[v].append(k)		rtags[v_].append(k)

revs = rcsfile.revs.items()		revs: List[Tuple[str, Tuple[str, int, str, str, List[str], str, str]]] = list(rcsfile.revs.items())
# sort by revision descending to priorize 1.1.1.1 than 1.1		# sort by revision descending to priorize 1.1.1.1 than 1.1
revs = sorted(revs, key=lambda a: a[1][0], reverse=True)		revs.sort(key=lambda a: a[1][0], reverse=True)
# sort by time		# sort by time
revs = sorted(revs, key=lambda a: a[1][1])		revs.sort(key=lambda a: a[1][1])
novendor = False		novendor = False
have_initial_revision = False		have_initial_revision = False
last_vendor_status = None		last_vendor_status = None
for k, v in revs:		for k, v in revs:
r = k.split('.')		r = k.split('.')
if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \		if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
and r[3] == '1':		and r[3] == '1':
if have_initial_revision:		if have_initial_revision:
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	def parse_file(self, path: str) -> None:
self.changesets[a] = a		self.changesets[a] = a
if k in rtags:		if k in rtags:
for t in rtags[k]:		for t in rtags[k]:
if t not in self.tags or \		if t not in self.tags or \
self.tags[t].max_time < a.max_time:		self.tags[t].max_time < a.max_time:
self.tags[t] = a		self.tags[t] = a


def file_path(r, p):		def file_path(r: str, p: str) -> str:
if r.endswith('/'):		if r.endswith('/'):
r = r[:-1]		r = r[:-1]
if p[-2:] == ',v':		if p[-2:] == ',v':
path = p[:-2] # drop ",v"		path = p[:-2] # drop ",v"
else:		else:
path = p		path = p
p = path.split('/')		p_ = path.split('/')
if len(p) > 0 and p[-2] == 'Attic':		if len(p_) > 0 and p_[-2] == 'Attic':
path = '/'.join(p[:-2] + [p[-1]])		path = '/'.join(p_[:-2] + [p_[-1]])
if path.startswith(r):		if path.startswith(r):
path = path[len(r) + 1:]		path = path[len(r) + 1:]
return path		return path


def git_dump_file(path, k, rcs, markseq):		def git_dump_file(path: str, k, rcs, markseq) -> None:
try:		try:
cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)		cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)
except RuntimeError as msg:		except RuntimeError as msg:
print('Unexpected runtime error on parsing',		print('Unexpected runtime error on parsing',
path, k, ':', msg, file=sys.stderr)		path, k, ':', msg, file=sys.stderr)
print('unlimit the resource limit may fix this problem.',		print('unlimit the resource limit may fix this problem.',
file=sys.stderr)		file=sys.stderr)
sys.exit(1)		sys.exit(1)
Show All 39 Lines	class RcsKeywords:
RCS_KWEXP_NAME = (1 << 1) # include keyword name		RCS_KWEXP_NAME = (1 << 1) # include keyword name
RCS_KWEXP_VAL = (1 << 2) # include keyword value		RCS_KWEXP_VAL = (1 << 2) # include keyword value
RCS_KWEXP_LKR = (1 << 3) # include name of locker		RCS_KWEXP_LKR = (1 << 3) # include name of locker
RCS_KWEXP_OLD = (1 << 4) # generate old keyword string		RCS_KWEXP_OLD = (1 << 4) # generate old keyword string
RCS_KWEXP_ERR = (1 << 5) # mode has an error		RCS_KWEXP_ERR = (1 << 5) # mode has an error
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL)		RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL)
RCS_KWEXP_KVL = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL \| RCS_KWEXP_LKR)		RCS_KWEXP_KVL = (RCS_KWEXP_NAME \| RCS_KWEXP_VAL \| RCS_KWEXP_LKR)

def __init__(self):		def __init__(self) -> None:
self.rerecomple()		self.rerecomple()

def rerecomple(self):		def rerecomple(self) -> None:
pat = b'\|'.join(list(self.rcs_expkw.keys()))		pat = b'\|'.join(list(self.rcs_expkw.keys()))
self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")		self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")

def add_id_keyword(self, keyword):		def add_id_keyword(self, keyword) -> None:
self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID		self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID
self.rerecomple()		self.rerecomple()

def kflag_get(self, flags):		def kflag_get(self, flags: Optional[str]) -> int:
if flags is None:		if flags is None:
return self.RCS_KWEXP_DEFAULT		return self.RCS_KWEXP_DEFAULT
fl = 0		fl = 0
for fc in flags:		for fc in flags:
if fc == 'k':		if fc == 'k':
fl \|= self.RCS_KWEXP_NAME		fl \|= self.RCS_KWEXP_NAME
elif fc == 'v':		elif fc == 'v':
fl \|= self.RCS_KWEXP_VAL		fl \|= self.RCS_KWEXP_VAL
elif fc == 'l':		elif fc == 'l':
fl \|= self.RCS_KWEXP_LKR		fl \|= self.RCS_KWEXP_LKR
elif fc == 'o':		elif fc == 'o':
if len(flags) != 1:		if len(flags) != 1:
fl \|= self.RCS_KWEXP_ERR		fl \|= self.RCS_KWEXP_ERR
fl \|= self.RCS_KWEXP_OLD		fl \|= self.RCS_KWEXP_OLD
elif fc == 'b':		elif fc == 'b':
if len(flags) != 1:		if len(flags) != 1:
fl \|= self.RCS_KWEXP_ERR		fl \|= self.RCS_KWEXP_ERR
fl \|= self.RCS_KWEXP_NONE		fl \|= self.RCS_KWEXP_NONE
else:		else:
fl \|= self.RCS_KWEXP_ERR		fl \|= self.RCS_KWEXP_ERR
return fl		return fl

def expand_keyword(self, filename, rcs, r):		def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str) -> bytes:
rev = rcs.revs[r]		rev = rcs.revs[r]

mode = self.kflag_get(rcs.expand)		mode = self.kflag_get(rcs.expand)
if (mode & (self.RCS_KWEXP_NONE \| self.RCS_KWEXP_OLD)) != 0:		if (mode & (self.RCS_KWEXP_NONE \| self.RCS_KWEXP_OLD)) != 0:
return rcs.checkout(rev[0])		return rcs.checkout(rev[0])

ret = []		ret = []
for line in rcs.checkout(rev[0]).split(b'\n'):		for line in rcs.checkout(rev[0]).split(b'\n'):
▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines