Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
Show All 30 Lines | |||||
# | # | ||||
import getopt | import getopt | ||||
import os | import os | ||||
import re | import re | ||||
import subprocess | import subprocess | ||||
import sys | import sys | ||||
import time | import time | ||||
from typing import Dict, List, Optional, Tuple, TypeVar | |||||
import swh.loader.cvs.rcsparse as rcsparse | import swh.loader.cvs.rcsparse as rcsparse | ||||
CHANGESET_FUZZ_SEC = 300 | CHANGESET_FUZZ_SEC = 300 | ||||
def usage(): | def usage(): | ||||
print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] ' | print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] ' | ||||
'[-E log_encodings]\n' | '[-E log_encodings]\n' | ||||
'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n' | '\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n' | ||||
'\tcvsroot [git_dir]', file=sys.stderr) | '\tcvsroot [git_dir]', file=sys.stderr) | ||||
def main(): | def main() -> None: | ||||
email_domain = None | email_domain = None | ||||
do_incremental = False | do_incremental = False | ||||
git_tip = None | git_tip = None | ||||
git_branch = 'master' | git_branch = 'master' | ||||
dump_all = False | dump_all = False | ||||
log_encoding = 'utf-8,iso-8859-1' | log_encoding = 'utf-8,iso-8859-1' | ||||
rcs = RcsKeywords() | rcs = RcsKeywords() | ||||
modules = [] | modules = [] | ||||
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines | def main() -> None: | ||||
if len(args) == 2: | if len(args) == 2: | ||||
do_incremental = True | do_incremental = True | ||||
git = subprocess.Popen( | git = subprocess.Popen( | ||||
['git', '--git-dir=' + args[1], '-c', | ['git', '--git-dir=' + args[1], '-c', | ||||
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', | 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', | ||||
'--date=raw', '--format=%ae%n%ad%n%H', git_branch], | '--date=raw', '--format=%ae%n%ad%n%H', git_branch], | ||||
encoding='utf-8', stdout=subprocess.PIPE) | encoding='utf-8', stdout=subprocess.PIPE) | ||||
assert git.stdout is not None | |||||
outs = git.stdout.readlines() | outs = git.stdout.readlines() | ||||
git.wait() | git.wait() | ||||
if git.returncode != 0: | if git.returncode != 0: | ||||
print("Couldn't exec git", file=sys.stderr) | print("Couldn't exec git", file=sys.stderr) | ||||
sys.exit(git.returncode) | sys.exit(git.returncode) | ||||
git_tip = outs[2].strip() | git_tip = outs[2].strip() | ||||
if last_revision is not None: | if last_revision is not None: | ||||
git = subprocess.Popen( | git = subprocess.Popen( | ||||
['git', '--git-dir=' + args[1], '-c', | ['git', '--git-dir=' + args[1], '-c', | ||||
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', | 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', | ||||
'--date=raw', '--format=%ae%n%ad%n%H', last_revision], | '--date=raw', '--format=%ae%n%ad%n%H', last_revision], | ||||
encoding='utf-8', stdout=subprocess.PIPE) | encoding='utf-8', stdout=subprocess.PIPE) | ||||
assert git.stdout is not None | |||||
outs = git.stdout.readlines() | outs = git.stdout.readlines() | ||||
git.wait() | git.wait() | ||||
if git.returncode != 0: | if git.returncode != 0: | ||||
print("Coundn't exec git", file=sys.stderr) | print("Coundn't exec git", file=sys.stderr) | ||||
sys.exit(git.returncode) | sys.exit(git.returncode) | ||||
last_author = outs[0].strip() | last_author = outs[0].strip() | ||||
last_ctime = float(outs[1].split()[0]) | last_ctime = float(outs[1].split()[0]) | ||||
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | for k in changesets: | ||||
else: | else: | ||||
markseq = markseq + 1 | markseq = markseq + 1 | ||||
git_dump_file(f.path, f.rev, rcs, markseq) | git_dump_file(f.path, f.rev, rcs, markseq) | ||||
marks[markseq] = f | marks[markseq] = f | ||||
log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev) | log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev) | ||||
for i, e in enumerate(log_encodings): | for i, e in enumerate(log_encodings): | ||||
try: | try: | ||||
how = 'ignore' if i == len(log_encodings) - 1 else 'strict' | how = 'ignore' if i == len(log_encodings) - 1 else 'strict' | ||||
log = log.decode(e, how) | log_str = log.decode(e, how) | ||||
break | break | ||||
except UnicodeError: | except UnicodeError: | ||||
pass | pass | ||||
log = log.encode('utf-8', 'ignore') | log = log_str.encode('utf-8', 'ignore') | ||||
output('commit refs/heads/' + git_branch) | output('commit refs/heads/' + git_branch) | ||||
markseq = markseq + 1 | markseq = markseq + 1 | ||||
output('mark :%d' % (markseq)) | output('mark :%d' % (markseq)) | ||||
email = k.author if email_domain is None \ | email = k.author if email_domain is None \ | ||||
else k.author + '@' + email_domain | else k.author + '@' + email_domain | ||||
output('author %s <%s> %d +0000' % (k.author, email, k.min_time)) | output('author %s <%s> %d +0000' % (k.author, email, k.min_time)) | ||||
output('committer %s <%s> %d +0000' % (k.author, email, k.min_time)) | output('committer %s <%s> %d +0000' % (k.author, email, k.min_time)) | ||||
Show All 26 Lines | def main() -> None: | ||||
print('** dumped', file=sys.stderr) | print('** dumped', file=sys.stderr) | ||||
# | # | ||||
# Encode by UTF-8 always for string objects since encoding for git-fast-import | # Encode by UTF-8 always for string objects since encoding for git-fast-import | ||||
# is UTF-8. Also write without conversion for a bytes object (file bodies | # is UTF-8. Also write without conversion for a bytes object (file bodies | ||||
# might be various encodings) | # might be various encodings) | ||||
# | # | ||||
def output(*args, end='\n'): | def output(*args, end='\n') -> None: | ||||
if len(args) == 0: | if len(args) == 0: | ||||
pass | pass | ||||
elif len(args) > 1 or isinstance(args[0], str): | elif len(args) > 1 or isinstance(args[0], str): | ||||
lines = ' '.join( | lines = ' '.join( | ||||
[arg if isinstance(arg, str) else str(arg) for arg in args]) | [arg if isinstance(arg, str) else str(arg) for arg in args]) | ||||
sys.stdout.buffer.write(lines.encode('utf-8')) | sys.stdout.buffer.write(lines.encode('utf-8')) | ||||
else: | else: | ||||
sys.stdout.buffer.write(args[0]) | sys.stdout.buffer.write(args[0]) | ||||
if len(end) > 0: | if len(end) > 0: | ||||
sys.stdout.buffer.write(end.encode('utf-8')) | sys.stdout.buffer.write(end.encode('utf-8')) | ||||
class FileRevision: | class FileRevision: | ||||
def __init__(self, path, rev, state, markseq): | def __init__(self, path: str, rev: str, state: str, markseq: int) -> None: | ||||
self.path = path | self.path = path | ||||
self.rev = rev | self.rev = rev | ||||
self.state = state | self.state = state | ||||
self.markseq = markseq | self.markseq = markseq | ||||
class ChangeSetKey: | class ChangeSetKey: | ||||
def __init__(self, branch, author, timestamp, log, commitid, fuzzsec): | def __init__( | ||||
self, | |||||
branch: str, | |||||
author, | |||||
timestamp: int, | |||||
log: bytes, | |||||
commitid: Optional[str], | |||||
fuzzsec: int | |||||
) -> None: | |||||
self.branch = branch | self.branch = branch | ||||
self.author = author | self.author = author | ||||
self.min_time = timestamp | self.min_time = timestamp | ||||
self.max_time = timestamp | self.max_time = timestamp | ||||
self.commitid = commitid | self.commitid = commitid | ||||
self.fuzzsec = fuzzsec | self.fuzzsec = fuzzsec | ||||
self.revs = [] | self.revs: List[FileRevision] = [] | ||||
self.tags = [] | self.tags: List[str] = [] | ||||
self.log_hash = 0 | self.log_hash = 0 | ||||
h = 0 | h = 0 | ||||
for c in log: | for c in log: | ||||
h = 31 * h + c | h = 31 * h + c | ||||
self.log_hash = h | self.log_hash = h | ||||
def __lt__(self, other): | def __lt__(self, other) -> bool: | ||||
return self._cmp(other) < 0 | return self._cmp(other) < 0 | ||||
def __gt__(self, other): | def __gt__(self, other) -> bool: | ||||
return self._cmp(other) > 0 | return self._cmp(other) > 0 | ||||
def __eq__(self, other): | def __eq__(self, other) -> bool: | ||||
return self._cmp(other) == 0 | return self._cmp(other) == 0 | ||||
def __le__(self, other): | def __le__(self, other) -> bool: | ||||
return self._cmp(other) <= 0 | return self._cmp(other) <= 0 | ||||
def __ge__(self, other): | def __ge__(self, other) -> bool: | ||||
return self._cmp(other) >= 0 | return self._cmp(other) >= 0 | ||||
def __ne__(self, other): | def __ne__(self, other) -> bool: | ||||
return self._cmp(other) != 0 | return self._cmp(other) != 0 | ||||
def _cmp(self, anon): | def _cmp(self, anon) -> int: | ||||
if not isinstance(anon, ChangeSetKey): | |||||
raise TypeError() | |||||
# compare by the commitid | # compare by the commitid | ||||
cid = _cmp2(self.commitid, anon.commitid) | cid = _cmp2(self.commitid, anon.commitid) | ||||
if cid == 0 and self.commitid is not None: | if cid == 0 and self.commitid is not None: | ||||
# both have commitid and they are same | # both have commitid and they are same | ||||
return 0 | return 0 | ||||
# compare by the time | # compare by the time | ||||
ma = anon.min_time - self.max_time | ma = anon.min_time - self.max_time | ||||
Show All 12 Lines | def _cmp(self, anon) -> int: | ||||
c = _cmp2(self.branch, anon.branch) | c = _cmp2(self.branch, anon.branch) | ||||
if c == 0: | if c == 0: | ||||
c = _cmp2(self.author, anon.author) | c = _cmp2(self.author, anon.author) | ||||
if c == 0: | if c == 0: | ||||
return 0 | return 0 | ||||
return ct if ct != 0 else c | return ct if ct != 0 else c | ||||
def merge(self, anot): | def merge(self, anot: "ChangeSetKey") -> None: | ||||
self.max_time = max(self.max_time, anot.max_time) | self.max_time = max(self.max_time, anot.max_time) | ||||
self.min_time = min(self.min_time, anot.min_time) | self.min_time = min(self.min_time, anot.min_time) | ||||
self.revs.extend(anot.revs) | self.revs.extend(anot.revs) | ||||
def __hash__(self): | def __hash__(self) -> int: | ||||
return hash(self.branch + '/' + self.author) * 31 + self.log_hash | return hash(self.branch + '/' + self.author) * 31 + self.log_hash | ||||
def put_file(self, path, rev, state, markseq): | def put_file(self, path: str, rev: str, state: str, markseq: int): | ||||
self.revs.append(FileRevision(path, rev, state, markseq)) | self.revs.append(FileRevision(path, rev, state, markseq)) | ||||
def _cmp2(a, b): | TCmp = TypeVar("TCmp", int, str) | ||||
def _cmp2(a: Optional[TCmp], b: Optional[TCmp]) -> int: | |||||
_a = a is not None | _a = a is not None | ||||
_b = b is not None | _b = b is not None | ||||
return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) | return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) # type: ignore | ||||
class CvsConv: | class CvsConv: | ||||
def __init__(self, cvsroot, rcs, dumpfile, fuzzsec): | def __init__(self, cvsroot: str, rcs: "RcsKeywords", dumpfile: bool, fuzzsec: int) -> None: | ||||
self.cvsroot = cvsroot | self.cvsroot = cvsroot | ||||
self.rcs = rcs | self.rcs = rcs | ||||
self.changesets = dict() | self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() | ||||
self.dumpfile = dumpfile | self.dumpfile = dumpfile | ||||
self.markseq = 0 | self.markseq = 0 | ||||
self.tags = dict() | self.tags: Dict[str, ChangeSetKey] = dict() | ||||
self.fuzzsec = fuzzsec | self.fuzzsec = fuzzsec | ||||
def walk(self, module=None): | def walk(self, module: Optional[str] =None) -> None: | ||||
p = [self.cvsroot] | p = [self.cvsroot] | ||||
if module is not None: | if module is not None: | ||||
p.append(module) | p.append(module) | ||||
path = os.path.join(*p) | path = os.path.join(*p) | ||||
for root, dirs, files in os.walk(path): | for root, dirs, files in os.walk(path): | ||||
if '.git' in dirs: | if '.git' in dirs: | ||||
print('Ignore %s: cannot handle the path named \'.git\'' % ( | print('Ignore %s: cannot handle the path named \'.git\'' % ( | ||||
root + os.sep + '.git'), file=sys.stderr) | root + os.sep + '.git'), file=sys.stderr) | ||||
dirs.remove('.git') | dirs.remove('.git') | ||||
if '.git' in files: | if '.git' in files: | ||||
print('Ignore %s: cannot handle the path named \'.git\'' % ( | print('Ignore %s: cannot handle the path named \'.git\'' % ( | ||||
root + os.sep + '.git'), file=sys.stderr) | root + os.sep + '.git'), file=sys.stderr) | ||||
files.remove('.git') | files.remove('.git') | ||||
for f in files: | for f in files: | ||||
if not f[-2:] == ',v': | if not f[-2:] == ',v': | ||||
continue | continue | ||||
self.parse_file(root + os.sep + f) | self.parse_file(root + os.sep + f) | ||||
for t, c in list(self.tags.items()): | for t, c in list(self.tags.items()): | ||||
c.tags.append(t) | c.tags.append(t) | ||||
def parse_file(self, path): | def parse_file(self, path: str) -> None: | ||||
rtags = dict() | rtags: Dict[str, List[str]] = dict() | ||||
rcsfile = rcsparse.rcsfile(path) | rcsfile = rcsparse.rcsfile(path) | ||||
branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} | branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} | ||||
for k, v in list(rcsfile.symbols.items()): | for k, v_ in list(rcsfile.symbols.items()): | ||||
r = v.split('.') | r = v_.split('.') | ||||
if len(r) == 3: | if len(r) == 3: | ||||
branches[v] = 'VENDOR' | branches[v_] = 'VENDOR' | ||||
elif len(r) >= 3 and r[-2] == '0': | elif len(r) >= 3 and r[-2] == '0': | ||||
branches['.'.join(r[:-2] + r[-1:])] = k | branches['.'.join(r[:-2] + r[-1:])] = k | ||||
if len(r) == 2 and branches[r[0]] == 'HEAD': | if len(r) == 2 and branches[r[0]] == 'HEAD': | ||||
if v not in rtags: | if v_ not in rtags: | ||||
rtags[v] = list() | rtags[v_] = list() | ||||
rtags[v].append(k) | rtags[v_].append(k) | ||||
revs = rcsfile.revs.items() | revs: List[Tuple[str, Tuple[str, int, str, str, List[str], str, str]]] = list(rcsfile.revs.items()) | ||||
# sort by revision descending to priorize 1.1.1.1 than 1.1 | # sort by revision descending to priorize 1.1.1.1 than 1.1 | ||||
revs = sorted(revs, key=lambda a: a[1][0], reverse=True) | revs.sort(key=lambda a: a[1][0], reverse=True) | ||||
# sort by time | # sort by time | ||||
revs = sorted(revs, key=lambda a: a[1][1]) | revs.sort(key=lambda a: a[1][1]) | ||||
novendor = False | novendor = False | ||||
have_initial_revision = False | have_initial_revision = False | ||||
last_vendor_status = None | last_vendor_status = None | ||||
for k, v in revs: | for k, v in revs: | ||||
r = k.split('.') | r = k.split('.') | ||||
if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \ | if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \ | ||||
and r[3] == '1': | and r[3] == '1': | ||||
if have_initial_revision: | if have_initial_revision: | ||||
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | def parse_file(self, path: str) -> None: | ||||
self.changesets[a] = a | self.changesets[a] = a | ||||
if k in rtags: | if k in rtags: | ||||
for t in rtags[k]: | for t in rtags[k]: | ||||
if t not in self.tags or \ | if t not in self.tags or \ | ||||
self.tags[t].max_time < a.max_time: | self.tags[t].max_time < a.max_time: | ||||
self.tags[t] = a | self.tags[t] = a | ||||
def file_path(r, p): | def file_path(r: str, p: str) -> str: | ||||
if r.endswith('/'): | if r.endswith('/'): | ||||
r = r[:-1] | r = r[:-1] | ||||
if p[-2:] == ',v': | if p[-2:] == ',v': | ||||
path = p[:-2] # drop ",v" | path = p[:-2] # drop ",v" | ||||
else: | else: | ||||
path = p | path = p | ||||
p = path.split('/') | p_ = path.split('/') | ||||
if len(p) > 0 and p[-2] == 'Attic': | if len(p_) > 0 and p_[-2] == 'Attic': | ||||
path = '/'.join(p[:-2] + [p[-1]]) | path = '/'.join(p_[:-2] + [p_[-1]]) | ||||
if path.startswith(r): | if path.startswith(r): | ||||
path = path[len(r) + 1:] | path = path[len(r) + 1:] | ||||
return path | return path | ||||
def git_dump_file(path, k, rcs, markseq): | def git_dump_file(path: str, k, rcs, markseq) -> None: | ||||
try: | try: | ||||
cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k) | cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k) | ||||
except RuntimeError as msg: | except RuntimeError as msg: | ||||
print('Unexpected runtime error on parsing', | print('Unexpected runtime error on parsing', | ||||
path, k, ':', msg, file=sys.stderr) | path, k, ':', msg, file=sys.stderr) | ||||
print('unlimit the resource limit may fix this problem.', | print('unlimit the resource limit may fix this problem.', | ||||
file=sys.stderr) | file=sys.stderr) | ||||
sys.exit(1) | sys.exit(1) | ||||
Show All 39 Lines | class RcsKeywords: | ||||
RCS_KWEXP_NAME = (1 << 1) # include keyword name | RCS_KWEXP_NAME = (1 << 1) # include keyword name | ||||
RCS_KWEXP_VAL = (1 << 2) # include keyword value | RCS_KWEXP_VAL = (1 << 2) # include keyword value | ||||
RCS_KWEXP_LKR = (1 << 3) # include name of locker | RCS_KWEXP_LKR = (1 << 3) # include name of locker | ||||
RCS_KWEXP_OLD = (1 << 4) # generate old keyword string | RCS_KWEXP_OLD = (1 << 4) # generate old keyword string | ||||
RCS_KWEXP_ERR = (1 << 5) # mode has an error | RCS_KWEXP_ERR = (1 << 5) # mode has an error | ||||
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL) | RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL) | ||||
RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR) | RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR) | ||||
def __init__(self): | def __init__(self) -> None: | ||||
self.rerecomple() | self.rerecomple() | ||||
def rerecomple(self): | def rerecomple(self) -> None: | ||||
pat = b'|'.join(list(self.rcs_expkw.keys())) | pat = b'|'.join(list(self.rcs_expkw.keys())) | ||||
self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") | self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") | ||||
def add_id_keyword(self, keyword): | def add_id_keyword(self, keyword) -> None: | ||||
self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID | self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID | ||||
self.rerecomple() | self.rerecomple() | ||||
def kflag_get(self, flags): | def kflag_get(self, flags: Optional[str]) -> int: | ||||
if flags is None: | if flags is None: | ||||
return self.RCS_KWEXP_DEFAULT | return self.RCS_KWEXP_DEFAULT | ||||
fl = 0 | fl = 0 | ||||
for fc in flags: | for fc in flags: | ||||
if fc == 'k': | if fc == 'k': | ||||
fl |= self.RCS_KWEXP_NAME | fl |= self.RCS_KWEXP_NAME | ||||
elif fc == 'v': | elif fc == 'v': | ||||
fl |= self.RCS_KWEXP_VAL | fl |= self.RCS_KWEXP_VAL | ||||
elif fc == 'l': | elif fc == 'l': | ||||
fl |= self.RCS_KWEXP_LKR | fl |= self.RCS_KWEXP_LKR | ||||
elif fc == 'o': | elif fc == 'o': | ||||
if len(flags) != 1: | if len(flags) != 1: | ||||
fl |= self.RCS_KWEXP_ERR | fl |= self.RCS_KWEXP_ERR | ||||
fl |= self.RCS_KWEXP_OLD | fl |= self.RCS_KWEXP_OLD | ||||
elif fc == 'b': | elif fc == 'b': | ||||
if len(flags) != 1: | if len(flags) != 1: | ||||
fl |= self.RCS_KWEXP_ERR | fl |= self.RCS_KWEXP_ERR | ||||
fl |= self.RCS_KWEXP_NONE | fl |= self.RCS_KWEXP_NONE | ||||
else: | else: | ||||
fl |= self.RCS_KWEXP_ERR | fl |= self.RCS_KWEXP_ERR | ||||
return fl | return fl | ||||
def expand_keyword(self, filename, rcs, r): | def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str) -> bytes: | ||||
rev = rcs.revs[r] | rev = rcs.revs[r] | ||||
mode = self.kflag_get(rcs.expand) | mode = self.kflag_get(rcs.expand) | ||||
if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0: | if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0: | ||||
return rcs.checkout(rev[0]) | return rcs.checkout(rev[0]) | ||||
ret = [] | ret = [] | ||||
for line in rcs.checkout(rev[0]).split(b'\n'): | for line in rcs.checkout(rev[0]).split(b'\n'): | ||||
▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines |