diff --git a/swh/loader/cvs/cvs2gitdump/.github/workflows/python-app.yml b/swh/loader/cvs/cvs2gitdump/.github/workflows/python-app.yml new file mode 100644 index 0000000..daa09e8 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/.github/workflows/python-app.yml @@ -0,0 +1,33 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --ignore=E221,E241,W504 --max-line-length=127 --statistics diff --git a/swh/loader/cvs/cvs2gitdump/README.md b/swh/loader/cvs/cvs2gitdump/README.md new file mode 100644 index 0000000..08f1075 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/README.md @@ -0,0 +1,204 @@ +**cvs2gitdump.py and cvs2svndump.py can run on Python 3 now. But + "rcsparse" doesn't support running on Python 3. For OpenBSD, use + py3-rcsparse-20151027p1 package which will be available for 6.9.** + +cvs2gitdump +=========== + + +A small python script which imports cvs tree into git repository. + +Pros: +- Small footprint +- Supports incremental import. It's very fast +- Converts tags on HEAD +- Everything is done in memory + +Cons: +- Doesn't convert any branches + +An alternative to +- [git-cvs](https://github.com/ustuehler/git-cvs) +- [cvs2svn](http://cvs2svn.tigris.org/) +- [cvs-fast-export](http://www.catb.org/~esr/cvs-fast-export/cvs-fast-export.html) + +Prerequisite: +- [rcsparse](https://github.com/corecode/rcsparse) + + +Usage +----- + + usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] [-E log_encodings] + [-k rcs_keywords] [-b branch] [-m module] [-l last_revision] + cvsroot [git_dir] + + +### Options + +* -a + + As the default the script will only use commits 10 minutes older than + the most recent commit because recent commits may not stable if the + repository is changing. This option will change this behavior, it + will use the entire commits. + +* -b branch + + The branch name of the git repository which is used for incremental + import. + +* -h + + Show the usage. + +* -z fuzz + + When the script collects changesets from CVS repository, commits by + the same author, using the same log message and within ``fuzz`` + seconds are collected into the same changeset. 300 (seconds) is used + as the default. + +* -e email_domain + + Append the email domain to the author. + +* -E log_encodings + + Specify the character encodings used for decoding CVS logs. Multiple + encodings can be specified by spearating with ','. Specified encodings + are used in order for decoding the log. Default is 'utf-8,iso-8859-1' + +* -k rcs_keywords + + Add an extra RCS keyword which are used by CVS. The script + substitutes the RCS keyword by the same way as $Id$. + +* -m module + + Specify the target module name in the target cvsroot. The script will + dump only the directory specified by this option. + +* -l last_rev + + Specify the last revision which is used for finding the last change + set in the CVS tree. Specify in SHA-1. + +* cvsroot + + The target cvsroot or the sub directory of the cvsroot. The script treats + this directory as the root directory. + +* git_dir + + The git repository. Specify this for incremental import. + +Example +------- + +First import: + + % git init --bare /git/openbsd.git + % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src > openbsd.dump + % git --git-dir /git/openbsd.git fast-import < openbsd.dump + +Periodic import: + + % doas reposync + % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src /git/openbsd.git > openbsd2.dump + % git --git-dir /git/openbsd.git fast-import < openbsd2.dump + + +cvs2svndump +=========== + +A small python script which imports cvs tree into subversion repository. + +Pros: +- Small footprint +- Supports incremental import is supported. It's very fast +- Everythings is done in memory + +Cons: +- Doesn't convert tags and branches + +Prerequirement: +- [rcsparse](http://gitorious.org/fromcvs/rcsparse) +- svn (Python interface for subversion) + + +Usage +----- + + usage: cvs2svndump [-ah] [-z fuzz] [-e email_domain] [-E log_encodings] + [-k rcs_keywords] [-m module] cvsroot [svnroot svnpath]] + + +### Options + +* -a + + As the default the script will only use commits 10 minutes older than + the most recent commit because recent commits may not stable if the + repository is changing. This option will change this behavior, it + will use the entire commits. + +* -h + + Show the usage. + +* -z fuzz + + When the script collects changesets from CVS repository, commits by + the same author, using the same log message and within ``fuzz`` + seconds are collected into the same changeset. 300 (seconds) is used + as the default. + +* -e email_domain + + Append the email domain to the author. + +* -E log_encodings + + Specify the character encodings used for decoding CVS logs. Multiple + encodings can be specified by spearating with ',' and they are used in + order. Default is 'utf-8,iso-8859-1' + +* -k rcs_keywords + + Add an extra RCS keyword which are used by CVS. The script + substitutes the RCS keyword by the same way as $Id$. + +* -m module + + Specify the target module name in the target cvsroot. The script will + dump only the directory specified by this option. + +* cvsroot + + The target cvsroot or the sub directory of the cvsroot. The script treats + this directory as the root directory. + +* svn_dir svn_path + + Specify the svn repository and path. Specify these for incremental + import. When the script searches the last commit, it excepts the commits + whose author are 'svnadmin'. Use 'svnadmin' for manually fixing. + + +Example +------- + +First import: + + % python cvs2svndump.py -k OpenBSD /cvs/openbsd/src > openbsd.dump + % svnadmin create /svnrepo + % svn mkdir --parents -m 'mkdir /vendor/openbsd/head/src' file:///svnrepo/vendor/openbsd/head/src + % svnadmin load --parent-dir /vendor/openbsd/head/src /svnrepo < openbsd.dump + +Periodic import: + + % doas cvsync + % python cvs2svndump.py -k OpenBSD /cvs/openbsd/src file:///svnrepo vendor/openbsd/head/src > openbsd2.dump + % svnadmin load /svnrepo < openbsd2.dump + diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.1 b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.1 new file mode 100644 index 0000000..f6de405 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.1 @@ -0,0 +1,81 @@ +.Dd August 1, 2016 +.Dt CVS2GITDUMP 1 +.Os +.Sh NAME +.Nm cvs2gitdump +.Nd imports a cvs tree into a git repository +.Sh SYNOPSIS +.Nm +.Op Fl ah +.Op Fl z Ar fuzz +.Op Fl e Ar email_domain +.Op Fl E Ar log_encodings +.Op Fl k Ar rcs_keywords +.Op Fl b Ar branch +.Op Fl m Ar module +.Op Fl l Ar last_revision +.Ar cvsroot +.Op Ar git_dir +.Sh DESCRIPTION +.Nm +is a small python script which imports a cvs tree into a git repository. +.Nm +has a small footprint, supports incremental imports and converts tags to HEAD. +It's very fast because the conversion is done in memory. +.Pp +Options: +.Bl -tag -width Ds +.It Fl a +By default, the script will only use commits 10 minutes older than the most +recent commit because recent commits are not stable if the repository is +changing. This option will change this behavior. It will use all the commits. +.It Fl b Ar branch +The git branch which is used for incremental import. +.It Fl h +Show the usage. +.It Fl z Ar fuzz +When the script collects changesets from the CVS repository, commits by the +same author, using the same log message and within fuzz seconds are collected +into the same changeset. 300 (seconds) is used as default. +.It Fl e Ar email_domain +Append the email domain to the author field. +.It Fl E Ar log_encodings +Specify the character encodings used for decoding CVS logs. Multiple encodings +can be specified by separating with ','. Specified encodings are used in order +for decoding the log. Default is 'utf-8,iso-8859-1' +.It Fl k Ar rcs_keywords +Add an extra RCS keyword which are used by CVS. The script substitutes the RCS +keyword by the same way as $Id$. +.It Fl m Ar module +Specify the target module name in the target cvsroot. The script will dump only +the directory specified by this option. +.It Fl l Ar last_revision +Specify the last SHA-1 revision which is used for finding the last change set +in the CVS tree. +.It Ar cvsroot +The target cvsroot or the sub directory of the cvsroot. The script treats this +directory as the root directory. +.It Ar git_dir +The git repository. Specify this for incremental import. +.El +.Sh EXAMPLES +First import: +.Bd -literal +$ git init --bare /git/openbsd.git +$ cvs2gitdump -k OpenBSD -e openbsd.org \(rs + /cvs/openbsd/src > openbsd.dump +$ git --git-dir /git/openbsd.git fast-import < openbsd.dump +.Ed +.Pp +Periodic import: +.Bd -literal +$ cvsync +$ cvs2gitdump -k OpenBSD -e openbsd.org /cvs/openbsd/src \(rs + /git/openbsd.git > openbsd2.dump +$ git --git-dir /git/openbsd.git fast-import < openbsd2.dump +.Ed +.Sh AUTHORS +.An YASUOKA Masahiko. +.Sh CAVEATS +.Nm +doesn't convert branches. diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py new file mode 100644 index 0000000..f6ae171 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py @@ -0,0 +1,646 @@ +#!/usr/local/bin/python + +# +# Copyright (c) 2012 YASUOKA Masahiko +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# Usage +# +# First import: +# % git init --bare /git/openbsd.git +# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \ +# > openbsd.dump +# % git --git-dir /git/openbsd.git fast-import < openbsd.dump +# +# Periodic import: +# % sudo cvsync +# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \ +# /git/openbsd.git > openbsd2.dump +# % git --git-dir /git/openbsd.git fast-import < openbsd2.dump +# + +import getopt +import os +import re +import subprocess +import sys +import time +import rcsparse + +CHANGESET_FUZZ_SEC = 300 + + +def usage(): + print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] ' + '[-E log_encodings]\n' + '\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n' + '\tcvsroot [git_dir]', file=sys.stderr) + + +def main(): + email_domain = None + do_incremental = False + git_tip = None + git_branch = 'master' + dump_all = False + log_encoding = 'utf-8,iso-8859-1' + rcs = RcsKeywords() + modules = [] + last_revision = None + fuzzsec = CHANGESET_FUZZ_SEC + + try: + opts, args = getopt.getopt(sys.argv[1:], 'ab:hm:z:e:E:k:t:l:') + for opt, v in opts: + if opt == '-z': + fuzzsec = int(v) + elif opt == '-e': + email_domain = v + elif opt == '-a': + dump_all = True + elif opt == '-b': + git_branch = v + elif opt == '-E': + log_encoding = v + elif opt == '-k': + rcs.add_id_keyword(v) + elif opt == '-m': + if v == '.git': + print('Cannot handle the path named \'.git\'', + file=sys.stderr) + sys.exit(1) + modules.append(v) + elif opt == '-l': + last_revision = v + elif opt == '-h': + usage() + sys.exit(1) + except getopt.GetoptError as msg: + print(msg, file=sys.stderr) + usage() + sys.exit(1) + + if len(args) == 0 or len(args) > 2: + usage() + sys.exit(1) + + log_encodings = log_encoding.split(',') + + cvsroot = args[0] + while cvsroot[-1] == '/': + cvsroot = cvsroot[:-1] + + if len(args) == 2: + do_incremental = True + git = subprocess.Popen( + ['git', '--git-dir=' + args[1], '-c', + 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', + '--date=raw', '--format=%ae%n%ad%n%H', git_branch], + encoding='utf-8', stdout=subprocess.PIPE) + outs = git.stdout.readlines() + git.wait() + if git.returncode != 0: + print("Couldn't exec git", file=sys.stderr) + sys.exit(git.returncode) + git_tip = outs[2].strip() + + if last_revision is not None: + git = subprocess.Popen( + ['git', '--git-dir=' + args[1], '-c', + 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', + '--date=raw', '--format=%ae%n%ad%n%H', last_revision], + encoding='utf-8', stdout=subprocess.PIPE) + outs = git.stdout.readlines() + git.wait() + if git.returncode != 0: + print("Coundn't exec git", file=sys.stderr) + sys.exit(git.returncode) + last_author = outs[0].strip() + last_ctime = float(outs[1].split()[0]) + + # strip off the domain part from the last author since cvs doesn't have + # the domain part. + if do_incremental and email_domain is not None and \ + last_author.lower().endswith(('@' + email_domain).lower()): + last_author = last_author[:-1 * (1 + len(email_domain))] + + cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec) + print('** walk cvs tree', file=sys.stderr) + if len(modules) == 0: + cvs.walk() + else: + for module in modules: + cvs.walk(module) + + changesets = sorted(cvs.changesets) + nchangesets = len(changesets) + print('** cvs has %d changeset' % (nchangesets), file=sys.stderr) + + if nchangesets <= 0: + sys.exit(0) + + if not dump_all: + # don't use last 10 minutes for safety + max_time_max = changesets[-1].max_time - 600 + else: + max_time_max = changesets[-1].max_time + + found_last_revision = False + markseq = cvs.markseq + extags = set() + for k in changesets: + if do_incremental and not found_last_revision: + if k.min_time == last_ctime and k.author == last_author: + found_last_revision = True + for tag in k.tags: + extags.add(tag) + continue + if k.max_time > max_time_max: + break + + marks = {} + + for f in k.revs: + if not do_incremental: + marks[f.markseq] = f + else: + markseq = markseq + 1 + git_dump_file(f.path, f.rev, rcs, markseq) + marks[markseq] = f + log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev) + for i, e in enumerate(log_encodings): + try: + how = 'ignore' if i == len(log_encodings) - 1 else 'strict' + log = log.decode(e, how) + break + except UnicodeError: + pass + log = log.encode('utf-8', 'ignore') + + output('commit refs/heads/' + git_branch) + markseq = markseq + 1 + output('mark :%d' % (markseq)) + email = k.author if email_domain is None \ + else k.author + '@' + email_domain + output('author %s <%s> %d +0000' % (k.author, email, k.min_time)) + output('committer %s <%s> %d +0000' % (k.author, email, k.min_time)) + + output('data', len(log)) + output(log, end='') + if do_incremental and git_tip is not None: + output('from', git_tip) + git_tip = None + + for m in marks: + f = marks[m] + mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644 + fn = file_path(cvs.cvsroot, f.path) + if f.state == 'dead': + output('D', fn) + else: + output('M %o :%d %s' % (mode, m, fn)) + output('') + for tag in k.tags: + if tag in extags: + continue + output('reset refs/tags/%s' % (tag)) + output('from :%d' % (markseq)) + output('') + + if do_incremental and not found_last_revision: + raise Exception('could not find the last revision') + + print('** dumped', file=sys.stderr) + + +# +# Encode by UTF-8 always for string objects since encoding for git-fast-import +# is UTF-8. Also write without conversion for a bytes object (file bodies +# might be various encodings) +# +def output(*args, end='\n'): + if len(args) == 0: + pass + elif len(args) > 1 or isinstance(args[0], str): + lines = ' '.join( + [arg if isinstance(arg, str) else str(arg) for arg in args]) + sys.stdout.buffer.write(lines.encode('utf-8')) + else: + sys.stdout.buffer.write(args[0]) + if len(end) > 0: + sys.stdout.buffer.write(end.encode('utf-8')) + + +class FileRevision: + def __init__(self, path, rev, state, markseq): + self.path = path + self.rev = rev + self.state = state + self.markseq = markseq + + +class ChangeSetKey: + def __init__(self, branch, author, timestamp, log, commitid, fuzzsec): + self.branch = branch + self.author = author + self.min_time = timestamp + self.max_time = timestamp + self.commitid = commitid + self.fuzzsec = fuzzsec + self.revs = [] + self.tags = [] + self.log_hash = 0 + h = 0 + for c in log: + h = 31 * h + c + self.log_hash = h + + def __lt__(self, other): + return self._cmp(other) < 0 + + def __gt__(self, other): + return self._cmp(other) > 0 + + def __eq__(self, other): + return self._cmp(other) == 0 + + def __le__(self, other): + return self._cmp(other) <= 0 + + def __ge__(self, other): + return self._cmp(other) >= 0 + + def __ne__(self, other): + return self._cmp(other) != 0 + + def _cmp(self, anon): + # compare by the commitid + cid = _cmp2(self.commitid, anon.commitid) + if cid == 0 and self.commitid is not None: + # both have commitid and they are same + return 0 + + # compare by the time + ma = anon.min_time - self.max_time + mi = self.min_time - anon.max_time + ct = self.min_time - anon.min_time + if ma > self.fuzzsec or mi > self.fuzzsec: + return ct + + if cid != 0: + # only one has the commitid, this means different commit + return cid if ct == 0 else ct + + # compare by log, branch and author + c = _cmp2(self.log_hash, anon.log_hash) + if c == 0: + c = _cmp2(self.branch, anon.branch) + if c == 0: + c = _cmp2(self.author, anon.author) + if c == 0: + return 0 + + return ct if ct != 0 else c + + def merge(self, anot): + self.max_time = max(self.max_time, anot.max_time) + self.min_time = min(self.min_time, anot.min_time) + self.revs.extend(anot.revs) + + def __hash__(self): + return hash(self.branch + '/' + self.author) * 31 + self.log_hash + + def put_file(self, path, rev, state, markseq): + self.revs.append(FileRevision(path, rev, state, markseq)) + + +def _cmp2(a, b): + _a = a is not None + _b = b is not None + return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) + + +class CvsConv: + def __init__(self, cvsroot, rcs, dumpfile, fuzzsec): + self.cvsroot = cvsroot + self.rcs = rcs + self.changesets = dict() + self.dumpfile = dumpfile + self.markseq = 0 + self.tags = dict() + self.fuzzsec = fuzzsec + + def walk(self, module=None): + p = [self.cvsroot] + if module is not None: + p.append(module) + path = os.path.join(*p) + + for root, dirs, files in os.walk(path): + if '.git' in dirs: + print('Ignore %s: cannot handle the path named \'.git\'' % ( + root + os.sep + '.git'), file=sys.stderr) + dirs.remove('.git') + if '.git' in files: + print('Ignore %s: cannot handle the path named \'.git\'' % ( + root + os.sep + '.git'), file=sys.stderr) + files.remove('.git') + for f in files: + if not f[-2:] == ',v': + continue + self.parse_file(root + os.sep + f) + + for t, c in list(self.tags.items()): + c.tags.append(t) + + def parse_file(self, path): + rtags = dict() + rcsfile = rcsparse.rcsfile(path) + branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} + for k, v in list(rcsfile.symbols.items()): + r = v.split('.') + if len(r) == 3: + branches[v] = 'VENDOR' + elif len(r) >= 3 and r[-2] == '0': + branches['.'.join(r[:-2] + r[-1:])] = k + if len(r) == 2 and branches[r[0]] == 'HEAD': + if v not in rtags: + rtags[v] = list() + rtags[v].append(k) + + revs = rcsfile.revs.items() + # sort by revision descending to priorize 1.1.1.1 than 1.1 + revs = sorted(revs, key=lambda a: a[1][0], reverse=True) + # sort by time + revs = sorted(revs, key=lambda a: a[1][1]) + novendor = False + have_initial_revision = False + last_vendor_status = None + for k, v in revs: + r = k.split('.') + if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \ + and r[3] == '1': + if have_initial_revision: + continue + if v[3] == 'dead': + continue + last_vendor_status = v[3] + have_initial_revision = True + elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1': + if novendor: + continue + last_vendor_status = v[3] + elif len(r) == 2: + if r[0] == '1' and r[1] == '1': + if have_initial_revision: + continue + if v[3] == 'dead': + continue + have_initial_revision = True + elif r[0] == '1' and r[1] != '1': + novendor = True + if last_vendor_status == 'dead' and v[3] == 'dead': + last_vendor_status = None + continue + last_vendor_status = None + else: + # trunk only + continue + + if self.dumpfile: + self.markseq = self.markseq + 1 + git_dump_file(path, k, self.rcs, self.markseq) + + b = '.'.join(r[:-1]) + try: + a = ChangeSetKey( + branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6], + self.fuzzsec) + except Exception as e: + print('Aborted at %s %s' % (path, v[0]), file=sys.stderr) + raise e + + a.put_file(path, k, v[3], self.markseq) + while a in self.changesets: + c = self.changesets[a] + del self.changesets[a] + c.merge(a) + a = c + self.changesets[a] = a + if k in rtags: + for t in rtags[k]: + if t not in self.tags or \ + self.tags[t].max_time < a.max_time: + self.tags[t] = a + + +def file_path(r, p): + if r.endswith('/'): + r = r[:-1] + path = p[:-2] # drop ",v" + p = path.split('/') + if len(p) > 0 and p[-2] == 'Attic': + path = '/'.join(p[:-2] + [p[-1]]) + if path.startswith(r): + path = path[len(r) + 1:] + return path + + +def git_dump_file(path, k, rcs, markseq): + try: + cont = rcs.expand_keyword(path, k) + except RuntimeError as msg: + print('Unexpected runtime error on parsing', + path, k, ':', msg, file=sys.stderr) + print('unlimit the resource limit may fix this problem.', + file=sys.stderr) + sys.exit(1) + output('blob') + output('mark :%d' % markseq) + output('data', len(cont)) + output(cont) + + +class RcsKeywords: + RCS_KW_AUTHOR = (1 << 0) + RCS_KW_DATE = (1 << 1) + RCS_KW_LOG = (1 << 2) + RCS_KW_NAME = (1 << 3) + RCS_KW_RCSFILE = (1 << 4) + RCS_KW_REVISION = (1 << 5) + RCS_KW_SOURCE = (1 << 6) + RCS_KW_STATE = (1 << 7) + RCS_KW_FULLPATH = (1 << 8) + RCS_KW_MDOCDATE = (1 << 9) + RCS_KW_LOCKER = (1 << 10) + + RCS_KW_ID = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE | + RCS_KW_AUTHOR | RCS_KW_STATE) + RCS_KW_HEADER = (RCS_KW_ID | RCS_KW_FULLPATH) + + rcs_expkw = { + b"Author": RCS_KW_AUTHOR, + b"Date": RCS_KW_DATE, + b"Header": RCS_KW_HEADER, + b"Id": RCS_KW_ID, + b"Log": RCS_KW_LOG, + b"Name": RCS_KW_NAME, + b"RCSfile": RCS_KW_RCSFILE, + b"Revision": RCS_KW_REVISION, + b"Source": RCS_KW_SOURCE, + b"State": RCS_KW_STATE, + b"Mdocdate": RCS_KW_MDOCDATE, + b"Locker": RCS_KW_LOCKER + } + + RCS_KWEXP_NONE = (1 << 0) + RCS_KWEXP_NAME = (1 << 1) # include keyword name + RCS_KWEXP_VAL = (1 << 2) # include keyword value + RCS_KWEXP_LKR = (1 << 3) # include name of locker + RCS_KWEXP_OLD = (1 << 4) # generate old keyword string + RCS_KWEXP_ERR = (1 << 5) # mode has an error + RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL) + RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR) + + def __init__(self): + self.rerecomple() + + def rerecomple(self): + pat = b'|'.join(list(self.rcs_expkw.keys())) + self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") + + def add_id_keyword(self, keyword): + self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID + self.rerecomple() + + def kflag_get(self, flags): + if flags is None: + return self.RCS_KWEXP_DEFAULT + fl = 0 + for fc in flags: + if fc == 'k': + fl |= self.RCS_KWEXP_NAME + elif fc == 'v': + fl |= self.RCS_KWEXP_VAL + elif fc == 'l': + fl |= self.RCS_KWEXP_LKR + elif fc == 'o': + if len(flags) != 1: + fl |= self.RCS_KWEXP_ERR + fl |= self.RCS_KWEXP_OLD + elif fc == 'b': + if len(flags) != 1: + fl |= self.RCS_KWEXP_ERR + fl |= self.RCS_KWEXP_NONE + else: + fl |= self.RCS_KWEXP_ERR + return fl + + def expand_keyword(self, filename, r): + rcs = rcsparse.rcsfile(filename) + rev = rcs.revs[r] + + mode = self.kflag_get(rcs.expand) + if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0: + return rcs.checkout(rev[0]) + + ret = [] + for line in rcs.checkout(rev[0]).split(b'\n'): + logbuf = None + m = self.re_kw.match(line) + if m is None: + # No RCS Keywords, use it as it is + ret += [line] + continue + + line0 = b'' + while m is not None: + try: + dsign = m.end(1) + line[m.end(1):].index(b'$') + except ValueError: + break + prefix = line[:m.start(1) - 1] + line = line[dsign + 1:] + line0 += prefix + expbuf = '' + if (mode & self.RCS_KWEXP_NAME) != 0: + expbuf += '$' + expbuf += m.group(1).decode('ascii') + if (mode & self.RCS_KWEXP_VAL) != 0: + expbuf += ': ' + if (mode & self.RCS_KWEXP_VAL) != 0: + expkw = self.rcs_expkw[m.group(1)] + if (expkw & self.RCS_KW_RCSFILE) != 0: + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + if (expkw & self.RCS_KW_REVISION) != 0: + expbuf += rev[0] + expbuf += " " + if (expkw & self.RCS_KW_DATE) != 0: + expbuf += time.strftime( + "%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1])) + if (expkw & self.RCS_KW_MDOCDATE) != 0: + d = time.gmtime(rev[1]) + expbuf += time.strftime( + "%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d) + if (expkw & self.RCS_KW_AUTHOR) != 0: + expbuf += rev[2] + expbuf += " " + if (expkw & self.RCS_KW_STATE) != 0: + expbuf += rev[3] + expbuf += " " + if (expkw & self.RCS_KW_LOG) != 0: + p = prefix + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + logbuf = p + ( + 'Revision %s %s %s\n' % ( + rev[0], time.strftime( + "%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])), + rev[2])).encode('ascii') + for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'): + if len(lline) == 0: + logbuf += p.rstrip() + b'\n' + else: + logbuf += p + lline.lstrip() + b'\n' + if len(line) == 0: + logbuf += p.rstrip() + else: + logbuf += p + line.lstrip() + line = b'' + if (expkw & self.RCS_KW_SOURCE) != 0: + expbuf += filename + expbuf += " " + if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0: + expbuf += " " + if (mode & self.RCS_KWEXP_NAME) != 0: + expbuf += '$' + line0 += expbuf[:255].encode('ascii') + m = self.re_kw.match(line) + + ret += [line0 + line] + if logbuf is not None: + ret += [logbuf] + return b'\n'.join(ret) + + +# ---------------------------------------------------------------------- +# entry point +# ---------------------------------------------------------------------- +if __name__ == '__main__': + main() diff --git a/swh/loader/cvs/cvs2gitdump/cvs2svndump.1 b/swh/loader/cvs/cvs2gitdump/cvs2svndump.1 new file mode 100644 index 0000000..ebd95d1 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/cvs2svndump.1 @@ -0,0 +1,78 @@ +.Dd August 1, 2016 +.Dt CVS2SVNDUMP 1 +.Os +.Sh NAME +.Nm cvs2svndump +.Nd imports a cvs tree into a subversion repository +.Sh SYNOPSIS +.Nm +.Op Fl ah +.Op Fl z Ar fuzz +.Op Fl e Ar email_domain +.Op Fl E Ar log_encodings +.Op Fl k Ar rcs_keywords +.Op Fl m Ar module +.Ar cvsroot +.Op Ar svnroot svnpath +.Sh DESCRIPTION +.Nm +is a small python script which imports a cvs tree into a subversion repository. +.Nm +has a small footprint and supports incremental imports. It's very fast because +the conversion is done in memory. +.Pp +Options: +.Bl -tag -width Ds +.It Fl a +By default, the script will only use commits 10 minutes older than the most +recent commit because recent commits are not stable if the repository is +changing. This option will change this behavior. It will use all the commits. +.It Fl h +Show the usage. +.It Fl z Ar fuzz +When the script collects changesets from the CVS repository, commits by the +same author, using the same log message and within fuzz seconds are collected +into the same changeset. 300 (seconds) is used as default. +.It Fl e Ar email_domain +Append the email domain to the author field. +.It Fl E Ar log_encodings +Specify the character encodings used for decoding CVS logs. Multiple encodings +can be specified by separating with ','. Specified encodings are used in order +for decoding the log. Default is 'utf-8,iso-8859-1' +.It Fl k Ar rcs_keywords +Add an extra RCS keyword which are used by CVS. The script substitutes the RCS +keyword by the same way as $Id$. +.It Fl m Ar module +Specify the target module name in the target cvsroot. The script will dump only +the directory specified by this option. +.It Ar cvsroot +The target cvsroot or the sub directory of the cvsroot. The script treats this +directory as the root directory. +.It Ar svn_dir svn_path +Specify the svn repository and path. Specify these for incremental import. When +the script searches the last commit, it excepts the commits whose author +are 'svnadmin'. Use 'svnadmin' for manually fixing. +.El +.Sh EXAMPLES +First import: +.Bd -literal +$ cvs2svndump -k OpenBSD /cvs/openbsd/src > openbsd.dump +$ svnadmin create /svnrepo +$ svn mkdir --parents -m 'mkdir /vendor/openbsd/head/src' \(rs + file:///svnrepo/vendor/openbsd/head/src +$ svnadmin load --parent-dir /vendor/openbsd/head/src \(rs + /svnrepo < openbsd.dump +.Ed +.Pp +Periodic import: +.Bd -literal +$ cvsync +$ cvs2svndump -k OpenBSD /cvs/openbsd/src file:///svnrepo \(rs + vendor/openbsd/head/src > openbsd2.dump +$ svnadmin load /svnrepo < openbsd2.dump +.Ed +.Sh AUTHORS +.An YASUOKA Masahiko. +.Sh CAVEATS +.Nm +doesn't convert tags and branches. diff --git a/swh/loader/cvs/cvs2gitdump/cvs2svndump.py b/swh/loader/cvs/cvs2gitdump/cvs2svndump.py new file mode 100644 index 0000000..fac2919 --- /dev/null +++ b/swh/loader/cvs/cvs2gitdump/cvs2svndump.py @@ -0,0 +1,742 @@ +#!/usr/local/bin/python + +# +# Copyright (c) 2012 YASUOKA Masahiko +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# Usage +# +# First import: +# % python cvs2svndump.py -k OpenBSD /cvs/openbsd/src > openbsd.dump +# % svnadmin create /svnrepo +# % svn mkdir --parents -m 'mkdir /vendor/openbsd/head/src' \ +# file:///svnrepo/vendor/openbsd/head/src +# % svnadmin load --parent-dir /vendor/openbsd/head/src /svnrepo \ +# < openbsd.dump +# +# Periodic import: +# % sudo cvsync +# % python cvs2svndump.py -k OpenBSD /cvs/openbsd/src file:///svnrepo \ +# vendor/openbsd/head/src > openbsd2.dump +# % svnadmin load /svnrepo < openbsd2.dump +# + +import getopt +import os +import re +import sys +import time + +from hashlib import md5 + +from svn import core, fs, delta, repos +import rcsparse + +CHANGESET_FUZZ_SEC = 300 + + +def usage(): + print('usage: cvs2svndump [-ah] [-z fuzz] [-e email_domain] ' + '[-E log_encodings]\n' + '\t[-k rcs_keywords] [-m module] cvsroot [svnroot svnpath]]', + file=sys.stderr) + + +def main(): + email_domain = None + do_incremental = False + dump_all = False + log_encoding = 'utf-8,iso-8859-1' + rcs = RcsKeywords() + modules = [] + fuzzsec = CHANGESET_FUZZ_SEC + + try: + opts, args = getopt.getopt(sys.argv[1:], 'ahm:z:e:E:k:') + for opt, v in opts: + if opt == '-z': + fuzzsec = int(v) + elif opt == '-e': + email_domain = v + elif opt == '-a': + dump_all = True + elif opt == '-E': + log_encoding = v + elif opt == '-k': + rcs.add_id_keyword(v) + elif opt == '-m': + modules.append(v) + elif opt == '-h': + usage() + sys.exit(1) + except getopt.GetoptError as msg: + print(msg, file=sys.stderr) + usage() + sys.exit(1) + + if len(args) != 1 and len(args) != 3: + usage() + sys.exit(1) + + log_encodings = log_encoding.split(',') + + cvsroot = args[0] + while cvsroot[-1] == '/': + cvsroot = cvsroot[:-1] + if len(args) == 3: + svnroot = args[1] + svnpath = args[2] + else: + svnroot = None + svnpath = None + + if svnroot is None: + svn = SvnDumper() + else: + svn = SvnDumper(svnpath) + svn.load(svnroot) + if svn.last_rev is not None: + do_incremental = True + print('** svn loaded revision r%d by %s' % + (svn.last_rev, svn.last_author), file=sys.stderr) + + # strip off the domain part from the last author since cvs doesn't have + # the domain part. + if do_incremental and email_domain is not None and \ + svn.last_author.lower().endswith(('@' + email_domain).lower()): + last_author = svn.last_author[:-1 * (1 + len(email_domain))] + else: + last_author = svn.last_author + + cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec) + print('** walk cvs tree', file=sys.stderr) + if len(modules) == 0: + cvs.walk() + else: + for module in modules: + cvs.walk(module) + + svn.dump = True + + changesets = sorted(cvs.changesets) + nchangesets = len(changesets) + print('** cvs has %d changeset' % (nchangesets), file=sys.stderr) + + if nchangesets <= 0: + sys.exit(0) + + if not dump_all: + # don't use last 10 minutes for safety + max_time_max = changesets[-1].max_time - 600 + else: + max_time_max = changesets[-1].max_time + printOnce = False + + found_last_revision = False + for chg_idx, k in enumerate(changesets): + if do_incremental and not found_last_revision: + if k.min_time == svn.last_date and k.author == last_author: + found_last_revision = True + continue + if k.max_time > max_time_max: + break + if not printOnce: + output('SVN-fs-dump-format-version: 2') + output('') + printOnce = True + + # parse the first file to get log + log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev) + for i, e in enumerate(log_encodings): + try: + how = 'ignore' if i == len(log_encodings) - 1 else 'strict' + log = log.decode(e, how) + break + except UnicodeError: + pass + + if email_domain is None: + email = k.author + else: + email = k.author + '@' + email_domain + + revprops = str_prop('svn:author', email) + revprops += str_prop('svn:date', svn_time(k.min_time)) + revprops += str_prop('svn:log', log) + revprops += 'PROPS-END\n' + + output('Revision-number: %d' % (chg_idx + 1)) + output('Prop-content-length: %d' % (len(revprops))) + output('Content-length: %d' % (len(revprops))) + output('') + output(revprops) + + for f in k.revs: + fileprops = '' + if os.access(f.path, os.X_OK): + fileprops += str_prop('svn:executable', '*') + fileprops += 'PROPS-END\n' + filecont = rcs.expand_keyword(f.path, f.rev) + + md5sum = md5() + md5sum.update(filecont) + + p = node_path(cvs.cvsroot, svnpath, f.path) + if f.state == 'dead': + if not svn.exists(p): + print("Warning: remove '%s', but it does " + "not exist." % (p), file=sys.stderr) + continue + output('Node-path: %s' % (p)) + output('Node-kind: file') + output('Node-action: delete') + output('') + svn.remove(p) + continue + if not svn.exists(p): + svn.add(p) + output('Node-path: %s' % (p)) + output('Node-kind: file') + output('Node-action: add') + else: + output('Node-path: %s' % (p)) + output('Node-kind: file') + output('Node-action: change') + + output('Prop-content-length: %d' % (len(fileprops))) + output('Text-content-length: %s' % (len(filecont))) + output('Text-content-md5: %s' % (md5sum.hexdigest())) + output('Content-length: %d' % (len(fileprops) + len(filecont))) + output('') + output(fileprops, end='') + output(filecont) + output('') + + if do_incremental and not found_last_revision: + raise Exception('could not find the last revision') + + print('** dumped', file=sys.stderr) + + +# +# Write string objects to stdout with the code decided by Python. +# Also write byte objects in raw, without any code conversion (file +# bodies might be various encoding). +# +def output(*args, end='\n'): + if len(args) == 0: + pass + elif len(args) > 1 or isinstance(args[0], str): + lines = ' '.join( + [arg if isinstance(arg, str) else str(arg) for arg in args]) + sys.stdout.write(lines) + else: + sys.stdout.buffer.write(args[0]) + if len(end) > 0: + sys.stdout.write(end) + sys.stdout.flush() + + +class FileRevision: + def __init__(self, path, rev, state, markseq): + self.path = path + self.rev = rev + self.state = state + self.markseq = markseq + + +class ChangeSetKey: + def __init__(self, branch, author, timestamp, log, commitid, fuzzsec): + self.branch = branch + self.author = author + self.min_time = timestamp + self.max_time = timestamp + self.commitid = commitid + self.fuzzsec = fuzzsec + self.revs = [] + self.tags = [] + self.log_hash = 0 + h = 0 + for c in log: + h = 31 * h + c + self.log_hash = h + + def __lt__(self, other): + return self._cmp(other) < 0 + + def __gt__(self, other): + return self._cmp(other) > 0 + + def __eq__(self, other): + return self._cmp(other) == 0 + + def __le__(self, other): + return self._cmp(other) <= 0 + + def __ge__(self, other): + return self._cmp(other) >= 0 + + def __ne__(self, other): + return self._cmp(other) != 0 + + def _cmp(self, anon): + # compare by the commitid + cid = _cmp2(self.commitid, anon.commitid) + if cid == 0 and self.commitid is not None: + # both have commitid and they are same + return 0 + + # compare by the time + ma = anon.min_time - self.max_time + mi = self.min_time - anon.max_time + ct = self.min_time - anon.min_time + if ma > self.fuzzsec or mi > self.fuzzsec: + return ct + + if cid != 0: + # only one has the commitid, this means different commit + return cid if ct == 0 else ct + + # compare by log, branch and author + c = _cmp2(self.log_hash, anon.log_hash) + if c == 0: + c = _cmp2(self.branch, anon.branch) + if c == 0: + c = _cmp2(self.author, anon.author) + if c == 0: + return 0 + + return ct if ct != 0 else c + + def merge(self, anot): + self.max_time = max(self.max_time, anot.max_time) + self.min_time = min(self.min_time, anot.min_time) + self.revs.extend(anot.revs) + + def __hash__(self): + return hash(self.branch + '/' + self.author) * 31 + self.log_hash + + def put_file(self, path, rev, state, markseq): + self.revs.append(FileRevision(path, rev, state, markseq)) + + +def _cmp2(a, b): + _a = a is not None + _b = b is not None + return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) + + +class CvsConv: + def __init__(self, cvsroot, rcs, dumpfile, fuzzsec): + self.cvsroot = cvsroot + self.rcs = rcs + self.changesets = dict() + self.dumpfile = dumpfile + self.markseq = 0 + self.tags = dict() + self.fuzzsec = fuzzsec + + def walk(self, module=None): + p = [self.cvsroot] + if module is not None: + p.append(module) + path = os.path.join(*p) + + for root, _, files in os.walk(path): + for f in files: + if not f[-2:] == ',v': + continue + self.parse_file(root + os.sep + f) + + for t, c in list(self.tags.items()): + c.tags.append(t) + + def parse_file(self, path): + rtags = dict() + rcsfile = rcsparse.rcsfile(path) + branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} + for k, v in list(rcsfile.symbols.items()): + r = v.split('.') + if len(r) == 3: + branches[v] = 'VENDOR' + elif len(r) >= 3 and r[-2] == '0': + branches['.'.join(r[:-2] + r[-1:])] = k + if len(r) == 2 and branches[r[0]] == 'HEAD': + if v not in rtags: + rtags[v] = list() + rtags[v].append(k) + + revs = rcsfile.revs.items() + # sort by revision descending to priorize 1.1.1.1 than 1.1 + revs = sorted(revs, key=lambda a: a[1][0], reverse=True) + # sort by time + revs = sorted(revs, key=lambda a: a[1][1]) + novendor = False + have_initial_revision = False + last_vendor_status = None + for k, v in revs: + r = k.split('.') + if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \ + and r[3] == '1': + if have_initial_revision: + continue + if v[3] == 'dead': + continue + last_vendor_status = v[3] + have_initial_revision = True + elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1': + if novendor: + continue + last_vendor_status = v[3] + elif len(r) == 2: + if r[0] == '1' and r[1] == '1': + if have_initial_revision: + continue + if v[3] == 'dead': + continue + have_initial_revision = True + elif r[0] == '1' and r[1] != '1': + novendor = True + if last_vendor_status == 'dead' and v[3] == 'dead': + last_vendor_status = None + continue + last_vendor_status = None + else: + # trunk only + continue + + if self.dumpfile: + self.markseq = self.markseq + 1 + + b = '.'.join(r[:-1]) + try: + a = ChangeSetKey( + branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6], + self.fuzzsec) + except Exception as e: + print('Aborted at %s %s' % (path, v[0]), file=sys.stderr) + raise e + + a.put_file(path, k, v[3], self.markseq) + while a in self.changesets: + c = self.changesets[a] + del self.changesets[a] + c.merge(a) + a = c + self.changesets[a] = a + if k in rtags: + for t in rtags[k]: + if t not in self.tags or \ + self.tags[t].max_time < a.max_time: + self.tags[t] = a + + +def node_path(r, n, p): + if r.endswith('/'): + r = r[:-1] + path = p[:-2] # drop ",v" + p = path.split('/') + if len(p) > 0 and p[-2] == 'Attic': + path = '/'.join(p[:-2] + [p[-1]]) + if path.startswith(r): + path = path[len(r) + 1:] + if n is None or len(n) == 0: + return path + return '%s/%s' % (n, path) + + +def str_prop(k, v): + return 'K %d\n%s\nV %d\n%s\n' % (len(k), k, len(v), v) + + +def svn_time(t): + return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(t)) + + +class SvnDumper: + def __init__(self, root=''): + self.root = root + if self.root != '' and self.root[-1] == '/': + self.root = self.root[:-1] + self.dirs = {} + self.dirs[self.root] = {'dontdelete': 1} + self.dump = False + self.last_author = None + self.last_date = None + self.last_rev = None + + def exists(self, path): + d = os.path.dirname(path) + if d not in self.dirs: + return False + return os.path.basename(path) in self.dirs[d] + + def add(self, path): + d = os.path.dirname(path) + if d not in self.dirs: + self.mkdir(d) + self.dirs[d][os.path.basename(path)] = 1 + + def remove(self, path): + d = os.path.dirname(path) + if d == path: + return + del self.dirs[d][os.path.basename(path)] + self.rmdir(d) + + def rmdir(self, path): + if len(self.dirs[path]) > 0: + return + for r in list(self.dirs.keys()): + if r != path and r.startswith(path + '/'): + return + if self.dump: + output('Node-path: %s' % (path)) + output('Node-kind: dir') + output('Node-action: delete') + output('') + del self.dirs[path] + d = os.path.dirname(path) + if d == path or d not in self.dirs: + return + self.rmdir(d) + + def mkdir(self, path): + if path not in self.dirs: + d = os.path.dirname(path) + if d == path: + return + self.mkdir(d) + if self.dump: + output('Node-path: %s' % (path)) + output('Node-kind: dir') + output('Node-action: add') + output('') + output('') + self.dirs[path] = {} + + def load(self, repo_path): + repo_path = core.svn_path_canonicalize(repo_path) + repos_ptr = repos.open(repo_path) + fs_ptr = repos.fs(repos_ptr) + rev = fs.youngest_rev(fs_ptr) + base_root = fs.revision_root(fs_ptr, 0) + root = fs.revision_root(fs_ptr, rev) + hist = fs.node_history(root, self.root) + while hist is not None: + hist = fs.history_prev(hist, 0) + dummy, rev = fs.history_location(hist) + d = fs.revision_prop(fs_ptr, rev, core.SVN_PROP_REVISION_DATE) + author = fs.revision_prop( + fs_ptr, rev, core.SVN_PROP_REVISION_AUTHOR) + if author == 'svnadmin': + continue + self.last_author = author + self.last_date = core.svn_time_from_cstring(d) / 1000000 + self.last_rev = rev + + def authz_cb(root, path, pool): + return 1 + + editor = SvnDumperEditor(self) + e_ptr, e_baton = delta.make_editor(editor) + repos.dir_delta( + base_root, '', '', root, self.root, e_ptr, e_baton, authz_cb, + 0, 1, 0, 0) + break + + +class SvnDumperEditor(delta.Editor): + def __init__(self, dumper): + self.dumper = dumper + + def add_file(self, path, *args): + self.dumper.add(self.dumper.root + '/' + path) + + def add_directory(self, path, *args): + self.dumper.mkdir(self.dumper.root + '/' + path) + + +class RcsKeywords: + RCS_KW_AUTHOR = (1 << 0) + RCS_KW_DATE = (1 << 1) + RCS_KW_LOG = (1 << 2) + RCS_KW_NAME = (1 << 3) + RCS_KW_RCSFILE = (1 << 4) + RCS_KW_REVISION = (1 << 5) + RCS_KW_SOURCE = (1 << 6) + RCS_KW_STATE = (1 << 7) + RCS_KW_FULLPATH = (1 << 8) + RCS_KW_MDOCDATE = (1 << 9) + RCS_KW_LOCKER = (1 << 10) + + RCS_KW_ID = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE | + RCS_KW_AUTHOR | RCS_KW_STATE) + RCS_KW_HEADER = (RCS_KW_ID | RCS_KW_FULLPATH) + + rcs_expkw = { + b"Author": RCS_KW_AUTHOR, + b"Date": RCS_KW_DATE, + b"Header": RCS_KW_HEADER, + b"Id": RCS_KW_ID, + b"Log": RCS_KW_LOG, + b"Name": RCS_KW_NAME, + b"RCSfile": RCS_KW_RCSFILE, + b"Revision": RCS_KW_REVISION, + b"Source": RCS_KW_SOURCE, + b"State": RCS_KW_STATE, + b"Mdocdate": RCS_KW_MDOCDATE, + b"Locker": RCS_KW_LOCKER + } + + RCS_KWEXP_NONE = (1 << 0) + RCS_KWEXP_NAME = (1 << 1) # include keyword name + RCS_KWEXP_VAL = (1 << 2) # include keyword value + RCS_KWEXP_LKR = (1 << 3) # include name of locker + RCS_KWEXP_OLD = (1 << 4) # generate old keyword string + RCS_KWEXP_ERR = (1 << 5) # mode has an error + RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL) + RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR) + + def __init__(self): + self.rerecomple() + + def rerecomple(self): + pat = b'|'.join(list(self.rcs_expkw.keys())) + self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") + + def add_id_keyword(self, keyword): + self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID + self.rerecomple() + + def kflag_get(self, flags): + if flags is None: + return self.RCS_KWEXP_DEFAULT + fl = 0 + for fc in flags: + if fc == 'k': + fl |= self.RCS_KWEXP_NAME + elif fc == 'v': + fl |= self.RCS_KWEXP_VAL + elif fc == 'l': + fl |= self.RCS_KWEXP_LKR + elif fc == 'o': + if len(flags) != 1: + fl |= self.RCS_KWEXP_ERR + fl |= self.RCS_KWEXP_OLD + elif fc == 'b': + if len(flags) != 1: + fl |= self.RCS_KWEXP_ERR + fl |= self.RCS_KWEXP_NONE + else: + fl |= self.RCS_KWEXP_ERR + return fl + + def expand_keyword(self, filename, r): + rcs = rcsparse.rcsfile(filename) + rev = rcs.revs[r] + + mode = self.kflag_get(rcs.expand) + if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0: + return rcs.checkout(rev[0]) + + ret = [] + for line in rcs.checkout(rev[0]).split(b'\n'): + logbuf = None + m = self.re_kw.match(line) + if m is None: + # No RCS Keywords, use it as it is + ret += [line] + continue + + line0 = b'' + while m is not None: + try: + dsign = m.end(1) + line[m.end(1):].index(b'$') + except ValueError: + break + prefix = line[:m.start(1) - 1] + line = line[dsign + 1:] + line0 += prefix + expbuf = '' + if (mode & self.RCS_KWEXP_NAME) != 0: + expbuf += '$' + expbuf += m.group(1).decode('ascii') + if (mode & self.RCS_KWEXP_VAL) != 0: + expbuf += ': ' + if (mode & self.RCS_KWEXP_VAL) != 0: + expkw = self.rcs_expkw[m.group(1)] + if (expkw & self.RCS_KW_RCSFILE) != 0: + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + if (expkw & self.RCS_KW_REVISION) != 0: + expbuf += rev[0] + expbuf += " " + if (expkw & self.RCS_KW_DATE) != 0: + expbuf += time.strftime( + "%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1])) + if (expkw & self.RCS_KW_MDOCDATE) != 0: + d = time.gmtime(rev[1]) + expbuf += time.strftime( + "%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d) + if (expkw & self.RCS_KW_AUTHOR) != 0: + expbuf += rev[2] + expbuf += " " + if (expkw & self.RCS_KW_STATE) != 0: + expbuf += rev[3] + expbuf += " " + if (expkw & self.RCS_KW_LOG) != 0: + p = prefix + expbuf += filename \ + if (expkw & self.RCS_KW_FULLPATH) != 0 \ + else os.path.basename(filename) + expbuf += " " + logbuf = p + ( + 'Revision %s %s %s\n' % ( + rev[0], time.strftime( + "%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])), + rev[2])).encode('ascii') + for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'): + if len(lline) == 0: + logbuf += p.rstrip() + b'\n' + else: + logbuf += p + lline.lstrip() + b'\n' + if len(line) == 0: + logbuf += p.rstrip() + else: + logbuf += p + line.lstrip() + line = b'' + if (expkw & self.RCS_KW_SOURCE) != 0: + expbuf += filename + expbuf += " " + if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0: + expbuf += " " + if (mode & self.RCS_KWEXP_NAME) != 0: + expbuf += '$' + line0 += expbuf[:255].encode('ascii') + m = self.re_kw.match(line) + + ret += [line0 + line] + if logbuf is not None: + ret += [logbuf] + return b'\n'.join(ret) + + +# ---------------------------------------------------------------------- +# entry point +# ---------------------------------------------------------------------- +if __name__ == '__main__': + main()