Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9340668
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
81 KB
Subscribers
None
View Options
diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
index 572a89d..8bfa321 100644
--- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
+++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
@@ -1,645 +1,648 @@
#!/usr/local/bin/python
#
# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Usage
#
# First import:
# % git init --bare /git/openbsd.git
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# > openbsd.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd.dump
#
# Periodic import:
# % sudo cvsync
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# /git/openbsd.git > openbsd2.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd2.dump
#
import getopt
import os
import re
import subprocess
import sys
import time
import swh.loader.cvs.rcsparse as rcsparse
CHANGESET_FUZZ_SEC = 300
def usage():
print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '
'[-E log_encodings]\n'
'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'
'\tcvsroot [git_dir]', file=sys.stderr)
def main():
email_domain = None
do_incremental = False
git_tip = None
git_branch = 'master'
dump_all = False
log_encoding = 'utf-8,iso-8859-1'
rcs = RcsKeywords()
modules = []
last_revision = None
fuzzsec = CHANGESET_FUZZ_SEC
try:
opts, args = getopt.getopt(sys.argv[1:], 'ab:hm:z:e:E:k:t:l:')
for opt, v in opts:
if opt == '-z':
fuzzsec = int(v)
elif opt == '-e':
email_domain = v
elif opt == '-a':
dump_all = True
elif opt == '-b':
git_branch = v
elif opt == '-E':
log_encoding = v
elif opt == '-k':
rcs.add_id_keyword(v)
elif opt == '-m':
if v == '.git':
print('Cannot handle the path named \'.git\'',
file=sys.stderr)
sys.exit(1)
modules.append(v)
elif opt == '-l':
last_revision = v
elif opt == '-h':
usage()
sys.exit(1)
except getopt.GetoptError as msg:
print(msg, file=sys.stderr)
usage()
sys.exit(1)
if len(args) == 0 or len(args) > 2:
usage()
sys.exit(1)
log_encodings = log_encoding.split(',')
cvsroot = args[0]
while cvsroot[-1] == '/':
cvsroot = cvsroot[:-1]
if len(args) == 2:
do_incremental = True
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', git_branch],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Couldn't exec git", file=sys.stderr)
sys.exit(git.returncode)
git_tip = outs[2].strip()
if last_revision is not None:
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', last_revision],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Coundn't exec git", file=sys.stderr)
sys.exit(git.returncode)
last_author = outs[0].strip()
last_ctime = float(outs[1].split()[0])
# strip off the domain part from the last author since cvs doesn't have
# the domain part.
if do_incremental and email_domain is not None and \
last_author.lower().endswith(('@' + email_domain).lower()):
last_author = last_author[:-1 * (1 + len(email_domain))]
cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec)
print('** walk cvs tree', file=sys.stderr)
if len(modules) == 0:
cvs.walk()
else:
for module in modules:
cvs.walk(module)
changesets = sorted(cvs.changesets)
nchangesets = len(changesets)
print('** cvs has %d changeset' % (nchangesets), file=sys.stderr)
if nchangesets <= 0:
sys.exit(0)
if not dump_all:
# don't use last 10 minutes for safety
max_time_max = changesets[-1].max_time - 600
else:
max_time_max = changesets[-1].max_time
found_last_revision = False
markseq = cvs.markseq
extags = set()
for k in changesets:
if do_incremental and not found_last_revision:
if k.min_time == last_ctime and k.author == last_author:
found_last_revision = True
for tag in k.tags:
extags.add(tag)
continue
if k.max_time > max_time_max:
break
marks = {}
for f in k.revs:
if not do_incremental:
marks[f.markseq] = f
else:
markseq = markseq + 1
git_dump_file(f.path, f.rev, rcs, markseq)
marks[markseq] = f
log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)
for i, e in enumerate(log_encodings):
try:
how = 'ignore' if i == len(log_encodings) - 1 else 'strict'
log = log.decode(e, how)
break
except UnicodeError:
pass
log = log.encode('utf-8', 'ignore')
output('commit refs/heads/' + git_branch)
markseq = markseq + 1
output('mark :%d' % (markseq))
email = k.author if email_domain is None \
else k.author + '@' + email_domain
output('author %s <%s> %d +0000' % (k.author, email, k.min_time))
output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))
output('data', len(log))
output(log, end='')
if do_incremental and git_tip is not None:
output('from', git_tip)
git_tip = None
for m in marks:
f = marks[m]
mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644
fn = file_path(cvs.cvsroot, f.path)
if f.state == 'dead':
output('D', fn)
else:
output('M %o :%d %s' % (mode, m, fn))
output('')
for tag in k.tags:
if tag in extags:
continue
output('reset refs/tags/%s' % (tag))
output('from :%d' % (markseq))
output('')
if do_incremental and not found_last_revision:
raise Exception('could not find the last revision')
print('** dumped', file=sys.stderr)
#
# Encode by UTF-8 always for string objects since encoding for git-fast-import
# is UTF-8. Also write without conversion for a bytes object (file bodies
# might be various encodings)
#
def output(*args, end='\n'):
if len(args) == 0:
pass
elif len(args) > 1 or isinstance(args[0], str):
lines = ' '.join(
[arg if isinstance(arg, str) else str(arg) for arg in args])
sys.stdout.buffer.write(lines.encode('utf-8'))
else:
sys.stdout.buffer.write(args[0])
if len(end) > 0:
sys.stdout.buffer.write(end.encode('utf-8'))
class FileRevision:
def __init__(self, path, rev, state, markseq):
self.path = path
self.rev = rev
self.state = state
self.markseq = markseq
class ChangeSetKey:
def __init__(self, branch, author, timestamp, log, commitid, fuzzsec):
self.branch = branch
self.author = author
self.min_time = timestamp
self.max_time = timestamp
self.commitid = commitid
self.fuzzsec = fuzzsec
self.revs = []
self.tags = []
self.log_hash = 0
h = 0
for c in log:
h = 31 * h + c
self.log_hash = h
def __lt__(self, other):
return self._cmp(other) < 0
def __gt__(self, other):
return self._cmp(other) > 0
def __eq__(self, other):
return self._cmp(other) == 0
def __le__(self, other):
return self._cmp(other) <= 0
def __ge__(self, other):
return self._cmp(other) >= 0
def __ne__(self, other):
return self._cmp(other) != 0
def _cmp(self, anon):
# compare by the commitid
cid = _cmp2(self.commitid, anon.commitid)
if cid == 0 and self.commitid is not None:
# both have commitid and they are same
return 0
# compare by the time
ma = anon.min_time - self.max_time
mi = self.min_time - anon.max_time
ct = self.min_time - anon.min_time
if ma > self.fuzzsec or mi > self.fuzzsec:
return ct
if cid != 0:
# only one has the commitid, this means different commit
return cid if ct == 0 else ct
# compare by log, branch and author
c = _cmp2(self.log_hash, anon.log_hash)
if c == 0:
c = _cmp2(self.branch, anon.branch)
if c == 0:
c = _cmp2(self.author, anon.author)
if c == 0:
return 0
return ct if ct != 0 else c
def merge(self, anot):
self.max_time = max(self.max_time, anot.max_time)
self.min_time = min(self.min_time, anot.min_time)
self.revs.extend(anot.revs)
def __hash__(self):
return hash(self.branch + '/' + self.author) * 31 + self.log_hash
def put_file(self, path, rev, state, markseq):
self.revs.append(FileRevision(path, rev, state, markseq))
def _cmp2(a, b):
_a = a is not None
_b = b is not None
return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b)
class CvsConv:
def __init__(self, cvsroot, rcs, dumpfile, fuzzsec):
self.cvsroot = cvsroot
self.rcs = rcs
self.changesets = dict()
self.dumpfile = dumpfile
self.markseq = 0
self.tags = dict()
self.fuzzsec = fuzzsec
def walk(self, module=None):
p = [self.cvsroot]
if module is not None:
p.append(module)
path = os.path.join(*p)
for root, dirs, files in os.walk(path):
if '.git' in dirs:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
dirs.remove('.git')
if '.git' in files:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
files.remove('.git')
for f in files:
if not f[-2:] == ',v':
continue
self.parse_file(root + os.sep + f)
for t, c in list(self.tags.items()):
c.tags.append(t)
def parse_file(self, path):
rtags = dict()
rcsfile = rcsparse.rcsfile(path)
branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
for k, v in list(rcsfile.symbols.items()):
r = v.split('.')
if len(r) == 3:
branches[v] = 'VENDOR'
elif len(r) >= 3 and r[-2] == '0':
branches['.'.join(r[:-2] + r[-1:])] = k
if len(r) == 2 and branches[r[0]] == 'HEAD':
if v not in rtags:
rtags[v] = list()
rtags[v].append(k)
revs = rcsfile.revs.items()
# sort by revision descending to priorize 1.1.1.1 than 1.1
revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
# sort by time
revs = sorted(revs, key=lambda a: a[1][1])
novendor = False
have_initial_revision = False
last_vendor_status = None
for k, v in revs:
r = k.split('.')
if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
and r[3] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
last_vendor_status = v[3]
have_initial_revision = True
elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
if novendor:
continue
last_vendor_status = v[3]
elif len(r) == 2:
if r[0] == '1' and r[1] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
have_initial_revision = True
elif r[0] == '1' and r[1] != '1':
novendor = True
if last_vendor_status == 'dead' and v[3] == 'dead':
last_vendor_status = None
continue
last_vendor_status = None
else:
# trunk only
continue
if self.dumpfile:
self.markseq = self.markseq + 1
git_dump_file(path, k, self.rcs, self.markseq)
b = '.'.join(r[:-1])
try:
a = ChangeSetKey(
branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6],
self.fuzzsec)
except Exception as e:
print('Aborted at %s %s' % (path, v[0]), file=sys.stderr)
raise e
a.put_file(path, k, v[3], self.markseq)
while a in self.changesets:
c = self.changesets[a]
del self.changesets[a]
c.merge(a)
a = c
self.changesets[a] = a
if k in rtags:
for t in rtags[k]:
if t not in self.tags or \
self.tags[t].max_time < a.max_time:
self.tags[t] = a
def file_path(r, p):
if r.endswith('/'):
r = r[:-1]
- path = p[:-2] # drop ",v"
+ if p[-2:] == ',v':
+ path = p[:-2] # drop ",v"
+ else:
+ path = p
p = path.split('/')
if len(p) > 0 and p[-2] == 'Attic':
path = '/'.join(p[:-2] + [p[-1]])
if path.startswith(r):
path = path[len(r) + 1:]
return path
def git_dump_file(path, k, rcs, markseq):
try:
cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)
except RuntimeError as msg:
print('Unexpected runtime error on parsing',
path, k, ':', msg, file=sys.stderr)
print('unlimit the resource limit may fix this problem.',
file=sys.stderr)
sys.exit(1)
output('blob')
output('mark :%d' % markseq)
output('data', len(cont))
output(cont)
class RcsKeywords:
RCS_KW_AUTHOR = (1 << 0)
RCS_KW_DATE = (1 << 1)
RCS_KW_LOG = (1 << 2)
RCS_KW_NAME = (1 << 3)
RCS_KW_RCSFILE = (1 << 4)
RCS_KW_REVISION = (1 << 5)
RCS_KW_SOURCE = (1 << 6)
RCS_KW_STATE = (1 << 7)
RCS_KW_FULLPATH = (1 << 8)
RCS_KW_MDOCDATE = (1 << 9)
RCS_KW_LOCKER = (1 << 10)
RCS_KW_ID = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE |
RCS_KW_AUTHOR | RCS_KW_STATE)
RCS_KW_HEADER = (RCS_KW_ID | RCS_KW_FULLPATH)
rcs_expkw = {
b"Author": RCS_KW_AUTHOR,
b"Date": RCS_KW_DATE,
b"Header": RCS_KW_HEADER,
b"Id": RCS_KW_ID,
b"Log": RCS_KW_LOG,
b"Name": RCS_KW_NAME,
b"RCSfile": RCS_KW_RCSFILE,
b"Revision": RCS_KW_REVISION,
b"Source": RCS_KW_SOURCE,
b"State": RCS_KW_STATE,
b"Mdocdate": RCS_KW_MDOCDATE,
b"Locker": RCS_KW_LOCKER
}
RCS_KWEXP_NONE = (1 << 0)
RCS_KWEXP_NAME = (1 << 1) # include keyword name
RCS_KWEXP_VAL = (1 << 2) # include keyword value
RCS_KWEXP_LKR = (1 << 3) # include name of locker
RCS_KWEXP_OLD = (1 << 4) # generate old keyword string
RCS_KWEXP_ERR = (1 << 5) # mode has an error
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL)
RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR)
def __init__(self):
self.rerecomple()
def rerecomple(self):
pat = b'|'.join(list(self.rcs_expkw.keys()))
self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")
def add_id_keyword(self, keyword):
self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID
self.rerecomple()
def kflag_get(self, flags):
if flags is None:
return self.RCS_KWEXP_DEFAULT
fl = 0
for fc in flags:
if fc == 'k':
fl |= self.RCS_KWEXP_NAME
elif fc == 'v':
fl |= self.RCS_KWEXP_VAL
elif fc == 'l':
fl |= self.RCS_KWEXP_LKR
elif fc == 'o':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_OLD
elif fc == 'b':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_NONE
else:
fl |= self.RCS_KWEXP_ERR
return fl
def expand_keyword(self, filename, rcs, r):
rev = rcs.revs[r]
mode = self.kflag_get(rcs.expand)
if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0:
return rcs.checkout(rev[0])
ret = []
for line in rcs.checkout(rev[0]).split(b'\n'):
logbuf = None
m = self.re_kw.match(line)
if m is None:
# No RCS Keywords, use it as it is
ret += [line]
continue
line0 = b''
while m is not None:
try:
dsign = m.end(1) + line[m.end(1):].index(b'$')
except ValueError:
break
prefix = line[:m.start(1) - 1]
line = line[dsign + 1:]
line0 += prefix
expbuf = ''
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
expbuf += m.group(1).decode('ascii')
if (mode & self.RCS_KWEXP_VAL) != 0:
expbuf += ': '
if (mode & self.RCS_KWEXP_VAL) != 0:
expkw = self.rcs_expkw[m.group(1)]
if (expkw & self.RCS_KW_RCSFILE) != 0:
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
if (expkw & self.RCS_KW_REVISION) != 0:
expbuf += rev[0]
expbuf += " "
if (expkw & self.RCS_KW_DATE) != 0:
expbuf += time.strftime(
"%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1]))
if (expkw & self.RCS_KW_MDOCDATE) != 0:
d = time.gmtime(rev[1])
expbuf += time.strftime(
"%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d)
if (expkw & self.RCS_KW_AUTHOR) != 0:
expbuf += rev[2]
expbuf += " "
if (expkw & self.RCS_KW_STATE) != 0:
expbuf += rev[3]
expbuf += " "
if (expkw & self.RCS_KW_LOG) != 0:
p = prefix
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
logbuf = p + (
'Revision %s %s %s\n' % (
rev[0], time.strftime(
"%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])),
rev[2])).encode('ascii')
for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'):
if len(lline) == 0:
logbuf += p.rstrip() + b'\n'
else:
logbuf += p + lline.lstrip() + b'\n'
if len(line) == 0:
logbuf += p.rstrip()
else:
logbuf += p + line.lstrip()
line = b''
if (expkw & self.RCS_KW_SOURCE) != 0:
expbuf += filename
expbuf += " "
if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0:
expbuf += " "
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
line0 += expbuf[:255].encode('ascii')
m = self.re_kw.match(line)
ret += [line0 + line]
if logbuf is not None:
ret += [logbuf]
return b'\n'.join(ret)
# ----------------------------------------------------------------------
# entry point
# ----------------------------------------------------------------------
if __name__ == '__main__':
main()
diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py
new file mode 100644
index 0000000..e670f96
--- /dev/null
+++ b/swh/loader/cvs/cvsclient.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2015-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Minimal CVS client implementation
+
+"""
+
+import socket
+import subprocess
+import os.path
+import tempfile
+import re
+
+from swh.loader.exception import NotFound
+
+CVS_PSERVER_PORT = 2401
+CVS_PROTOCOL_BUFFER_SIZE = 8192
+EXAMPLE_PSERVER_URL = "pserver://user:password@cvs.example.com/cvsroot/repository"
+EXAMPLE_SSH_URL = "ssh://user@cvs.example.com/cvsroot/repository"
+
+VALID_RESPONSES = [ "ok", "error", "Valid-requests", "Checked-in",
+ "New-entry", "Checksum", "Copy-file", "Updated", "Created",
+ "Update-existing", "Merged", "Patched", "Rcs-diff", "Mode",
+ "Removed", "Remove-entry", "Template", "Notified", "Module-expansion",
+ "Wrapper-rcsOption", "M", "Mbinary", "E", "F", "MT" ]
+
+# Trivially encode strings to protect them from innocent eyes (i.e.,
+# inadvertent password compromises, like a network administrator
+# who's watching packets for legitimate reasons and accidentally sees
+# the password protocol go by).
+#
+# This is NOT secure encryption.
+def scramble_password(password):
+ s = ['A'] # scramble scheme version number
+ scramble_shifts = [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 114,120, 53, 79, 96,109, 72,108, 70, 64, 76, 67,116, 74, 68, 87,
+ 111, 52, 75,119, 49, 34, 82, 81, 95, 65,112, 86,118,110,122,105,
+ 41, 57, 83, 43, 46,102, 40, 89, 38,103, 45, 50, 42,123, 91, 35,
+ 125, 55, 54, 66,124,126, 59, 47, 92, 71,115, 78, 88,107,106, 56,
+ 36,121,117,104,101,100, 69, 73, 99, 63, 94, 93, 39, 37, 61, 48,
+ 58,113, 32, 90, 44, 98, 60, 51, 33, 97, 62, 77, 84, 80, 85,223,
+ 225,216,187,166,229,189,222,188,141,249,148,200,184,136,248,190,
+ 199,170,181,204,138,232,218,183,255,234,220,247,213,203,226,193,
+ 174,172,228,252,217,201,131,230,197,211,145,238,161,179,160,212,
+ 207,221,254,173,202,146,224,151,140,196,205,130,135,133,143,246,
+ 192,159,244,239,185,168,215,144,139,165,180,157,147,186,214,176,
+ 227,231,219,169,175,156,206,198,129,164,150,210,154,177,134,127,
+ 182,128,158,208,162,132,167,209,149,241,153,251,237,236,171,195,
+ 243,233,253,240,194,250,191,155,142,137,245,235,163,242,178,152 ]
+ for c in password:
+ s.append('%c' % scramble_shifts[ord(c)])
+ return "".join(s)
+
+
+class CVSProtocolError(Exception):
+ pass
+
+_re_kb_opt = re.compile(b'\/-kb\/')
+
+class CVSClient:
+
+ def connect_pserver(self, hostname, port, auth):
+ if port == None:
+ port = CVS_PSERVER_PORT
+ if auth == None:
+ raise NotFound("Username and password are required for a pserver connection: %s" % EXAMPLE_PSERVER_URL)
+ try:
+ user = auth.split(':')[0]
+ password = auth.split(':')[1]
+ except IndexError:
+ raise NotFound("Username and password are required for a pserver connection: %s" % EXAMPLE_PSERVER_URL)
+
+ try:
+ self.socket = socket.create_connection((hostname, port))
+ except ConnectionRefusedError:
+ raise NotFound("Could not connect to %s:%s", hostname, port)
+
+ scrambled_password = scramble_password(password)
+ request = "BEGIN AUTH REQUEST\n%s/%s\n%s\n%s\nEND AUTH REQUEST\n" \
+ % (self.cvsroot_path, self.cvs_module_name, user, scrambled_password)
+ self.socket.sendall(request.encode('UTF-8'))
+
+ response = self.socket.recv(11)
+ if response != b"I LOVE YOU\n":
+ raise NotFound("pserver authentication failed for %s:%s" % (hostname, port))
+
+ def connect_ssh(self, hostname, port, auth):
+ command = [ 'ssh' ]
+ if auth != None:
+ # Assume 'auth' contains only a user name.
+ # We do not support password authentication with SSH since the
+ # anoncvs user is usually granted access without a password.
+ command += [ '-l' , '%s' % auth ]
+ if port != None:
+ command += [ '-p' , '%d' % port ]
+
+ # accept new SSH hosts keys upon first use; changed host keys will require intervention
+ command += ['-o', "StrictHostKeyChecking=accept-new" ]
+
+ # disable interactive prompting
+ command += ['-o', "BatchMode=yes" ]
+
+ # disable further option processing by adding '--'
+ command += [ '--' ]
+
+ command += ['%s' % hostname, 'cvs', 'server']
+ self.ssh = subprocess.Popen(command,
+ bufsize=0, # use non-buffered I/O to match behaviour of self.socket
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+ def connect_fake(self, hostname, port, auth):
+ command = [ 'cvs', 'server' ]
+ self.ssh = subprocess.Popen(command,
+ bufsize=0, # use non-buffered I/O to match behaviour of self.socket
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+ def conn_read_line(self, require_newline=True):
+ if len(self.linebuffer) != 0:
+ return self.linebuffer.pop(0)
+ buf = b''
+ idx = -1
+ while idx == -1:
+ if len(buf) >= CVS_PROTOCOL_BUFFER_SIZE:
+ if require_newline:
+ raise CVSProtocolError("Overlong response from CVS server: %s" % buf)
+ else:
+ break
+ if self.socket:
+ buf += self.socket.recv(CVS_PROTOCOL_BUFFER_SIZE)
+ elif self.ssh:
+ buf += self.ssh.stdout.read(CVS_PROTOCOL_BUFFER_SIZE)
+ else:
+ raise Exception("No valid connection")
+ if not buf:
+ return None
+ idx = buf.rfind(b'\n')
+ if idx != -1:
+ self.linebuffer = buf[:idx + 1].splitlines(keepends=True)
+ else:
+ if require_newline:
+ raise CVSProtocolError("Invalid response from CVS server: %s" % buf)
+ else:
+ self.linebuffer.append(buf)
+ if len(self.incomplete_line) > 0:
+ self.linebuffer[0] = self.incomplete_line + self.linebuffer[0]
+ if idx != -1:
+ self.incomplete_line = buf[idx + 1:]
+ else:
+ self.incomplete_line = b''
+ return self.linebuffer.pop(0)
+
+ def conn_write(self, data):
+ if self.socket:
+ return self.socket.sendall(data)
+ if self.ssh:
+ self.ssh.stdin.write(data)
+ return self.ssh.stdin.flush()
+ raise Exception("No valid connection")
+
+ def conn_write_str(self, s):
+ return self.conn_write(s.encode('UTF-8'))
+
+ def conn_close(self):
+ if self.socket:
+ self.socket.close()
+ if self.ssh:
+ self.ssh.kill()
+ try:
+ self.ssh.wait(timeout=10)
+ except TimeoutExpired as e:
+ raise TimeoutExpired("Could not terminate ssh program: %s" % e)
+
+ def __init__(self, url):
+ """
+ Connect to a CVS server at the specified URL and perform the initial
+ CVS protocol handshake.
+ """
+ self.hostname = url.host
+ self.cvsroot_path = os.path.dirname(url.path)
+ self.cvs_module_name = os.path.basename(url.path)
+ self.socket = None
+ self.ssh = None
+ self.linebuffer = list()
+ self.incomplete_line = b''
+
+ if url.scheme == 'pserver':
+ self.connect_pserver(url.host, url.port, url.auth)
+ elif url.scheme == 'ssh':
+ self.connect_ssh(url.host, url.port, url.auth)
+ elif url.scheme == 'fake':
+ self.connect_fake(url.host, url.port, url.auth)
+ else:
+ raise NotFound("Invalid CVS origin URL '%s'" % url)
+
+ # we should have a connection now
+ assert self.socket or self.ssh
+
+ self.conn_write_str("Root %s\nValid-responses %s\nvalid-requests\nUseUnchanged\n" % \
+ (self.cvsroot_path, ' '.join(VALID_RESPONSES)))
+ response = self.conn_read_line()
+ if not response:
+ raise CVSProtocolError("No response from CVS server")
+ try:
+ if response[0:15] != b"Valid-requests ":
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+ except IndexError:
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+ response = self.conn_read_line()
+ if response != b"ok\n":
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+
+ def __del__(self):
+ self.conn_close()
+
+ def _parse_rlog_response(self, fp):
+ rlog_output = tempfile.TemporaryFile()
+ expect_error = False
+ for line in fp.readlines():
+ if expect_error:
+ raise CVSProtocolError('CVS server error: %s' % line)
+ if line == b'ok\n':
+ break
+ elif line == b'M \n':
+ continue
+ elif line[0:2] == b'M ':
+ rlog_output.write(line[2:])
+ elif line[0:8] == b'MT text ':
+ rlog_output.write(line[8:-1])
+ elif line[0:8] == b'MT date ':
+ rlog_output.write(line[8:-1])
+ elif line[0:10] == b'MT newline':
+ rlog_output.write(line[10:])
+ elif line[0:7] == b'error ':
+ epxect_error = True
+ continue
+ else:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % line)
+ rlog_output.seek(0)
+ return rlog_output
+
+
+ def fetch_rlog(self):
+ fp = tempfile.TemporaryFile()
+ self.conn_write_str("Global_option -q\nArgument --\nArgument %s\nrlog\n" % \
+ self.cvs_module_name)
+ while True:
+ response = self.conn_read_line()
+ if response == None:
+ raise CVSProtocolError("No response from CVS server")
+ if response[0:2] == b"E ":
+ raise CVSProtocolError("Error response from CVS server: %s" % response)
+ fp.write(response)
+ if response == b"ok\n":
+ break
+ fp.seek(0)
+ return self._parse_rlog_response(fp)
+
+ def checkout(self, path, rev, dest_dir):
+ skip_line = False
+ expect_modeline = False
+ expect_bytecount = False
+ have_bytecount = False
+ bytecount = 0
+ dirname = os.path.dirname(path)
+ if dirname:
+ self.conn_write_str("Directory %s\n%s\n" % (dirname, dirname))
+ filename = os.path.basename(path)
+ co_output = tempfile.NamedTemporaryFile(dir=dest_dir, delete=True,
+ prefix='cvsclient-checkout-%s-r%s-' % (filename, rev))
+ # TODO: cvs <= 1.10 servers expect to be given every Directory along the path.
+ self.conn_write_str("Directory %s\n%s\n"
+ "Global_option -q\n"
+ "Argument -r%s\n"
+ "Argument -kb\n"
+ "Argument --\nArgument %s\nco \n" % (self.cvs_module_name,
+ self.cvs_module_name, rev, path))
+ while True:
+ if have_bytecount and bytecount > 0:
+ response = self.conn_read_line(require_newline=False)
+ if response == None:
+ raise CVSProtocolError("Incomplete response from CVS server")
+ co_output.write(response)
+ bytecount -= len(response)
+ if bytecount < 0:
+ raise CVSProtocolError("Overlong response from CVS server: %s" % response)
+ continue
+ else:
+ response = self.conn_read_line()
+ if response[0:2] == b'E ':
+ raise CVSProtocolError('Error from CVS server: %s' % response)
+ if have_bytecount and bytecount == 0 and response == b'ok\n':
+ break
+ if skip_line:
+ skip_line = False
+ continue
+ elif expect_bytecount:
+ try:
+ bytecount = int(response[0:-1]) # strip trailing \n
+ except ValueError:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % response)
+ have_bytecount = True
+ continue
+ elif response == b'M \n':
+ continue
+ elif response == b'MT +updated\n':
+ continue
+ elif response == b'MT -updated\n':
+ continue
+ elif response[0:9] == b'MT fname ':
+ continue
+ elif response[0:8] == b'Created ':
+ skip_line = True
+ continue
+ elif response[0:1] == b'/' and _re_kb_opt.search(response):
+ expect_modeline = True
+ continue
+ elif expect_modeline and response[0:2] == b'u=':
+ expect_modeline = False
+ expect_bytecount = True
+ continue
+ elif response[0:2] == b'M ':
+ continue
+ elif response[0:8] == b'MT text ':
+ continue
+ elif response[0:10] == b'MT newline':
+ continue
+ else:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % response)
+ co_output.seek(0)
+ return co_output
diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py
index d5c7ae2..e7a187f 100644
--- a/swh/loader/cvs/loader.py
+++ b/swh/loader/cvs/loader.py
@@ -1,372 +1,466 @@
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Loader in charge of injecting either new or existing cvs repositories to
swh-storage.
"""
from datetime import datetime
import os
import subprocess
import tempfile
import time
from typing import Iterator, List, Optional, Sequence, Tuple
from urllib3.util import parse_url
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
from swh.loader.exception import NotFound
import swh.loader.cvs.rcsparse as rcsparse
+import swh.loader.cvs.cvsclient as cvsclient
+from swh.loader.cvs.rlog import RlogConv
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import CvsConv, RcsKeywords, CHANGESET_FUZZ_SEC, file_path, ChangeSetKey
from swh.model import from_disk, hashutil
from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone
from swh.model.model import (
Content,
Directory,
Origin,
Revision,
SkippedContent,
Snapshot,
SnapshotBranch,
TargetType,
)
from swh.storage.interface import StorageInterface
DEFAULT_BRANCH = b"HEAD"
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs."
class CvsLoader(BaseLoader):
"""Swh cvs loader.
The repository is local. The loader deals with
update on an already previously loaded repository.
"""
visit_type = "cvs"
def __init__(
self,
storage: StorageInterface,
url: str,
origin_url: Optional[str] = None,
visit_date: Optional[datetime] = None,
cvsroot_path: Optional[str] = None,
temp_directory: str = "/tmp",
max_content_size: Optional[int] = None,
):
super().__init__(
storage=storage,
logging_class="swh.loader.cvs.CvsLoader",
max_content_size=max_content_size,
)
self.cvsroot_url = url
# origin url as unique identifier for origin in swh archive
self.origin_url = origin_url if origin_url else self.cvsroot_url
self.temp_directory = temp_directory
self.done = False
+
self.cvs_module_name = None
- self.cvs_module_path = None
+
+ # XXX At present changeset IDs are recomputed on the fly during every visit.
+ # If we were able to maintain a cached somewhere which can be indexed by a
+ # cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid
+ # doing a lot of redundant work during every visit.
+
self.cvs_changesets = None
+
+ # remote CVS repository access (history is parsed from CVS rlog):
+ self.cvsclient = None
+ self.rlog_file = None
+
# internal state used to store swh objects
self._contents: List[Content] = []
self._skipped_contents: List[SkippedContent] = []
self._directories: List[Directory] = []
self._revisions: List[Revision] = []
self.swh_revision_gen = None
# internal state, current visit
self._last_revision = None
self._visit_status = "full"
self._load_status = "uneventful"
self.visit_date = visit_date
self.cvsroot_path = cvsroot_path
self.snapshot = None
+ def compute_swh_revision(self, k, logmsg):
+ """Compute swh hash data per CVS changeset.
+
+ Returns:
+ tuple (rev, swh_directory)
+ - rev: current SWH revision computed from checked out work tree
+ - swh_directory: dictionary of path, swh hash data with type
+
+ """
+ # Compute SWH revision from the on-disk state
+ swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
+ if self._last_revision:
+ parents = tuple([bytes(self._last_revision.id)])
+ else:
+ parents = ()
+ revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
+ self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))
+ self._last_revision = revision
+ if self._load_status == "uneventful":
+ # We have an eventful load if this revision is not already present in the archive
+ if not self.storage.revision_get([revision.id])[0]:
+ self._load_status = "eventful"
+ return (revision, swh_dir)
+
def swh_hash_data_per_cvs_changeset(self):
"""Compute swh hash data per CVS changeset.
Yields:
tuple (rev, swh_directory)
- rev: current SWH revision computed from checked out work tree
- swh_directory: dictionary of path, swh hash data with type
"""
- # XXX At present changeset IDs are recomputed on the fly during every visit.
- # If we were able to maintain a cached somewhere which can be indexed by a
- # cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid
- # doing a lot of redundant work during every visit.
for k in self.cvs_changesets:
tstr = time.strftime('%c', time.gmtime(k.max_time))
self.log.info("changeset from %s by %s on branch %s", tstr, k.author, k.branch);
logmsg = ""
# Check out the on-disk state of this revision
for f in k.revs:
rcsfile = None
path = file_path(self.cvsroot_path, f.path)
wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s of file %s" % (f.rev, f.path));
if not logmsg:
rcsfile = rcsparse.rcsfile(f.path)
logmsg = rcsfile.getlog(k.revs[0].rev)
if f.state == 'dead':
# remove this file from work tree
try:
os.remove(wtpath)
except FileNotFoundError:
pass
else:
# create, or update, this file in the work tree
if not rcsfile:
rcsfile = rcsparse.rcsfile(f.path)
rcs = RcsKeywords()
contents = rcs.expand_keyword(f.path, rcsfile, f.rev)
try:
outfile = open(wtpath, mode='wb')
except FileNotFoundError:
os.makedirs(os.path.dirname(wtpath))
outfile = open(wtpath, mode='wb')
outfile.write(contents)
outfile.close()
- # Compute SWH revision from the on-disk state
- swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
- if self._last_revision:
- parents = tuple([bytes(self._last_revision.id)])
- else:
- parents = ()
- revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
- self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))
- self._last_revision = revision
- if self._load_status == "uneventful":
- # We have an eventful load if this revision is not already present in the archive
- if not self.storage.revision_get([revision.id])[0]:
- self._load_status = "eventful"
-
+ (revision, swh_dir) = self.compute_swh_revision(k, logmsg)
yield revision, swh_dir
+ def swh_hash_data_per_cvs_rlog_changeset(self):
+ """Compute swh hash data per CVS rlog changeset.
+
+ Yields:
+ tuple (rev, swh_directory)
+ - rev: current SWH revision computed from checked out work tree
+ - swh_directory: dictionary of path, swh hash data with type
+
+ """
+ for k in self.cvs_changesets:
+ tstr = time.strftime('%c', time.gmtime(k.max_time))
+ self.log.info("changeset from %s by %s on branch %s", tstr, k.author, k.branch);
+ logmsg = ""
+ # Check out the on-disk state of this revision
+ for f in k.revs:
+ path = file_path(self.cvsroot_path, f.path)
+ wtpath = os.path.join(self.worktree_path, path)
+ self.log.info("rev %s of file %s" % (f.rev, f.path));
+ if not logmsg:
+ logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev)
+ self.log.debug("f.state is %s\n" % f.state)
+ if f.state == 'dead':
+ # remove this file from work tree
+ try:
+ os.remove(wtpath)
+ except FileNotFoundError:
+ pass
+ else:
+ dirname = os.path.dirname(wtpath)
+ try:
+ os.makedirs(dirname)
+ except FileExistsError:
+ pass
+ self.log.debug("checkout to %s\n" % wtpath)
+ fp = self.cvsclient.checkout(f.path, f.rev, dirname)
+ os.rename(fp.name, wtpath)
+ try:
+ fp.close()
+ except FileNotFoundError:
+ # Well, we have just renamed the file...
+ pass
+
+ # TODO: prune empty directories?
+ (revision, swh_dir) = self.compute_swh_revision(k, logmsg)
+ yield revision, swh_dir
def process_cvs_changesets(self) -> Iterator[
Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
]:
"""Process CVS revisions.
At each CVS revision, check out contents and compute swh hashes.
Yields:
tuple (contents, skipped-contents, directories, revision) of dict as a
dictionary with keys, sha1_git, sha1, etc...
"""
for swh_revision, swh_dir in self.swh_hash_data_per_cvs_changeset():
# Send the associated contents/directories
(_contents, _skipped_contents, _directories) = from_disk.iter_directory(swh_dir)
yield _contents, _skipped_contents, _directories, swh_revision
+ def process_cvs_rlog_changesets(self) -> Iterator[
+ Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
+ ]:
+ """Process CVS rlog revisions.
+
+ At each CVS revision, check out contents and compute swh hashes.
+
+ Yields:
+ tuple (contents, skipped-contents, directories, revision) of dict as a
+ dictionary with keys, sha1_git, sha1, etc...
+
+ """
+ for swh_revision, swh_dir in self.swh_hash_data_per_cvs_rlog_changeset():
+ # Send the associated contents/directories
+ (_contents, _skipped_contents, _directories) = from_disk.iter_directory(swh_dir)
+ yield _contents, _skipped_contents, _directories, swh_revision
def prepare_origin_visit(self):
self.origin = Origin(url=self.origin_url if self.origin_url else self.cvsroot_url)
def pre_cleanup(self):
"""Cleanup potential dangling files from prior runs (e.g. OOM killed
tasks)
"""
clean_dangling_folders(
self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log,
)
def cleanup(self):
self.log.info("cleanup")
def fetch_cvs_repo_with_rsync(self, host, path):
# URL *must* end with a trailing slash in order to get CVSROOT listed
url = 'rsync://%s%s/' % (host, os.path.dirname(path))
rsync = subprocess.run(['rsync', url], capture_output=True, encoding='ascii')
rsync.check_returncode()
have_cvsroot = False
have_module = False
for line in rsync.stdout.split('\n'):
self.log.debug("rsync server: %s" % line)
if line.endswith(' CVSROOT'):
have_cvsroot = True
elif line.endswith(' %s' % self.cvs_module_name):
have_module = True
if have_module and have_cvsroot:
break
if not have_module:
raise NotFound("CVS module %s not found at %s" \
% (self.cvs_module_name, host, url))
if not have_cvsroot:
raise NotFound("No CVSROOT directory found at %s" % url)
rsync = subprocess.run(['rsync', '-a', url, self.cvsroot_path])
rsync.check_returncode()
def prepare(self):
self._last_revision = None
self._load_status = "uneventful"
self.swh_revision_gen = None
if not self.cvsroot_path:
self.cvsroot_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,
)
self.worktree_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,
)
url = parse_url(self.origin_url)
self.log.debug("prepare; origin_url=%s scheme=%s path=%s" % (self.origin_url, url.scheme, url.path))
if not url.path:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
self.cvs_module_name = os.path.basename(url.path)
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name));
- self.cvs_module_path = os.path.join(self.cvsroot_path, self.cvs_module_name)
if url.scheme == 'file':
if not os.path.exists(url.path):
raise NotFound
elif url.scheme == 'rsync':
- self.fetch_cvs_repo_with_rsync(url.host, url.path)
+ self.fetch_cvs_repo_with_rsync(url.host, url.path)
+
+ if url.scheme == 'file' or url.scheme == 'rsync':
+ # local CVS repository conversion
+ have_rcsfile = False
+ have_cvsroot = False
+ for root, dirs, files in os.walk(self.cvsroot_path):
+ if 'CVSROOT' in dirs:
+ have_cvsroot = True
+ dirs.remove('CVSROOT')
+ continue;
+ for f in files:
+ filepath = os.path.join(root, f)
+ if f[-2:] == ',v':
+ try:
+ rcsfile = rcsparse.rcsfile(filepath)
+ except(Exception):
+ raise
+ else:
+ self.log.debug("Looks like we have data to convert; "
+ "found a valid RCS file at %s" % filepath)
+ have_rcsfile = True
+ break
+ if have_rcsfile:
+ break;
+
+ if not have_rcsfile:
+ raise NotFound("Directory %s does not contain any valid RCS files %s" % self.cvsroot_path)
+ if not have_cvsroot:
+ self.log.warn("The CVS repository at '%s' lacks a CVSROOT directory; "
+ "we might be ingesting an incomplete copy of the repository" % self.cvsroot_path)
+
+ # Unfortunately, there is no way to convert CVS history in an iterative fashion
+ # because the data is not indexed by any kind of changeset ID. We need to walk
+ # the history of each and every RCS file in the repository during every visit,
+ # even if no new changes will be added to the SWH archive afterwards.
+ # "CVS’s repository is the software equivalent of a telephone book sorted by telephone number."
+ # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/
+ #
+ # An implicit assumption made here is that self.cvs_changesets will fit into
+ # memory in its entirety. If it won't fit then the CVS walker will need to
+ # be modified such that it spools the list of changesets to disk instead.
+ cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)
+ self.log.info("Walking CVS module %s", self.cvs_module_name)
+ cvs.walk(self.cvs_module_name)
+ self.cvs_changesets = sorted(cvs.changesets)
+ self.log.info('CVS changesets found in %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
+ self.swh_revision_gen = self.process_cvs_changesets()
+ elif url.scheme == 'pserver' or url.scheme == 'fake':
+ # remote CVS repository conversion
+ self.cvsclient = cvsclient.CVSClient(url)
+ cvsroot_path = os.path.dirname(url.path)
+ self.log.info("Fetching CVS rlog from %s:%s/%s", url.host, cvsroot_path, self.cvs_module_name)
+ self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)
+ self.rlog_file = self.cvsclient.fetch_rlog()
+ self.rlog.parse_rlog(self.rlog_file)
+ self.cvs_changesets = sorted(self.rlog.changesets)
+ self.log.info('CVS changesets found for %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
+ self.swh_revision_gen = self.process_cvs_rlog_changesets()
else:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
- have_rcsfile = False
- have_cvsroot = False
- for root, dirs, files in os.walk(self.cvsroot_path):
- if 'CVSROOT' in dirs:
- have_cvsroot = True
- dirs.remove('CVSROOT')
- continue;
- for f in files:
- filepath = os.path.join(root, f)
- if f[-2:] == ',v':
- try:
- rcsfile = rcsparse.rcsfile(filepath)
- except(Exception):
- raise
- else:
- self.log.debug("Looks like we have data to convert; "
- "found a valid RCS file at %s" % filepath)
- have_rcsfile = True
- break
- if have_rcsfile:
- break;
-
- if not have_rcsfile:
- raise NotFound("Directory %s does not contain any valid RCS files %s" % self.cvsroot_path)
- if not have_cvsroot:
- self.log.warn("The CVS repository at '%s' lacks a CVSROOT directory; "
- "we might be ingesting an incomplete copy of the repository" % self.cvsroot_path)
-
- # Unfortunately, there is no way to convert CVS history in an iterative fashion
- # because the data is not indexed by any kind of changeset ID. We need to walk
- # the history of each and every RCS file in the repository during every visit,
- # even if no new changes will be added to the SWH archive afterwards.
- # "CVS’s repository is the software equivalent of a telephone book sorted by telephone number."
- # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/
- #
- # An implicit assumption made here is that self.cvs_changesets will fit into
- # memory in its entirety. If it won't fit then the CVS walker will need to
- # be modified such that it spools the list of changesets to disk instead.
- cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)
- self.log.info("Walking CVS module %s", self.cvs_module_name)
- cvs.walk(self.cvs_module_name)
- self.cvs_changesets = sorted(cvs.changesets)
- self.log.info('CVS changesets found in %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
- # SWH revisions are generated and stored iteratively to avoid high memory consumption
- self.swh_revision_gen = self.process_cvs_changesets()
+
def fetch_data(self):
"""Fetch the next CVS revision."""
try:
data = next(self.swh_revision_gen)
except StopIteration:
return False
except Exception as e:
self.log.exception(e)
return False # Stopping iteration
self._contents, self._skipped_contents, self._directories, rev = data
self._revisions = [rev]
return True
def build_swh_revision(self,
k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes]
) -> Revision:
"""Given a CVS revision, build a swh revision.
Args:
k: changeset data
logmsg: the changeset's log message
dir_id: the tree's hash identifier
parents: the revision's parents identifier
Returns:
The swh revision dictionary.
"""
author = Person.from_fullname(k.author.encode('UTF-8'))
date = TimestampWithTimezone.from_datetime(k.max_time)
return Revision(
type=RevisionType.CVS,
date=date,
committer_date=date,
directory=dir_id,
message=logmsg,
author=author,
committer=author,
synthetic=True,
extra_headers=[],
parents=tuple(parents))
def generate_and_load_snapshot(self, revision) -> Snapshot:
"""Create the snapshot either from existing revision.
Args:
revision (dict): Last revision seen if any (None by default)
Returns:
Optional[Snapshot] The newly created snapshot
"""
snap = Snapshot(
branches={
DEFAULT_BRANCH: SnapshotBranch(
target=revision.id, target_type=TargetType.REVISION
)
}
)
self.log.debug("snapshot: %s" % snap)
self.storage.snapshot_add([snap])
return snap
def store_data(self):
"Add our current CVS changeset to the archive."
self.storage.skipped_content_add(self._skipped_contents)
self.storage.content_add(self._contents)
self.storage.directory_add(self._directories)
self.storage.revision_add(self._revisions)
self.snapshot = self.generate_and_load_snapshot(self._last_revision)
self.log.debug("SWH snapshot ID: %s" % hashutil.hash_to_hex(self.snapshot.id))
self.flush()
self.loaded_snapshot_id = self.snapshot.id
self._skipped_contents = []
self._contents = []
self._directories = []
self._revisions = []
def load_status(self):
return {
"status": self._load_status,
}
def visit_status(self):
return self._visit_status
diff --git a/swh/loader/cvs/rlog.py b/swh/loader/cvs/rlog.py
new file mode 100644
index 0000000..1a046c3
--- /dev/null
+++ b/swh/loader/cvs/rlog.py
@@ -0,0 +1,391 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+""" RCS/CVS rlog parser, derived from viewvc and cvs2gitdump.py """
+
+# Copyright (C) 1999-2021 The ViewCVS Group. All Rights Reserved.
+#
+# By using ViewVC, you agree to the terms and conditions set forth
+# below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following
+# disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
+#
+# Permission to use, copy, modify, and distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+import re
+import calendar
+import time
+
+from swh.loader.cvs.cvs2gitdump.cvs2gitdump import CHANGESET_FUZZ_SEC, file_path, ChangeSetKey
+
+# TODO: actual path encoding should be specified as a parameter
+path_encodings = [ 'ascii', 'utf-8' ]
+
+class RlogConv:
+ def __init__(self, cvsroot_path, fuzzsec):
+ self.cvsroot_path = cvsroot_path
+ self.fuzzsec = fuzzsec
+ self.changesets = dict()
+ self.tags = dict()
+ self.offsets = dict()
+
+ def _process_rlog_entry(self, path, taginfo, revisions, logmsgs):
+ """ Convert an rlog entry into an item in self.changesets """
+ rtags = dict()
+ branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
+ for k, v in list(taginfo.items()):
+ r = v.split('.')
+ if len(r) == 3:
+ branches[v] = 'VENDOR'
+ elif len(r) >= 3 and r[-2] == '0':
+ branches['.'.join(r[:-2] + r[-1:])] = k
+ if len(r) == 2 and branches[r[0]] == 'HEAD':
+ if v not in rtags:
+ rtags[v] = list()
+ rtags[v].append(k)
+
+ revs = revisions.items()
+ # sort by revision descending to priorize 1.1.1.1 than 1.1
+ revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
+ # sort by time
+ revs = sorted(revs, key=lambda a: a[1][1])
+ novendor = False
+ have_initial_revision = False
+ last_vendor_status = None
+ for k, v in revs:
+ r = k.split('.')
+ if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
+ and r[3] == '1':
+ if have_initial_revision:
+ continue
+ if v[3] == 'dead':
+ continue
+ last_vendor_status = v[3]
+ have_initial_revision = True
+ elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
+ if novendor:
+ continue
+ last_vendor_status = v[3]
+ elif len(r) == 2:
+ if r[0] == '1' and r[1] == '1':
+ if have_initial_revision:
+ continue
+ if v[3] == 'dead':
+ continue
+ have_initial_revision = True
+ elif r[0] == '1' and r[1] != '1':
+ novendor = True
+ if last_vendor_status == 'dead' and v[3] == 'dead':
+ last_vendor_status = None
+ continue
+ last_vendor_status = None
+ else:
+ # trunk only
+ continue
+
+ b = '.'.join(r[:-1])
+ # decode author name in a potentially lossy way;
+ # it is only used for internal hashing in this case
+ author = v[2].decode('utf-8', 'ignore')
+ a = ChangeSetKey(
+ branches[b], author, v[1], logmsgs[k], v[6],
+ self.fuzzsec)
+
+ a.put_file(path, k, v[3], 0)
+ while a in self.changesets:
+ c = self.changesets[a]
+ del self.changesets[a]
+ c.merge(a)
+ a = c
+ self.changesets[a] = a
+ if k in rtags:
+ for t in rtags[k]:
+ if t not in self.tags or \
+ self.tags[t].max_time < a.max_time:
+ self.tags[t] = a
+
+ def parse_rlog(self, fp):
+ eof = None
+ while eof != _EOF_LOG and eof != _EOF_ERROR:
+ filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp)
+ revisions = {}
+ logmsgs = {}
+ if filename:
+ for i, e in enumerate(path_encodings):
+ try:
+ how = 'ignore' if i == len(path_encodings) - 1 else 'strict'
+ fname = filename.decode(e, how)
+ break
+ except UnicodeError:
+ pass
+ while not eof:
+ off = fp.tell()
+ rev, logmsg, eof = _parse_log_entry(fp)
+ if rev:
+ revisions[rev[0]] = rev
+ logmsgs[rev[0]] = logmsg
+ if eof != _EOF_LOG and eof != _EOF_ERROR:
+ path = file_path(self.cvsroot_path, fname)
+ if not path in self.offsets.keys():
+ self.offsets[path] = dict()
+ if rev:
+ self.offsets[path][rev[0]] = off
+ self._process_rlog_entry(path, taginfo, revisions, logmsgs)
+
+ def getlog(self, fp, path, rev):
+ off = self.offsets[path][rev]
+ fp.seek(off)
+ rev, logmsg, eof = _parse_log_entry(fp)
+ return logmsg
+
+# if your rlog doesn't use 77 '=' characters, then this must change
+LOG_END_MARKER = b'=' * 77 + b'\n'
+ENTRY_END_MARKER = b'-' * 28 + b'\n'
+
+_EOF_FILE = b'end of file entries' # no more entries for this RCS file
+_EOF_LOG = b'end of log' # hit the true EOF on the pipe
+_EOF_ERROR = b'error message found' # rlog issued an error
+
+# rlog error messages look like
+#
+# rlog: filename/goes/here,v: error message
+# rlog: filename/goes/here,v:123: error message
+#
+# so we should be able to match them with a regex like
+#
+# ^rlog\: (.*)(?:\:\d+)?\: (.*)$
+#
+# But for some reason the windows version of rlog omits the "rlog: " prefix
+# for the first error message when the standard error stream has been
+# redirected to a file or pipe. (the prefix is present in subsequent errors
+# and when rlog is run from the console). So the expression below is more
+# complicated
+_re_log_error = re.compile(b'^(?:rlog\: )*(.*,v)(?:\:\d+)?\: (.*)$')
+
+# CVSNT error messages look like:
+# cvs rcsfile: `C:/path/to/file,v' does not appear to be a valid rcs file
+# cvs [rcsfile aborted]: C:/path/to/file,v: No such file or directory
+# cvs [rcsfile aborted]: cannot open C:/path/to/file,v: Permission denied
+_re_cvsnt_error = re.compile(b'^(?:cvs rcsfile\: |cvs \[rcsfile aborted\]: )'
+ b'(?:\`(.*,v)\' |'
+ b'cannot open (.*,v)\: |(.*,v)\: |)'
+ b'(.*)$')
+
+
+def _parse_log_header(fp):
+ """Parse and RCS/CVS log header.
+
+ fp is a file (pipe) opened for reading the log information.
+
+ On entry, fp should point to the start of a log entry.
+ On exit, fp will have consumed the separator line between the header and
+ the first revision log.
+
+ If there is no revision information (e.g. the "-h" switch was passed to
+ rlog), then fp will consumed the file separator line on exit.
+
+ Returns: filename, default branch, tag dictionary, lock dictionary,
+ rlog error message, and eof flag
+ """
+
+ filename = branch = msg = b""
+ taginfo = {} # tag name => number
+ lockinfo = {} # revision => locker
+ state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks
+ eof = None
+
+ while 1:
+ line = fp.readline()
+ if not line:
+ # the true end-of-file
+ eof = _EOF_LOG
+ break
+
+ if state == 1:
+ if line[0] == b'\t':
+ [tag, rev] = [x.strip() for x in line.split(b':')]
+ taginfo[tag] = rev
+ else:
+ # oops. this line isn't tag info. stop parsing tags.
+ state = 0
+
+ if state == 2:
+ if line[0] == b'\t':
+ [locker, rev] = [x.strip() for x in line.split(b':')]
+ lockinfo[rev] = locker
+ else:
+ # oops. this line isn't lock info. stop parsing tags.
+ state = 0
+
+ if state == 0:
+ if line[:9] == b'RCS file:':
+ filename = line[10:-1]
+ elif line[:5] == b'head:':
+ # head = line[6:-1]
+ pass
+ elif line[:7] == b'branch:':
+ branch = line[8:-1]
+ elif line[:6] == b'locks:':
+ # start parsing the lock information
+ state = 2
+ elif line[:14] == b'symbolic names':
+ # start parsing the tag information
+ state = 1
+ elif line == ENTRY_END_MARKER:
+ # end of the headers
+ break
+ elif line == LOG_END_MARKER:
+ # end of this file's log information
+ eof = _EOF_FILE
+ break
+ else:
+ error = _re_cvsnt_error.match(line)
+ if error:
+ p1, p2, p3, msg = error.groups()
+ filename = p1 or p2 or p3
+ if not filename:
+ raise vclib.Error("Could not get filename from CVSNT error:\n%s"
+ % line)
+ eof = _EOF_ERROR
+ break
+
+ error = _re_log_error.match(line)
+ if error:
+ filename, msg = error.groups()
+ if msg[:30] == b'warning: Unknown phrases like ':
+ # don't worry about this warning. it can happen with some RCS
+ # files that have unknown fields in them (e.g. "permissions 644;"
+ continue
+ eof = _EOF_ERROR
+ break
+
+ return filename, branch, taginfo, lockinfo, msg, eof
+
+
+_re_log_info = re.compile(b'^date:\s+([^;]+);'
+ b'\s+author:\s+([^;]+);'
+ b'\s+state:\s+([^;]+);'
+ b'(\s+lines:\s+([0-9\s+-]+);?)?'
+ b'(\s+commitid:\s+([a-zA-Z0-9]+))?\n$')
+
+# TODO: _re_rev should be updated to extract the "locked" flag
+_re_rev = re.compile(b'^revision\s+([0-9.]+).*')
+
+def cvs_strptime(timestr):
+ try:
+ return time.strptime(timestr, '%Y/%m/%d %H:%M:%S')[:-1] + (0,)
+ except ValueError:
+ return time.strptime(timestr, '%Y-%m-%d %H:%M:%S %z')[:-1] + (0,)
+
+def _parse_log_entry(fp):
+ """Parse a single log entry.
+
+ On entry, fp should point to the first line of the entry (the "revision"
+ line).
+ On exit, fp will have consumed the log separator line (dashes) or the
+ end-of-file marker (equals).
+
+ Returns: Revision data tuple, and eof flag (see _EOF_*)
+ """
+ rev = None
+ line = fp.readline()
+ if not line:
+ return None, None, _EOF_LOG
+ if line == LOG_END_MARKER:
+ # Needed because some versions of RCS precede LOG_END_MARKER
+ # with ENTRY_END_MARKER
+ return None, None, _EOF_FILE
+ if line[:8] == b'revision':
+ match = _re_rev.match(line)
+ if not match:
+ return None, None, _EOF_LOG
+ rev = match.group(1)
+
+ line = fp.readline()
+ if not line:
+ return None, None, _EOF_LOG
+ match = _re_log_info.match(line)
+
+ eof = None
+ log = b''
+ while 1:
+ line = fp.readline()
+ if not line:
+ # true end-of-file
+ eof = _EOF_LOG
+ break
+ if line[:9] == b'branches:':
+ continue
+ if line == ENTRY_END_MARKER:
+ break
+ if line == LOG_END_MARKER:
+ # end of this file's log information
+ eof = _EOF_FILE
+ break
+
+ log = log + line
+
+ if not rev or not match:
+ # there was a parsing error
+ return None, None, eof
+
+ # parse out a time tuple for the local time
+ tm = cvs_strptime(match.group(1).decode('UTF-8'))
+
+ # rlog seems to assume that two-digit years are 1900-based (so, "04"
+ # comes out as "1904", not "2004").
+ EPOCH = 1970
+ if tm[0] < EPOCH:
+ tm = list(tm)
+ if (tm[0] - 1900) < 70:
+ tm[0] = tm[0] + 100
+ if tm[0] < EPOCH:
+ raise ValueError('invalid year')
+ date = calendar.timegm(tm)
+
+ # return a revision tuple compatible with 'rcsparse', the log message, and the EOF marker
+ return (rev.decode('ascii'), # revision number string
+ date,
+ match.group(2), # author (encoding is arbitrary; don't attempt to decode)
+ match.group(3).decode('ascii'), # state, usually "Exp" or "dead"; non-ASCII data here would be weird
+ None, # TODO: branches of this rev
+ None, # TODO: revnumstr of previous rev
+ None, # TODO: commitid
+ ), log, eof
diff --git a/swh/loader/cvs/tests/data/nano.rlog.tgz b/swh/loader/cvs/tests/data/nano.rlog.tgz
new file mode 100644
index 0000000..ffffa87
Binary files /dev/null and b/swh/loader/cvs/tests/data/nano.rlog.tgz differ
diff --git a/swh/loader/cvs/tests/data/runbaby.tgz b/swh/loader/cvs/tests/data/runbaby.tgz
index c2256f3..354845d 100644
Binary files a/swh/loader/cvs/tests/data/runbaby.tgz and b/swh/loader/cvs/tests/data/runbaby.tgz differ
diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py
index c4abe57..d899a9a 100644
--- a/swh/loader/cvs/tests/test_loader.py
+++ b/swh/loader/cvs/tests/test_loader.py
@@ -1,223 +1,259 @@
# Copyright (C) 2016-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import pytest
from swh.loader.cvs.loader import CvsLoader
from swh.loader.tests import (
assert_last_visit_matches,
check_snapshot,
get_stats,
prepare_repository_from_archive,
)
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Snapshot, SnapshotBranch, TargetType
RUNBABY_SNAPSHOT = Snapshot(
id=hash_to_bytes("1cff69ab9bd70822d5e3006092f943ccaafdcf57"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("ef511d258fa55035c2bc2a5b05cad233cee1d328"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_not_found_no_mock(swh_storage, tmp_path):
"""Given an unknown repository, the loader visit ends up in status not_found"""
unknown_repo_url = "unknown-repository"
loader = CvsLoader(swh_storage, unknown_repo_url, cvsroot_path=tmp_path)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
swh_storage, unknown_repo_url, status="not_found", type="cvs",
)
def test_loader_cvs_visit(swh_storage, datadir, tmp_path):
"""Eventful visit should yield 1 snapshot"""
archive_name = "runbaby"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 5,
"directory": 2,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 1,
"skipped_content": 0,
"snapshot": 1,
}
check_snapshot(RUNBABY_SNAPSHOT, loader.storage)
def test_loader_cvs_2_visits_no_change(swh_storage, datadir, tmp_path):
"""Eventful visit followed by uneventful visit should yield the same snapshot
"""
archive_name = "runbaby"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
assert loader.load() == {"status": "uneventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot == visit_status2.snapshot
stats = get_stats(loader.storage)
assert stats["origin_visit"] == 1 + 1 # computed twice the same snapshot
assert stats["snapshot"] == 1
GREEK_SNAPSHOT = Snapshot(
id=hash_to_bytes("5e74af67d69dfd7aea0eb118154d062f71f50120"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("e18b92f14cd5b3efb3fcb4ea46cfaf97f25f301b"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_with_file_additions_and_deletions(swh_storage, datadir, tmp_path):
"""Eventful conversion of history with file additions and deletions"""
archive_name = "greek-repository"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 8,
"directory": 20,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 7,
}
check_snapshot(GREEK_SNAPSHOT, loader.storage)
GREEK_SNAPSHOT2 = Snapshot(
id=hash_to_bytes("048885ae2145ffe81588aea95dcf75c536ecdf26"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("55eb1438c03588607ce4b8db8f45e8e23075951b"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_2_visits_with_change(swh_storage, datadir, tmp_path):
"""Eventful visit followed by eventful visit should yield two snapshots"""
archive_name = "greek-repository"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 8,
"directory": 20,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 7,
}
archive_name2 = "greek-repository2"
archive_path2 = os.path.join(datadir, f"{archive_name2}.tgz")
repo_url = prepare_repository_from_archive(archive_path2, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT2.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 10,
"directory": 23,
"origin": 1,
"origin_visit": 2,
"release": 0,
"revision": 8,
"skipped_content": 0,
"snapshot": 8,
}
check_snapshot(GREEK_SNAPSHOT2, loader.storage)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot != visit_status2.snapshot
+
+def test_loader_cvs_visit_pserver(swh_storage, datadir, tmp_path):
+ """Eventful visit to CVS pserver should yield 1 snapshot"""
+ archive_name = "runbaby"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+ repo_url += '/runbaby' # CVS module name
+
+ # Ask our cvsclient to connect via the 'cvs server' command
+ repo_url = 'fake://' + repo_url[7:]
+
+ loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
+
+ assert loader.load() == {"status": "eventful"}
+
+ assert_last_visit_matches(
+ loader.storage,
+ repo_url,
+ status="full",
+ type="cvs",
+ snapshot=RUNBABY_SNAPSHOT.id,
+ )
+
+ stats = get_stats(loader.storage)
+ assert stats == {
+ "content": 5,
+ "directory": 2,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 1,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ check_snapshot(RUNBABY_SNAPSHOT, loader.storage)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 10:58 AM (3 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3299270
Attached To
rDLDCVS CVS Loader
Event Timeline
Log In to Comment