Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
index 572a89d..8bfa321 100644
--- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
+++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py
@@ -1,645 +1,648 @@
#!/usr/local/bin/python
#
# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Usage
#
# First import:
# % git init --bare /git/openbsd.git
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# > openbsd.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd.dump
#
# Periodic import:
# % sudo cvsync
# % python cvs2gitdump.py -k OpenBSD -e openbsd.org /cvs/openbsd/src \
# /git/openbsd.git > openbsd2.dump
# % git --git-dir /git/openbsd.git fast-import < openbsd2.dump
#
import getopt
import os
import re
import subprocess
import sys
import time
import swh.loader.cvs.rcsparse as rcsparse
CHANGESET_FUZZ_SEC = 300
def usage():
print('usage: cvs2gitdump [-ah] [-z fuzz] [-e email_domain] '
'[-E log_encodings]\n'
'\t[-k rcs_keywords] [-b branch] [-m module] [-l last_revision]\n'
'\tcvsroot [git_dir]', file=sys.stderr)
def main():
email_domain = None
do_incremental = False
git_tip = None
git_branch = 'master'
dump_all = False
log_encoding = 'utf-8,iso-8859-1'
rcs = RcsKeywords()
modules = []
last_revision = None
fuzzsec = CHANGESET_FUZZ_SEC
try:
opts, args = getopt.getopt(sys.argv[1:], 'ab:hm:z:e:E:k:t:l:')
for opt, v in opts:
if opt == '-z':
fuzzsec = int(v)
elif opt == '-e':
email_domain = v
elif opt == '-a':
dump_all = True
elif opt == '-b':
git_branch = v
elif opt == '-E':
log_encoding = v
elif opt == '-k':
rcs.add_id_keyword(v)
elif opt == '-m':
if v == '.git':
print('Cannot handle the path named \'.git\'',
file=sys.stderr)
sys.exit(1)
modules.append(v)
elif opt == '-l':
last_revision = v
elif opt == '-h':
usage()
sys.exit(1)
except getopt.GetoptError as msg:
print(msg, file=sys.stderr)
usage()
sys.exit(1)
if len(args) == 0 or len(args) > 2:
usage()
sys.exit(1)
log_encodings = log_encoding.split(',')
cvsroot = args[0]
while cvsroot[-1] == '/':
cvsroot = cvsroot[:-1]
if len(args) == 2:
do_incremental = True
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', git_branch],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Couldn't exec git", file=sys.stderr)
sys.exit(git.returncode)
git_tip = outs[2].strip()
if last_revision is not None:
git = subprocess.Popen(
['git', '--git-dir=' + args[1], '-c',
'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1',
'--date=raw', '--format=%ae%n%ad%n%H', last_revision],
encoding='utf-8', stdout=subprocess.PIPE)
outs = git.stdout.readlines()
git.wait()
if git.returncode != 0:
print("Coundn't exec git", file=sys.stderr)
sys.exit(git.returncode)
last_author = outs[0].strip()
last_ctime = float(outs[1].split()[0])
# strip off the domain part from the last author since cvs doesn't have
# the domain part.
if do_incremental and email_domain is not None and \
last_author.lower().endswith(('@' + email_domain).lower()):
last_author = last_author[:-1 * (1 + len(email_domain))]
cvs = CvsConv(cvsroot, rcs, not do_incremental, fuzzsec)
print('** walk cvs tree', file=sys.stderr)
if len(modules) == 0:
cvs.walk()
else:
for module in modules:
cvs.walk(module)
changesets = sorted(cvs.changesets)
nchangesets = len(changesets)
print('** cvs has %d changeset' % (nchangesets), file=sys.stderr)
if nchangesets <= 0:
sys.exit(0)
if not dump_all:
# don't use last 10 minutes for safety
max_time_max = changesets[-1].max_time - 600
else:
max_time_max = changesets[-1].max_time
found_last_revision = False
markseq = cvs.markseq
extags = set()
for k in changesets:
if do_incremental and not found_last_revision:
if k.min_time == last_ctime and k.author == last_author:
found_last_revision = True
for tag in k.tags:
extags.add(tag)
continue
if k.max_time > max_time_max:
break
marks = {}
for f in k.revs:
if not do_incremental:
marks[f.markseq] = f
else:
markseq = markseq + 1
git_dump_file(f.path, f.rev, rcs, markseq)
marks[markseq] = f
log = rcsparse.rcsfile(k.revs[0].path).getlog(k.revs[0].rev)
for i, e in enumerate(log_encodings):
try:
how = 'ignore' if i == len(log_encodings) - 1 else 'strict'
log = log.decode(e, how)
break
except UnicodeError:
pass
log = log.encode('utf-8', 'ignore')
output('commit refs/heads/' + git_branch)
markseq = markseq + 1
output('mark :%d' % (markseq))
email = k.author if email_domain is None \
else k.author + '@' + email_domain
output('author %s <%s> %d +0000' % (k.author, email, k.min_time))
output('committer %s <%s> %d +0000' % (k.author, email, k.min_time))
output('data', len(log))
output(log, end='')
if do_incremental and git_tip is not None:
output('from', git_tip)
git_tip = None
for m in marks:
f = marks[m]
mode = 0o100755 if os.access(f.path, os.X_OK) else 0o100644
fn = file_path(cvs.cvsroot, f.path)
if f.state == 'dead':
output('D', fn)
else:
output('M %o :%d %s' % (mode, m, fn))
output('')
for tag in k.tags:
if tag in extags:
continue
output('reset refs/tags/%s' % (tag))
output('from :%d' % (markseq))
output('')
if do_incremental and not found_last_revision:
raise Exception('could not find the last revision')
print('** dumped', file=sys.stderr)
#
# Encode by UTF-8 always for string objects since encoding for git-fast-import
# is UTF-8. Also write without conversion for a bytes object (file bodies
# might be various encodings)
#
def output(*args, end='\n'):
if len(args) == 0:
pass
elif len(args) > 1 or isinstance(args[0], str):
lines = ' '.join(
[arg if isinstance(arg, str) else str(arg) for arg in args])
sys.stdout.buffer.write(lines.encode('utf-8'))
else:
sys.stdout.buffer.write(args[0])
if len(end) > 0:
sys.stdout.buffer.write(end.encode('utf-8'))
class FileRevision:
def __init__(self, path, rev, state, markseq):
self.path = path
self.rev = rev
self.state = state
self.markseq = markseq
class ChangeSetKey:
def __init__(self, branch, author, timestamp, log, commitid, fuzzsec):
self.branch = branch
self.author = author
self.min_time = timestamp
self.max_time = timestamp
self.commitid = commitid
self.fuzzsec = fuzzsec
self.revs = []
self.tags = []
self.log_hash = 0
h = 0
for c in log:
h = 31 * h + c
self.log_hash = h
def __lt__(self, other):
return self._cmp(other) < 0
def __gt__(self, other):
return self._cmp(other) > 0
def __eq__(self, other):
return self._cmp(other) == 0
def __le__(self, other):
return self._cmp(other) <= 0
def __ge__(self, other):
return self._cmp(other) >= 0
def __ne__(self, other):
return self._cmp(other) != 0
def _cmp(self, anon):
# compare by the commitid
cid = _cmp2(self.commitid, anon.commitid)
if cid == 0 and self.commitid is not None:
# both have commitid and they are same
return 0
# compare by the time
ma = anon.min_time - self.max_time
mi = self.min_time - anon.max_time
ct = self.min_time - anon.min_time
if ma > self.fuzzsec or mi > self.fuzzsec:
return ct
if cid != 0:
# only one has the commitid, this means different commit
return cid if ct == 0 else ct
# compare by log, branch and author
c = _cmp2(self.log_hash, anon.log_hash)
if c == 0:
c = _cmp2(self.branch, anon.branch)
if c == 0:
c = _cmp2(self.author, anon.author)
if c == 0:
return 0
return ct if ct != 0 else c
def merge(self, anot):
self.max_time = max(self.max_time, anot.max_time)
self.min_time = min(self.min_time, anot.min_time)
self.revs.extend(anot.revs)
def __hash__(self):
return hash(self.branch + '/' + self.author) * 31 + self.log_hash
def put_file(self, path, rev, state, markseq):
self.revs.append(FileRevision(path, rev, state, markseq))
def _cmp2(a, b):
_a = a is not None
_b = b is not None
return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b)
class CvsConv:
def __init__(self, cvsroot, rcs, dumpfile, fuzzsec):
self.cvsroot = cvsroot
self.rcs = rcs
self.changesets = dict()
self.dumpfile = dumpfile
self.markseq = 0
self.tags = dict()
self.fuzzsec = fuzzsec
def walk(self, module=None):
p = [self.cvsroot]
if module is not None:
p.append(module)
path = os.path.join(*p)
for root, dirs, files in os.walk(path):
if '.git' in dirs:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
dirs.remove('.git')
if '.git' in files:
print('Ignore %s: cannot handle the path named \'.git\'' % (
root + os.sep + '.git'), file=sys.stderr)
files.remove('.git')
for f in files:
if not f[-2:] == ',v':
continue
self.parse_file(root + os.sep + f)
for t, c in list(self.tags.items()):
c.tags.append(t)
def parse_file(self, path):
rtags = dict()
rcsfile = rcsparse.rcsfile(path)
branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
for k, v in list(rcsfile.symbols.items()):
r = v.split('.')
if len(r) == 3:
branches[v] = 'VENDOR'
elif len(r) >= 3 and r[-2] == '0':
branches['.'.join(r[:-2] + r[-1:])] = k
if len(r) == 2 and branches[r[0]] == 'HEAD':
if v not in rtags:
rtags[v] = list()
rtags[v].append(k)
revs = rcsfile.revs.items()
# sort by revision descending to priorize 1.1.1.1 than 1.1
revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
# sort by time
revs = sorted(revs, key=lambda a: a[1][1])
novendor = False
have_initial_revision = False
last_vendor_status = None
for k, v in revs:
r = k.split('.')
if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
and r[3] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
last_vendor_status = v[3]
have_initial_revision = True
elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
if novendor:
continue
last_vendor_status = v[3]
elif len(r) == 2:
if r[0] == '1' and r[1] == '1':
if have_initial_revision:
continue
if v[3] == 'dead':
continue
have_initial_revision = True
elif r[0] == '1' and r[1] != '1':
novendor = True
if last_vendor_status == 'dead' and v[3] == 'dead':
last_vendor_status = None
continue
last_vendor_status = None
else:
# trunk only
continue
if self.dumpfile:
self.markseq = self.markseq + 1
git_dump_file(path, k, self.rcs, self.markseq)
b = '.'.join(r[:-1])
try:
a = ChangeSetKey(
branches[b], v[2], v[1], rcsfile.getlog(v[0]), v[6],
self.fuzzsec)
except Exception as e:
print('Aborted at %s %s' % (path, v[0]), file=sys.stderr)
raise e
a.put_file(path, k, v[3], self.markseq)
while a in self.changesets:
c = self.changesets[a]
del self.changesets[a]
c.merge(a)
a = c
self.changesets[a] = a
if k in rtags:
for t in rtags[k]:
if t not in self.tags or \
self.tags[t].max_time < a.max_time:
self.tags[t] = a
def file_path(r, p):
if r.endswith('/'):
r = r[:-1]
- path = p[:-2] # drop ",v"
+ if p[-2:] == ',v':
+ path = p[:-2] # drop ",v"
+ else:
+ path = p
p = path.split('/')
if len(p) > 0 and p[-2] == 'Attic':
path = '/'.join(p[:-2] + [p[-1]])
if path.startswith(r):
path = path[len(r) + 1:]
return path
def git_dump_file(path, k, rcs, markseq):
try:
cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k)
except RuntimeError as msg:
print('Unexpected runtime error on parsing',
path, k, ':', msg, file=sys.stderr)
print('unlimit the resource limit may fix this problem.',
file=sys.stderr)
sys.exit(1)
output('blob')
output('mark :%d' % markseq)
output('data', len(cont))
output(cont)
class RcsKeywords:
RCS_KW_AUTHOR = (1 << 0)
RCS_KW_DATE = (1 << 1)
RCS_KW_LOG = (1 << 2)
RCS_KW_NAME = (1 << 3)
RCS_KW_RCSFILE = (1 << 4)
RCS_KW_REVISION = (1 << 5)
RCS_KW_SOURCE = (1 << 6)
RCS_KW_STATE = (1 << 7)
RCS_KW_FULLPATH = (1 << 8)
RCS_KW_MDOCDATE = (1 << 9)
RCS_KW_LOCKER = (1 << 10)
RCS_KW_ID = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE |
RCS_KW_AUTHOR | RCS_KW_STATE)
RCS_KW_HEADER = (RCS_KW_ID | RCS_KW_FULLPATH)
rcs_expkw = {
b"Author": RCS_KW_AUTHOR,
b"Date": RCS_KW_DATE,
b"Header": RCS_KW_HEADER,
b"Id": RCS_KW_ID,
b"Log": RCS_KW_LOG,
b"Name": RCS_KW_NAME,
b"RCSfile": RCS_KW_RCSFILE,
b"Revision": RCS_KW_REVISION,
b"Source": RCS_KW_SOURCE,
b"State": RCS_KW_STATE,
b"Mdocdate": RCS_KW_MDOCDATE,
b"Locker": RCS_KW_LOCKER
}
RCS_KWEXP_NONE = (1 << 0)
RCS_KWEXP_NAME = (1 << 1) # include keyword name
RCS_KWEXP_VAL = (1 << 2) # include keyword value
RCS_KWEXP_LKR = (1 << 3) # include name of locker
RCS_KWEXP_OLD = (1 << 4) # generate old keyword string
RCS_KWEXP_ERR = (1 << 5) # mode has an error
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL)
RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR)
def __init__(self):
self.rerecomple()
def rerecomple(self):
pat = b'|'.join(list(self.rcs_expkw.keys()))
self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]")
def add_id_keyword(self, keyword):
self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID
self.rerecomple()
def kflag_get(self, flags):
if flags is None:
return self.RCS_KWEXP_DEFAULT
fl = 0
for fc in flags:
if fc == 'k':
fl |= self.RCS_KWEXP_NAME
elif fc == 'v':
fl |= self.RCS_KWEXP_VAL
elif fc == 'l':
fl |= self.RCS_KWEXP_LKR
elif fc == 'o':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_OLD
elif fc == 'b':
if len(flags) != 1:
fl |= self.RCS_KWEXP_ERR
fl |= self.RCS_KWEXP_NONE
else:
fl |= self.RCS_KWEXP_ERR
return fl
def expand_keyword(self, filename, rcs, r):
rev = rcs.revs[r]
mode = self.kflag_get(rcs.expand)
if (mode & (self.RCS_KWEXP_NONE | self.RCS_KWEXP_OLD)) != 0:
return rcs.checkout(rev[0])
ret = []
for line in rcs.checkout(rev[0]).split(b'\n'):
logbuf = None
m = self.re_kw.match(line)
if m is None:
# No RCS Keywords, use it as it is
ret += [line]
continue
line0 = b''
while m is not None:
try:
dsign = m.end(1) + line[m.end(1):].index(b'$')
except ValueError:
break
prefix = line[:m.start(1) - 1]
line = line[dsign + 1:]
line0 += prefix
expbuf = ''
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
expbuf += m.group(1).decode('ascii')
if (mode & self.RCS_KWEXP_VAL) != 0:
expbuf += ': '
if (mode & self.RCS_KWEXP_VAL) != 0:
expkw = self.rcs_expkw[m.group(1)]
if (expkw & self.RCS_KW_RCSFILE) != 0:
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
if (expkw & self.RCS_KW_REVISION) != 0:
expbuf += rev[0]
expbuf += " "
if (expkw & self.RCS_KW_DATE) != 0:
expbuf += time.strftime(
"%Y/%m/%d %H:%M:%S ", time.gmtime(rev[1]))
if (expkw & self.RCS_KW_MDOCDATE) != 0:
d = time.gmtime(rev[1])
expbuf += time.strftime(
"%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d)
if (expkw & self.RCS_KW_AUTHOR) != 0:
expbuf += rev[2]
expbuf += " "
if (expkw & self.RCS_KW_STATE) != 0:
expbuf += rev[3]
expbuf += " "
if (expkw & self.RCS_KW_LOG) != 0:
p = prefix
expbuf += filename \
if (expkw & self.RCS_KW_FULLPATH) != 0 \
else os.path.basename(filename)
expbuf += " "
logbuf = p + (
'Revision %s %s %s\n' % (
rev[0], time.strftime(
"%Y/%m/%d %H:%M:%S", time.gmtime(rev[1])),
rev[2])).encode('ascii')
for lline in rcs.getlog(rev[0]).rstrip().split(b'\n'):
if len(lline) == 0:
logbuf += p.rstrip() + b'\n'
else:
logbuf += p + lline.lstrip() + b'\n'
if len(line) == 0:
logbuf += p.rstrip()
else:
logbuf += p + line.lstrip()
line = b''
if (expkw & self.RCS_KW_SOURCE) != 0:
expbuf += filename
expbuf += " "
if (expkw & (self.RCS_KW_NAME | self.RCS_KW_LOCKER)) != 0:
expbuf += " "
if (mode & self.RCS_KWEXP_NAME) != 0:
expbuf += '$'
line0 += expbuf[:255].encode('ascii')
m = self.re_kw.match(line)
ret += [line0 + line]
if logbuf is not None:
ret += [logbuf]
return b'\n'.join(ret)
# ----------------------------------------------------------------------
# entry point
# ----------------------------------------------------------------------
if __name__ == '__main__':
main()
diff --git a/swh/loader/cvs/cvsclient.py b/swh/loader/cvs/cvsclient.py
new file mode 100644
index 0000000..e670f96
--- /dev/null
+++ b/swh/loader/cvs/cvsclient.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2015-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Minimal CVS client implementation
+
+"""
+
+import socket
+import subprocess
+import os.path
+import tempfile
+import re
+
+from swh.loader.exception import NotFound
+
+CVS_PSERVER_PORT = 2401
+CVS_PROTOCOL_BUFFER_SIZE = 8192
+EXAMPLE_PSERVER_URL = "pserver://user:password@cvs.example.com/cvsroot/repository"
+EXAMPLE_SSH_URL = "ssh://user@cvs.example.com/cvsroot/repository"
+
+VALID_RESPONSES = [ "ok", "error", "Valid-requests", "Checked-in",
+ "New-entry", "Checksum", "Copy-file", "Updated", "Created",
+ "Update-existing", "Merged", "Patched", "Rcs-diff", "Mode",
+ "Removed", "Remove-entry", "Template", "Notified", "Module-expansion",
+ "Wrapper-rcsOption", "M", "Mbinary", "E", "F", "MT" ]
+
+# Trivially encode strings to protect them from innocent eyes (i.e.,
+# inadvertent password compromises, like a network administrator
+# who's watching packets for legitimate reasons and accidentally sees
+# the password protocol go by).
+#
+# This is NOT secure encryption.
+def scramble_password(password):
+ s = ['A'] # scramble scheme version number
+ scramble_shifts = [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 114,120, 53, 79, 96,109, 72,108, 70, 64, 76, 67,116, 74, 68, 87,
+ 111, 52, 75,119, 49, 34, 82, 81, 95, 65,112, 86,118,110,122,105,
+ 41, 57, 83, 43, 46,102, 40, 89, 38,103, 45, 50, 42,123, 91, 35,
+ 125, 55, 54, 66,124,126, 59, 47, 92, 71,115, 78, 88,107,106, 56,
+ 36,121,117,104,101,100, 69, 73, 99, 63, 94, 93, 39, 37, 61, 48,
+ 58,113, 32, 90, 44, 98, 60, 51, 33, 97, 62, 77, 84, 80, 85,223,
+ 225,216,187,166,229,189,222,188,141,249,148,200,184,136,248,190,
+ 199,170,181,204,138,232,218,183,255,234,220,247,213,203,226,193,
+ 174,172,228,252,217,201,131,230,197,211,145,238,161,179,160,212,
+ 207,221,254,173,202,146,224,151,140,196,205,130,135,133,143,246,
+ 192,159,244,239,185,168,215,144,139,165,180,157,147,186,214,176,
+ 227,231,219,169,175,156,206,198,129,164,150,210,154,177,134,127,
+ 182,128,158,208,162,132,167,209,149,241,153,251,237,236,171,195,
+ 243,233,253,240,194,250,191,155,142,137,245,235,163,242,178,152 ]
+ for c in password:
+ s.append('%c' % scramble_shifts[ord(c)])
+ return "".join(s)
+
+
+class CVSProtocolError(Exception):
+ pass
+
+_re_kb_opt = re.compile(b'\/-kb\/')
+
+class CVSClient:
+
+ def connect_pserver(self, hostname, port, auth):
+ if port == None:
+ port = CVS_PSERVER_PORT
+ if auth == None:
+ raise NotFound("Username and password are required for a pserver connection: %s" % EXAMPLE_PSERVER_URL)
+ try:
+ user = auth.split(':')[0]
+ password = auth.split(':')[1]
+ except IndexError:
+ raise NotFound("Username and password are required for a pserver connection: %s" % EXAMPLE_PSERVER_URL)
+
+ try:
+ self.socket = socket.create_connection((hostname, port))
+ except ConnectionRefusedError:
+ raise NotFound("Could not connect to %s:%s", hostname, port)
+
+ scrambled_password = scramble_password(password)
+ request = "BEGIN AUTH REQUEST\n%s/%s\n%s\n%s\nEND AUTH REQUEST\n" \
+ % (self.cvsroot_path, self.cvs_module_name, user, scrambled_password)
+ self.socket.sendall(request.encode('UTF-8'))
+
+ response = self.socket.recv(11)
+ if response != b"I LOVE YOU\n":
+ raise NotFound("pserver authentication failed for %s:%s" % (hostname, port))
+
+ def connect_ssh(self, hostname, port, auth):
+ command = [ 'ssh' ]
+ if auth != None:
+ # Assume 'auth' contains only a user name.
+ # We do not support password authentication with SSH since the
+ # anoncvs user is usually granted access without a password.
+ command += [ '-l' , '%s' % auth ]
+ if port != None:
+ command += [ '-p' , '%d' % port ]
+
+ # accept new SSH hosts keys upon first use; changed host keys will require intervention
+ command += ['-o', "StrictHostKeyChecking=accept-new" ]
+
+ # disable interactive prompting
+ command += ['-o', "BatchMode=yes" ]
+
+ # disable further option processing by adding '--'
+ command += [ '--' ]
+
+ command += ['%s' % hostname, 'cvs', 'server']
+ self.ssh = subprocess.Popen(command,
+ bufsize=0, # use non-buffered I/O to match behaviour of self.socket
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+ def connect_fake(self, hostname, port, auth):
+ command = [ 'cvs', 'server' ]
+ self.ssh = subprocess.Popen(command,
+ bufsize=0, # use non-buffered I/O to match behaviour of self.socket
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+ def conn_read_line(self, require_newline=True):
+ if len(self.linebuffer) != 0:
+ return self.linebuffer.pop(0)
+ buf = b''
+ idx = -1
+ while idx == -1:
+ if len(buf) >= CVS_PROTOCOL_BUFFER_SIZE:
+ if require_newline:
+ raise CVSProtocolError("Overlong response from CVS server: %s" % buf)
+ else:
+ break
+ if self.socket:
+ buf += self.socket.recv(CVS_PROTOCOL_BUFFER_SIZE)
+ elif self.ssh:
+ buf += self.ssh.stdout.read(CVS_PROTOCOL_BUFFER_SIZE)
+ else:
+ raise Exception("No valid connection")
+ if not buf:
+ return None
+ idx = buf.rfind(b'\n')
+ if idx != -1:
+ self.linebuffer = buf[:idx + 1].splitlines(keepends=True)
+ else:
+ if require_newline:
+ raise CVSProtocolError("Invalid response from CVS server: %s" % buf)
+ else:
+ self.linebuffer.append(buf)
+ if len(self.incomplete_line) > 0:
+ self.linebuffer[0] = self.incomplete_line + self.linebuffer[0]
+ if idx != -1:
+ self.incomplete_line = buf[idx + 1:]
+ else:
+ self.incomplete_line = b''
+ return self.linebuffer.pop(0)
+
+ def conn_write(self, data):
+ if self.socket:
+ return self.socket.sendall(data)
+ if self.ssh:
+ self.ssh.stdin.write(data)
+ return self.ssh.stdin.flush()
+ raise Exception("No valid connection")
+
+ def conn_write_str(self, s):
+ return self.conn_write(s.encode('UTF-8'))
+
+ def conn_close(self):
+ if self.socket:
+ self.socket.close()
+ if self.ssh:
+ self.ssh.kill()
+ try:
+ self.ssh.wait(timeout=10)
+ except TimeoutExpired as e:
+ raise TimeoutExpired("Could not terminate ssh program: %s" % e)
+
+ def __init__(self, url):
+ """
+ Connect to a CVS server at the specified URL and perform the initial
+ CVS protocol handshake.
+ """
+ self.hostname = url.host
+ self.cvsroot_path = os.path.dirname(url.path)
+ self.cvs_module_name = os.path.basename(url.path)
+ self.socket = None
+ self.ssh = None
+ self.linebuffer = list()
+ self.incomplete_line = b''
+
+ if url.scheme == 'pserver':
+ self.connect_pserver(url.host, url.port, url.auth)
+ elif url.scheme == 'ssh':
+ self.connect_ssh(url.host, url.port, url.auth)
+ elif url.scheme == 'fake':
+ self.connect_fake(url.host, url.port, url.auth)
+ else:
+ raise NotFound("Invalid CVS origin URL '%s'" % url)
+
+ # we should have a connection now
+ assert self.socket or self.ssh
+
+ self.conn_write_str("Root %s\nValid-responses %s\nvalid-requests\nUseUnchanged\n" % \
+ (self.cvsroot_path, ' '.join(VALID_RESPONSES)))
+ response = self.conn_read_line()
+ if not response:
+ raise CVSProtocolError("No response from CVS server")
+ try:
+ if response[0:15] != b"Valid-requests ":
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+ except IndexError:
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+ response = self.conn_read_line()
+ if response != b"ok\n":
+ raise CVSProtocolError("Invalid response from CVS server: %s" % response)
+
+ def __del__(self):
+ self.conn_close()
+
+ def _parse_rlog_response(self, fp):
+ rlog_output = tempfile.TemporaryFile()
+ expect_error = False
+ for line in fp.readlines():
+ if expect_error:
+ raise CVSProtocolError('CVS server error: %s' % line)
+ if line == b'ok\n':
+ break
+ elif line == b'M \n':
+ continue
+ elif line[0:2] == b'M ':
+ rlog_output.write(line[2:])
+ elif line[0:8] == b'MT text ':
+ rlog_output.write(line[8:-1])
+ elif line[0:8] == b'MT date ':
+ rlog_output.write(line[8:-1])
+ elif line[0:10] == b'MT newline':
+ rlog_output.write(line[10:])
+ elif line[0:7] == b'error ':
+ epxect_error = True
+ continue
+ else:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % line)
+ rlog_output.seek(0)
+ return rlog_output
+
+
+ def fetch_rlog(self):
+ fp = tempfile.TemporaryFile()
+ self.conn_write_str("Global_option -q\nArgument --\nArgument %s\nrlog\n" % \
+ self.cvs_module_name)
+ while True:
+ response = self.conn_read_line()
+ if response == None:
+ raise CVSProtocolError("No response from CVS server")
+ if response[0:2] == b"E ":
+ raise CVSProtocolError("Error response from CVS server: %s" % response)
+ fp.write(response)
+ if response == b"ok\n":
+ break
+ fp.seek(0)
+ return self._parse_rlog_response(fp)
+
+ def checkout(self, path, rev, dest_dir):
+ skip_line = False
+ expect_modeline = False
+ expect_bytecount = False
+ have_bytecount = False
+ bytecount = 0
+ dirname = os.path.dirname(path)
+ if dirname:
+ self.conn_write_str("Directory %s\n%s\n" % (dirname, dirname))
+ filename = os.path.basename(path)
+ co_output = tempfile.NamedTemporaryFile(dir=dest_dir, delete=True,
+ prefix='cvsclient-checkout-%s-r%s-' % (filename, rev))
+ # TODO: cvs <= 1.10 servers expect to be given every Directory along the path.
+ self.conn_write_str("Directory %s\n%s\n"
+ "Global_option -q\n"
+ "Argument -r%s\n"
+ "Argument -kb\n"
+ "Argument --\nArgument %s\nco \n" % (self.cvs_module_name,
+ self.cvs_module_name, rev, path))
+ while True:
+ if have_bytecount and bytecount > 0:
+ response = self.conn_read_line(require_newline=False)
+ if response == None:
+ raise CVSProtocolError("Incomplete response from CVS server")
+ co_output.write(response)
+ bytecount -= len(response)
+ if bytecount < 0:
+ raise CVSProtocolError("Overlong response from CVS server: %s" % response)
+ continue
+ else:
+ response = self.conn_read_line()
+ if response[0:2] == b'E ':
+ raise CVSProtocolError('Error from CVS server: %s' % response)
+ if have_bytecount and bytecount == 0 and response == b'ok\n':
+ break
+ if skip_line:
+ skip_line = False
+ continue
+ elif expect_bytecount:
+ try:
+ bytecount = int(response[0:-1]) # strip trailing \n
+ except ValueError:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % response)
+ have_bytecount = True
+ continue
+ elif response == b'M \n':
+ continue
+ elif response == b'MT +updated\n':
+ continue
+ elif response == b'MT -updated\n':
+ continue
+ elif response[0:9] == b'MT fname ':
+ continue
+ elif response[0:8] == b'Created ':
+ skip_line = True
+ continue
+ elif response[0:1] == b'/' and _re_kb_opt.search(response):
+ expect_modeline = True
+ continue
+ elif expect_modeline and response[0:2] == b'u=':
+ expect_modeline = False
+ expect_bytecount = True
+ continue
+ elif response[0:2] == b'M ':
+ continue
+ elif response[0:8] == b'MT text ':
+ continue
+ elif response[0:10] == b'MT newline':
+ continue
+ else:
+ raise CVSProtocolError('Bad CVS protocol response: %s' % response)
+ co_output.seek(0)
+ return co_output
diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py
index d5c7ae2..e7a187f 100644
--- a/swh/loader/cvs/loader.py
+++ b/swh/loader/cvs/loader.py
@@ -1,372 +1,466 @@
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Loader in charge of injecting either new or existing cvs repositories to
swh-storage.
"""
from datetime import datetime
import os
import subprocess
import tempfile
import time
from typing import Iterator, List, Optional, Sequence, Tuple
from urllib3.util import parse_url
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
from swh.loader.exception import NotFound
import swh.loader.cvs.rcsparse as rcsparse
+import swh.loader.cvs.cvsclient as cvsclient
+from swh.loader.cvs.rlog import RlogConv
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import CvsConv, RcsKeywords, CHANGESET_FUZZ_SEC, file_path, ChangeSetKey
from swh.model import from_disk, hashutil
from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone
from swh.model.model import (
Content,
Directory,
Origin,
Revision,
SkippedContent,
Snapshot,
SnapshotBranch,
TargetType,
)
from swh.storage.interface import StorageInterface
DEFAULT_BRANCH = b"HEAD"
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs."
class CvsLoader(BaseLoader):
"""Swh cvs loader.
The repository is local. The loader deals with
update on an already previously loaded repository.
"""
visit_type = "cvs"
def __init__(
self,
storage: StorageInterface,
url: str,
origin_url: Optional[str] = None,
visit_date: Optional[datetime] = None,
cvsroot_path: Optional[str] = None,
temp_directory: str = "/tmp",
max_content_size: Optional[int] = None,
):
super().__init__(
storage=storage,
logging_class="swh.loader.cvs.CvsLoader",
max_content_size=max_content_size,
)
self.cvsroot_url = url
# origin url as unique identifier for origin in swh archive
self.origin_url = origin_url if origin_url else self.cvsroot_url
self.temp_directory = temp_directory
self.done = False
+
self.cvs_module_name = None
- self.cvs_module_path = None
+
+ # XXX At present changeset IDs are recomputed on the fly during every visit.
+ # If we were able to maintain a cached somewhere which can be indexed by a
+ # cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid
+ # doing a lot of redundant work during every visit.
+
self.cvs_changesets = None
+
+ # remote CVS repository access (history is parsed from CVS rlog):
+ self.cvsclient = None
+ self.rlog_file = None
+
# internal state used to store swh objects
self._contents: List[Content] = []
self._skipped_contents: List[SkippedContent] = []
self._directories: List[Directory] = []
self._revisions: List[Revision] = []
self.swh_revision_gen = None
# internal state, current visit
self._last_revision = None
self._visit_status = "full"
self._load_status = "uneventful"
self.visit_date = visit_date
self.cvsroot_path = cvsroot_path
self.snapshot = None
+ def compute_swh_revision(self, k, logmsg):
+ """Compute swh hash data per CVS changeset.
+
+ Returns:
+ tuple (rev, swh_directory)
+ - rev: current SWH revision computed from checked out work tree
+ - swh_directory: dictionary of path, swh hash data with type
+
+ """
+ # Compute SWH revision from the on-disk state
+ swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
+ if self._last_revision:
+ parents = tuple([bytes(self._last_revision.id)])
+ else:
+ parents = ()
+ revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
+ self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))
+ self._last_revision = revision
+ if self._load_status == "uneventful":
+ # We have an eventful load if this revision is not already present in the archive
+ if not self.storage.revision_get([revision.id])[0]:
+ self._load_status = "eventful"
+ return (revision, swh_dir)
+
def swh_hash_data_per_cvs_changeset(self):
"""Compute swh hash data per CVS changeset.
Yields:
tuple (rev, swh_directory)
- rev: current SWH revision computed from checked out work tree
- swh_directory: dictionary of path, swh hash data with type
"""
- # XXX At present changeset IDs are recomputed on the fly during every visit.
- # If we were able to maintain a cached somewhere which can be indexed by a
- # cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid
- # doing a lot of redundant work during every visit.
for k in self.cvs_changesets:
tstr = time.strftime('%c', time.gmtime(k.max_time))
self.log.info("changeset from %s by %s on branch %s", tstr, k.author, k.branch);
logmsg = ""
# Check out the on-disk state of this revision
for f in k.revs:
rcsfile = None
path = file_path(self.cvsroot_path, f.path)
wtpath = os.path.join(self.worktree_path, path)
self.log.info("rev %s of file %s" % (f.rev, f.path));
if not logmsg:
rcsfile = rcsparse.rcsfile(f.path)
logmsg = rcsfile.getlog(k.revs[0].rev)
if f.state == 'dead':
# remove this file from work tree
try:
os.remove(wtpath)
except FileNotFoundError:
pass
else:
# create, or update, this file in the work tree
if not rcsfile:
rcsfile = rcsparse.rcsfile(f.path)
rcs = RcsKeywords()
contents = rcs.expand_keyword(f.path, rcsfile, f.rev)
try:
outfile = open(wtpath, mode='wb')
except FileNotFoundError:
os.makedirs(os.path.dirname(wtpath))
outfile = open(wtpath, mode='wb')
outfile.write(contents)
outfile.close()
- # Compute SWH revision from the on-disk state
- swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path))
- if self._last_revision:
- parents = tuple([bytes(self._last_revision.id)])
- else:
- parents = ()
- revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents)
- self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id))
- self._last_revision = revision
- if self._load_status == "uneventful":
- # We have an eventful load if this revision is not already present in the archive
- if not self.storage.revision_get([revision.id])[0]:
- self._load_status = "eventful"
-
+ (revision, swh_dir) = self.compute_swh_revision(k, logmsg)
yield revision, swh_dir
+ def swh_hash_data_per_cvs_rlog_changeset(self):
+ """Compute swh hash data per CVS rlog changeset.
+
+ Yields:
+ tuple (rev, swh_directory)
+ - rev: current SWH revision computed from checked out work tree
+ - swh_directory: dictionary of path, swh hash data with type
+
+ """
+ for k in self.cvs_changesets:
+ tstr = time.strftime('%c', time.gmtime(k.max_time))
+ self.log.info("changeset from %s by %s on branch %s", tstr, k.author, k.branch);
+ logmsg = ""
+ # Check out the on-disk state of this revision
+ for f in k.revs:
+ path = file_path(self.cvsroot_path, f.path)
+ wtpath = os.path.join(self.worktree_path, path)
+ self.log.info("rev %s of file %s" % (f.rev, f.path));
+ if not logmsg:
+ logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev)
+ self.log.debug("f.state is %s\n" % f.state)
+ if f.state == 'dead':
+ # remove this file from work tree
+ try:
+ os.remove(wtpath)
+ except FileNotFoundError:
+ pass
+ else:
+ dirname = os.path.dirname(wtpath)
+ try:
+ os.makedirs(dirname)
+ except FileExistsError:
+ pass
+ self.log.debug("checkout to %s\n" % wtpath)
+ fp = self.cvsclient.checkout(f.path, f.rev, dirname)
+ os.rename(fp.name, wtpath)
+ try:
+ fp.close()
+ except FileNotFoundError:
+ # Well, we have just renamed the file...
+ pass
+
+ # TODO: prune empty directories?
+ (revision, swh_dir) = self.compute_swh_revision(k, logmsg)
+ yield revision, swh_dir
def process_cvs_changesets(self) -> Iterator[
Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
]:
"""Process CVS revisions.
At each CVS revision, check out contents and compute swh hashes.
Yields:
tuple (contents, skipped-contents, directories, revision) of dict as a
dictionary with keys, sha1_git, sha1, etc...
"""
for swh_revision, swh_dir in self.swh_hash_data_per_cvs_changeset():
# Send the associated contents/directories
(_contents, _skipped_contents, _directories) = from_disk.iter_directory(swh_dir)
yield _contents, _skipped_contents, _directories, swh_revision
+ def process_cvs_rlog_changesets(self) -> Iterator[
+ Tuple[List[Content], List[SkippedContent], List[Directory], Revision]
+ ]:
+ """Process CVS rlog revisions.
+
+ At each CVS revision, check out contents and compute swh hashes.
+
+ Yields:
+ tuple (contents, skipped-contents, directories, revision) of dict as a
+ dictionary with keys, sha1_git, sha1, etc...
+
+ """
+ for swh_revision, swh_dir in self.swh_hash_data_per_cvs_rlog_changeset():
+ # Send the associated contents/directories
+ (_contents, _skipped_contents, _directories) = from_disk.iter_directory(swh_dir)
+ yield _contents, _skipped_contents, _directories, swh_revision
def prepare_origin_visit(self):
self.origin = Origin(url=self.origin_url if self.origin_url else self.cvsroot_url)
def pre_cleanup(self):
"""Cleanup potential dangling files from prior runs (e.g. OOM killed
tasks)
"""
clean_dangling_folders(
self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log,
)
def cleanup(self):
self.log.info("cleanup")
def fetch_cvs_repo_with_rsync(self, host, path):
# URL *must* end with a trailing slash in order to get CVSROOT listed
url = 'rsync://%s%s/' % (host, os.path.dirname(path))
rsync = subprocess.run(['rsync', url], capture_output=True, encoding='ascii')
rsync.check_returncode()
have_cvsroot = False
have_module = False
for line in rsync.stdout.split('\n'):
self.log.debug("rsync server: %s" % line)
if line.endswith(' CVSROOT'):
have_cvsroot = True
elif line.endswith(' %s' % self.cvs_module_name):
have_module = True
if have_module and have_cvsroot:
break
if not have_module:
raise NotFound("CVS module %s not found at %s" \
% (self.cvs_module_name, host, url))
if not have_cvsroot:
raise NotFound("No CVSROOT directory found at %s" % url)
rsync = subprocess.run(['rsync', '-a', url, self.cvsroot_path])
rsync.check_returncode()
def prepare(self):
self._last_revision = None
self._load_status = "uneventful"
self.swh_revision_gen = None
if not self.cvsroot_path:
self.cvsroot_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,
)
self.worktree_path = tempfile.mkdtemp(
suffix="-%s" % os.getpid(),
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
dir=self.temp_directory,
)
url = parse_url(self.origin_url)
self.log.debug("prepare; origin_url=%s scheme=%s path=%s" % (self.origin_url, url.scheme, url.path))
if not url.path:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
self.cvs_module_name = os.path.basename(url.path)
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name));
- self.cvs_module_path = os.path.join(self.cvsroot_path, self.cvs_module_name)
if url.scheme == 'file':
if not os.path.exists(url.path):
raise NotFound
elif url.scheme == 'rsync':
- self.fetch_cvs_repo_with_rsync(url.host, url.path)
+ self.fetch_cvs_repo_with_rsync(url.host, url.path)
+
+ if url.scheme == 'file' or url.scheme == 'rsync':
+ # local CVS repository conversion
+ have_rcsfile = False
+ have_cvsroot = False
+ for root, dirs, files in os.walk(self.cvsroot_path):
+ if 'CVSROOT' in dirs:
+ have_cvsroot = True
+ dirs.remove('CVSROOT')
+ continue;
+ for f in files:
+ filepath = os.path.join(root, f)
+ if f[-2:] == ',v':
+ try:
+ rcsfile = rcsparse.rcsfile(filepath)
+ except(Exception):
+ raise
+ else:
+ self.log.debug("Looks like we have data to convert; "
+ "found a valid RCS file at %s" % filepath)
+ have_rcsfile = True
+ break
+ if have_rcsfile:
+ break;
+
+ if not have_rcsfile:
+ raise NotFound("Directory %s does not contain any valid RCS files %s" % self.cvsroot_path)
+ if not have_cvsroot:
+ self.log.warn("The CVS repository at '%s' lacks a CVSROOT directory; "
+ "we might be ingesting an incomplete copy of the repository" % self.cvsroot_path)
+
+ # Unfortunately, there is no way to convert CVS history in an iterative fashion
+ # because the data is not indexed by any kind of changeset ID. We need to walk
+ # the history of each and every RCS file in the repository during every visit,
+ # even if no new changes will be added to the SWH archive afterwards.
+ # "CVS’s repository is the software equivalent of a telephone book sorted by telephone number."
+ # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/
+ #
+ # An implicit assumption made here is that self.cvs_changesets will fit into
+ # memory in its entirety. If it won't fit then the CVS walker will need to
+ # be modified such that it spools the list of changesets to disk instead.
+ cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)
+ self.log.info("Walking CVS module %s", self.cvs_module_name)
+ cvs.walk(self.cvs_module_name)
+ self.cvs_changesets = sorted(cvs.changesets)
+ self.log.info('CVS changesets found in %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
+ self.swh_revision_gen = self.process_cvs_changesets()
+ elif url.scheme == 'pserver' or url.scheme == 'fake':
+ # remote CVS repository conversion
+ self.cvsclient = cvsclient.CVSClient(url)
+ cvsroot_path = os.path.dirname(url.path)
+ self.log.info("Fetching CVS rlog from %s:%s/%s", url.host, cvsroot_path, self.cvs_module_name)
+ self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC)
+ self.rlog_file = self.cvsclient.fetch_rlog()
+ self.rlog.parse_rlog(self.rlog_file)
+ self.cvs_changesets = sorted(self.rlog.changesets)
+ self.log.info('CVS changesets found for %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
+ self.swh_revision_gen = self.process_cvs_rlog_changesets()
else:
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url)
- have_rcsfile = False
- have_cvsroot = False
- for root, dirs, files in os.walk(self.cvsroot_path):
- if 'CVSROOT' in dirs:
- have_cvsroot = True
- dirs.remove('CVSROOT')
- continue;
- for f in files:
- filepath = os.path.join(root, f)
- if f[-2:] == ',v':
- try:
- rcsfile = rcsparse.rcsfile(filepath)
- except(Exception):
- raise
- else:
- self.log.debug("Looks like we have data to convert; "
- "found a valid RCS file at %s" % filepath)
- have_rcsfile = True
- break
- if have_rcsfile:
- break;
-
- if not have_rcsfile:
- raise NotFound("Directory %s does not contain any valid RCS files %s" % self.cvsroot_path)
- if not have_cvsroot:
- self.log.warn("The CVS repository at '%s' lacks a CVSROOT directory; "
- "we might be ingesting an incomplete copy of the repository" % self.cvsroot_path)
-
- # Unfortunately, there is no way to convert CVS history in an iterative fashion
- # because the data is not indexed by any kind of changeset ID. We need to walk
- # the history of each and every RCS file in the repository during every visit,
- # even if no new changes will be added to the SWH archive afterwards.
- # "CVS’s repository is the software equivalent of a telephone book sorted by telephone number."
- # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/
- #
- # An implicit assumption made here is that self.cvs_changesets will fit into
- # memory in its entirety. If it won't fit then the CVS walker will need to
- # be modified such that it spools the list of changesets to disk instead.
- cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC)
- self.log.info("Walking CVS module %s", self.cvs_module_name)
- cvs.walk(self.cvs_module_name)
- self.cvs_changesets = sorted(cvs.changesets)
- self.log.info('CVS changesets found in %s: %d' % (self.cvs_module_name, len(self.cvs_changesets)))
- # SWH revisions are generated and stored iteratively to avoid high memory consumption
- self.swh_revision_gen = self.process_cvs_changesets()
+
def fetch_data(self):
"""Fetch the next CVS revision."""
try:
data = next(self.swh_revision_gen)
except StopIteration:
return False
except Exception as e:
self.log.exception(e)
return False # Stopping iteration
self._contents, self._skipped_contents, self._directories, rev = data
self._revisions = [rev]
return True
def build_swh_revision(self,
k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes]
) -> Revision:
"""Given a CVS revision, build a swh revision.
Args:
k: changeset data
logmsg: the changeset's log message
dir_id: the tree's hash identifier
parents: the revision's parents identifier
Returns:
The swh revision dictionary.
"""
author = Person.from_fullname(k.author.encode('UTF-8'))
date = TimestampWithTimezone.from_datetime(k.max_time)
return Revision(
type=RevisionType.CVS,
date=date,
committer_date=date,
directory=dir_id,
message=logmsg,
author=author,
committer=author,
synthetic=True,
extra_headers=[],
parents=tuple(parents))
def generate_and_load_snapshot(self, revision) -> Snapshot:
"""Create the snapshot either from existing revision.
Args:
revision (dict): Last revision seen if any (None by default)
Returns:
Optional[Snapshot] The newly created snapshot
"""
snap = Snapshot(
branches={
DEFAULT_BRANCH: SnapshotBranch(
target=revision.id, target_type=TargetType.REVISION
)
}
)
self.log.debug("snapshot: %s" % snap)
self.storage.snapshot_add([snap])
return snap
def store_data(self):
"Add our current CVS changeset to the archive."
self.storage.skipped_content_add(self._skipped_contents)
self.storage.content_add(self._contents)
self.storage.directory_add(self._directories)
self.storage.revision_add(self._revisions)
self.snapshot = self.generate_and_load_snapshot(self._last_revision)
self.log.debug("SWH snapshot ID: %s" % hashutil.hash_to_hex(self.snapshot.id))
self.flush()
self.loaded_snapshot_id = self.snapshot.id
self._skipped_contents = []
self._contents = []
self._directories = []
self._revisions = []
def load_status(self):
return {
"status": self._load_status,
}
def visit_status(self):
return self._visit_status
diff --git a/swh/loader/cvs/rlog.py b/swh/loader/cvs/rlog.py
new file mode 100644
index 0000000..1a046c3
--- /dev/null
+++ b/swh/loader/cvs/rlog.py
@@ -0,0 +1,391 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+""" RCS/CVS rlog parser, derived from viewvc and cvs2gitdump.py """
+
+# Copyright (C) 1999-2021 The ViewCVS Group. All Rights Reserved.
+#
+# By using ViewVC, you agree to the terms and conditions set forth
+# below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following
+# disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Copyright (c) 2012 YASUOKA Masahiko <yasuoka@yasuoka.net>
+#
+# Permission to use, copy, modify, and distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+import re
+import calendar
+import time
+
+from swh.loader.cvs.cvs2gitdump.cvs2gitdump import CHANGESET_FUZZ_SEC, file_path, ChangeSetKey
+
+# TODO: actual path encoding should be specified as a parameter
+path_encodings = [ 'ascii', 'utf-8' ]
+
+class RlogConv:
+ def __init__(self, cvsroot_path, fuzzsec):
+ self.cvsroot_path = cvsroot_path
+ self.fuzzsec = fuzzsec
+ self.changesets = dict()
+ self.tags = dict()
+ self.offsets = dict()
+
+ def _process_rlog_entry(self, path, taginfo, revisions, logmsgs):
+ """ Convert an rlog entry into an item in self.changesets """
+ rtags = dict()
+ branches = {'1': 'HEAD', '1.1.1': 'VENDOR'}
+ for k, v in list(taginfo.items()):
+ r = v.split('.')
+ if len(r) == 3:
+ branches[v] = 'VENDOR'
+ elif len(r) >= 3 and r[-2] == '0':
+ branches['.'.join(r[:-2] + r[-1:])] = k
+ if len(r) == 2 and branches[r[0]] == 'HEAD':
+ if v not in rtags:
+ rtags[v] = list()
+ rtags[v].append(k)
+
+ revs = revisions.items()
+ # sort by revision descending to priorize 1.1.1.1 than 1.1
+ revs = sorted(revs, key=lambda a: a[1][0], reverse=True)
+ # sort by time
+ revs = sorted(revs, key=lambda a: a[1][1])
+ novendor = False
+ have_initial_revision = False
+ last_vendor_status = None
+ for k, v in revs:
+ r = k.split('.')
+ if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
+ and r[3] == '1':
+ if have_initial_revision:
+ continue
+ if v[3] == 'dead':
+ continue
+ last_vendor_status = v[3]
+ have_initial_revision = True
+ elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
+ if novendor:
+ continue
+ last_vendor_status = v[3]
+ elif len(r) == 2:
+ if r[0] == '1' and r[1] == '1':
+ if have_initial_revision:
+ continue
+ if v[3] == 'dead':
+ continue
+ have_initial_revision = True
+ elif r[0] == '1' and r[1] != '1':
+ novendor = True
+ if last_vendor_status == 'dead' and v[3] == 'dead':
+ last_vendor_status = None
+ continue
+ last_vendor_status = None
+ else:
+ # trunk only
+ continue
+
+ b = '.'.join(r[:-1])
+ # decode author name in a potentially lossy way;
+ # it is only used for internal hashing in this case
+ author = v[2].decode('utf-8', 'ignore')
+ a = ChangeSetKey(
+ branches[b], author, v[1], logmsgs[k], v[6],
+ self.fuzzsec)
+
+ a.put_file(path, k, v[3], 0)
+ while a in self.changesets:
+ c = self.changesets[a]
+ del self.changesets[a]
+ c.merge(a)
+ a = c
+ self.changesets[a] = a
+ if k in rtags:
+ for t in rtags[k]:
+ if t not in self.tags or \
+ self.tags[t].max_time < a.max_time:
+ self.tags[t] = a
+
+ def parse_rlog(self, fp):
+ eof = None
+ while eof != _EOF_LOG and eof != _EOF_ERROR:
+ filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp)
+ revisions = {}
+ logmsgs = {}
+ if filename:
+ for i, e in enumerate(path_encodings):
+ try:
+ how = 'ignore' if i == len(path_encodings) - 1 else 'strict'
+ fname = filename.decode(e, how)
+ break
+ except UnicodeError:
+ pass
+ while not eof:
+ off = fp.tell()
+ rev, logmsg, eof = _parse_log_entry(fp)
+ if rev:
+ revisions[rev[0]] = rev
+ logmsgs[rev[0]] = logmsg
+ if eof != _EOF_LOG and eof != _EOF_ERROR:
+ path = file_path(self.cvsroot_path, fname)
+ if not path in self.offsets.keys():
+ self.offsets[path] = dict()
+ if rev:
+ self.offsets[path][rev[0]] = off
+ self._process_rlog_entry(path, taginfo, revisions, logmsgs)
+
+ def getlog(self, fp, path, rev):
+ off = self.offsets[path][rev]
+ fp.seek(off)
+ rev, logmsg, eof = _parse_log_entry(fp)
+ return logmsg
+
+# if your rlog doesn't use 77 '=' characters, then this must change
+LOG_END_MARKER = b'=' * 77 + b'\n'
+ENTRY_END_MARKER = b'-' * 28 + b'\n'
+
+_EOF_FILE = b'end of file entries' # no more entries for this RCS file
+_EOF_LOG = b'end of log' # hit the true EOF on the pipe
+_EOF_ERROR = b'error message found' # rlog issued an error
+
+# rlog error messages look like
+#
+# rlog: filename/goes/here,v: error message
+# rlog: filename/goes/here,v:123: error message
+#
+# so we should be able to match them with a regex like
+#
+# ^rlog\: (.*)(?:\:\d+)?\: (.*)$
+#
+# But for some reason the windows version of rlog omits the "rlog: " prefix
+# for the first error message when the standard error stream has been
+# redirected to a file or pipe. (the prefix is present in subsequent errors
+# and when rlog is run from the console). So the expression below is more
+# complicated
+_re_log_error = re.compile(b'^(?:rlog\: )*(.*,v)(?:\:\d+)?\: (.*)$')
+
+# CVSNT error messages look like:
+# cvs rcsfile: `C:/path/to/file,v' does not appear to be a valid rcs file
+# cvs [rcsfile aborted]: C:/path/to/file,v: No such file or directory
+# cvs [rcsfile aborted]: cannot open C:/path/to/file,v: Permission denied
+_re_cvsnt_error = re.compile(b'^(?:cvs rcsfile\: |cvs \[rcsfile aborted\]: )'
+ b'(?:\`(.*,v)\' |'
+ b'cannot open (.*,v)\: |(.*,v)\: |)'
+ b'(.*)$')
+
+
+def _parse_log_header(fp):
+ """Parse and RCS/CVS log header.
+
+ fp is a file (pipe) opened for reading the log information.
+
+ On entry, fp should point to the start of a log entry.
+ On exit, fp will have consumed the separator line between the header and
+ the first revision log.
+
+ If there is no revision information (e.g. the "-h" switch was passed to
+ rlog), then fp will consumed the file separator line on exit.
+
+ Returns: filename, default branch, tag dictionary, lock dictionary,
+ rlog error message, and eof flag
+ """
+
+ filename = branch = msg = b""
+ taginfo = {} # tag name => number
+ lockinfo = {} # revision => locker
+ state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks
+ eof = None
+
+ while 1:
+ line = fp.readline()
+ if not line:
+ # the true end-of-file
+ eof = _EOF_LOG
+ break
+
+ if state == 1:
+ if line[0] == b'\t':
+ [tag, rev] = [x.strip() for x in line.split(b':')]
+ taginfo[tag] = rev
+ else:
+ # oops. this line isn't tag info. stop parsing tags.
+ state = 0
+
+ if state == 2:
+ if line[0] == b'\t':
+ [locker, rev] = [x.strip() for x in line.split(b':')]
+ lockinfo[rev] = locker
+ else:
+ # oops. this line isn't lock info. stop parsing tags.
+ state = 0
+
+ if state == 0:
+ if line[:9] == b'RCS file:':
+ filename = line[10:-1]
+ elif line[:5] == b'head:':
+ # head = line[6:-1]
+ pass
+ elif line[:7] == b'branch:':
+ branch = line[8:-1]
+ elif line[:6] == b'locks:':
+ # start parsing the lock information
+ state = 2
+ elif line[:14] == b'symbolic names':
+ # start parsing the tag information
+ state = 1
+ elif line == ENTRY_END_MARKER:
+ # end of the headers
+ break
+ elif line == LOG_END_MARKER:
+ # end of this file's log information
+ eof = _EOF_FILE
+ break
+ else:
+ error = _re_cvsnt_error.match(line)
+ if error:
+ p1, p2, p3, msg = error.groups()
+ filename = p1 or p2 or p3
+ if not filename:
+ raise vclib.Error("Could not get filename from CVSNT error:\n%s"
+ % line)
+ eof = _EOF_ERROR
+ break
+
+ error = _re_log_error.match(line)
+ if error:
+ filename, msg = error.groups()
+ if msg[:30] == b'warning: Unknown phrases like ':
+ # don't worry about this warning. it can happen with some RCS
+ # files that have unknown fields in them (e.g. "permissions 644;"
+ continue
+ eof = _EOF_ERROR
+ break
+
+ return filename, branch, taginfo, lockinfo, msg, eof
+
+
+_re_log_info = re.compile(b'^date:\s+([^;]+);'
+ b'\s+author:\s+([^;]+);'
+ b'\s+state:\s+([^;]+);'
+ b'(\s+lines:\s+([0-9\s+-]+);?)?'
+ b'(\s+commitid:\s+([a-zA-Z0-9]+))?\n$')
+
+# TODO: _re_rev should be updated to extract the "locked" flag
+_re_rev = re.compile(b'^revision\s+([0-9.]+).*')
+
+def cvs_strptime(timestr):
+ try:
+ return time.strptime(timestr, '%Y/%m/%d %H:%M:%S')[:-1] + (0,)
+ except ValueError:
+ return time.strptime(timestr, '%Y-%m-%d %H:%M:%S %z')[:-1] + (0,)
+
+def _parse_log_entry(fp):
+ """Parse a single log entry.
+
+ On entry, fp should point to the first line of the entry (the "revision"
+ line).
+ On exit, fp will have consumed the log separator line (dashes) or the
+ end-of-file marker (equals).
+
+ Returns: Revision data tuple, and eof flag (see _EOF_*)
+ """
+ rev = None
+ line = fp.readline()
+ if not line:
+ return None, None, _EOF_LOG
+ if line == LOG_END_MARKER:
+ # Needed because some versions of RCS precede LOG_END_MARKER
+ # with ENTRY_END_MARKER
+ return None, None, _EOF_FILE
+ if line[:8] == b'revision':
+ match = _re_rev.match(line)
+ if not match:
+ return None, None, _EOF_LOG
+ rev = match.group(1)
+
+ line = fp.readline()
+ if not line:
+ return None, None, _EOF_LOG
+ match = _re_log_info.match(line)
+
+ eof = None
+ log = b''
+ while 1:
+ line = fp.readline()
+ if not line:
+ # true end-of-file
+ eof = _EOF_LOG
+ break
+ if line[:9] == b'branches:':
+ continue
+ if line == ENTRY_END_MARKER:
+ break
+ if line == LOG_END_MARKER:
+ # end of this file's log information
+ eof = _EOF_FILE
+ break
+
+ log = log + line
+
+ if not rev or not match:
+ # there was a parsing error
+ return None, None, eof
+
+ # parse out a time tuple for the local time
+ tm = cvs_strptime(match.group(1).decode('UTF-8'))
+
+ # rlog seems to assume that two-digit years are 1900-based (so, "04"
+ # comes out as "1904", not "2004").
+ EPOCH = 1970
+ if tm[0] < EPOCH:
+ tm = list(tm)
+ if (tm[0] - 1900) < 70:
+ tm[0] = tm[0] + 100
+ if tm[0] < EPOCH:
+ raise ValueError('invalid year')
+ date = calendar.timegm(tm)
+
+ # return a revision tuple compatible with 'rcsparse', the log message, and the EOF marker
+ return (rev.decode('ascii'), # revision number string
+ date,
+ match.group(2), # author (encoding is arbitrary; don't attempt to decode)
+ match.group(3).decode('ascii'), # state, usually "Exp" or "dead"; non-ASCII data here would be weird
+ None, # TODO: branches of this rev
+ None, # TODO: revnumstr of previous rev
+ None, # TODO: commitid
+ ), log, eof
diff --git a/swh/loader/cvs/tests/data/nano.rlog.tgz b/swh/loader/cvs/tests/data/nano.rlog.tgz
new file mode 100644
index 0000000..ffffa87
Binary files /dev/null and b/swh/loader/cvs/tests/data/nano.rlog.tgz differ
diff --git a/swh/loader/cvs/tests/data/runbaby.tgz b/swh/loader/cvs/tests/data/runbaby.tgz
index c2256f3..354845d 100644
Binary files a/swh/loader/cvs/tests/data/runbaby.tgz and b/swh/loader/cvs/tests/data/runbaby.tgz differ
diff --git a/swh/loader/cvs/tests/test_loader.py b/swh/loader/cvs/tests/test_loader.py
index c4abe57..d899a9a 100644
--- a/swh/loader/cvs/tests/test_loader.py
+++ b/swh/loader/cvs/tests/test_loader.py
@@ -1,223 +1,259 @@
# Copyright (C) 2016-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import pytest
from swh.loader.cvs.loader import CvsLoader
from swh.loader.tests import (
assert_last_visit_matches,
check_snapshot,
get_stats,
prepare_repository_from_archive,
)
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Snapshot, SnapshotBranch, TargetType
RUNBABY_SNAPSHOT = Snapshot(
id=hash_to_bytes("1cff69ab9bd70822d5e3006092f943ccaafdcf57"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("ef511d258fa55035c2bc2a5b05cad233cee1d328"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_not_found_no_mock(swh_storage, tmp_path):
"""Given an unknown repository, the loader visit ends up in status not_found"""
unknown_repo_url = "unknown-repository"
loader = CvsLoader(swh_storage, unknown_repo_url, cvsroot_path=tmp_path)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
swh_storage, unknown_repo_url, status="not_found", type="cvs",
)
def test_loader_cvs_visit(swh_storage, datadir, tmp_path):
"""Eventful visit should yield 1 snapshot"""
archive_name = "runbaby"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 5,
"directory": 2,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 1,
"skipped_content": 0,
"snapshot": 1,
}
check_snapshot(RUNBABY_SNAPSHOT, loader.storage)
def test_loader_cvs_2_visits_no_change(swh_storage, datadir, tmp_path):
"""Eventful visit followed by uneventful visit should yield the same snapshot
"""
archive_name = "runbaby"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
assert loader.load() == {"status": "uneventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=RUNBABY_SNAPSHOT.id,
)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot == visit_status2.snapshot
stats = get_stats(loader.storage)
assert stats["origin_visit"] == 1 + 1 # computed twice the same snapshot
assert stats["snapshot"] == 1
GREEK_SNAPSHOT = Snapshot(
id=hash_to_bytes("5e74af67d69dfd7aea0eb118154d062f71f50120"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("e18b92f14cd5b3efb3fcb4ea46cfaf97f25f301b"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_with_file_additions_and_deletions(swh_storage, datadir, tmp_path):
"""Eventful conversion of history with file additions and deletions"""
archive_name = "greek-repository"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 8,
"directory": 20,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 7,
}
check_snapshot(GREEK_SNAPSHOT, loader.storage)
GREEK_SNAPSHOT2 = Snapshot(
id=hash_to_bytes("048885ae2145ffe81588aea95dcf75c536ecdf26"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("55eb1438c03588607ce4b8db8f45e8e23075951b"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_cvs_2_visits_with_change(swh_storage, datadir, tmp_path):
"""Eventful visit followed by eventful visit should yield two snapshots"""
archive_name = "greek-repository"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 8,
"directory": 20,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 7,
}
archive_name2 = "greek-repository2"
archive_path2 = os.path.join(datadir, f"{archive_name2}.tgz")
repo_url = prepare_repository_from_archive(archive_path2, archive_name, tmp_path)
repo_url += '/greek-tree' # CVS module name
loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
assert loader.load() == {"status": "eventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="cvs",
snapshot=GREEK_SNAPSHOT2.id,
)
stats = get_stats(loader.storage)
assert stats == {
"content": 10,
"directory": 23,
"origin": 1,
"origin_visit": 2,
"release": 0,
"revision": 8,
"skipped_content": 0,
"snapshot": 8,
}
check_snapshot(GREEK_SNAPSHOT2, loader.storage)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot != visit_status2.snapshot
+
+def test_loader_cvs_visit_pserver(swh_storage, datadir, tmp_path):
+ """Eventful visit to CVS pserver should yield 1 snapshot"""
+ archive_name = "runbaby"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+ repo_url += '/runbaby' # CVS module name
+
+ # Ask our cvsclient to connect via the 'cvs server' command
+ repo_url = 'fake://' + repo_url[7:]
+
+ loader = CvsLoader(swh_storage, repo_url, cvsroot_path=os.path.join(tmp_path, archive_name))
+
+ assert loader.load() == {"status": "eventful"}
+
+ assert_last_visit_matches(
+ loader.storage,
+ repo_url,
+ status="full",
+ type="cvs",
+ snapshot=RUNBABY_SNAPSHOT.id,
+ )
+
+ stats = get_stats(loader.storage)
+ assert stats == {
+ "content": 5,
+ "directory": 2,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 1,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ check_snapshot(RUNBABY_SNAPSHOT, loader.storage)

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 10:58 AM (3 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3299270

Event Timeline