Differential D6598 Diff 24021 swh/loader/cvs/rlog.py

Changeset View

Standalone View

swh/loader/cvs/rlog.py

Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES		# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN		# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF		# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.		# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

import calendar		import calendar
import re		import re
import time		import time
		from typing import BinaryIO, Dict, List, NamedTuple, Optional, Tuple

from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path		from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path

# There is no known encoding of path names in CVS. The actual encoding used		# There is no known encoding of path names in CVS. The actual encoding used
# will depend on the CVS server's operating system and perhaps even the		# will depend on the CVS server's operating system and perhaps even the
# underlying filesystem used to host a CVS repository.		# underlying filesystem used to host a CVS repository.
# It is even conceivable that a given repository may use multiple encodings,		# It is even conceivable that a given repository may use multiple encodings,
# e.g. due to migrations of the repository between different servers over time.		# e.g. due to migrations of the repository between different servers over time.
#		#
# This issue also affects the CVS network protocol which is communicating		# This issue also affects the CVS network protocol which is communicating
# paths between the CVS server and the CVS client. For this reason, most		# paths between the CVS server and the CVS client. For this reason, most
# public-facing repositories should stick to ASCII in practice.		# public-facing repositories should stick to ASCII in practice.
#		#
# TODO: If known, the actual path encoding used by the repository should		# TODO: If known, the actual path encoding used by the repository should
# be specified as a parameter. This parameter should be a list since		# be specified as a parameter. This parameter should be a list since
# multiple encodings may be present in a given repository.		# multiple encodings may be present in a given repository.
path_encodings = ["ascii", "utf-8"]		path_encodings = ["ascii", "utf-8"]


		class revtuple(NamedTuple):
		number: str
		date: int
		author: bytes
		state: str
		branches: None
		revnumstr: None
		commitid: None


class RlogConv:		class RlogConv:
def __init__(self, cvsroot_path, fuzzsec):		def __init__(self, cvsroot_path: str, fuzzsec: int) -> None:
self.cvsroot_path = cvsroot_path		self.cvsroot_path = cvsroot_path
self.fuzzsec = fuzzsec		self.fuzzsec = fuzzsec
self.changesets = dict()		self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict()
self.tags = dict()		self.tags: Dict[str, ChangeSetKey] = dict()
self.offsets = dict()		self.offsets: Dict[str, Dict[str, int]] = dict()

def _process_rlog_revisions(self, path, taginfo, revisions, logmsgs):		def _process_rlog_revisions(
		self,
		path: str,
		taginfo: Dict[bytes, bytes],
		revisions: Dict[str, revtuple],
		logmsgs: Dict[str, Optional[bytes]]
		) -> None:
""" Convert RCS revision history of a file into self.changesets items """		""" Convert RCS revision history of a file into self.changesets items """
rtags = dict()		rtags: Dict[str, List[str]] = dict()
# RCS and CVS represent branches by adding digits to revision numbers.		# RCS and CVS represent branches by adding digits to revision numbers.
# And CVS assigns special meaning to certain revision number ranges.		# And CVS assigns special meaning to certain revision number ranges.
#		#
# Revision numbers on the main branch have only two digits:		# Revision numbers on the main branch have only two digits:
#		#
# 1.1, 1.2, 1.3, ...		# 1.1, 1.2, 1.3, ...
#		#
# Branches created with 'cvs tag -b' use even numbers for		# Branches created with 'cvs tag -b' use even numbers for
Show All 26 Lines	) -> None:
# 1.1, 1.2, 1.3, ... main branch history of the file		# 1.1, 1.2, 1.3, ... main branch history of the file
# \|		# \|
# 1.1.2.0.1 magic branch (2)		# 1.1.2.0.1 magic branch (2)
#		#
# This allows CVS to store information about a branch's existence		# This allows CVS to store information about a branch's existence
# before any files on this branch have been modified.		# before any files on this branch have been modified.
# Even-numbered branch revisions appear once the file is modified.		# Even-numbered branch revisions appear once the file is modified.
branches = {"1": "HEAD", "1.1.1": "VENDOR"}		branches = {"1": "HEAD", "1.1.1": "VENDOR"}
for k, v in list(taginfo.items()):
r = v.split(".")		k: str
		v_: str
		for k, v_ in list(taginfo.items()): # type: ignore # FIXME, inconsistent types
		r = v_.split(".")
if len(r) == 3:		if len(r) == 3:
# vendor branch number		# vendor branch number
branches[v] = "VENDOR"		branches[v_] = "VENDOR"
elif len(r) >= 3 and r[-2] == "0":		elif len(r) >= 3 and r[-2] == "0":
# magic branch number		# magic branch number
branches[".".join(r[:-2] + r[-1:])] = k		branches[".".join(r[:-2] + r[-1:])] = k
if len(r) == 2 and branches[r[0]] == "HEAD":		if len(r) == 2 and branches[r[0]] == "HEAD":
# main branch number		# main branch number
if v not in rtags:		if v_ not in rtags:
rtags[v] = list()		rtags[v_] = list()
rtags[v].append(k)		rtags[v_].append(k)

revs = revisions.items()		revs: List[Tuple[str, revtuple]] = list(revisions.items())
# sort by revision descending to priorize 1.1.1.1 than 1.1		# sort by revision descending to priorize 1.1.1.1 than 1.1
revs = sorted(revs, key=lambda a: a[1][0], reverse=True)		revs.sort(key=lambda a: a[1][0], reverse=True)
# sort by time		# sort by time
revs = sorted(revs, key=lambda a: a[1][1])		revs.sort(key=lambda a: a[1][1])
novendor = False		novendor = False
have_initial_revision = False		have_initial_revision = False
last_vendor_status = None		last_vendor_status = None
for k, v in revs:		for k, v in revs:
r = k.split(".")		r = k.split(".")
if (		if (
len(r) == 4		len(r) == 4
and r[0] == "1"		and r[0] == "1"
Show All 27 Lines	) -> None:
else:		else:
# trunk only		# trunk only
continue		continue

b = ".".join(r[:-1])		b = ".".join(r[:-1])
# decode author name in a potentially lossy way;		# decode author name in a potentially lossy way;
# it is only used for internal hashing in this case		# it is only used for internal hashing in this case
author = v[2].decode("utf-8", "ignore")		author = v[2].decode("utf-8", "ignore")
a = ChangeSetKey(branches[b], author, v[1], logmsgs[k], v[6], self.fuzzsec)		logmsg = logmsgs[k]
		assert logmsg is not None
		a = ChangeSetKey(branches[b], author, v[1], logmsg, v[6], self.fuzzsec)

a.put_file(path, k, v[3], 0)		a.put_file(path, k, v[3], 0)
while a in self.changesets:		while a in self.changesets:
c = self.changesets[a]		c = self.changesets[a]
del self.changesets[a]		del self.changesets[a]
c.merge(a)		c.merge(a)
a = c		a = c
self.changesets[a] = a		self.changesets[a] = a
if k in rtags:		if k in rtags:
for t in rtags[k]:		for t in rtags[k]:
if t not in self.tags or self.tags[t].max_time < a.max_time:		if t not in self.tags or self.tags[t].max_time < a.max_time:
self.tags[t] = a		self.tags[t] = a

def parse_rlog(self, fp):		def parse_rlog(self, fp: BinaryIO) -> None:
eof = None		eof = None
while eof != _EOF_LOG and eof != _EOF_ERROR:		while eof != _EOF_LOG and eof != _EOF_ERROR:
filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp)		filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp)
revisions = {}		revisions: Dict[str, revtuple] = {}
logmsgs = {}		logmsgs: Dict[str, Optional[bytes]] = {}
path = ""		path = ""
if filename:		if filename:
# There is no known encoding of filenames in CVS.		# There is no known encoding of filenames in CVS.
# Attempt to decode the path with our list of known encodings.		# Attempt to decode the path with our list of known encodings.
# If none of them work, forcefully decode the path assuming		# If none of them work, forcefully decode the path assuming
# the final path encoding provided in the list.		# the final path encoding provided in the list.
for i, e in enumerate(path_encodings):		for i, e in enumerate(path_encodings):
try:		try:
Show All 14 Lines	def parse_rlog(self, fp: BinaryIO) -> None:
if eof != _EOF_LOG and eof != _EOF_ERROR:		if eof != _EOF_LOG and eof != _EOF_ERROR:
if path not in self.offsets.keys():		if path not in self.offsets.keys():
self.offsets[path] = dict()		self.offsets[path] = dict()
if rev:		if rev:
self.offsets[path][rev[0]] = off		self.offsets[path][rev[0]] = off

self._process_rlog_revisions(path, taginfo, revisions, logmsgs)		self._process_rlog_revisions(path, taginfo, revisions, logmsgs)

def getlog(self, fp, path, rev):		def getlog(self, fp: BinaryIO, path: str, rev: str) -> Optional[bytes]:
off = self.offsets[path][rev]		off = self.offsets[path][rev]
fp.seek(off)		fp.seek(off)
rev, logmsg, eof = _parse_log_entry(fp)		_rev, logmsg, eof = _parse_log_entry(fp)
return logmsg		return logmsg


# if your rlog doesn't use 77 '=' characters, then this must change		# if your rlog doesn't use 77 '=' characters, then this must change
LOG_END_MARKER = b"=" * 77 + b"\n"		LOG_END_MARKER = b"=" * 77 + b"\n"
ENTRY_END_MARKER = b"-" * 28 + b"\n"		ENTRY_END_MARKER = b"-" * 28 + b"\n"

_EOF_FILE = b"end of file entries" # no more entries for this RCS file		_EOF_FILE = b"end of file entries" # no more entries for this RCS file
Show All 23 Lines
_re_cvsnt_error = re.compile(		_re_cvsnt_error = re.compile(
rb"^(?:cvs rcsfile\: \|cvs \[rcsfile aborted\]: )"		rb"^(?:cvs rcsfile\: \|cvs \[rcsfile aborted\]: )"
rb"(?:\`(.*,v)' \|"		rb"(?:\`(.*,v)' \|"
rb"cannot open (.,v)\: \|(.,v)\: \|)"		rb"cannot open (.,v)\: \|(.,v)\: \|)"
rb"(.*)$"		rb"(.*)$"
)		)


def _parse_log_header(fp):		def _parse_log_header(fp: BinaryIO) -> Tuple[
		bytes, bytes, Dict[bytes, bytes], Dict[bytes, bytes], bytes, Optional[bytes]
		]:
"""Parse and RCS/CVS log header.		"""Parse and RCS/CVS log header.

fp is a file (pipe) opened for reading the log information.		fp is a file (pipe) opened for reading the log information.

On entry, fp should point to the start of a log entry.		On entry, fp should point to the start of a log entry.
On exit, fp will have consumed the separator line between the header and		On exit, fp will have consumed the separator line between the header and
the first revision log.		the first revision log.

If there is no revision information (e.g. the "-h" switch was passed to		If there is no revision information (e.g. the "-h" switch was passed to
rlog), then fp will consumed the file separator line on exit.		rlog), then fp will consumed the file separator line on exit.

Returns: filename, default branch, tag dictionary, lock dictionary,		Returns: filename, default branch, tag dictionary, lock dictionary,
rlog error message, and eof flag		rlog error message, and eof flag
"""		"""

filename = branch = msg = b""		filename = branch = msg = b""
taginfo = {} # tag name => number		taginfo: Dict[bytes, bytes] = {} # tag name => number
lockinfo = {} # revision => locker		lockinfo: Dict[bytes, bytes] = {} # revision => locker
state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks		state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks
eof = None		eof = None

while 1:		while 1:
line = fp.readline()		line = fp.readline()
if not line:		if not line:
# the true end-of-file		# the true end-of-file
eof = _EOF_LOG		eof = _EOF_LOG
Show All 38 Lines	while 1:
break		break
else:		else:
error = _re_cvsnt_error.match(line)		error = _re_cvsnt_error.match(line)
if error:		if error:
p1, p2, p3, msg = error.groups()		p1, p2, p3, msg = error.groups()
filename = p1 or p2 or p3		filename = p1 or p2 or p3
if not filename:		if not filename:
raise ValueError(		raise ValueError(
"Could not get filename from CVSNT error:\n%s" % line		"Could not get filename from CVSNT error:\n%r" % line
)		)
eof = _EOF_ERROR		eof = _EOF_ERROR
break		break

error = _re_log_error.match(line)		error = _re_log_error.match(line)
if error:		if error:
filename, msg = error.groups()		filename, msg = error.groups()
if msg[:30] == b"warning: Unknown phrases like ":		if msg[:30] == b"warning: Unknown phrases like ":
Show All 20 Lines

def cvs_strptime(timestr):		def cvs_strptime(timestr):
try:		try:
return time.strptime(timestr, "%Y/%m/%d %H:%M:%S")[:-1] + (0,)		return time.strptime(timestr, "%Y/%m/%d %H:%M:%S")[:-1] + (0,)
except ValueError:		except ValueError:
return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,)		return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,)


def _parse_log_entry(fp):		def _parse_log_entry(fp) -> Tuple[Optional[revtuple], Optional[bytes], Optional[bytes]]:
"""Parse a single log entry.		"""Parse a single log entry.

On entry, fp should point to the first line of the entry (the "revision"		On entry, fp should point to the first line of the entry (the "revision"
line).		line).
On exit, fp will have consumed the log separator line (dashes) or the		On exit, fp will have consumed the log separator line (dashes) or the
end-of-file marker (equals).		end-of-file marker (equals).

Returns: Revision data tuple, and eof flag (see _EOF_*)		Returns: Revision data tuple (number string, date, author, state, branches, revnumstr,
		commitid) if any, log, and eof flag (see _EOF_*)
"""		"""
rev = None		rev = None
line = fp.readline()		line = fp.readline()
if not line:		if not line:
return None, None, _EOF_LOG		return None, None, _EOF_LOG
if line == LOG_END_MARKER:		if line == LOG_END_MARKER:
# Needed because some versions of RCS precede LOG_END_MARKER		# Needed because some versions of RCS precede LOG_END_MARKER
# with ENTRY_END_MARKER		# with ENTRY_END_MARKER
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	if tm[0] < EPOCH:
tm[0] = tm[0] + 100		tm[0] = tm[0] + 100
if tm[0] < EPOCH:		if tm[0] < EPOCH:
raise ValueError("invalid year")		raise ValueError("invalid year")
date = calendar.timegm(tm)		date = calendar.timegm(tm)

# return a revision tuple compatible with 'rcsparse', the log message,		# return a revision tuple compatible with 'rcsparse', the log message,
# and the EOF marker		# and the EOF marker
return (		return (
(		revtuple(
rev.decode("ascii"), # revision number string		rev.decode("ascii"), # revision number string
date,		date,
match.group(2), # author (encoding is arbitrary; don't attempt to decode)		match.group(2), # author (encoding is arbitrary; don't attempt to decode)
match.group(3).decode(		match.group(3).decode(
"ascii"		"ascii"
), # state, usually "Exp" or "dead"; non-ASCII data here would be weird		), # state, usually "Exp" or "dead"; non-ASCII data here would be weird
None, # TODO: branches of this rev		None, # TODO: branches of this rev
None, # TODO: revnumstr of previous rev		None, # TODO: revnumstr of previous rev
None, # TODO: commitid		None, # TODO: commitid
),		),
log,		log,
eof,		eof,
)		)