Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/cvs/rlog.py
Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | |||||
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||||
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||||
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||||
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||||
import calendar | import calendar | ||||
import re | import re | ||||
import time | import time | ||||
from typing import BinaryIO, Dict, List, NamedTuple, Optional, Tuple | |||||
from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path | from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path | ||||
# There is no known encoding of path names in CVS. The actual encoding used | # There is no known encoding of path names in CVS. The actual encoding used | ||||
# will depend on the CVS server's operating system and perhaps even the | # will depend on the CVS server's operating system and perhaps even the | ||||
# underlying filesystem used to host a CVS repository. | # underlying filesystem used to host a CVS repository. | ||||
# It is even conceivable that a given repository may use multiple encodings, | # It is even conceivable that a given repository may use multiple encodings, | ||||
# e.g. due to migrations of the repository between different servers over time. | # e.g. due to migrations of the repository between different servers over time. | ||||
# | # | ||||
# This issue also affects the CVS network protocol which is communicating | # This issue also affects the CVS network protocol which is communicating | ||||
# paths between the CVS server and the CVS client. For this reason, most | # paths between the CVS server and the CVS client. For this reason, most | ||||
# public-facing repositories should stick to ASCII in practice. | # public-facing repositories should stick to ASCII in practice. | ||||
# | # | ||||
# TODO: If known, the actual path encoding used by the repository should | # TODO: If known, the actual path encoding used by the repository should | ||||
# be specified as a parameter. This parameter should be a list since | # be specified as a parameter. This parameter should be a list since | ||||
# multiple encodings may be present in a given repository. | # multiple encodings may be present in a given repository. | ||||
path_encodings = ["ascii", "utf-8"] | path_encodings = ["ascii", "utf-8"] | ||||
class revtuple(NamedTuple): | |||||
number: str | |||||
date: int | |||||
author: bytes | |||||
state: str | |||||
branches: None | |||||
revnumstr: None | |||||
commitid: None | |||||
class RlogConv: | class RlogConv: | ||||
def __init__(self, cvsroot_path, fuzzsec): | def __init__(self, cvsroot_path: str, fuzzsec: int) -> None: | ||||
self.cvsroot_path = cvsroot_path | self.cvsroot_path = cvsroot_path | ||||
self.fuzzsec = fuzzsec | self.fuzzsec = fuzzsec | ||||
self.changesets = dict() | self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() | ||||
self.tags = dict() | self.tags: Dict[str, ChangeSetKey] = dict() | ||||
self.offsets = dict() | self.offsets: Dict[str, Dict[str, int]] = dict() | ||||
def _process_rlog_revisions(self, path, taginfo, revisions, logmsgs): | def _process_rlog_revisions( | ||||
self, | |||||
path: str, | |||||
taginfo: Dict[bytes, bytes], | |||||
revisions: Dict[str, revtuple], | |||||
logmsgs: Dict[str, Optional[bytes]] | |||||
) -> None: | |||||
""" Convert RCS revision history of a file into self.changesets items """ | """ Convert RCS revision history of a file into self.changesets items """ | ||||
rtags = dict() | rtags: Dict[str, List[str]] = dict() | ||||
# RCS and CVS represent branches by adding digits to revision numbers. | # RCS and CVS represent branches by adding digits to revision numbers. | ||||
# And CVS assigns special meaning to certain revision number ranges. | # And CVS assigns special meaning to certain revision number ranges. | ||||
# | # | ||||
# Revision numbers on the main branch have only two digits: | # Revision numbers on the main branch have only two digits: | ||||
# | # | ||||
# 1.1, 1.2, 1.3, ... | # 1.1, 1.2, 1.3, ... | ||||
# | # | ||||
# Branches created with 'cvs tag -b' use even numbers for | # Branches created with 'cvs tag -b' use even numbers for | ||||
Show All 26 Lines | ) -> None: | ||||
# 1.1, 1.2, 1.3, ... main branch history of the file | # 1.1, 1.2, 1.3, ... main branch history of the file | ||||
# | | # | | ||||
# 1.1.2.0.1 magic branch (2) | # 1.1.2.0.1 magic branch (2) | ||||
# | # | ||||
# This allows CVS to store information about a branch's existence | # This allows CVS to store information about a branch's existence | ||||
# before any files on this branch have been modified. | # before any files on this branch have been modified. | ||||
# Even-numbered branch revisions appear once the file is modified. | # Even-numbered branch revisions appear once the file is modified. | ||||
branches = {"1": "HEAD", "1.1.1": "VENDOR"} | branches = {"1": "HEAD", "1.1.1": "VENDOR"} | ||||
for k, v in list(taginfo.items()): | |||||
r = v.split(".") | k: str | ||||
v_: str | |||||
for k, v_ in list(taginfo.items()): # type: ignore # FIXME, inconsistent types | |||||
r = v_.split(".") | |||||
if len(r) == 3: | if len(r) == 3: | ||||
# vendor branch number | # vendor branch number | ||||
branches[v] = "VENDOR" | branches[v_] = "VENDOR" | ||||
elif len(r) >= 3 and r[-2] == "0": | elif len(r) >= 3 and r[-2] == "0": | ||||
# magic branch number | # magic branch number | ||||
branches[".".join(r[:-2] + r[-1:])] = k | branches[".".join(r[:-2] + r[-1:])] = k | ||||
if len(r) == 2 and branches[r[0]] == "HEAD": | if len(r) == 2 and branches[r[0]] == "HEAD": | ||||
# main branch number | # main branch number | ||||
if v not in rtags: | if v_ not in rtags: | ||||
rtags[v] = list() | rtags[v_] = list() | ||||
rtags[v].append(k) | rtags[v_].append(k) | ||||
revs = revisions.items() | revs: List[Tuple[str, revtuple]] = list(revisions.items()) | ||||
# sort by revision descending to priorize 1.1.1.1 than 1.1 | # sort by revision descending to priorize 1.1.1.1 than 1.1 | ||||
revs = sorted(revs, key=lambda a: a[1][0], reverse=True) | revs.sort(key=lambda a: a[1][0], reverse=True) | ||||
# sort by time | # sort by time | ||||
revs = sorted(revs, key=lambda a: a[1][1]) | revs.sort(key=lambda a: a[1][1]) | ||||
novendor = False | novendor = False | ||||
have_initial_revision = False | have_initial_revision = False | ||||
last_vendor_status = None | last_vendor_status = None | ||||
for k, v in revs: | for k, v in revs: | ||||
r = k.split(".") | r = k.split(".") | ||||
if ( | if ( | ||||
len(r) == 4 | len(r) == 4 | ||||
and r[0] == "1" | and r[0] == "1" | ||||
Show All 27 Lines | ) -> None: | ||||
else: | else: | ||||
# trunk only | # trunk only | ||||
continue | continue | ||||
b = ".".join(r[:-1]) | b = ".".join(r[:-1]) | ||||
# decode author name in a potentially lossy way; | # decode author name in a potentially lossy way; | ||||
# it is only used for internal hashing in this case | # it is only used for internal hashing in this case | ||||
author = v[2].decode("utf-8", "ignore") | author = v[2].decode("utf-8", "ignore") | ||||
a = ChangeSetKey(branches[b], author, v[1], logmsgs[k], v[6], self.fuzzsec) | logmsg = logmsgs[k] | ||||
assert logmsg is not None | |||||
a = ChangeSetKey(branches[b], author, v[1], logmsg, v[6], self.fuzzsec) | |||||
a.put_file(path, k, v[3], 0) | a.put_file(path, k, v[3], 0) | ||||
while a in self.changesets: | while a in self.changesets: | ||||
c = self.changesets[a] | c = self.changesets[a] | ||||
del self.changesets[a] | del self.changesets[a] | ||||
c.merge(a) | c.merge(a) | ||||
a = c | a = c | ||||
self.changesets[a] = a | self.changesets[a] = a | ||||
if k in rtags: | if k in rtags: | ||||
for t in rtags[k]: | for t in rtags[k]: | ||||
if t not in self.tags or self.tags[t].max_time < a.max_time: | if t not in self.tags or self.tags[t].max_time < a.max_time: | ||||
self.tags[t] = a | self.tags[t] = a | ||||
def parse_rlog(self, fp): | def parse_rlog(self, fp: BinaryIO) -> None: | ||||
eof = None | eof = None | ||||
while eof != _EOF_LOG and eof != _EOF_ERROR: | while eof != _EOF_LOG and eof != _EOF_ERROR: | ||||
filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp) | filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp) | ||||
revisions = {} | revisions: Dict[str, revtuple] = {} | ||||
logmsgs = {} | logmsgs: Dict[str, Optional[bytes]] = {} | ||||
path = "" | path = "" | ||||
if filename: | if filename: | ||||
# There is no known encoding of filenames in CVS. | # There is no known encoding of filenames in CVS. | ||||
# Attempt to decode the path with our list of known encodings. | # Attempt to decode the path with our list of known encodings. | ||||
# If none of them work, forcefully decode the path assuming | # If none of them work, forcefully decode the path assuming | ||||
# the final path encoding provided in the list. | # the final path encoding provided in the list. | ||||
for i, e in enumerate(path_encodings): | for i, e in enumerate(path_encodings): | ||||
try: | try: | ||||
Show All 14 Lines | def parse_rlog(self, fp: BinaryIO) -> None: | ||||
if eof != _EOF_LOG and eof != _EOF_ERROR: | if eof != _EOF_LOG and eof != _EOF_ERROR: | ||||
if path not in self.offsets.keys(): | if path not in self.offsets.keys(): | ||||
self.offsets[path] = dict() | self.offsets[path] = dict() | ||||
if rev: | if rev: | ||||
self.offsets[path][rev[0]] = off | self.offsets[path][rev[0]] = off | ||||
self._process_rlog_revisions(path, taginfo, revisions, logmsgs) | self._process_rlog_revisions(path, taginfo, revisions, logmsgs) | ||||
def getlog(self, fp, path, rev): | def getlog(self, fp: BinaryIO, path: str, rev: str) -> Optional[bytes]: | ||||
off = self.offsets[path][rev] | off = self.offsets[path][rev] | ||||
fp.seek(off) | fp.seek(off) | ||||
rev, logmsg, eof = _parse_log_entry(fp) | _rev, logmsg, eof = _parse_log_entry(fp) | ||||
return logmsg | return logmsg | ||||
# if your rlog doesn't use 77 '=' characters, then this must change | # if your rlog doesn't use 77 '=' characters, then this must change | ||||
LOG_END_MARKER = b"=" * 77 + b"\n" | LOG_END_MARKER = b"=" * 77 + b"\n" | ||||
ENTRY_END_MARKER = b"-" * 28 + b"\n" | ENTRY_END_MARKER = b"-" * 28 + b"\n" | ||||
_EOF_FILE = b"end of file entries" # no more entries for this RCS file | _EOF_FILE = b"end of file entries" # no more entries for this RCS file | ||||
Show All 23 Lines | |||||
_re_cvsnt_error = re.compile( | _re_cvsnt_error = re.compile( | ||||
rb"^(?:cvs rcsfile\: |cvs \[rcsfile aborted\]: )" | rb"^(?:cvs rcsfile\: |cvs \[rcsfile aborted\]: )" | ||||
rb"(?:\`(.*,v)' |" | rb"(?:\`(.*,v)' |" | ||||
rb"cannot open (.*,v)\: |(.*,v)\: |)" | rb"cannot open (.*,v)\: |(.*,v)\: |)" | ||||
rb"(.*)$" | rb"(.*)$" | ||||
) | ) | ||||
def _parse_log_header(fp): | def _parse_log_header(fp: BinaryIO) -> Tuple[ | ||||
bytes, bytes, Dict[bytes, bytes], Dict[bytes, bytes], bytes, Optional[bytes] | |||||
]: | |||||
"""Parse and RCS/CVS log header. | """Parse and RCS/CVS log header. | ||||
fp is a file (pipe) opened for reading the log information. | fp is a file (pipe) opened for reading the log information. | ||||
On entry, fp should point to the start of a log entry. | On entry, fp should point to the start of a log entry. | ||||
On exit, fp will have consumed the separator line between the header and | On exit, fp will have consumed the separator line between the header and | ||||
the first revision log. | the first revision log. | ||||
If there is no revision information (e.g. the "-h" switch was passed to | If there is no revision information (e.g. the "-h" switch was passed to | ||||
rlog), then fp will consumed the file separator line on exit. | rlog), then fp will consumed the file separator line on exit. | ||||
Returns: filename, default branch, tag dictionary, lock dictionary, | Returns: filename, default branch, tag dictionary, lock dictionary, | ||||
rlog error message, and eof flag | rlog error message, and eof flag | ||||
""" | """ | ||||
filename = branch = msg = b"" | filename = branch = msg = b"" | ||||
taginfo = {} # tag name => number | taginfo: Dict[bytes, bytes] = {} # tag name => number | ||||
lockinfo = {} # revision => locker | lockinfo: Dict[bytes, bytes] = {} # revision => locker | ||||
state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks | state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks | ||||
eof = None | eof = None | ||||
while 1: | while 1: | ||||
line = fp.readline() | line = fp.readline() | ||||
if not line: | if not line: | ||||
# the true end-of-file | # the true end-of-file | ||||
eof = _EOF_LOG | eof = _EOF_LOG | ||||
Show All 38 Lines | while 1: | ||||
break | break | ||||
else: | else: | ||||
error = _re_cvsnt_error.match(line) | error = _re_cvsnt_error.match(line) | ||||
if error: | if error: | ||||
p1, p2, p3, msg = error.groups() | p1, p2, p3, msg = error.groups() | ||||
filename = p1 or p2 or p3 | filename = p1 or p2 or p3 | ||||
if not filename: | if not filename: | ||||
raise ValueError( | raise ValueError( | ||||
"Could not get filename from CVSNT error:\n%s" % line | "Could not get filename from CVSNT error:\n%r" % line | ||||
) | ) | ||||
eof = _EOF_ERROR | eof = _EOF_ERROR | ||||
break | break | ||||
error = _re_log_error.match(line) | error = _re_log_error.match(line) | ||||
if error: | if error: | ||||
filename, msg = error.groups() | filename, msg = error.groups() | ||||
if msg[:30] == b"warning: Unknown phrases like ": | if msg[:30] == b"warning: Unknown phrases like ": | ||||
Show All 20 Lines | |||||
def cvs_strptime(timestr): | def cvs_strptime(timestr): | ||||
try: | try: | ||||
return time.strptime(timestr, "%Y/%m/%d %H:%M:%S")[:-1] + (0,) | return time.strptime(timestr, "%Y/%m/%d %H:%M:%S")[:-1] + (0,) | ||||
except ValueError: | except ValueError: | ||||
return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,) | return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,) | ||||
def _parse_log_entry(fp): | def _parse_log_entry(fp) -> Tuple[Optional[revtuple], Optional[bytes], Optional[bytes]]: | ||||
"""Parse a single log entry. | """Parse a single log entry. | ||||
On entry, fp should point to the first line of the entry (the "revision" | On entry, fp should point to the first line of the entry (the "revision" | ||||
line). | line). | ||||
On exit, fp will have consumed the log separator line (dashes) or the | On exit, fp will have consumed the log separator line (dashes) or the | ||||
end-of-file marker (equals). | end-of-file marker (equals). | ||||
Returns: Revision data tuple, and eof flag (see _EOF_*) | Returns: Revision data tuple (number string, date, author, state, branches, revnumstr, | ||||
commitid) if any, log, and eof flag (see _EOF_*) | |||||
""" | """ | ||||
rev = None | rev = None | ||||
line = fp.readline() | line = fp.readline() | ||||
if not line: | if not line: | ||||
return None, None, _EOF_LOG | return None, None, _EOF_LOG | ||||
if line == LOG_END_MARKER: | if line == LOG_END_MARKER: | ||||
# Needed because some versions of RCS precede LOG_END_MARKER | # Needed because some versions of RCS precede LOG_END_MARKER | ||||
# with ENTRY_END_MARKER | # with ENTRY_END_MARKER | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | if tm[0] < EPOCH: | ||||
tm[0] = tm[0] + 100 | tm[0] = tm[0] + 100 | ||||
if tm[0] < EPOCH: | if tm[0] < EPOCH: | ||||
raise ValueError("invalid year") | raise ValueError("invalid year") | ||||
date = calendar.timegm(tm) | date = calendar.timegm(tm) | ||||
# return a revision tuple compatible with 'rcsparse', the log message, | # return a revision tuple compatible with 'rcsparse', the log message, | ||||
# and the EOF marker | # and the EOF marker | ||||
return ( | return ( | ||||
( | revtuple( | ||||
rev.decode("ascii"), # revision number string | rev.decode("ascii"), # revision number string | ||||
date, | date, | ||||
match.group(2), # author (encoding is arbitrary; don't attempt to decode) | match.group(2), # author (encoding is arbitrary; don't attempt to decode) | ||||
match.group(3).decode( | match.group(3).decode( | ||||
"ascii" | "ascii" | ||||
), # state, usually "Exp" or "dead"; non-ASCII data here would be weird | ), # state, usually "Exp" or "dead"; non-ASCII data here would be weird | ||||
None, # TODO: branches of this rev | None, # TODO: branches of this rev | ||||
None, # TODO: revnumstr of previous rev | None, # TODO: revnumstr of previous rev | ||||
None, # TODO: commitid | None, # TODO: commitid | ||||
), | ), | ||||
log, | log, | ||||
eof, | eof, | ||||
) | ) |