diff --git a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py --- a/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py +++ b/swh/loader/cvs/cvs2gitdump/cvs2gitdump.py @@ -36,6 +36,8 @@ import subprocess import sys import time +from typing import Dict, List, Optional, Tuple, TypeVar + import swh.loader.cvs.rcsparse as rcsparse CHANGESET_FUZZ_SEC = 300 @@ -48,7 +50,7 @@ '\tcvsroot [git_dir]', file=sys.stderr) -def main(): +def main() -> None: email_domain = None do_incremental = False git_tip = None @@ -108,6 +110,7 @@ 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', '--date=raw', '--format=%ae%n%ad%n%H', git_branch], encoding='utf-8', stdout=subprocess.PIPE) + assert git.stdout is not None outs = git.stdout.readlines() git.wait() if git.returncode != 0: @@ -121,6 +124,7 @@ 'i18n.logOutputEncoding=UTF-8', 'log', '--max-count', '1', '--date=raw', '--format=%ae%n%ad%n%H', last_revision], encoding='utf-8', stdout=subprocess.PIPE) + assert git.stdout is not None outs = git.stdout.readlines() git.wait() if git.returncode != 0: @@ -182,11 +186,11 @@ for i, e in enumerate(log_encodings): try: how = 'ignore' if i == len(log_encodings) - 1 else 'strict' - log = log.decode(e, how) + log_str = log.decode(e, how) break except UnicodeError: pass - log = log.encode('utf-8', 'ignore') + log = log_str.encode('utf-8', 'ignore') output('commit refs/heads/' + git_branch) markseq = markseq + 1 @@ -229,7 +233,7 @@ # is UTF-8. Also write without conversion for a bytes object (file bodies # might be various encodings) # -def output(*args, end='\n'): +def output(*args, end='\n') -> None: if len(args) == 0: pass elif len(args) > 1 or isinstance(args[0], str): @@ -243,7 +247,7 @@ class FileRevision: - def __init__(self, path, rev, state, markseq): + def __init__(self, path: str, rev: str, state: str, markseq: int) -> None: self.path = path self.rev = rev self.state = state @@ -251,40 +255,50 @@ class ChangeSetKey: - def __init__(self, branch, author, timestamp, log, commitid, fuzzsec): + def __init__( + self, + branch: str, + author, + timestamp: int, + log: bytes, + commitid: Optional[str], + fuzzsec: int + ) -> None: self.branch = branch self.author = author self.min_time = timestamp self.max_time = timestamp self.commitid = commitid self.fuzzsec = fuzzsec - self.revs = [] - self.tags = [] + self.revs: List[FileRevision] = [] + self.tags: List[str] = [] self.log_hash = 0 h = 0 for c in log: h = 31 * h + c self.log_hash = h - def __lt__(self, other): + def __lt__(self, other) -> bool: return self._cmp(other) < 0 - def __gt__(self, other): + def __gt__(self, other) -> bool: return self._cmp(other) > 0 - def __eq__(self, other): + def __eq__(self, other) -> bool: return self._cmp(other) == 0 - def __le__(self, other): + def __le__(self, other) -> bool: return self._cmp(other) <= 0 - def __ge__(self, other): + def __ge__(self, other) -> bool: return self._cmp(other) >= 0 - def __ne__(self, other): + def __ne__(self, other) -> bool: return self._cmp(other) != 0 - def _cmp(self, anon): + def _cmp(self, anon) -> int: + if not isinstance(anon, ChangeSetKey): + raise TypeError() # compare by the commitid cid = _cmp2(self.commitid, anon.commitid) if cid == 0 and self.commitid is not None: @@ -313,35 +327,36 @@ return ct if ct != 0 else c - def merge(self, anot): + def merge(self, anot: "ChangeSetKey") -> None: self.max_time = max(self.max_time, anot.max_time) self.min_time = min(self.min_time, anot.min_time) self.revs.extend(anot.revs) - def __hash__(self): + def __hash__(self) -> int: return hash(self.branch + '/' + self.author) * 31 + self.log_hash - def put_file(self, path, rev, state, markseq): + def put_file(self, path: str, rev: str, state: str, markseq: int): self.revs.append(FileRevision(path, rev, state, markseq)) -def _cmp2(a, b): +TCmp = TypeVar("TCmp", int, str) +def _cmp2(a: Optional[TCmp], b: Optional[TCmp]) -> int: _a = a is not None _b = b is not None - return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) + return (a > b) - (a < b) if _a and _b else (_a > _b) - (_a < _b) # type: ignore class CvsConv: - def __init__(self, cvsroot, rcs, dumpfile, fuzzsec): + def __init__(self, cvsroot: str, rcs: "RcsKeywords", dumpfile: bool, fuzzsec: int) -> None: self.cvsroot = cvsroot self.rcs = rcs - self.changesets = dict() + self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() self.dumpfile = dumpfile self.markseq = 0 - self.tags = dict() + self.tags: Dict[str, ChangeSetKey] = dict() self.fuzzsec = fuzzsec - def walk(self, module=None): + def walk(self, module: Optional[str] =None) -> None: p = [self.cvsroot] if module is not None: p.append(module) @@ -364,26 +379,26 @@ for t, c in list(self.tags.items()): c.tags.append(t) - def parse_file(self, path): - rtags = dict() + def parse_file(self, path: str) -> None: + rtags: Dict[str, List[str]] = dict() rcsfile = rcsparse.rcsfile(path) branches = {'1': 'HEAD', '1.1.1': 'VENDOR'} - for k, v in list(rcsfile.symbols.items()): - r = v.split('.') + for k, v_ in list(rcsfile.symbols.items()): + r = v_.split('.') if len(r) == 3: - branches[v] = 'VENDOR' + branches[v_] = 'VENDOR' elif len(r) >= 3 and r[-2] == '0': branches['.'.join(r[:-2] + r[-1:])] = k if len(r) == 2 and branches[r[0]] == 'HEAD': - if v not in rtags: - rtags[v] = list() - rtags[v].append(k) + if v_ not in rtags: + rtags[v_] = list() + rtags[v_].append(k) - revs = rcsfile.revs.items() + revs: List[Tuple[str, Tuple[str, int, str, str, List[str], str, str]]] = list(rcsfile.revs.items()) # sort by revision descending to priorize 1.1.1.1 than 1.1 - revs = sorted(revs, key=lambda a: a[1][0], reverse=True) + revs.sort(key=lambda a: a[1][0], reverse=True) # sort by time - revs = sorted(revs, key=lambda a: a[1][1]) + revs.sort(key=lambda a: a[1][1]) novendor = False have_initial_revision = False last_vendor_status = None @@ -445,22 +460,22 @@ self.tags[t] = a -def file_path(r, p): +def file_path(r: str, p: str) -> str: if r.endswith('/'): r = r[:-1] if p[-2:] == ',v': path = p[:-2] # drop ",v" else: path = p - p = path.split('/') - if len(p) > 0 and p[-2] == 'Attic': - path = '/'.join(p[:-2] + [p[-1]]) + p_ = path.split('/') + if len(p_) > 0 and p_[-2] == 'Attic': + path = '/'.join(p_[:-2] + [p_[-1]]) if path.startswith(r): path = path[len(r) + 1:] return path -def git_dump_file(path, k, rcs, markseq): +def git_dump_file(path: str, k, rcs, markseq) -> None: try: cont = rcs.expand_keyword(path, rcsparse.rcsfile(path), k) except RuntimeError as msg: @@ -516,18 +531,18 @@ RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL) RCS_KWEXP_KVL = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR) - def __init__(self): + def __init__(self) -> None: self.rerecomple() - def rerecomple(self): + def rerecomple(self) -> None: pat = b'|'.join(list(self.rcs_expkw.keys())) self.re_kw = re.compile(b".*?\\$(" + pat + b")[\\$:]") - def add_id_keyword(self, keyword): + def add_id_keyword(self, keyword) -> None: self.rcs_expkw[keyword.encode('ascii')] = self.RCS_KW_ID self.rerecomple() - def kflag_get(self, flags): + def kflag_get(self, flags: Optional[str]) -> int: if flags is None: return self.RCS_KWEXP_DEFAULT fl = 0 @@ -550,7 +565,7 @@ fl |= self.RCS_KWEXP_ERR return fl - def expand_keyword(self, filename, rcs, r): + def expand_keyword(self, filename: str, rcs: rcsparse.rcsfile, r: str) -> bytes: rev = rcs.revs[r] mode = self.kflag_get(rcs.expand) diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py --- a/swh/loader/cvs/loader.py +++ b/swh/loader/cvs/loader.py @@ -22,10 +22,11 @@ CHANGESET_FUZZ_SEC, ChangeSetKey, CvsConv, + FileRevision, RcsKeywords, file_path, ) -import swh.loader.cvs.cvsclient as cvsclient +from swh.loader.cvs.cvsclient import CVSClient import swh.loader.cvs.rcsparse as rcsparse from swh.loader.cvs.rlog import RlogConv from swh.loader.exception import NotFound @@ -52,6 +53,10 @@ TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs." +class Foo: + pass + + class CvsLoader(BaseLoader): """Swh cvs loader. @@ -63,7 +68,7 @@ visit_type = "cvs" cvs_module_name: str - cvsclient: cvsclient.CVSClient + cvsclient: CVSClient # remote CVS repository access (history is parsed from CVS rlog): rlog_file: BinaryIO @@ -115,7 +120,9 @@ self.storage, self.origin_url ) - def compute_swh_revision(self, k, logmsg) -> Tuple[Revision, from_disk.Directory]: + def compute_swh_revision( + self, k: ChangeSetKey, logmsg: Optional[bytes] + ) -> Tuple[Revision, from_disk.Directory]: """Compute swh hash data per CVS changeset. Returns: @@ -136,7 +143,9 @@ self._last_revision = revision return (revision, swh_dir) - def checkout_file_with_rcsparse(self, k, f, rcsfile): + def checkout_file_with_rcsparse( + self, k: ChangeSetKey, f: FileRevision, rcsfile: rcsparse.rcsfile + ) -> None: path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.worktree_path, path) self.log.info("rev %s of file %s" % (f.rev, f.path)) @@ -157,7 +166,9 @@ outfile.write(contents) outfile.close() - def checkout_file_with_cvsclient(self, k, f, cvsclient): + def checkout_file_with_cvsclient( + self, k: ChangeSetKey, f: FileRevision, cvsclient: CVSClient + ): path = file_path(self.cvsroot_path, f.path) wtpath = os.path.join(self.worktree_path, path) self.log.info("rev %s of file %s" % (f.rev, f.path)) @@ -180,7 +191,9 @@ pass def process_cvs_changesets( - self, cvs_changesets, use_rcsparse, + self, + cvs_changesets: List[ChangeSetKey], + use_rcsparse: bool, ) -> Iterator[ Tuple[List[Content], List[SkippedContent], List[Directory], Revision] ]: @@ -198,7 +211,7 @@ self.log.info( "changeset from %s by %s on branch %s", tstr, k.author, k.branch ) - logmsg = "" + logmsg: Optional[bytes] = b"" # Check out all files of this revision and get a log message. # # The log message is obtained from the first file in the changeset. @@ -231,7 +244,7 @@ def pre_cleanup(self) -> None: """Cleanup potential dangling files from prior runs (e.g. OOM killed - tasks) + tasks) """ clean_dangling_folders( @@ -353,7 +366,7 @@ ) elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": # remote CVS repository conversion - self.cvsclient = cvsclient.CVSClient(url) + self.cvsclient = CVSClient(url) cvsroot_path = os.path.dirname(url.path) self.log.info( "Fetching CVS rlog from %s:%s/%s", @@ -390,7 +403,11 @@ return True def build_swh_revision( - self, k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes] + self, + k: ChangeSetKey, + logmsg: Optional[bytes], + dir_id: bytes, + parents: Sequence[bytes], ) -> Revision: """Given a CVS revision, build a swh revision. @@ -405,7 +422,7 @@ """ author = Person.from_fullname(k.author.encode("UTF-8")) - date = TimestampWithTimezone.from_datetime(k.max_time) + date = TimestampWithTimezone.from_dict(k.max_time) return Revision( type=RevisionType.CVS, @@ -420,7 +437,7 @@ parents=tuple(parents), ) - def generate_and_load_snapshot(self, revision) -> Snapshot: + def generate_and_load_snapshot(self, revision: Revision) -> Snapshot: """Create the snapshot either from existing revision. Args: @@ -447,6 +464,7 @@ self.storage.content_add(self._contents) self.storage.directory_add(self._directories) self.storage.revision_add(self._revisions) + assert self._last_revision is not None self.snapshot = self.generate_and_load_snapshot(self._last_revision) self.log.debug("SWH snapshot ID: %s", hashutil.hash_to_hex(self.snapshot.id)) self.flush() @@ -466,5 +484,5 @@ "status": load_status, } - def visit_status(self): + def visit_status(self) -> str: return self._visit_status diff --git a/swh/loader/cvs/rcsparse.pyi b/swh/loader/cvs/rcsparse.pyi --- a/swh/loader/cvs/rcsparse.pyi +++ b/swh/loader/cvs/rcsparse.pyi @@ -3,7 +3,25 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any +from collections.abc import Mapping +from typing import Any, List, Tuple def __getattr__(name) -> Any: ... -def rcsfile(path): ... + +class rcsfile: + head: str + branch: str + access: List[str] + symbols: Mapping[str, str] # actually rcsparse.rcstokmap + locks: Mapping[str, str] # actually rcsparse.rcstokmap + strict: bool + comment: str + expand: str + revs: Mapping[str, Tuple[str, int, str, str, List[str], str, str]] # actually rcsparse.rcsrevtree + desc: str + + def __init__(self, path: str): ... + + def checkout(self, rev: str = "HEAD") -> bytes: ... + def getlog(self, rev: str) -> bytes: ... + def sym2rev(self, rev: str = "HEAD") -> str: ... diff --git a/swh/loader/cvs/rlog.py b/swh/loader/cvs/rlog.py --- a/swh/loader/cvs/rlog.py +++ b/swh/loader/cvs/rlog.py @@ -47,6 +47,7 @@ import calendar import re import time +from typing import BinaryIO, Dict, List, NamedTuple, Optional, Tuple from swh.loader.cvs.cvs2gitdump.cvs2gitdump import ChangeSetKey, file_path @@ -66,17 +67,33 @@ path_encodings = ["ascii", "utf-8"] +class revtuple(NamedTuple): + number: str + date: int + author: bytes + state: str + branches: None + revnumstr: None + commitid: None + + class RlogConv: - def __init__(self, cvsroot_path, fuzzsec): + def __init__(self, cvsroot_path: str, fuzzsec: int) -> None: self.cvsroot_path = cvsroot_path self.fuzzsec = fuzzsec - self.changesets = dict() - self.tags = dict() - self.offsets = dict() - - def _process_rlog_revisions(self, path, taginfo, revisions, logmsgs): + self.changesets: Dict[ChangeSetKey, ChangeSetKey] = dict() + self.tags: Dict[str, ChangeSetKey] = dict() + self.offsets: Dict[str, Dict[str, int]] = dict() + + def _process_rlog_revisions( + self, + path: str, + taginfo: Dict[bytes, bytes], + revisions: Dict[str, revtuple], + logmsgs: Dict[str, Optional[bytes]] + ) -> None: """ Convert RCS revision history of a file into self.changesets items """ - rtags = dict() + rtags: Dict[str, List[str]] = dict() # RCS and CVS represent branches by adding digits to revision numbers. # And CVS assigns special meaning to certain revision number ranges. # @@ -119,25 +136,28 @@ # before any files on this branch have been modified. # Even-numbered branch revisions appear once the file is modified. branches = {"1": "HEAD", "1.1.1": "VENDOR"} - for k, v in list(taginfo.items()): - r = v.split(".") + + k: str + v_: str + for k, v_ in list(taginfo.items()): # type: ignore # FIXME, inconsistent types + r = v_.split(".") if len(r) == 3: # vendor branch number - branches[v] = "VENDOR" + branches[v_] = "VENDOR" elif len(r) >= 3 and r[-2] == "0": # magic branch number branches[".".join(r[:-2] + r[-1:])] = k if len(r) == 2 and branches[r[0]] == "HEAD": # main branch number - if v not in rtags: - rtags[v] = list() - rtags[v].append(k) + if v_ not in rtags: + rtags[v_] = list() + rtags[v_].append(k) - revs = revisions.items() + revs: List[Tuple[str, revtuple]] = list(revisions.items()) # sort by revision descending to priorize 1.1.1.1 than 1.1 - revs = sorted(revs, key=lambda a: a[1][0], reverse=True) + revs.sort(key=lambda a: a[1][0], reverse=True) # sort by time - revs = sorted(revs, key=lambda a: a[1][1]) + revs.sort(key=lambda a: a[1][1]) novendor = False have_initial_revision = False last_vendor_status = None @@ -181,7 +201,9 @@ # decode author name in a potentially lossy way; # it is only used for internal hashing in this case author = v[2].decode("utf-8", "ignore") - a = ChangeSetKey(branches[b], author, v[1], logmsgs[k], v[6], self.fuzzsec) + logmsg = logmsgs[k] + assert logmsg is not None + a = ChangeSetKey(branches[b], author, v[1], logmsg, v[6], self.fuzzsec) a.put_file(path, k, v[3], 0) while a in self.changesets: @@ -195,12 +217,12 @@ if t not in self.tags or self.tags[t].max_time < a.max_time: self.tags[t] = a - def parse_rlog(self, fp): + def parse_rlog(self, fp: BinaryIO) -> None: eof = None while eof != _EOF_LOG and eof != _EOF_ERROR: filename, branch, taginfo, lockinfo, errmsg, eof = _parse_log_header(fp) - revisions = {} - logmsgs = {} + revisions: Dict[str, revtuple] = {} + logmsgs: Dict[str, Optional[bytes]] = {} path = "" if filename: # There is no known encoding of filenames in CVS. @@ -231,10 +253,10 @@ self._process_rlog_revisions(path, taginfo, revisions, logmsgs) - def getlog(self, fp, path, rev): + def getlog(self, fp: BinaryIO, path: str, rev: str) -> Optional[bytes]: off = self.offsets[path][rev] fp.seek(off) - rev, logmsg, eof = _parse_log_entry(fp) + _rev, logmsg, eof = _parse_log_entry(fp) return logmsg @@ -274,7 +296,9 @@ ) -def _parse_log_header(fp): +def _parse_log_header(fp: BinaryIO) -> Tuple[ + bytes, bytes, Dict[bytes, bytes], Dict[bytes, bytes], bytes, Optional[bytes] +]: """Parse and RCS/CVS log header. fp is a file (pipe) opened for reading the log information. @@ -291,8 +315,8 @@ """ filename = branch = msg = b"" - taginfo = {} # tag name => number - lockinfo = {} # revision => locker + taginfo: Dict[bytes, bytes] = {} # tag name => number + lockinfo: Dict[bytes, bytes] = {} # revision => locker state = 0 # 0 = base, 1 = parsing symbols, 2 = parsing locks eof = None @@ -347,7 +371,7 @@ filename = p1 or p2 or p3 if not filename: raise ValueError( - "Could not get filename from CVSNT error:\n%s" % line + "Could not get filename from CVSNT error:\n%r" % line ) eof = _EOF_ERROR break @@ -384,7 +408,7 @@ return time.strptime(timestr, "%Y-%m-%d %H:%M:%S %z")[:-1] + (0,) -def _parse_log_entry(fp): +def _parse_log_entry(fp) -> Tuple[Optional[revtuple], Optional[bytes], Optional[bytes]]: """Parse a single log entry. On entry, fp should point to the first line of the entry (the "revision" @@ -392,7 +416,8 @@ On exit, fp will have consumed the log separator line (dashes) or the end-of-file marker (equals). - Returns: Revision data tuple, and eof flag (see _EOF_*) + Returns: Revision data tuple (number string, date, author, state, branches, revnumstr, + commitid) if any, log, and eof flag (see _EOF_*) """ rev = None line = fp.readline() @@ -453,7 +478,7 @@ # return a revision tuple compatible with 'rcsparse', the log message, # and the EOF marker return ( - ( + revtuple( rev.decode("ascii"), # revision number string date, match.group(2), # author (encoding is arbitrary; don't attempt to decode)