Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/cvs/loader.py
Show First 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | def compute_swh_revision(self, k, logmsg) -> Tuple[Revision, from_disk.Directory]: | ||||
parents = (self._last_revision.id,) | parents = (self._last_revision.id,) | ||||
else: | else: | ||||
parents = () | parents = () | ||||
revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | ||||
self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) | self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) | ||||
self._last_revision = revision | self._last_revision = revision | ||||
return (revision, swh_dir) | return (revision, swh_dir) | ||||
def process_cvs_changesets( | def checkout_file_with_rcsparse(self, k, f, rcsfile): | ||||
self, cvs_changesets, | |||||
) -> Iterator[ | |||||
Tuple[List[Content], List[SkippedContent], List[Directory], Revision] | |||||
]: | |||||
"""Process CVS revisions. | |||||
At each CVS revision, check out contents and compute swh hashes. | |||||
Yields: | |||||
tuple (contents, skipped-contents, directories, revision) of dict as a | |||||
dictionary with keys, sha1_git, sha1, etc... | |||||
""" | |||||
for k in cvs_changesets: | |||||
tstr = time.strftime("%c", time.gmtime(k.max_time)) | |||||
self.log.info( | |||||
"changeset from %s by %s on branch %s", tstr, k.author, k.branch | |||||
) | |||||
logmsg = "" | |||||
# Check out the on-disk state of this revision | |||||
for f in k.revs: | |||||
rcsfile = None | |||||
path = file_path(self.cvsroot_path, f.path) | path = file_path(self.cvsroot_path, f.path) | ||||
wtpath = os.path.join(self.worktree_path, path) | wtpath = os.path.join(self.worktree_path, path) | ||||
self.log.info("rev %s of file %s", f.rev, f.path) | self.log.info("rev %s of file %s" % (f.rev, f.path)) | ||||
if not logmsg: | |||||
rcsfile = rcsparse.rcsfile(f.path) | |||||
logmsg = rcsfile.getlog(k.revs[0].rev) | |||||
if f.state == "dead": | if f.state == "dead": | ||||
# remove this file from work tree | # remove this file from work tree | ||||
try: | try: | ||||
os.remove(wtpath) | os.remove(wtpath) | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
pass | pass | ||||
else: | else: | ||||
# create, or update, this file in the work tree | # create, or update, this file in the work tree | ||||
if not rcsfile: | if not rcsfile: | ||||
rcsfile = rcsparse.rcsfile(f.path) | rcsfile = rcsparse.rcsfile(f.path) | ||||
rcs = RcsKeywords() | rcs = RcsKeywords() | ||||
contents = rcs.expand_keyword(f.path, rcsfile, f.rev) | contents = rcs.expand_keyword(f.path, rcsfile, f.rev) | ||||
os.makedirs(os.path.dirname(wtpath), exist_ok=True) | os.makedirs(os.path.dirname(wtpath), exist_ok=True) | ||||
outfile = open(wtpath, mode="wb") | outfile = open(wtpath, mode="wb") | ||||
outfile.write(contents) | outfile.write(contents) | ||||
outfile.close() | outfile.close() | ||||
(revision, swh_dir) = self.compute_swh_revision(k, logmsg) | def checkout_file_with_cvsclient(self, k, f, cvsclient): | ||||
(contents, skipped_contents, directories) = from_disk.iter_directory( | path = file_path(self.cvsroot_path, f.path) | ||||
swh_dir | wtpath = os.path.join(self.worktree_path, path) | ||||
) | self.log.info("rev %s of file %s" % (f.rev, f.path)) | ||||
yield contents, skipped_contents, directories, revision | if f.state == "dead": | ||||
# remove this file from work tree | |||||
try: | |||||
os.remove(wtpath) | |||||
except FileNotFoundError: | |||||
pass | |||||
else: | |||||
dirname = os.path.dirname(wtpath) | |||||
os.makedirs(dirname, exist_ok=True) | |||||
self.log.debug("checkout to %s\n" % wtpath) | |||||
fp = cvsclient.checkout(f.path, f.rev, dirname) | |||||
os.rename(fp.name, wtpath) | |||||
try: | |||||
fp.close() | |||||
except FileNotFoundError: | |||||
# Well, we have just renamed the file... | |||||
pass | |||||
def process_cvs_rlog_changesets( | def process_cvs_changesets( | ||||
self, cvs_changesets, | self, cvs_changesets, use_rcsparse, | ||||
) -> Iterator[ | ) -> Iterator[ | ||||
Tuple[List[Content], List[SkippedContent], List[Directory], Revision] | Tuple[List[Content], List[SkippedContent], List[Directory], Revision] | ||||
]: | ]: | ||||
"""Process CVS rlog revisions. | """Process CVS revisions. | ||||
At each CVS revision, check out contents and compute swh hashes. | At each CVS revision, check out contents and compute swh hashes. | ||||
Yields: | Yields: | ||||
tuple (contents, skipped-contents, directories, revision) of dict as a | tuple (contents, skipped-contents, directories, revision) of dict as a | ||||
dictionary with keys, sha1_git, sha1, etc... | dictionary with keys, sha1_git, sha1, etc... | ||||
""" | """ | ||||
# XXX At present changeset IDs are recomputed on the fly during every visit. | |||||
# If we were able to maintain a cached somewhere which can be indexed by a | |||||
# cvs2gitdump.ChangeSetKey and yields an SWH revision hash we could avoid | |||||
# doing a lot of redundant work during every visit. | |||||
for k in cvs_changesets: | for k in cvs_changesets: | ||||
tstr = time.strftime("%c", time.gmtime(k.max_time)) | tstr = time.strftime("%c", time.gmtime(k.max_time)) | ||||
self.log.info( | self.log.info( | ||||
"changeset from %s by %s on branch %s", tstr, k.author, k.branch | "changeset from %s by %s on branch %s", tstr, k.author, k.branch | ||||
) | ) | ||||
logmsg = "" | logmsg = "" | ||||
# Check out the on-disk state of this revision | # Check out all files of this revision and get a log message. | ||||
# | |||||
# The log message is obtained from the first file in the changeset. | |||||
# The message will usually be the same for all affected files, and | |||||
# the SWH archive will only store one version of the log message. | |||||
for f in k.revs: | for f in k.revs: | ||||
path = file_path(self.cvsroot_path, f.path) | rcsfile = None | ||||
wtpath = os.path.join(self.worktree_path, path) | if use_rcsparse: | ||||
self.log.info("rev %s of file %s", f.rev, f.path) | if rcsfile is None: | ||||
rcsfile = rcsparse.rcsfile(f.path) | |||||
if not logmsg: | if not logmsg: | ||||
logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev) | logmsg = rcsfile.getlog(k.revs[0].rev) | ||||
self.log.debug("f.state is %s", f.state) | self.checkout_file_with_rcsparse(k, f, rcsfile) | ||||
if f.state == "dead": | |||||
# remove this file from work tree | |||||
try: | |||||
os.remove(wtpath) | |||||
except FileNotFoundError: | |||||
pass | |||||
else: | else: | ||||
dirname = os.path.dirname(wtpath) | if not logmsg: | ||||
os.makedirs(dirname, exist_ok=True) | logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev) | ||||
self.log.debug("checkout to %s", wtpath) | self.checkout_file_with_cvsclient(k, f, self.cvsclient) | ||||
assert self.cvsclient # avoid None type error from mypy | |||||
fp = self.cvsclient.checkout(f.path, f.rev, dirname) | |||||
os.rename(fp.name, wtpath) | |||||
try: | |||||
fp.close() | |||||
except FileNotFoundError: | |||||
# Well, we have just renamed the file... | |||||
pass | |||||
# TODO: prune empty directories? | # TODO: prune empty directories? | ||||
(revision, swh_dir) = self.compute_swh_revision(k, logmsg) | (revision, swh_dir) = self.compute_swh_revision(k, logmsg) | ||||
(contents, skipped_contents, directories) = from_disk.iter_directory( | (contents, skipped_contents, directories) = from_disk.iter_directory( | ||||
swh_dir | swh_dir | ||||
) | ) | ||||
yield contents, skipped_contents, directories, revision | yield contents, skipped_contents, directories, revision | ||||
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines | def prepare(self) -> None: | ||||
self.log.info("Walking CVS module %s", self.cvs_module_name) | self.log.info("Walking CVS module %s", self.cvs_module_name) | ||||
cvs.walk(self.cvs_module_name) | cvs.walk(self.cvs_module_name) | ||||
cvs_changesets = sorted(cvs.changesets) | cvs_changesets = sorted(cvs.changesets) | ||||
self.log.info( | self.log.info( | ||||
"CVS changesets found in %s: %d", | "CVS changesets found in %s: %d", | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
len(cvs_changesets), | len(cvs_changesets), | ||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_changesets(cvs_changesets) | self.swh_revision_gen = self.process_cvs_changesets( | ||||
cvs_changesets, use_rcsparse=True | |||||
) | |||||
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | ||||
# remote CVS repository conversion | # remote CVS repository conversion | ||||
self.cvsclient = cvsclient.CVSClient(url) | self.cvsclient = cvsclient.CVSClient(url) | ||||
cvsroot_path = os.path.dirname(url.path) | cvsroot_path = os.path.dirname(url.path) | ||||
self.log.info( | self.log.info( | ||||
"Fetching CVS rlog from %s:%s/%s", | "Fetching CVS rlog from %s:%s/%s", | ||||
url.host, | url.host, | ||||
cvsroot_path, | cvsroot_path, | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
) | ) | ||||
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | ||||
self.rlog_file = self.cvsclient.fetch_rlog() | self.rlog_file = self.cvsclient.fetch_rlog() | ||||
self.rlog.parse_rlog(self.rlog_file) | self.rlog.parse_rlog(self.rlog_file) | ||||
cvs_changesets = sorted(self.rlog.changesets) | cvs_changesets = sorted(self.rlog.changesets) | ||||
self.log.info( | self.log.info( | ||||
"CVS changesets found for %s: %d", | "CVS changesets found for %s: %d", | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
len(cvs_changesets), | len(cvs_changesets), | ||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_rlog_changesets(cvs_changesets) | self.swh_revision_gen = self.process_cvs_changesets( | ||||
cvs_changesets, use_rcsparse=False | |||||
) | |||||
else: | else: | ||||
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | ||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
"""Fetch the next CVS revision.""" | """Fetch the next CVS revision.""" | ||||
try: | try: | ||||
data = next(self.swh_revision_gen) | data = next(self.swh_revision_gen) | ||||
except StopIteration: | except StopIteration: | ||||
▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines |