Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/cvs/loader.py
Show First 20 Lines • Show All 114 Lines • ▼ Show 20 Lines | def compute_swh_revision(self, k, logmsg): | ||||
""" | """ | ||||
# Compute SWH revision from the on-disk state | # Compute SWH revision from the on-disk state | ||||
swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path)) | swh_dir = from_disk.Directory.from_disk(path=os.fsencode(self.worktree_path)) | ||||
if self._last_revision: | if self._last_revision: | ||||
parents = (self._last_revision.id,) | parents = (self._last_revision.id,) | ||||
else: | else: | ||||
parents = () | parents = () | ||||
revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | revision = self.build_swh_revision(k, logmsg, swh_dir.hash, parents) | ||||
self.log.debug("SWH revision ID: %s" % hashutil.hash_to_hex(revision.id)) | self.log.debug("SWH revision ID: %s", hashutil.hash_to_hex(revision.id)) | ||||
self._last_revision = revision | self._last_revision = revision | ||||
return (revision, swh_dir) | return (revision, swh_dir) | ||||
def process_cvs_changesets( | def process_cvs_changesets( | ||||
self, cvs_changesets, | self, cvs_changesets, | ||||
) -> Iterator[ | ) -> Iterator[ | ||||
Tuple[List[Content], List[SkippedContent], List[Directory], Revision] | Tuple[List[Content], List[SkippedContent], List[Directory], Revision] | ||||
]: | ]: | ||||
Show All 12 Lines | ]: | ||||
"changeset from %s by %s on branch %s", tstr, k.author, k.branch | "changeset from %s by %s on branch %s", tstr, k.author, k.branch | ||||
) | ) | ||||
logmsg = "" | logmsg = "" | ||||
# Check out the on-disk state of this revision | # Check out the on-disk state of this revision | ||||
for f in k.revs: | for f in k.revs: | ||||
rcsfile = None | rcsfile = None | ||||
path = file_path(self.cvsroot_path, f.path) | path = file_path(self.cvsroot_path, f.path) | ||||
wtpath = os.path.join(self.worktree_path, path) | wtpath = os.path.join(self.worktree_path, path) | ||||
self.log.info("rev %s of file %s" % (f.rev, f.path)) | self.log.info("rev %s of file %s", f.rev, f.path) | ||||
if not logmsg: | if not logmsg: | ||||
rcsfile = rcsparse.rcsfile(f.path) | rcsfile = rcsparse.rcsfile(f.path) | ||||
logmsg = rcsfile.getlog(k.revs[0].rev) | logmsg = rcsfile.getlog(k.revs[0].rev) | ||||
if f.state == "dead": | if f.state == "dead": | ||||
# remove this file from work tree | # remove this file from work tree | ||||
try: | try: | ||||
os.remove(wtpath) | os.remove(wtpath) | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | ]: | ||||
self.log.info( | self.log.info( | ||||
"changeset from %s by %s on branch %s", tstr, k.author, k.branch | "changeset from %s by %s on branch %s", tstr, k.author, k.branch | ||||
) | ) | ||||
logmsg = "" | logmsg = "" | ||||
# Check out the on-disk state of this revision | # Check out the on-disk state of this revision | ||||
for f in k.revs: | for f in k.revs: | ||||
path = file_path(self.cvsroot_path, f.path) | path = file_path(self.cvsroot_path, f.path) | ||||
wtpath = os.path.join(self.worktree_path, path) | wtpath = os.path.join(self.worktree_path, path) | ||||
self.log.info("rev %s of file %s" % (f.rev, f.path)) | self.log.info("rev %s of file %s", f.rev, f.path) | ||||
if not logmsg: | if not logmsg: | ||||
logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev) | logmsg = self.rlog.getlog(self.rlog_file, f.path, k.revs[0].rev) | ||||
self.log.debug("f.state is %s\n" % f.state) | self.log.debug("f.state is %s", f.state) | ||||
if f.state == "dead": | if f.state == "dead": | ||||
# remove this file from work tree | # remove this file from work tree | ||||
try: | try: | ||||
os.remove(wtpath) | os.remove(wtpath) | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
pass | pass | ||||
else: | else: | ||||
dirname = os.path.dirname(wtpath) | dirname = os.path.dirname(wtpath) | ||||
os.makedirs(dirname, exist_ok=True) | os.makedirs(dirname, exist_ok=True) | ||||
self.log.debug("checkout to %s\n" % wtpath) | self.log.debug("checkout to %s", wtpath) | ||||
assert self.cvsclient # avoid None type error from mypy | assert self.cvsclient # avoid None type error from mypy | ||||
fp = self.cvsclient.checkout(f.path, f.rev, dirname) | fp = self.cvsclient.checkout(f.path, f.rev, dirname) | ||||
os.rename(fp.name, wtpath) | os.rename(fp.name, wtpath) | ||||
try: | try: | ||||
fp.close() | fp.close() | ||||
except FileNotFoundError: | except FileNotFoundError: | ||||
# Well, we have just renamed the file... | # Well, we have just renamed the file... | ||||
pass | pass | ||||
Show All 27 Lines | class CvsLoader(BaseLoader): | ||||
def fetch_cvs_repo_with_rsync(self, host, path): | def fetch_cvs_repo_with_rsync(self, host, path): | ||||
# URL *must* end with a trailing slash in order to get CVSROOT listed | # URL *must* end with a trailing slash in order to get CVSROOT listed | ||||
url = "rsync://%s%s/" % (host, os.path.dirname(path)) | url = "rsync://%s%s/" % (host, os.path.dirname(path)) | ||||
rsync = subprocess.run(["rsync", url], capture_output=True, encoding="ascii") | rsync = subprocess.run(["rsync", url], capture_output=True, encoding="ascii") | ||||
rsync.check_returncode() | rsync.check_returncode() | ||||
have_cvsroot = False | have_cvsroot = False | ||||
have_module = False | have_module = False | ||||
for line in rsync.stdout.split("\n"): | for line in rsync.stdout.split("\n"): | ||||
self.log.debug("rsync server: %s" % line) | self.log.debug("rsync server: %s", line) | ||||
if line.endswith(" CVSROOT"): | if line.endswith(" CVSROOT"): | ||||
have_cvsroot = True | have_cvsroot = True | ||||
elif line.endswith(" %s" % self.cvs_module_name): | elif line.endswith(" %s" % self.cvs_module_name): | ||||
have_module = True | have_module = True | ||||
if have_module and have_cvsroot: | if have_module and have_cvsroot: | ||||
break | break | ||||
if not have_module: | if not have_module: | ||||
raise NotFound( | raise NotFound( | ||||
Show All 16 Lines | def prepare(self): | ||||
) | ) | ||||
self.worktree_path = tempfile.mkdtemp( | self.worktree_path = tempfile.mkdtemp( | ||||
suffix="-%s" % os.getpid(), | suffix="-%s" % os.getpid(), | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
dir=self.temp_directory, | dir=self.temp_directory, | ||||
) | ) | ||||
url = parse_url(self.origin_url) | url = parse_url(self.origin_url) | ||||
self.log.debug( | self.log.debug( | ||||
"prepare; origin_url=%s scheme=%s path=%s" | "prepare; origin_url=%s scheme=%s path=%s", | ||||
% (self.origin_url, url.scheme, url.path) | self.origin_url, | ||||
url.scheme, | |||||
url.path, | |||||
) | ) | ||||
if not url.path: | if not url.path: | ||||
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | ||||
self.cvs_module_name = os.path.basename(url.path) | self.cvs_module_name = os.path.basename(url.path) | ||||
os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name)) | os.mkdir(os.path.join(self.worktree_path, self.cvs_module_name)) | ||||
if url.scheme == "file": | if url.scheme == "file": | ||||
if not os.path.exists(url.path): | if not os.path.exists(url.path): | ||||
raise NotFound | raise NotFound | ||||
Show All 14 Lines | def prepare(self): | ||||
if f[-2:] == ",v": | if f[-2:] == ",v": | ||||
try: | try: | ||||
rcsfile = rcsparse.rcsfile(filepath) # noqa: F841 | rcsfile = rcsparse.rcsfile(filepath) # noqa: F841 | ||||
except (Exception): | except (Exception): | ||||
raise | raise | ||||
else: | else: | ||||
self.log.debug( | self.log.debug( | ||||
"Looks like we have data to convert; " | "Looks like we have data to convert; " | ||||
"found a valid RCS file at %s" % filepath | "found a valid RCS file at %s", | ||||
filepath, | |||||
) | ) | ||||
have_rcsfile = True | have_rcsfile = True | ||||
break | break | ||||
if have_rcsfile: | if have_rcsfile: | ||||
break | break | ||||
if not have_rcsfile: | if not have_rcsfile: | ||||
raise NotFound( | raise NotFound( | ||||
"Directory %s does not contain any valid RCS files %s" | "Directory %s does not contain any valid RCS files %s", | ||||
% self.cvsroot_path | self.cvsroot_path, | ||||
) | ) | ||||
if not have_cvsroot: | if not have_cvsroot: | ||||
self.log.warn( | self.log.warn( | ||||
"The CVS repository at '%s' lacks a CVSROOT directory; " | "The CVS repository at '%s' lacks a CVSROOT directory; " | ||||
"we might be ingesting an incomplete copy of the repository" | "we might be ingesting an incomplete copy of the repository", | ||||
% self.cvsroot_path | self.cvsroot_path, | ||||
) | ) | ||||
# Unfortunately, there is no way to convert CVS history in an | # Unfortunately, there is no way to convert CVS history in an | ||||
# iterative fashion because the data is not indexed by any kind | # iterative fashion because the data is not indexed by any kind | ||||
# of changeset ID. We need to walk the history of each and every | # of changeset ID. We need to walk the history of each and every | ||||
# RCS file in the repository during every visit, even if no new | # RCS file in the repository during every visit, even if no new | ||||
# changes will be added to the SWH archive afterwards. | # changes will be added to the SWH archive afterwards. | ||||
# "CVS’s repository is the software equivalent of a telephone book | # "CVS’s repository is the software equivalent of a telephone book | ||||
# sorted by telephone number." | # sorted by telephone number." | ||||
# https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/ | # https://corecursive.com/software-that-doesnt-suck-with-jim-blandy/ | ||||
# | # | ||||
# An implicit assumption made here is that self.cvs_changesets will | # An implicit assumption made here is that self.cvs_changesets will | ||||
# fit into memory in its entirety. If it won't fit then the CVS walker | # fit into memory in its entirety. If it won't fit then the CVS walker | ||||
# will need to be modified such that it spools the list of changesets | # will need to be modified such that it spools the list of changesets | ||||
# to disk instead. | # to disk instead. | ||||
cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC) | cvs = CvsConv(self.cvsroot_path, RcsKeywords(), False, CHANGESET_FUZZ_SEC) | ||||
self.log.info("Walking CVS module %s", self.cvs_module_name) | self.log.info("Walking CVS module %s", self.cvs_module_name) | ||||
cvs.walk(self.cvs_module_name) | cvs.walk(self.cvs_module_name) | ||||
cvs_changesets = sorted(cvs.changesets) | cvs_changesets = sorted(cvs.changesets) | ||||
self.log.info( | self.log.info( | ||||
"CVS changesets found in %s: %d" | "CVS changesets found in %s: %d", | ||||
% (self.cvs_module_name, len(cvs_changesets)) | self.cvs_module_name, | ||||
len(cvs_changesets), | |||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_changesets(cvs_changesets) | self.swh_revision_gen = self.process_cvs_changesets(cvs_changesets) | ||||
elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | elif url.scheme == "pserver" or url.scheme == "fake" or url.scheme == "ssh": | ||||
# remote CVS repository conversion | # remote CVS repository conversion | ||||
self.cvsclient = cvsclient.CVSClient(url) | self.cvsclient = cvsclient.CVSClient(url) | ||||
cvsroot_path = os.path.dirname(url.path) | cvsroot_path = os.path.dirname(url.path) | ||||
self.log.info( | self.log.info( | ||||
"Fetching CVS rlog from %s:%s/%s", | "Fetching CVS rlog from %s:%s/%s", | ||||
url.host, | url.host, | ||||
cvsroot_path, | cvsroot_path, | ||||
self.cvs_module_name, | self.cvs_module_name, | ||||
) | ) | ||||
self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | self.rlog = RlogConv(cvsroot_path, CHANGESET_FUZZ_SEC) | ||||
self.rlog_file = self.cvsclient.fetch_rlog() | self.rlog_file = self.cvsclient.fetch_rlog() | ||||
self.rlog.parse_rlog(self.rlog_file) | self.rlog.parse_rlog(self.rlog_file) | ||||
cvs_changesets = sorted(self.rlog.changesets) | cvs_changesets = sorted(self.rlog.changesets) | ||||
self.log.info( | self.log.info( | ||||
"CVS changesets found for %s: %d" | "CVS changesets found for %s: %d", | ||||
% (self.cvs_module_name, len(cvs_changesets)) | self.cvs_module_name, | ||||
len(cvs_changesets), | |||||
) | ) | ||||
self.swh_revision_gen = self.process_cvs_rlog_changesets(cvs_changesets) | self.swh_revision_gen = self.process_cvs_rlog_changesets(cvs_changesets) | ||||
else: | else: | ||||
raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | raise NotFound("Invalid CVS origin URL '%s'" % self.origin_url) | ||||
def fetch_data(self): | def fetch_data(self): | ||||
"""Fetch the next CVS revision.""" | """Fetch the next CVS revision.""" | ||||
try: | try: | ||||
data = next(self.swh_revision_gen) | data = next(self.swh_revision_gen) | ||||
except StopIteration: | except StopIteration: | ||||
return False | return False | ||||
except Exception as e: | except Exception: | ||||
self.log.exception(e) | self.log.exception("Exception in fetch_data:") | ||||
return False # Stopping iteration | return False # Stopping iteration | ||||
self._contents, self._skipped_contents, self._directories, rev = data | self._contents, self._skipped_contents, self._directories, rev = data | ||||
self._revisions = [rev] | self._revisions = [rev] | ||||
return True | return True | ||||
def build_swh_revision( | def build_swh_revision( | ||||
self, k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes] | self, k: ChangeSetKey, logmsg: bytes, dir_id: bytes, parents: Sequence[bytes] | ||||
) -> Revision: | ) -> Revision: | ||||
Show All 37 Lines | def generate_and_load_snapshot(self, revision) -> Snapshot: | ||||
""" | """ | ||||
snap = Snapshot( | snap = Snapshot( | ||||
branches={ | branches={ | ||||
DEFAULT_BRANCH: SnapshotBranch( | DEFAULT_BRANCH: SnapshotBranch( | ||||
target=revision.id, target_type=TargetType.REVISION | target=revision.id, target_type=TargetType.REVISION | ||||
) | ) | ||||
} | } | ||||
) | ) | ||||
self.log.debug("snapshot: %s" % snap) | self.log.debug("snapshot: %s", snap) | ||||
self.storage.snapshot_add([snap]) | self.storage.snapshot_add([snap]) | ||||
return snap | return snap | ||||
def store_data(self): | def store_data(self): | ||||
"Add our current CVS changeset to the archive." | "Add our current CVS changeset to the archive." | ||||
self.storage.skipped_content_add(self._skipped_contents) | self.storage.skipped_content_add(self._skipped_contents) | ||||
self.storage.content_add(self._contents) | self.storage.content_add(self._contents) | ||||
self.storage.directory_add(self._directories) | self.storage.directory_add(self._directories) | ||||
self.storage.revision_add(self._revisions) | self.storage.revision_add(self._revisions) | ||||
self.snapshot = self.generate_and_load_snapshot(self._last_revision) | self.snapshot = self.generate_and_load_snapshot(self._last_revision) | ||||
self.log.debug("SWH snapshot ID: %s" % hashutil.hash_to_hex(self.snapshot.id)) | self.log.debug("SWH snapshot ID: %s", hashutil.hash_to_hex(self.snapshot.id)) | ||||
self.flush() | self.flush() | ||||
self.loaded_snapshot_id = self.snapshot.id | self.loaded_snapshot_id = self.snapshot.id | ||||
self._skipped_contents = [] | self._skipped_contents = [] | ||||
self._contents = [] | self._contents = [] | ||||
self._directories = [] | self._directories = [] | ||||
self._revisions = [] | self._revisions = [] | ||||
def load_status(self): | def load_status(self): | ||||
Show All 11 Lines |