Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/core/loader.py
Show First 20 Lines • Show All 732 Lines • ▼ Show 20 Lines | class ContentLoader(NodeLoader): | ||||
visit_type = "content" | visit_type = "content" | ||||
def __init__(self, *args, **kwargs): | def __init__(self, *args, **kwargs): | ||||
super().__init__(*args, **kwargs) | super().__init__(*args, **kwargs) | ||||
self.content: Optional[Content] = None | self.content: Optional[Content] = None | ||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
"""Retrieve the content file as a Content Object""" | """Retrieve the content file as a Content Object""" | ||||
errors = [] | |||||
for url in self.mirror_urls: | for url in self.mirror_urls: | ||||
url_ = urlparse(url) | url_ = urlparse(url) | ||||
self.log.debug( | self.log.debug( | ||||
"prepare; origin_url=%s fallback=%s scheme=%s path=%s", | "prepare; origin_url=%s fallback=%s scheme=%s path=%s", | ||||
self.origin.url, | self.origin.url, | ||||
url, | url, | ||||
url_.scheme, | url_.scheme, | ||||
url_.path, | url_.path, | ||||
) | ) | ||||
try: | try: | ||||
# FIXME: Ensure no "nar" computations is required for file | # FIXME: Ensure no "nar" computations is required for file | ||||
assert self.checksums_computation == "standard" | assert self.checksums_computation == "standard" | ||||
with tempfile.TemporaryDirectory() as tmpdir: | with tempfile.TemporaryDirectory() as tmpdir: | ||||
file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) | file_path, _ = download(url, dest=tmpdir, hashes=self.checksums) | ||||
with open(file_path, "rb") as file: | with open(file_path, "rb") as file: | ||||
self.content = Content.from_data(file.read()) | self.content = Content.from_data(file.read()) | ||||
except ValueError as e: | |||||
errors.append(e) | |||||
self.log.debug( | |||||
"Mismatched checksums <%s>: continue on next mirror url if any", | |||||
url, | |||||
) | |||||
continue | |||||
except HTTPError as http_error: | except HTTPError as http_error: | ||||
if http_error.response.status_code == 404: | if http_error.response.status_code == 404: | ||||
self.log.debug( | self.log.debug( | ||||
"Not found '%s', continue on next mirror url if any", url | "Not found '%s', continue on next mirror url if any", url | ||||
) | ) | ||||
continue | continue | ||||
else: | else: | ||||
return False # no more data to fetch | return False # no more data to fetch | ||||
if errors: | |||||
raise errors[0] | |||||
# If we reach this point, we did not find any proper content, consider the | # If we reach this point, we did not find any proper content, consider the | ||||
# origin not found | # origin not found | ||||
raise NotFound(f"Unknown origin {self.origin.url}.") | raise NotFound(f"Unknown origin {self.origin.url}.") | ||||
def process_data(self) -> bool: | def process_data(self) -> bool: | ||||
"""Build the snapshot out of the Content retrieved.""" | """Build the snapshot out of the Content retrieved.""" | ||||
assert self.content is not None | assert self.content is not None | ||||
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | def __init__(self, *args, **kwargs): | ||||
self.dirs: List[Directory] = None | self.dirs: List[Directory] = None | ||||
def fetch_data(self) -> bool: | def fetch_data(self) -> bool: | ||||
"""Fetch directory as a tarball amongst the self.mirror_urls. | """Fetch directory as a tarball amongst the self.mirror_urls. | ||||
Raises NotFound if no tarball is found | Raises NotFound if no tarball is found | ||||
""" | """ | ||||
errors = [] | |||||
for url in self.mirror_urls: | for url in self.mirror_urls: | ||||
url_ = urlparse(url) | url_ = urlparse(url) | ||||
self.log.debug( | self.log.debug( | ||||
"prepare; origin_url=%s fallback=%s scheme=%s path=%s", | "prepare; origin_url=%s fallback=%s scheme=%s path=%s", | ||||
self.origin.url, | self.origin.url, | ||||
url, | url, | ||||
url_.scheme, | url_.scheme, | ||||
url_.path, | url_.path, | ||||
) | ) | ||||
with tempfile.TemporaryDirectory() as tmpdir: | with tempfile.TemporaryDirectory() as tmpdir: | ||||
try: | try: | ||||
tarball_path, extrinsic_metadata = download( | tarball_path, extrinsic_metadata = download( | ||||
url, | url, | ||||
tmpdir, | tmpdir, | ||||
hashes=self.standard_hashes, | hashes=self.standard_hashes, | ||||
extra_request_headers={"Accept-Encoding": "identity"}, | extra_request_headers={"Accept-Encoding": "identity"}, | ||||
) | ) | ||||
except ValueError: | except ValueError as e: | ||||
# Checksum mismatch can happen, so we | errors.append(e) | ||||
self.log.debug( | self.log.debug( | ||||
"Mismatched checksums <%s>: continue on next mirror url if any", | "Mismatched checksums <%s>: continue on next mirror url if any", | ||||
url, | url, | ||||
) | ) | ||||
continue | continue | ||||
except HTTPError as http_error: | except HTTPError as http_error: | ||||
if http_error.response.status_code == 404: | if http_error.response.status_code == 404: | ||||
self.log.debug( | self.log.debug( | ||||
Show All 10 Lines | def fetch_data(self) -> bool: | ||||
# hashes are not "standard", so we need an extra check to happen | # hashes are not "standard", so we need an extra check to happen | ||||
# on the uncompressed tarball | # on the uncompressed tarball | ||||
dir_to_check = next(directory_path.iterdir()) | dir_to_check = next(directory_path.iterdir()) | ||||
self.log.debug("Directory to check nar hashes: %s", dir_to_check) | self.log.debug("Directory to check nar hashes: %s", dir_to_check) | ||||
actual_checksums = nix_hashes( | actual_checksums = nix_hashes( | ||||
dir_to_check, self.checksums.keys() | dir_to_check, self.checksums.keys() | ||||
).hexdigest() | ).hexdigest() | ||||
assert actual_checksums == self.checksums | if actual_checksums != self.checksums: | ||||
errors.append( | |||||
ValueError( | |||||
f"Checksum mismatched on <{url}>: " | |||||
f"{actual_checksums} != {self.checksums}" | |||||
) | |||||
) | |||||
self.log.debug( | |||||
"Mismatched checksums <%s>: continue on next mirror url if any", | |||||
url, | |||||
) | |||||
continue | |||||
self.directory = from_disk.Directory.from_disk( | self.directory = from_disk.Directory.from_disk( | ||||
path=bytes(directory_path), | path=bytes(directory_path), | ||||
max_content_length=self.max_content_size, | max_content_length=self.max_content_size, | ||||
) | ) | ||||
# Compute the merkle dag from the top-level directory | # Compute the merkle dag from the top-level directory | ||||
self.cnts, self.skipped_cnts, self.dirs = from_disk.iter_directory( | self.cnts, self.skipped_cnts, self.dirs = from_disk.iter_directory( | ||||
self.directory | self.directory | ||||
) | ) | ||||
if self.directory is not None: | if self.directory is not None: | ||||
return False # no more data to fetch | return False # no more data to fetch | ||||
if errors: | |||||
raise errors[0] | |||||
# if we reach here, we did not find any proper tarball, so consider the origin | # if we reach here, we did not find any proper tarball, so consider the origin | ||||
# not found | # not found | ||||
raise NotFound(f"Unknown origin {self.origin.url}.") | raise NotFound(f"Unknown origin {self.origin.url}.") | ||||
def process_data(self) -> bool: | def process_data(self) -> bool: | ||||
"""Build the snapshot out of the Directory retrieved.""" | """Build the snapshot out of the Directory retrieved.""" | ||||
assert self.directory is not None | assert self.directory is not None | ||||
Show All 26 Lines |