Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cpan/lister.py
Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | class CpanLister(StatelessLister[CpanListerPage]): | ||||
BASE_URL = "https://fastapi.metacpan.org/v1" | BASE_URL = "https://fastapi.metacpan.org/v1" | ||||
REQUIRED_DOC_FIELDS = [ | REQUIRED_DOC_FIELDS = [ | ||||
"download_url", | "download_url", | ||||
"checksum_sha256", | "checksum_sha256", | ||||
"distribution", | "distribution", | ||||
"version", | "version", | ||||
] | ] | ||||
OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] | OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] | ||||
ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" | # CPAN hosts legacy modules known as backpan that do not have an HTML landing page | ||||
# so use fake origin URL pattern below instead | |||||
ORIGIN_URL_PATTERN = "cpan://{author}/{module_name}" | |||||
EXTRINSIC_METADATA_URL_PATTERN = BASE_URL + "/release/{author}/{release_name}" | EXTRINSIC_METADATA_URL_PATTERN = BASE_URL + "/release/{author}/{release_name}" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines | class CpanLister(StatelessLister[CpanListerPage]): | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, module_names: CpanListerPage | self, module_names: CpanListerPage | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for module_name in module_names: | for module_name in module_names: | ||||
module_metadata = self.module_metadata[module_name] | module_metadata = self.module_metadata[module_name] | ||||
author = module_metadata[0]["cpan_author"] | |||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), | url=self.ORIGIN_URL_PATTERN.format( | ||||
author=author, module_name=module_name | |||||
), | |||||
last_update=max(self.release_dates[module_name]), | last_update=max(self.release_dates[module_name]), | ||||
extra_loader_arguments={ | extra_loader_arguments={ | ||||
"artifacts": self.artifacts[module_name], | "artifacts": self.artifacts[module_name], | ||||
"module_metadata": module_metadata, | "module_metadata": module_metadata, | ||||
}, | }, | ||||
) | ) |