Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cpan/lister.py
| Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | class CpanLister(StatelessLister[CpanListerPage]): | ||||
| BASE_URL = "https://fastapi.metacpan.org/v1" | BASE_URL = "https://fastapi.metacpan.org/v1" | ||||
| REQUIRED_DOC_FIELDS = [ | REQUIRED_DOC_FIELDS = [ | ||||
| "download_url", | "download_url", | ||||
| "checksum_sha256", | "checksum_sha256", | ||||
| "distribution", | "distribution", | ||||
| "version", | "version", | ||||
| ] | ] | ||||
| OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] | OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] | ||||
| ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" | # CPAN hosts legacy modules known as backpan that do not have an HTML landing page | ||||
| # so use fake origin URL pattern below instead | |||||
| ORIGIN_URL_PATTERN = "cpan://{author}/{module_name}" | |||||
| EXTRINSIC_METADATA_URL_PATTERN = BASE_URL + "/release/{author}/{release_name}" | EXTRINSIC_METADATA_URL_PATTERN = BASE_URL + "/release/{author}/{release_name}" | ||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
| credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
| ): | ): | ||||
| super().__init__( | super().__init__( | ||||
| ▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines | class CpanLister(StatelessLister[CpanListerPage]): | ||||
| def get_origins_from_page( | def get_origins_from_page( | ||||
| self, module_names: CpanListerPage | self, module_names: CpanListerPage | ||||
| ) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
| """Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
| assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
| for module_name in module_names: | for module_name in module_names: | ||||
| module_metadata = self.module_metadata[module_name] | module_metadata = self.module_metadata[module_name] | ||||
| author = module_metadata[0]["cpan_author"] | |||||
| yield ListedOrigin( | yield ListedOrigin( | ||||
| lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
| visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
| url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), | url=self.ORIGIN_URL_PATTERN.format( | ||||
| author=author, module_name=module_name | |||||
| ), | |||||
| last_update=max(self.release_dates[module_name]), | last_update=max(self.release_dates[module_name]), | ||||
| extra_loader_arguments={ | extra_loader_arguments={ | ||||
| "artifacts": self.artifacts[module_name], | "artifacts": self.artifacts[module_name], | ||||
| "module_metadata": module_metadata, | "module_metadata": module_metadata, | ||||
| }, | }, | ||||
| ) | ) | ||||