Differential D1497 Diff 4911 swh/lister/maven_central/lister.py

Changeset View

Standalone View

swh/lister/maven_central/lister.py

This file was added.

				# Copyright (C) 2018 the Software Heritage developers
				# License: GNU General Public License version 3, or any later version
				# See top-level LICENSE file for more information

				from bs4 import BeautifulSoup

				from .models import Maven_CentralModel

				from swh.scheduler import utils
				from swh.lister.core.simple_lister import SimpleLister
				from swh.lister.core.lister_transports import ListerOnePageApiTransport


				class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister):
				MODEL = Maven_CentralModel
				LISTER_NAME = 'maven_central'
				PAGE = 'http://central.maven.org/maven2/'
				_packages = []

				def __init__(self, override_config=None):
				ListerOnePageApiTransport .__init__(self)
				SimpleLister.__init__(self, override_config=override_config)

				def task_dict(self, origin_type, origin_url, **kwargs):
				"""(Override) Return task format dict
				douarddaUnsubmitted Done Inline Actions Please do not put this '(Override)' in your docstrings (here and below). douardda: Please do not put this '(Override)' in your docstrings (here and below).

				This is overridden from the lister_base as more information is
				needed for the ingestion task creation.

				"""
				_type = 'origin-update-%s' % origin_type
				anlambertUnsubmitted Done Inline Actions FYI, we are currently renaming the scheduler task names to something more rational (see T1508). This should be changed to: _type = 'load-%s' % origin_type anlambert: FYI, we are currently renaming the scheduler task names to something more rational (see T1508).
				_policy = 'recurring'
				project_name = kwargs.get('name')
				project_metadata_url = kwargs.get('html_url')
				douarddaUnsubmitted Done Inline Actions I am a bit surprised by these 2 `dict.get()` here. Aren't these parameters mandatory for the loader task creation and execution to succeed? What is expected to happen if 'name' or 'html_url' are not found in the `kwargs` dict? douardda: I am a bit surprised by these 2 `dict.get()` here. Aren't these parameters mandatory for the…
				return utils.create_task_dict(
				_type, _policy, project_name, origin_url,
				project_metadata_url=project_metadata_url)
				douarddaUnsubmitted Done Inline Actions A detail, but I'm not sure I see any added value to use variables in this method. Could be: def task_dict(self, origin_type, origin_url, kwargs): '''[snip]''' return utils.create_task_dict( 'load-%s' % origin_type, 'recurring', kwargs.get('name'), origin_url, project_metadata_url=kwargs.get('html_url')) douardda:** A detail, but I'm not sure I see any added value to use variables in this method. Could be…

				def list_packages(self, response):
				"""(Override) List the actual maven central origins from the response.

				"""
				soup = BeautifulSoup(response.text, features="lxml")
				groups = []
				for b in soup.find_all('a'):
				groups.append(b.text)
				douarddaUnsubmitted Done Inline Actions prefer the list comprehension form: groups = [b.text for b in soup.find_all('a')] douardda: prefer the list comprehension form: ``` groups = [b.text for b in soup.find_all('a')] ```

				for group in groups[1:-6]:
				self.find_packages_recursively(group, self.PAGE)
				douarddaUnsubmitted Not Done Inline Actions Reading this, I'm a bit worried this code is very fragile... Where do these 1 and -6 come from? Why are they needed? douardda: Reading this, I'm a bit worried this code is very fragile... Where do these 1 and -6 come from?

				return self._packages

				def find_packages_recursively(self, package_name, url):
				"""
				Visits all the directories recursively and populate _packages variable
				with all the packages present in different groups.
				"""
				self.PAGE = url+package_name
				douarddaUnsubmitted Not Done Inline Actions what? why are you keeping this transient state (currently visited page) in your instance (as the PAGE attribute)? This seems bad design to me. Where is this self.PAGE used elsewhere in the class? douardda: what? why are you keeping this transient state (currently visited page) in your instance (as…
				response = self.safely_issue_request(1)
				file_system = file_structure(response)

				if 'maven-metadata.xml' in file_system:
				package = {
				'name': package_name[:-1],
				'url': url+package_name,
				'metadata_url': url+package_name + 'maven-metadata.xml'
				}
				self._packages.append(package)
				return None
				douarddaUnsubmitted Not Done Inline Actions do not `return None` if the function/method is not expected to return anything, just use `return`. douardda: do not `return None` if the function/method is not expected to return anything, just use…

				# Some packages don't have maven-metadata.xml file, to deal with those
				# cases this loop checks of there is no more directory to visit in the
				douarddaUnsubmitted Not Done Inline Actions s/of/if/ I guess douardda: s/of/if/ I guess
				# file system, hence it has reached the package file.
				for files in file_system:
				if files[-1] != '/':
				previous_directory = special_name(url)
				package = {
				'name': previous_directory,
				'url': url,
				'metadata_url': None
				}
				self._packages.append(package)
				return None
				douarddaUnsubmitted Not Done Inline Actions I don't like this 'almost duplicated' code here. I believe this method can be simplified and improved. douardda: I don't like this 'almost duplicated' code here. I believe this method can be simplified and…

				for directory in file_system:
				self.find_packages_recursively(directory, url+package_name)

				return None

				def get_model_from_repo(self, repo):
				"""(Override) Transform from repository representation to model

				"""
				return {
				'uid': repo['name'],
				'name': repo['name'],
				'full_name': repo['name'],
				'html_url': repo['metadata_url'],
				'origin_url': repo['url'],
				'origin_type': 'maven_central',
				'description': None,
				}

				def transport_response_simplified(self, response):
				"""(Override) Transform response to list for model manipulation

				"""
				return [self.get_model_from_repo(repo) for repo in response]


				def file_structure(response):
				'''
				Lists all the files and folders present in the response

				Args :
				HTML response

				Returns:
				List of all the files and folders
				'''
				soup = BeautifulSoup(response.text, features="lxml")
				files = []
				for row in soup.find_all('a'):
				files.append(row.text)
				return files[1:]


				def special_name(url):
				'''
				Construct name of package from url for those packages which
				do not have maven-metadata.xml file
				'''
				position_in_url = ([pos for pos, char in enumerate(url) if char == '/'])
				douarddaUnsubmitted Not Done Inline Actions no need for parenthesis here. But more importantly, why no use `re.findall()`? douardda: no need for parenthesis here. But more importantly, why no use `re.findall()`?
				douarddaUnsubmitted Not Done Inline Actions or even just `url.split('/')` as starting point in fact... douardda: or even just `url.split('/')` as starting point in fact...
				name = url[position_in_url[-2]+1:position_in_url[-1]]
				return name