Changeset View
Standalone View
swh/lister/maven_central/lister.py
- This file was added.
# Copyright (C) 2018 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from bs4 import BeautifulSoup | |||||
from .models import Maven_CentralModel | |||||
from swh.scheduler import utils | |||||
from swh.lister.core.simple_lister import SimpleLister | |||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport | |||||
class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister): | |||||
MODEL = Maven_CentralModel | |||||
LISTER_NAME = 'maven_central' | |||||
PAGE = 'http://central.maven.org/maven2/' | |||||
_packages = [] | |||||
def __init__(self, override_config=None): | |||||
ListerOnePageApiTransport .__init__(self) | |||||
SimpleLister.__init__(self, override_config=override_config) | |||||
def task_dict(self, origin_type, origin_url, **kwargs): | |||||
"""(Override) Return task format dict | |||||
douardda: Please do not put this '(Override)' in your docstrings (here and below). | |||||
This is overridden from the lister_base as more information is | |||||
needed for the ingestion task creation. | |||||
""" | |||||
_type = 'origin-update-%s' % origin_type | |||||
anlambertUnsubmitted Done Inline ActionsFYI, we are currently renaming the scheduler task names to something more rational (see T1508). _type = 'load-%s' % origin_type anlambert: FYI, we are currently renaming the scheduler task names to something more rational (see T1508). | |||||
_policy = 'recurring' | |||||
project_name = kwargs.get('name') | |||||
project_metadata_url = kwargs.get('html_url') | |||||
Done Inline ActionsI am a bit surprised by these 2 dict.get() here. Aren't these parameters mandatory for the loader task creation and execution to succeed? What is expected to happen if 'name' or 'html_url' are not found in the kwargs dict? douardda: I am a bit surprised by these 2 `dict.get()` here. Aren't these parameters mandatory for the… | |||||
return utils.create_task_dict( | |||||
_type, _policy, project_name, origin_url, | |||||
project_metadata_url=project_metadata_url) | |||||
Done Inline ActionsA detail, but I'm not sure I see any added value to use variables in this method. Could be: def task_dict(self, origin_type, origin_url, **kwargs): '''[snip]''' return utils.create_task_dict( 'load-%s' % origin_type, 'recurring', kwargs.get('name'), origin_url, project_metadata_url=kwargs.get('html_url')) douardda: A detail, but I'm not sure I see any added value to use variables in this method. Could be… | |||||
def list_packages(self, response): | |||||
"""(Override) List the actual maven central origins from the response. | |||||
""" | |||||
soup = BeautifulSoup(response.text, features="lxml") | |||||
groups = [] | |||||
for b in soup.find_all('a'): | |||||
groups.append(b.text) | |||||
Done Inline Actionsprefer the list comprehension form: groups = [b.text for b in soup.find_all('a')] douardda: prefer the list comprehension form:
```
groups = [b.text for b in soup.find_all('a')]
``` | |||||
for group in groups[1:-6]: | |||||
self.find_packages_recursively(group, self.PAGE) | |||||
Not Done Inline ActionsReading this, I'm a bit worried this code is very fragile... Where do these 1 and -6 come from? Why are they needed? douardda: Reading this, I'm a bit worried this code is very fragile... Where do these 1 and -6 come from? | |||||
return self._packages | |||||
def find_packages_recursively(self, package_name, url): | |||||
""" | |||||
Visits all the directories recursively and populate _packages variable | |||||
with all the packages present in different groups. | |||||
""" | |||||
self.PAGE = url+package_name | |||||
Not Done Inline Actionswhat? why are you keeping this transient state (currently visited page) in your instance (as the PAGE attribute)? This seems bad design to me. Where is this self.PAGE used elsewhere in the class? douardda: what? why are you keeping this transient state (currently visited page) in your instance (as… | |||||
response = self.safely_issue_request(1) | |||||
file_system = file_structure(response) | |||||
if 'maven-metadata.xml' in file_system: | |||||
package = { | |||||
'name': package_name[:-1], | |||||
'url': url+package_name, | |||||
'metadata_url': url+package_name + 'maven-metadata.xml' | |||||
} | |||||
self._packages.append(package) | |||||
return None | |||||
Not Done Inline Actionsdo not return None if the function/method is not expected to return anything, just use return. douardda: do not `return None` if the function/method is not expected to return anything, just use… | |||||
# Some packages don't have maven-metadata.xml file, to deal with those | |||||
# cases this loop checks of there is no more directory to visit in the | |||||
Not Done Inline Actionss/of/if/ I guess douardda: s/of/if/ I guess | |||||
# file system, hence it has reached the package file. | |||||
for files in file_system: | |||||
if files[-1] != '/': | |||||
previous_directory = special_name(url) | |||||
package = { | |||||
'name': previous_directory, | |||||
'url': url, | |||||
'metadata_url': None | |||||
} | |||||
self._packages.append(package) | |||||
return None | |||||
Not Done Inline ActionsI don't like this 'almost duplicated' code here. I believe this method can be simplified and improved. douardda: I don't like this 'almost duplicated' code here. I believe this method can be simplified and… | |||||
for directory in file_system: | |||||
self.find_packages_recursively(directory, url+package_name) | |||||
return None | |||||
def get_model_from_repo(self, repo): | |||||
"""(Override) Transform from repository representation to model | |||||
""" | |||||
return { | |||||
'uid': repo['name'], | |||||
'name': repo['name'], | |||||
'full_name': repo['name'], | |||||
'html_url': repo['metadata_url'], | |||||
'origin_url': repo['url'], | |||||
'origin_type': 'maven_central', | |||||
'description': None, | |||||
} | |||||
def transport_response_simplified(self, response): | |||||
"""(Override) Transform response to list for model manipulation | |||||
""" | |||||
return [self.get_model_from_repo(repo) for repo in response] | |||||
def file_structure(response): | |||||
''' | |||||
Lists all the files and folders present in the response | |||||
Args : | |||||
HTML response | |||||
Returns: | |||||
List of all the files and folders | |||||
''' | |||||
soup = BeautifulSoup(response.text, features="lxml") | |||||
files = [] | |||||
for row in soup.find_all('a'): | |||||
files.append(row.text) | |||||
return files[1:] | |||||
def special_name(url): | |||||
''' | |||||
Construct name of package from url for those packages which | |||||
do not have maven-metadata.xml file | |||||
''' | |||||
position_in_url = ([pos for pos, char in enumerate(url) if char == '/']) | |||||
Not Done Inline Actionsno need for parenthesis here. But more importantly, why no use re.findall()? douardda: no need for parenthesis here. But more importantly, why no use `re.findall()`? | |||||
Not Done Inline Actionsor even just url.split('/') as starting point in fact... douardda: or even just `url.split('/')` as starting point in fact... | |||||
name = url[position_in_url[-2]+1:position_in_url[-1]] | |||||
return name |
Please do not put this '(Override)' in your docstrings (here and below).