Page MenuHomeSoftware Heritage

D1497.diff
No OneTemporary

D1497.diff

diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
- `swh.lister.pypi`
- `swh.lister.npm`
- `swh.lister.phabricator`
+- `swh.lister.maven_central`
Dependencies
------------
@@ -177,6 +178,18 @@
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
+## lister-maven_central
+
+Once configured, you can execute a Maven Central lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.maven_central.tasks import maven_central_lister
+
+logging.basicConfig(level=logging.DEBUG)
+maven_central_lister()
+```
+
Licensing
---------
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
setuptools
xmltodict
iso8601
+beautifulsoup4
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -10,7 +10,7 @@
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator']
+ 'npm', 'phabricator', 'maven_central']
@click.command()
@@ -105,6 +105,11 @@
api_token='',
override_config=override_conf)
+ elif lister == 'maven_central':
+ from .maven_central.models import ModelBase
+ from .maven_central.lister import Maven_CentralLister
+ _lister = Maven_CentralLister(override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
+ 'swh.lister.maven_central.tasks',
]
diff --git a/swh/lister/maven_central/__init__.py b/swh/lister/maven_central/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/lister.py b/swh/lister/maven_central/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/lister.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from bs4 import BeautifulSoup
+
+from .models import Maven_CentralModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+
+
+class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = Maven_CentralModel
+ LISTER_NAME = 'maven_central'
+ PAGE = 'http://central.maven.org/maven2/'
+ _packages = []
+
+ def __init__(self, override_config=None):
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """(Override) Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+
+ """
+ _type = 'load-%s' % origin_type
+ _policy = 'recurring'
+ project_name = kwargs.get('name')
+ project_metadata_url = kwargs.get('html_url')
+ return utils.create_task_dict(
+ _type, _policy, project_name, origin_url,
+ project_metadata_url=project_metadata_url)
+
+ def list_packages(self, response):
+ """(Override) List the actual maven central origins from the response.
+
+ """
+ soup = BeautifulSoup(response.text, features="lxml")
+ groups = []
+ for b in soup.find_all('a'):
+ groups.append(b.text)
+
+ for group in groups[1:-6]:
+ self.find_packages_recursively(group, self.PAGE)
+
+ return self._packages
+
+ def find_packages_recursively(self, package_name, url):
+ """
+ Visits all the directories recursively and populate _packages variable
+ with all the packages present in different groups.
+ """
+ self.PAGE = url+package_name
+ response = self.safely_issue_request(1)
+ file_system = file_structure(response)
+
+ if 'maven-metadata.xml' in file_system:
+ package = {
+ 'name': package_name[:-1],
+ 'url': url+package_name,
+ 'metadata_url': url+package_name + 'maven-metadata.xml'
+ }
+ self._packages.append(package)
+ return None
+
+ # Some packages don't have maven-metadata.xml file, to deal with those
+ # cases this loop checks of there is no more directory to visit in the
+ # file system, hence it has reached the package file.
+ for files in file_system:
+ if files[-1] != '/':
+ previous_directory = special_name(url)
+ package = {
+ 'name': previous_directory,
+ 'url': url,
+ 'metadata_url': None
+ }
+ self._packages.append(package)
+ return None
+
+ for directory in file_system:
+ self.find_packages_recursively(directory, url+package_name)
+
+ return None
+
+ def get_model_from_repo(self, repo):
+ """(Override) Transform from repository representation to model
+
+ """
+ return {
+ 'uid': repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['metadata_url'],
+ 'origin_url': repo['url'],
+ 'origin_type': 'maven_central',
+ 'description': None,
+ }
+
+ def transport_response_simplified(self, response):
+ """(Override) Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+
+def file_structure(response):
+ '''
+ Lists all the files and folders present in the response
+
+ Args :
+ HTML response
+
+ Returns:
+ List of all the files and folders
+ '''
+ soup = BeautifulSoup(response.text, features="lxml")
+ files = []
+ for row in soup.find_all('a'):
+ files.append(row.text)
+ return files[1:]
+
+
+def special_name(url):
+ '''
+ Construct name of package from url for those packages which
+ do not have maven-metadata.xml file
+ '''
+ position_in_url = ([pos for pos, char in enumerate(url) if char == '/'])
+ name = url[position_in_url[-2]+1:position_in_url[-1]]
+ return name
diff --git a/swh/lister/maven_central/models.py b/swh/lister/maven_central/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/models.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from ..core.models import ModelBase
+
+
+class Maven_CentralModel(ModelBase):
+ """a Maven Central repository representation
+
+ """
+ __tablename__ = 'maven_central_repo'
+
+ uid = Column(String, primary_key=True)
diff --git a/swh/lister/maven_central/tasks.py b/swh/lister/maven_central/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import Maven_CentralLister
+
+
+@app.task(name=__name__ + '.Maven_CentralListerTask')
+def maven_central_lister(**lister_args):
+ Maven_CentralLister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/maven_central/tests/__init__.py b/swh/lister/maven_central/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/tests/conftest.py b/swh/lister/maven_central/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/maven_central/tests/test_tasks.py b/swh/lister/maven_central/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.pypi.tasks.PyPILister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked PypiLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.PyPIListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Dec 17 2024, 2:01 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3213976

Event Timeline