Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122975
D1497.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D1497.diff
View Options
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
- `swh.lister.pypi`
- `swh.lister.npm`
- `swh.lister.phabricator`
+- `swh.lister.maven_central`
Dependencies
------------
@@ -177,6 +178,18 @@
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
+## lister-maven_central
+
+Once configured, you can execute a Maven Central lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.maven_central.tasks import maven_central_lister
+
+logging.basicConfig(level=logging.DEBUG)
+maven_central_lister()
+```
+
Licensing
---------
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
setuptools
xmltodict
iso8601
+beautifulsoup4
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -10,7 +10,7 @@
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
- 'npm', 'phabricator']
+ 'npm', 'phabricator', 'maven_central']
@click.command()
@@ -105,6 +105,11 @@
api_token='',
override_config=override_conf)
+ elif lister == 'maven_central':
+ from .maven_central.models import ModelBase
+ from .maven_central.lister import Maven_CentralLister
+ _lister = Maven_CentralLister(override_config=override_conf)
+
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
+ 'swh.lister.maven_central.tasks',
]
diff --git a/swh/lister/maven_central/__init__.py b/swh/lister/maven_central/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/lister.py b/swh/lister/maven_central/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/lister.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from bs4 import BeautifulSoup
+
+from .models import Maven_CentralModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+
+
+class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = Maven_CentralModel
+ LISTER_NAME = 'maven_central'
+ PAGE = 'http://central.maven.org/maven2/'
+ _packages = []
+
+ def __init__(self, override_config=None):
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """(Override) Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+
+ """
+ _type = 'load-%s' % origin_type
+ _policy = 'recurring'
+ project_name = kwargs.get('name')
+ project_metadata_url = kwargs.get('html_url')
+ return utils.create_task_dict(
+ _type, _policy, project_name, origin_url,
+ project_metadata_url=project_metadata_url)
+
+ def list_packages(self, response):
+ """(Override) List the actual maven central origins from the response.
+
+ """
+ soup = BeautifulSoup(response.text, features="lxml")
+ groups = []
+ for b in soup.find_all('a'):
+ groups.append(b.text)
+
+ for group in groups[1:-6]:
+ self.find_packages_recursively(group, self.PAGE)
+
+ return self._packages
+
+ def find_packages_recursively(self, package_name, url):
+ """
+ Visits all the directories recursively and populate _packages variable
+ with all the packages present in different groups.
+ """
+ self.PAGE = url+package_name
+ response = self.safely_issue_request(1)
+ file_system = file_structure(response)
+
+ if 'maven-metadata.xml' in file_system:
+ package = {
+ 'name': package_name[:-1],
+ 'url': url+package_name,
+ 'metadata_url': url+package_name + 'maven-metadata.xml'
+ }
+ self._packages.append(package)
+ return None
+
+ # Some packages don't have maven-metadata.xml file, to deal with those
+ # cases this loop checks of there is no more directory to visit in the
+ # file system, hence it has reached the package file.
+ for files in file_system:
+ if files[-1] != '/':
+ previous_directory = special_name(url)
+ package = {
+ 'name': previous_directory,
+ 'url': url,
+ 'metadata_url': None
+ }
+ self._packages.append(package)
+ return None
+
+ for directory in file_system:
+ self.find_packages_recursively(directory, url+package_name)
+
+ return None
+
+ def get_model_from_repo(self, repo):
+ """(Override) Transform from repository representation to model
+
+ """
+ return {
+ 'uid': repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['metadata_url'],
+ 'origin_url': repo['url'],
+ 'origin_type': 'maven_central',
+ 'description': None,
+ }
+
+ def transport_response_simplified(self, response):
+ """(Override) Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+
+def file_structure(response):
+ '''
+ Lists all the files and folders present in the response
+
+ Args :
+ HTML response
+
+ Returns:
+ List of all the files and folders
+ '''
+ soup = BeautifulSoup(response.text, features="lxml")
+ files = []
+ for row in soup.find_all('a'):
+ files.append(row.text)
+ return files[1:]
+
+
+def special_name(url):
+ '''
+ Construct name of package from url for those packages which
+ do not have maven-metadata.xml file
+ '''
+ position_in_url = ([pos for pos, char in enumerate(url) if char == '/'])
+ name = url[position_in_url[-2]+1:position_in_url[-1]]
+ return name
diff --git a/swh/lister/maven_central/models.py b/swh/lister/maven_central/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/models.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from ..core.models import ModelBase
+
+
+class Maven_CentralModel(ModelBase):
+ """a Maven Central repository representation
+
+ """
+ __tablename__ = 'maven_central_repo'
+
+ uid = Column(String, primary_key=True)
diff --git a/swh/lister/maven_central/tasks.py b/swh/lister/maven_central/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import Maven_CentralLister
+
+
+@app.task(name=__name__ + '.Maven_CentralListerTask')
+def maven_central_lister(**lister_args):
+ Maven_CentralLister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/maven_central/tests/__init__.py b/swh/lister/maven_central/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/tests/conftest.py b/swh/lister/maven_central/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/maven_central/tests/test_tasks.py b/swh/lister/maven_central/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.pypi.tasks.PyPILister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked PypiLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.PyPIListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 17 2024, 2:01 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3213976
Attached To
D1497: Maven Lister
Event Timeline
Log In to Comment