diff --git a/PKG-INFO b/PKG-INFO index 8e011fa3..0e524373 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.89 +Version: 0.0.90 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/ Description: # swh-deposit This is [Software Heritage](https://www.softwareheritage.org)'s [SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server implementation, as well as a simple client to upload deposits on the server. **S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository **D**eposit) is an interoperability standard for digital file deposit. This implementation will permit interaction between a client (a repository) and a server (SWH repository) to permit deposits of software source code archives and associated metadata. The documentation is at ./docs/README-specification.md Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: server diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 8e011fa3..0e524373 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.89 +Version: 0.0.90 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/ Description: # swh-deposit This is [Software Heritage](https://www.softwareheritage.org)'s [SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server implementation, as well as a simple client to upload deposits on the server. **S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository **D**eposit) is an interoperability standard for digital file deposit. This implementation will permit interaction between a client (a repository) and a server (SWH repository) to permit deposits of software source code archives and associated metadata. The documentation is at ./docs/README-specification.md Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: server diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 0f37b938..04e86e6c 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,233 +1,240 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb +import datetime from django.contrib.postgres.fields import JSONField, ArrayField from django.contrib.auth.models import User, UserManager from django.db import models from django.utils.timezone import now from .config import ( DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_REJECTED, ARCHIVE_TYPE, METADATA_TYPE, ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) class Meta: db_table = "deposit_client" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deposit's external identifier external_id = models.TextField() # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swh_id = models.TextField(blank=True, null=True) swh_id_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) class Meta: db_table = "deposit" def __str__(self): d = { "id": self.id, "reception_date": self.reception_date, "collection": self.collection.name, "external_id": self.external_id, "client": self.client.username, "status": self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) @property def origin_url(self): return "%s/%s" % (self.client.provider_url.rstrip("/"), self.external_id) -def client_directory_path(instance, filename): - """Callable to upload archive in MEDIA_ROOT/user_/ +def client_directory_path(instance: "DepositRequest", filename: str) -> str: + """Callable to determine the upload archive path. This defaults to + MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. + + The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit + formatted using strftime. Args: - instance (DepositRequest): DepositRequest concerned by the upload - filename (str): Filename of the uploaded file + instance: DepositRequest concerned by the upload + filename: Filename of the uploaded file Returns: - A path to be prefixed by the MEDIA_ROOT to access physically - to the file uploaded. + The upload archive path. """ - return "client_{0}/{1}".format(instance.deposit.client.id, filename) + reception_date = instance.deposit.reception_date + assert isinstance(reception_date, datetime.datetime) + folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") + return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py index 1fd4845e..3b02d7bf 100644 --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -1,141 +1,146 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import os import re import tarfile import tempfile from swh.core import tarball def compute_info(archive_path): """Given a path, compute information on path. """ with open(archive_path, "rb") as f: length = 0 sha1sum = hashlib.sha1() md5sum = hashlib.md5() data = b"" for chunk in f: sha1sum.update(chunk) md5sum.update(chunk) length += len(chunk) data += chunk return { "dir": os.path.dirname(archive_path), "name": os.path.basename(archive_path), "path": archive_path, "length": length, "sha1sum": sha1sum.hexdigest(), "md5sum": md5sum.hexdigest(), "data": data, } def _compress(path, extension, dir_path): """Compress path according to extension """ if extension == "zip" or extension == "tar": return tarball.compress(path, extension, dir_path) elif "." in extension: split_ext = extension.split(".") if split_ext[0] != "tar": raise ValueError( "Development error, only zip or tar archive supported, " "%s not supported" % extension ) # deal with specific tar mode = split_ext[1] supported_mode = ["xz", "gz", "bz2"] if mode not in supported_mode: raise ValueError( "Development error, only %s supported, %s not supported" % (supported_mode, mode) ) files = tarball._ls(dir_path) with tarfile.open(path, "w:%s" % mode) as t: for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) return path def create_arborescence_archive( root_path, archive_name, filename, content, up_to_size=None, extension="zip" ): """Build an archive named archive_name in the root_path. This archive contains one file named filename with the content content. Args: root_path (str): Location path of the archive to create archive_name (str): Archive's name (without extension) filename (str): Archive's content is only one filename content (bytes): Content of the filename up_to_size (int | None): Fill in the blanks size to oversize or complete an archive's size extension (str): Extension of the archive to write (default is zip) Returns: dict with the keys: - dir: the directory of that archive - path: full path to the archive - sha1sum: archive's sha1sum - length: archive's length """ os.makedirs(root_path, exist_ok=True) archive_path_dir = tempfile.mkdtemp(dir=root_path) dir_path = os.path.join(archive_path_dir, archive_name) os.mkdir(dir_path) filepath = os.path.join(dir_path, filename) _length = len(content) count = 0 batch_size = 128 with open(filepath, "wb") as f: f.write(content) if up_to_size: # fill with blank content up to a given size count += _length while count < up_to_size: f.write(b"0" * batch_size) count += batch_size _path = "%s.%s" % (dir_path, extension) _path = _compress(_path, extension, dir_path) return compute_info(_path) def create_archive_with_archive(root_path, name, archive): """Create an archive holding another. """ invalid_archive_path = os.path.join(root_path, name) with tarfile.open(invalid_archive_path, "w:gz") as _archive: _archive.add(archive["path"], arcname=archive["name"]) return compute_info(invalid_archive_path) def check_archive(archive_name: str, archive_name_to_check: str): """Helper function to ensure archive_name is present within the archive_name_to_check. Raises: AssertionError if archive_name is not present within archive_name_to_check """ + ARCHIVE_FILEPATH_PATTERN = re.compile( + r"client_[0-9].*/[0-9]{8}-[0-9]{6}\.[0-9]{6}/[a-zA-Z0-9.].*" + ) + assert ARCHIVE_FILEPATH_PATTERN.match(archive_name_to_check) + if "." in archive_name: filename, extension = archive_name.split(".") pattern = re.compile(".*/%s.*\\.%s" % (filename, extension)) else: pattern = re.compile(".*/%s" % archive_name) assert pattern.match(archive_name_to_check) is not None diff --git a/swh/deposit/tests/test_common.py b/swh/deposit/tests/test_common.py index 89454aba..59548b80 100644 --- a/swh/deposit/tests/test_common.py +++ b/swh/deposit/tests/test_common.py @@ -1,26 +1,28 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.deposit.tests.common import check_archive def test_check_archive_helper(): # success for archive_name, archive_name_to_check in [ - ("filename0", "something/filename0"), - ("archive.zip", "client_1/archive_noisynoise.zip"), + ("filename0", "client_666/20200601-092624.421886/filename0.zip"), + ("archive", "client_007/20190601-092624.532978/archive"), ]: check_archive(archive_name, archive_name_to_check) # failures for archive_name, archive_name_to_check in [ + ("filename0", "something/filename0"), + ("archive.zip", "client_1/archive_noisynoise.zip"), ("filename0", "something-filename0"), ("archive.zip", "client_1_archive_noisynoise.zip"), ("reference", "irrelevant"), ]: with pytest.raises(AssertionError): check_archive(archive_name, archive_name_to_check) diff --git a/version.txt b/version.txt index 02267cdc..c5d8e33c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.89-0-g4b4cebc4 \ No newline at end of file +v0.0.90-0-gc586ff17 \ No newline at end of file