Commit 7fd97e25 authored by hynek's avatar hynek 🤤
Browse files

Merge branch 'feature/dhrep' into 'feature/tests'

Feature/dhrep

See merge request !10
parents d80a7940 f1be43cc
......@@ -10,11 +10,10 @@ include:
variables:
DOCKER_HOST: tcp://docker:2375
DOCKER_DRIVER: overlay2
CONTAINER_TEST_IMAGE: $CI_REGISTRY_IMAGE/$CI_COMMIT_REF_SLUG:$CI_COMMIT_SHA
CONTAINER_RELEASE_IMAGE: $CI_REGISTRY_IMAGE:latest
DS_PIP_DEPENDENCY_PATH: requirements/production.txt
DARIAH_STORAGE_TOKEN: $DH_TOKEN
stages:
- build
......@@ -40,17 +39,16 @@ build_develop:
- docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY
- docker build -t $CONTAINER_TEST_IMAGE -f compose/local/django/Dockerfile .
- docker push $CONTAINER_TEST_IMAGE
tests:
stage: test
image: tiangolo/docker-with-compose
script:
- docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN $CI_REGISTRY
- echo "Composing CI setup with $CONTAINER_TEST_IMAGE"
- docker-compose -f ci.yml build
# - docker-compose -f local.yml run --rm django pydocstyle
- docker-compose -f ci.yml run --rm django flake8
- docker-compose -f ci.yml run django coverage run -m pytest
- docker-compose -f ci.yml run django /bin/sh -c "./run_pytest"
- docker-compose -f ci.yml run --rm django coverage html
- docker-compose -f ci.yml run --rm django /bin/sh -c "cd docs && apk add make && make html"
- docker-compose -f ci.yml run django coverage report
......@@ -60,14 +58,15 @@ tests:
- htmlcov
- docs/_build
expire_in: 5 days
except:
allow_failure: true
except:
- master
code_quality:
stage: test
artifacts:
paths: [gl-code-quality-report.json]
except:
except:
- master
dependency_scanning:
......@@ -93,13 +92,13 @@ container_scanning:
create_release:
image: node:8
stage: release
script:
script:
- npm install
- npx semantic-release
only:
- master
release_image:
stage: deploy
script:
......
......@@ -37,4 +37,7 @@ RUN chmod +x /start
WORKDIR /app
# copy testing fixtures for dhrep tests
COPY ./discuss_data/dhrep/fixtures/* /app/discuss_data/media/
ENTRYPOINT ["/entrypoint"]
......@@ -4,6 +4,6 @@ set -o errexit
set -o pipefail
set -o nounset
python manage.py makemigrations
python manage.py migrate
python manage.py runserver_plus 0.0.0.0:8000
......@@ -24,6 +24,7 @@ urlpatterns = [
# path("accounts/", include("allauth.urls")),
# Your stuff: custom urls includes go here
path("dataset/", include("discuss_data.dddatasets.urls", namespace="dddatasets"),),
path("dhrep/", include("discuss_data.dhrep.urls", namespace="dhrep")),
path("shib/", include("shibboleth.urls", namespace="shibboleth")),
] + static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
......
# from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class DhrepConfig(AppConfig):
name = "dhrep"
[
{
"model": "dddatasets.datafile",
"pk": 1,
"fields": {
"uuid": "13a7c83e-d710-4ba0-a3dc-95e64edf2b28",
"dataset": 1,
"file": "lehnt.png",
"data_file_type": "DAT",
"data_file_format": "unknown",
"data_file_size": 29059,
"content_type": "image/png",
"name": "lehnt.png",
"repository_file_id": "not set",
"repository": "dariah-repository"
}
},
{
"model": "dddatasets.datafile",
"pk": 2,
"fields": {
"uuid": "074fe617-f880-432c-9358-cd9e98b02ce4",
"dataset": 1,
"file": "fluegel.png",
"data_file_type": "DAT",
"data_file_format": "unknown",
"data_file_size": 21633,
"content_type": "image/png",
"name": "fluegel.png",
"repository_file_id": "not set",
"repository": "dariah-repository"
}
},
{
"model": "auth.group",
"pk": 1,
"fields": {
"name": "Moderators",
"uuid": "7713724d-2a6c-4fb8-94af-7d7cc988e12c",
"permissions": [
4,
5,
6,
7,
1,
2,
3
]
}
},
{
"model": "auth.group",
"pk": 2,
"fields": {
"name": "Editors",
"uuid": "7713724d-2a6c-4fb8-94af-7d7cc988e12c",
"permissions": [
4,
5,
6,
7,
1,
2,
3
]
}
},
{
"model": "ddusers.user",
"pk": 1,
"fields": {
"password": "!mguXOReJQ4C0VhaHGqu7Ccw4B6jCJjqEKWZ2hYBk",
"last_login": null,
"is_superuser": false,
"username": "AnonymousUser",
"first_name": "",
"last_name": "",
"email": "",
"is_staff": false,
"is_active": true,
"date_joined": "2020-01-16T11:34:43.682Z",
"uuid": "117c2a74-dec2-419f-a4a1-31bf61814c08",
"middle_name": "",
"academic_title": "",
"name_prefix": "",
"name_suffix": "",
"photo": "",
"research_profile_full": "",
"research_profile_short": "",
"access": "PUB",
"external_profile": "",
"groups": [],
"user_permissions": [],
"topics": [],
"receptive_tos": [],
"countries": [],
"publications": []
}
},
{
"model": "ddusers.user",
"pk": 2,
"fields": {
"password": "!opxW4BRsN8RvlViGuGgO2P6z5B9tHFv4wIf1lxiO",
"last_login": "2019-04-01T12:22:32.080Z",
"is_superuser": false,
"username": "test",
"first_name": "dd",
"last_name": "test",
"email": "test@discuss-data.net",
"is_staff": false,
"is_active": true,
"date_joined": "2019-09-03T10:06:08.698Z",
"uuid": "f0fc08c5-6f0c-45ab-9af3-8e67bcc5e6c7",
"middle_name": "",
"academic_title": "",
"name_prefix": "",
"name_suffix": "",
"photo": "",
"research_profile_full": "",
"research_profile_short": "",
"access": "PUB",
"external_profile": "",
"groups": [
1,
2
],
"user_permissions": [],
"topics": [],
"receptive_tos": [],
"countries": [],
"publications": []
}
},
{
"model": "dddatasets.datasetmanagementobject",
"pk": 1,
"fields": {
"uuid": "af8b9327-90de-48da-9a0b-5eead050489e",
"owner": 2,
"created_at": "2019-09-04T09:36:56.424Z",
"updated_at": "2019-09-04T09:36:56.477Z",
"doi": "",
"published": false,
"main_published_ds": null,
"groups": [
1,
2
]
}
},
{
"model": "dddatasets.dataset",
"pk": 1,
"fields": {
"uuid": "d5e80404-1e85-43e3-bed1-a89cc248e1dd",
"owner": 2,
"institution": null,
"title": "DataSet 1",
"subtitle": "",
"image": "",
"link": null,
"creators": "",
"data_repository": null,
"publication_date": "2019-09-04",
"date_of_data_creation_from": null,
"date_of_data_creation_to": null,
"date_of_data_creation_text": "",
"version": 1.0,
"description": "DataSet for testing",
"sources_of_data": "",
"time_period_text": "",
"time_period_from": null,
"time_period_to": null,
"license": null,
"related_dataset_text": "",
"institutional_affiliation": "",
"funding": "",
"dataset_management_object": 1,
"published": false,
"groups": [],
"categories": [],
"countries": [],
"datatypes": [],
"related_dataset": [],
"sponsors": []
}
}
]
# middleware to use shibboleth with gunicorn, e.g. for apache http-proxy
from shibboleth.middleware import ShibbolethRemoteUserMiddleware
class ProxyRemoteUserMiddleware(ShibbolethRemoteUserMiddleware):
header = "HTTP_REMOTE_USER"
# from django.db import models
# Create your models here.
""" Publish to the DARIAH-DE Repository
Repository Frontend docs:
https://repository.de.dariah.eu/doc/services/submodules/publikator/docs/
Usage:
job = publish.publish_dataset(token=token, dataset_id=1)
while(True):
status = job.status()
# communicate progress to user ... -> status.progress
if status['finished']:
break
time.sleep(1)
"""
import json
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from django.template import loader
from discuss_data.dddatasets.models import DataSet
from discuss_data.dhrep.storage import Storage
class Publish:
""" Methods for publication to the DARIAH-DE repository
"""
def __init__(self):
self._storage = Storage()
self._publish_url = settings.DARIAH_PUBLISH_URL
if not self._publish_url.endswith("/"):
self._publish_url += "/"
def publish_dataset(self, token, dataset_id):
""" Publish a DiscussData dataset to DARIAH-DE repository,
convenience method that calls copy_dataset_to_ownstorage() and publish()
Parameters:
token {str} -- authentication token
dataset_id {int} -- ID of DiscussData dataset
Returns:
PublishJob -- a PublishJob that contains all relevant information
"""
job = self.copy_dataset_to_ownstorage(token, dataset_id)
job.publish()
return job
def copy_dataset_to_ownstorage(self, token, dataset_id):
"""Copy all files of a DiscussData dataset to the DARIAH-DE ownstorage and
create collection rdf with metadata and links to the files. The storage-ID
of this collection can be used to trigger publication, which could also be
done with job.publish()
Arguments:
token {str} -- authentication token
dataset_id {int} -- ID of DiscussData dataset
Returns:
PublishJob -- a PublishJob which contains all relevant information
"""
dataset = DataSet.objects.get(id=dataset_id)
storage_id = self._storage.create(token)
job = PublishJob(token, storage_id, dataset_id)
datafiles = dataset.get_datafiles()
for datafile in datafiles:
# add storage_ids to datafile objects for
# usage with create_collection_rdf()
datafile_storage_id = self.upload_datafile(token, datafile)
job.files[datafile_storage_id] = {
"id": datafile.id,
"dataset_id": dataset_id,
"name": datafile.name,
"content_type": datafile.content_type,
"storage_id": datafile_storage_id,
}
turtle = self.create_collection_rdf(storage_id, dataset, job.files)
tfile = ContentFile(turtle)
tfile.file.content_type = "text/plain"
self._storage.update(token, storage_id, tfile)
return job
def publish(self, token, storage_id):
"""Publish a collection where the collection metadata rdf is located
at the storage_id.
Arguments:
token {str} -- authentication token
storage_id {str} -- storage_id of collection rdf
Raises:
PublishException: An error from publicator if HTTP-Status != 200
"""
response = requests.post(
self._publish_url + storage_id + "/publish",
headers={
"X-Storage-Token": token,
"Accept": "application/json",
"X-Transaction-ID": "BLUBBER",
},
)
if response.status_code != 200:
raise PublishException(
"Error starting publication process: "
+ response.text
+ " - "
+ str(response.status_code)
)
def status(self, token, storage_id):
"""get status from publish service for a given storage-id in publication pipeline.
better use job.status()
Arguments:
token {str} -- authentication token
storage_id {str} -- storage_id of collection rdf
Raises:
PublishException: An error from publicator if HTTP-Status != 200
Returns:
JSON -- publish status response as JSON object
"""
response = requests.get(
self._publish_url + storage_id + "/status",
headers={"X-Storage-Token": token, "Accept": "application/json"},
)
print("status-text: " + response.text)
print("status-status: " + str(response.status_code))
if response.status_code != 200:
raise PublishException(
"Error with publish status: "
+ response.text
+ " - "
+ str(response.status_code)
)
return json.loads(response.text)
def upload_datafile(self, token, datafile):
"""Upload a Django File to DARIAH-DE ownstorage
Arguments:
token {str} -- authentication token
datafile {File} -- File object from Django
Returns:
str -- storage_id from DARIAH-DE ownstorage
"""
tfile = ContentFile(datafile.file.read())
# TODO: possible without copying into memory?
tfile.file.content_type = datafile.content_type
storage_id = self._storage.create(token, tfile)
return storage_id
@staticmethod
def create_collection_rdf(storage_id, dataset, datafiles):
"""Create dariahrep collection rdf for a dataset
https://repository.de.dariah.eu/doc/services/submodules/kolibri/kolibri-dhpublish-service/docs/index.html
Arguments:
storage_id {str} -- dariahstorage id for the collection file, for self-reference
dataset {ddusers.model.DataSet} -- a ddusers DataSet to generate rdf for
datafiles {Array ddusers.model.DataFile} -- Array of ddusers.DataFile contained in
collection extended with their storage_id
Returns:
[str] -- RDF (turtle) to represent the given dataset
"""
turtle = loader.render_to_string(
"collection.ttl",
{"storage_id": storage_id, "dataset": dataset, "datafiles": datafiles},
)
return turtle
def get_publish_url(self):
"""Get the publish service url
Returns:
str -- the actual publish service url set
"""
return self._publish_url
class PublishException(Exception):
"""Thrown in case of problems with the publish service"""
class PublishJob:
"""PublishJob - keep track of status, files and their DOIs in a
Publish Process to the DARIAH-DE Repository
"""
def __init__(self, token, collection_id, dataset_id):
self.token = token
self.collection_id = collection_id
self.dataset_id = dataset_id
self.files = dict()
self.pid = ""
def remote_status(self):
"""Get the status of this PublishJob from DARIAH-DE publish service
Raises:
PublishException: An error from publicator if HTTP-Status != 200
Returns:
JSON -- publish status response as JSON object
"""
return publish.status(self.token, self.collection_id)
def publish(self):
"""Inititate publish process for this PublishJob
Raises:
PublishException: An error from publicator if HTTP-Status != 200
"""
return publish.publish(self.token, self.collection_id)
def status(self):
""" Status of this job this includes:
- files, storage-IDs and DOIS if already available
- progress (in percent)
- finished (true / false)
- publish-status answer from remote server
Returns:
dict -- finished/progress/file-mappings, ...
"""
status = self.remote_status()
# attach pid to files
if "publishObjects" in status:
for pobject in status["publishObjects"]:
# cut 'https://de.dariah.eu/storage/' from uri to get the storage_id
storage_id = pobject["uri"][29:]
if "pid" in pobject:
if storage_id == self.collection_id:
self.pid = pobject["pid"]
else:
self.files[storage_id]["pid"] = pobject["pid"]
return {
"finished": status["publishStatus"]["processStatus"] == "FINISHED",
"progress": status["publishStatus"]["progress"],
"collection": {
"dataset_id": self.dataset_id,
"storage_id": self.collection_id,
"pid": self.pid,
},
"files": self.files,
"remote_status": status,
}
# an instance of publish
publish = Publish()
""" Connection to DARIAH-DE storage
Storage API docs: http://hdl.handle.net/11858/00-1734-0000-0009-FEA1-D
"""
from django.conf import settings
import requests
class Storage:
"""
Methods to create and update data objects in DARIAH-DE storage
"""
def __init__(self):
storage_url = settings.DARIAH_STORAGE_LOCATION
if not storage_url.endswith("/"):
storage_url += "/"
self._storage_url = storage_url
def create(self, token, content=None):
"""Create a new object in dariahstorage
Arguments:
token {str} -- authentication token
content {ContentFile, optional} -- data to store (optionally)
Raises:
Exception -- if response from storage had no 201 status code