Commit e2a0b8da authored by hynek's avatar hynek 🤤
Browse files

feat(dhrep): add dhrep package

implement dhrep functionalities
parent d80a7940
# from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class DhrepConfig(AppConfig):
name = "dhrep"
[
{
"model": "dddatasets.datafile",
"pk": 1,
"fields": {
"uuid": "13a7c83e-d710-4ba0-a3dc-95e64edf2b28",
"dataset": 1,
"file": "lehnt.png",
"data_file_type": "DAT",
"data_file_format": "unknown",
"data_file_size": 29059,
"content_type": "image/png",
"name": "lehnt.png",
"repository_file_id": "not set",
"repository": "dariah-repository"
}
},
{
"model": "dddatasets.datafile",
"pk": 2,
"fields": {
"uuid": "074fe617-f880-432c-9358-cd9e98b02ce4",
"dataset": 1,
"file": "fluegel.png",
"data_file_type": "DAT",
"data_file_format": "unknown",
"data_file_size": 21633,
"content_type": "image/png",
"name": "fluegel.png",
"repository_file_id": "not set",
"repository": "dariah-repository"
}
},
{
"model": "auth.group",
"pk": 3,
"fields": {
"name": "1_view",
"permissions": []
}
},
{
"model": "auth.group",
"pk": 4,
"fields": {
"name": "1_edit",
"permissions": []
}
},
{
"model": "auth.group",
"pk": 5,
"fields": {
"name": "1_admin",
"permissions": []
}
},
{
"model": "ddusers.user",
"pk": 1,
"fields": {
"password": "!PQdnCsg0t1VbaGndhvk8WnISK7kxYJaZFRguQvxx",
"last_login": null,
"is_superuser": false,
"username": "AnonymousUser",
"first_name": "",
"last_name": "",
"email": "",
"is_staff": false,
"is_active": true,
"date_joined": "2019-09-03T09:52:54.798Z",
"uuid": "a8e8d2d5-b8c2-4497-963d-85a5697306cd",
"middle_name": "",
"academic_title": "",
"name_prefix": "",
"name_suffix": "",
"photo": "",
"research_profile_full": "",
"research_profile_short": "",
"access": "PUB",
"external_profile": "",
"groups": [],
"user_permissions": [],
"topics": [],
"receptive_tos": [],
"countries": [],
"publications": []
}
},
{
"model": "ddusers.user",
"pk": 2,
"fields": {
"password": "!opxW4BRsN8RvlViGuGgO2P6z5B9tHFv4wIf1lxiO",
"last_login": "2019-04-01T12:22:32.080Z",
"is_superuser": false,
"username": "test",
"first_name": "dd",
"last_name": "test",
"email": "test@discuss-data.net",
"is_staff": false,
"is_active": true,
"date_joined": "2019-09-03T10:06:08.698Z",
"uuid": "f0fc08c5-6f0c-45ab-9af3-8e67bcc5e6c7",
"middle_name": "",
"academic_title": "",
"name_prefix": "",
"name_suffix": "",
"photo": "",
"research_profile_full": "",
"research_profile_short": "",
"access": "PUB",
"external_profile": "",
"groups": [
3,
4,
5
],
"user_permissions": [],
"topics": [],
"receptive_tos": [],
"countries": [],
"publications": []
}
},
{
"model": "dddatasets.datasetmanagementobject",
"pk": 1,
"fields": {
"uuid": "af8b9327-90de-48da-9a0b-5eead050489e",
"owner": 2,
"created_at": "2019-09-04T09:36:56.424Z",
"updated_at": "2019-09-04T09:36:56.477Z",
"doi": "",
"published": false,
"main_published_ds": null,
"groups": [
3,
4,
5
]
}
},
{
"model": "dddatasets.dataset",
"pk": 1,
"fields": {
"uuid": "d5e80404-1e85-43e3-bed1-a89cc248e1dd",
"owner": 2,
"institution": null,
"title": "DataSet 1",
"subtitle": "",
"image": "",
"link": null,
"creators": "",
"data_repository": null,
"publication_date": "2019-09-04",
"date_of_data_creation_from": null,
"date_of_data_creation_to": null,
"date_of_data_creation_text": "",
"version": 1.0,
"description": "DataSet for testing",
"sources_of_data": "",
"time_period_text": "",
"time_period_from": null,
"time_period_to": null,
"license": null,
"related_dataset_text": "",
"institutional_affiliation": "",
"funding": "",
"dataset_management_object": 1,
"published": false,
"groups": [],
"categories": [],
"countries": [],
"datatypes": [],
"related_dataset": [],
"sponsors": []
}
}
]
# middleware to use shibboleth with gunicorn, e.g. for apache http-proxy
from shibboleth.middleware import ShibbolethRemoteUserMiddleware
class ProxyRemoteUserMiddleware(ShibbolethRemoteUserMiddleware):
header = "HTTP_REMOTE_USER"
# from django.db import models
# Create your models here.
""" Publish to the DARIAH-DE Repository
Repository Frontend docs:
https://repository.de.dariah.eu/doc/services/submodules/publikator/docs/
Usage:
job = publish.publish_dataset(token=token, dataset_id=1)
while(True):
status = job.status()
# communicate progress to user ... -> status.progress
if status['finished']:
break
time.sleep(1)
"""
import json
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from django.template import loader
from discuss_data.dddatasets.models import DataSet
from discuss_data.dhrep.storage import Storage
class Publish:
""" Methods for publication to the DARIAH-DE repository
"""
def __init__(self):
self._storage = Storage()
self._publish_url = settings.DARIAH_PUBLISH_URL
if not self._publish_url.endswith("/"):
self._publish_url += "/"
def publish_dataset(self, token, dataset_id):
""" Publish a DiscussData dataset to DARIAH-DE repository,
convenience method that calls copy_dataset_to_ownstorage() and publish()
Parameters:
token {str} -- authentication token
dataset_id {int} -- ID of DiscussData dataset
Returns:
PublishJob -- a PublishJob that contains all relevant information
"""
job = self.copy_dataset_to_ownstorage(token, dataset_id)
job.publish()
return job
def copy_dataset_to_ownstorage(self, token, dataset_id):
"""Copy all files of a DiscussData dataset to the DARIAH-DE ownstorage and
create collection rdf with metadata and links to the files. The storage-ID
of this collection can be used to trigger publication, which could also be
done with job.publish()
Arguments:
token {str} -- authentication token
dataset_id {int} -- ID of DiscussData dataset
Returns:
PublishJob -- a PublishJob which contains all relevant information
"""
dataset = DataSet.objects.get(id=dataset_id)
storage_id = self._storage.create(token)
job = PublishJob(token, storage_id, dataset_id)
datafiles = dataset.get_datafiles()
for datafile in datafiles:
# add storage_ids to datafile objects for
# usage with create_collection_rdf()
datafile_storage_id = self.upload_datafile(token, datafile)
job.files[datafile_storage_id] = {
"id": datafile.id,
"dataset_id": dataset_id,
"name": datafile.name,
"content_type": datafile.content_type,
"storage_id": datafile_storage_id,
}
turtle = self.create_collection_rdf(storage_id, dataset, job.files)
tfile = ContentFile(turtle)
tfile.file.content_type = "text/plain"
self._storage.update(token, storage_id, tfile)
return job
def publish(self, token, storage_id):
"""Publish a collection where the collection metadata rdf is located
at the storage_id.
Arguments:
token {str} -- authentication token
storage_id {str} -- storage_id of collection rdf
Raises:
PublishException: An error from publicator if HTTP-Status != 200
"""
response = requests.post(
self._publish_url + storage_id + "/publish",
headers={
"X-Storage-Token": token,
"Accept": "application/json",
"X-Transaction-ID": "BLUBBER",
},
)
if response.status_code != 200:
raise PublishException(
"Error starting publication process: "
+ response.text
+ " - "
+ str(response.status_code)
)
def status(self, token, storage_id):
"""get status from publish service for a given storage-id in publication pipeline.
better use job.status()
Arguments:
token {str} -- authentication token
storage_id {str} -- storage_id of collection rdf
Raises:
PublishException: An error from publicator if HTTP-Status != 200
Returns:
JSON -- publish status response as JSON object
"""
response = requests.get(
self._publish_url + storage_id + "/status",
headers={"X-Storage-Token": token, "Accept": "application/json"},
)
print("status-text: " + response.text)
print("status-status: " + str(response.status_code))
if response.status_code != 200:
raise PublishException(
"Error with publish status: "
+ response.text
+ " - "
+ str(response.status_code)
)
return json.loads(response.text)
def upload_datafile(self, token, datafile):
"""Upload a Django File to DARIAH-DE ownstorage
Arguments:
token {str} -- authentication token
datafile {File} -- File object from Django
Returns:
str -- storage_id from DARIAH-DE ownstorage
"""
tfile = ContentFile(datafile.file.read())
# TODO: possible without copying into memory?
tfile.file.content_type = datafile.content_type
storage_id = self._storage.create(token, tfile)
return storage_id
@staticmethod
def create_collection_rdf(storage_id, dataset, datafiles):
"""Create dariahrep collection rdf for a dataset
https://repository.de.dariah.eu/doc/services/submodules/kolibri/kolibri-dhpublish-service/docs/index.html
Arguments:
storage_id {str} -- dariahstorage id for the collection file, for self-reference
dataset {ddusers.model.DataSet} -- a ddusers DataSet to generate rdf for
datafiles {Array ddusers.model.DataFile} -- Array of ddusers.DataFile contained in
collection extended with their storage_id
Returns:
[str] -- RDF (turtle) to represent the given dataset
"""
turtle = loader.render_to_string(
"collection.ttl",
{"storage_id": storage_id, "dataset": dataset, "datafiles": datafiles},
)
return turtle
def get_publish_url(self):
"""Get the publish service url
Returns:
str -- the actual publish service url set
"""
return self._publish_url
class PublishException(Exception):
"""Thrown in case of problems with the publish service"""
class PublishJob:
"""PublishJob - keep track of status, files and their DOIs in a
Publish Process to the DARIAH-DE Repository
"""
def __init__(self, token, collection_id, dataset_id):
self.token = token
self.collection_id = collection_id
self.dataset_id = dataset_id
self.files = dict()
self.pid = ""
def remote_status(self):
"""Get the status of this PublishJob from DARIAH-DE publish service
Raises:
PublishException: An error from publicator if HTTP-Status != 200
Returns:
JSON -- publish status response as JSON object
"""
return publish.status(self.token, self.collection_id)
def publish(self):
"""Inititate publish process for this PublishJob
Raises:
PublishException: An error from publicator if HTTP-Status != 200
"""
return publish.publish(self.token, self.collection_id)
def status(self):
""" Status of this job this includes:
- files, storage-IDs and DOIS if already available
- progress (in percent)
- finished (true / false)
- publish-status answer from remote server
Returns:
dict -- finished/progress/file-mappings, ...
"""
status = self.remote_status()
# attach pid to files
if "publishObjects" in status:
for pobject in status["publishObjects"]:
# cut 'https://de.dariah.eu/storage/' from uri to get the storage_id
storage_id = pobject["uri"][29:]
if "pid" in pobject:
if storage_id == self.collection_id:
self.pid = pobject["pid"]
else:
self.files[storage_id]["pid"] = pobject["pid"]
return {
"finished": status["publishStatus"]["processStatus"] == "FINISHED",
"progress": status["publishStatus"]["progress"],
"collection": {
"dataset_id": self.dataset_id,
"storage_id": self.collection_id,
"pid": self.pid,
},
"files": self.files,
"remote_status": status,
}
# an instance of publish
publish = Publish()
""" Connection to DARIAH-DE storage
Storage API docs: http://hdl.handle.net/11858/00-1734-0000-0009-FEA1-D
"""
from django.conf import settings
import requests
class Storage:
"""
Methods to create and update data objects in DARIAH-DE storage
"""
def __init__(self):
storage_url = settings.DARIAH_STORAGE_LOCATION
if not storage_url.endswith("/"):
storage_url += "/"
self._storage_url = storage_url
def create(self, token, content=None):
"""Create a new object in dariahstorage
Arguments:
token {str} -- authentication token
content {ContentFile, optional} -- data to store (optionally)
Raises:
Exception -- if response from storage had no 201 status code
Returns:
str -- the storage id of created object
"""
if content is not None:
data = content.read()
else:
data = None
response = requests.post(
self._storage_url,
headers={"Authorization": "bearer " + token, "Content-type": "text/plain"},
data=data,
)
if response.status_code != 201:
raise StorageException(
"Error creating new cdstar object: "
+ response.text
+ " - "
+ str(response.status_code)
)
storage_id = response.headers["location"].rsplit("/", 1)[-1]
return storage_id
def update(self, token, storage_id, content):
"""Update an object in dariahstorage
Arguments:
token {str} -- authentication token
storageid {str} -- the storage id
content {ContentFile} -- the data to store
Raises:
Exception -- if response from storage had no 201 status code
"""
response = requests.put(
self._storage_url + storage_id,
headers={
"Authorization": "bearer " + token,
"Content-type": content.file.content_type,
},
data=content.read(),
)
if response.status_code != 201:
raise StorageException(
"Error updating cdstar object "
+ storage_id
+ ": "
+ response.text
+ " - "
+ str(response.status_code)
)
def get_storage_url(self):
"""Get the storage url
Returns:
str -- the actual storage url set
"""
return self._storage_url
class StorageException(Exception):
"""Thrown in case of problems with the storage"""
# an instance of storage
storage = Storage()
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix dariah: <http://de.dariah.eu/rdf/dataobjects/terms/> .
@prefix dariahstorage: <https://de.dariah.eu/storage/> .
dariahstorage:{{storage_id}}
a dariah:Collection ;
dc:title "{{dataset.title}}" ;
dc:creator "TODO: {{dataset.creators}}" ;
dc:rights "TODO" ;
dcterms:hasPart ( {% for id, file in datafiles.items %} dariahstorage:{{file.storage_id}} {% endfor %} ) .
{% for id, file in datafiles.items %}
dariahstorage:{{file.storage_id}}
a dariah:DataObject ;
dc:format "{{file.content_type}}" ;
dc:title "{{file.name}}" ;
dc:creator "TODO" ;
dc:rights "TODO" .
{% endfor %}