Commit 4800595e authored by Joerg-Holger Panzer's avatar Joerg-Holger Panzer
Browse files

Fix issue with METS structure and PDF conversion

for details see:https://gitlab.gwdg.de/digizeit/digizeit-app/-/issues/89
parent 7c08457f
Pipeline #303316 passed with stages
in 8 minutes and 20 seconds
......@@ -206,6 +206,14 @@ class WorkConverter
end
end
def min (a,b)
a<=b ? a : b
end
def max (a,b)
a>=b ? a : b
end
# "log_id" : "1047098326_0002|LOG_0004"
def build_jobs_from_es(context, product, id, log, log_id)
request_logical_part = id != log
......@@ -219,76 +227,82 @@ class WorkConverter
# get start and end page index
#
#
body = if request_logical_part
{
query: {
"bool": {
"must": [
{ "match": { "id.keyword": id + '|' + log } }
]
}
},
"_source": %w[id start_page_index end_page_index]
}
else
{
query: {
"bool": {
"must": [
{ "match": { "id": id } },
{ "match": { "IsFirst": true } }
]
}
},
"_source": %w[id start_page_index end_page_index]
}
end
resp = client.search(
index: log_index,
body: body,
scroll: '1m',
size: 3000
)
# "id", "order", "page", "format", "start_page_index", "end_page_index"]
total = resp['hits']['total']
if total == 0
removeQueue(log_id)
@logger.error("[work_converter] Couldn't find #{id}|#{log} in index, conversion not possible")
return
end
start_page_index = resp['hits']['hits'].first['_source']['start_page_index']
end_page_index = resp['hits']['hits'].first['_source']['end_page_index']
# body = if request_logical_part
# {
# query: {
# "bool": {
# "must": [
# { "match": { "id.keyword": id + '|' + log } }
# ]
# }
# },
# "_source": %w[id start_page_index end_page_index]
# }
# else
# {
# query: {
# "bool": {
# "must": [
# { "match": { "id": id } },
# { "match": { "IsFirst": true } }
# ]
# }
# },
# "_source": %w[id start_page_index end_page_index]
# }
# end
# resp = client.search(
# index: log_index,
# body: body,
# scroll: '1m',
# size: 3000
# )
# # "id", "order", "page", "format", "start_page_index", "end_page_index"]
# total = resp['hits']['total']
# if total == 0
# removeQueue(log_id)
# @logger.error("[work_converter] Couldn't find #{id}|#{log} in index, conversion not possible")
# return
# end
# start_page_index = resp['hits']['hits'].first['_source']['start_page_index']
# end_page_index = resp['hits']['hits'].first['_source']['end_page_index']
# get page array
#
body = {
"query": {
# "bool": {
# "must": [
# {
# "match": { "work": id }
# },
# {
# "range": {
# "Index": {
# "gte": start_page_index,
# "lte": end_page_index,
# "boost": 2.0
# }
# }
# }
# ]
# }
"bool": {
"must": [
{
"match": { "work": id }
},
{
"range": {
"Index": {
"gte": start_page_index,
"lte": end_page_index,
"boost": 2.0
}
}
}
"should": [
{"match": {"log_id.keyword": id + '|' + log}},
{"match": {"structrun.parent_id.keyword": id + '|' + log}}
]
}
}
},
"sort": [
{
"order": { "order": 'asc' }
}
],
"_source": %w[work page format]
"_source": %w[work page format Index]
}
resp = client.search(
......@@ -309,11 +323,16 @@ class WorkConverter
image_format = resp['hits']['hits'].first['_source']['format']
pages = []
start_page_index = 0
end_page_index = 0
loop do
hits = resp.dig('hits', 'hits')
break if hits.empty?
hits.each do |hit|
start_page_index = min(start_page_index, hit['_source']['Index'])
end_page_index = max(end_page_index, hit['_source']['Index'])
pages << hit['_source']['page']
end
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment