diff --git a/src/Controller/Tei2SolrController.php b/src/Controller/Tei2SolrController.php
index c3cb1c5c74943b181d869e9892531a74cf7c1788..c8ee39cbdc3f328c39ee10fcafaa7eb636b15cb3 100755
--- a/src/Controller/Tei2SolrController.php
+++ b/src/Controller/Tei2SolrController.php
@@ -2,8 +2,12 @@
namespace App\Controller;
+use App\Service\PreProcessingService;
use App\Model\SolrDocument;
+use App\Service\EditedTextService;
+use App\Service\TranscriptionService;
use DOMDocument;
+use DOMElement;
use DOMXPath;
use League\Flysystem\Exception;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
@@ -18,10 +22,21 @@ class Tei2SolrController extends AbstractController
private Client $client;
private ?string $teiDir = null;
-
- public function __construct(Client $client)
+ private TranscriptionService $transcriptionService;
+ private EditedTextService $editedTextService;
+ private PreProcessingService $preProcessingService;
+
+ public function __construct(
+ Client $client,
+ PreProcessingService $preProcessingService,
+ TranscriptionService $transcriptionService,
+ EditedTextService $editedTextService
+ )
{
$this->client = $client;
+ $this->transcriptionService = $transcriptionService;
+ $this->editedTextService = $editedTextService;
+ $this->preProcessingService = $preProcessingService;
}
public function setConfigs(string $teiDir) {
@@ -628,9 +643,9 @@ class Tei2SolrController extends AbstractController
$graphics = $this->getGraphics($imageIds, $imageUrls);
$solrDocument = $this->getTextVersions($file, $graphics);
$transcription = $solrDocument->getTranscriptedText();
- $pagesTranscription = $solrDocument->getTranscriptedTextArr();
+ $pagesTranscription = $solrDocument->getPageLevelTranscriptedText();
$editedText = $solrDocument->getEditedText();
- $pagesEdited = $solrDocument->getEditedTextArr();
+ $pagesEdited = $solrDocument->getPageLevelEditedText();
$pagesGndsUuids = $solrDocument->getPagesGndsUuids();
$pagesNotesUuids = $solrDocument->getPagesNotesUuids();
$pagesSegs = $solrDocument->getPagesSegs();
@@ -763,9 +778,10 @@ class Tei2SolrController extends AbstractController
$childDoc->edited_text = $pagesEdited[$i - 1];
}
- if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
- $childDoc->entities = array_values($pagesGndsUuids[$i - 1]);
- $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i - 1]);
+ if (isset($pagesGndsUuids[$i]) && !empty(($pagesGndsUuids[$i]))) {
+// if (isset($pagesGndsUuids[$i - 1]) && !empty(($pagesGndsUuids[$i - 1]))) {
+ $childDoc->entities = array_values($pagesGndsUuids[$i]);
+ $childDoc->annotation_ids = array_keys($pagesGndsUuids[$i]);
}
if (isset($pagesNotesUuids[$i - 1]) && !empty(($pagesNotesUuids[$i - 1]))) {
@@ -920,355 +936,61 @@ class Tei2SolrController extends AbstractController
public function getTextVersions(string $filePath = './../data/gitlab/Z_1822-02-20_k.xml', array $graphics = []): SolrDocument
{
$doc = new DOMDocument();
- $doc->load($filePath);
+ $doc->load($filePath, LIBXML_NOBLANKS);
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('tei', 'http://www.tei-c.org/ns/1.0');
$pagesNodes = $xpath->query('//tei:body');
+ /** @var DOMElement $body */
+ $body = $pagesNodes[0];
+ $pages = $this->preProcessingService->splitByPages($body);
- $allElements = [];
- foreach ($pagesNodes as $pagesNode) {
- $allElements = $this->getPagesNodes($pagesNode, $allElements);
- }
+ $pageLevelEditedText = [];
+ $pageLevelTranscriptedText = [];
- $editedTextArr = [];
- $transcriptedTextArr = [];
- $gndsUuids = [];
$pagesGndsUuids = [];
$pagesSegs = [];
$pagesNotesUuids = [];
$notesUuids = [];
$pagesSics = [];
- foreach ($allElements as $k => $elements) {
- $renditions = [];
- $myeditedText = '';
- $mytranscriptedText = '';
-
- foreach ($elements as $element) {
- if (isset($element->nodeName) && (
- ($element->nodeName === 'p' && $element->firstChild->nodeName !== 'address') ||
- ($element->nodeName === 'dateline' && 'closer' !== $element->parentNode->nodeName) ||
- $element->nodeName === 'address' ||
- $element->nodeName === 'closer' ||
- $element->nodeName === 'list' ||
- ($element->nodeName === 'signed' && 'closer' !== $element->parentNode->nodeName)
- )) {
-
- $n = 0;
- $liNumber = 1;
- $tText = '';
- $eText = '';
- $childElementsArr = [];
- $childElements = $this->getNodeChilds($element, $childElementsArr);
-
- foreach ($childElements as $childElement) {
- if ($childElement->nodeName === '#text'
- && ($childElement->parentNode->nodeName !== 'abbr' && ($childElement->nodeName !== 'del' || $childElement->parentNode->nodeName !== 'add'))
- && $childElement->parentNode->nodeName !== 'note' && $childElement->parentNode->nodeName !== 'seg'
- && $childElement->parentNode->nodeName !== 'ref'
- ) {
- if (isset($add) && !empty($add)) {
- $tText .= ' 〈' . $childElement->data . ' ' . $add;
- $eText .= ' ' . $childElement->data;
- $add = '';
- } elseif (isset($del) && !empty($del)) {
- $tText .= ' [' . $childElement->data . ' ' . $del;
- $del = '';
- } elseif (isset($li) && !empty($li) && !empty($childElement->data) && 'item' === $childElement->parentNode->nodeName) {
- if (isset($italic) && true === $italic) {
- $tText .= $li . $childElement->data . '';
- $eText .= $li . $childElement->data . '';
- $italic = false;
- } else {
- $tText .= $li . $childElement->data;
- $eText .= $li . $childElement->data;
- }
-
- $li = '';
- } elseif ('supplied' === $childElement->parentNode->nodeName) {
- $tText .= $childElement->data;
- $eText .= '' . $childElement->data . '';
- } elseif ('hi' === $childElement->parentNode->nodeName && isset($childElement->parentNode->attributes[0])) {
-
- $hi = explode(':', $childElement->parentNode->attributes[0]->value);
-
- if (isset($hi[1]) && !empty($hi[1]) && 'superscript' === $hi[1]) {
- $tText .= '' . $childElement->data . '';
- $eText .= $childElement->data;
- } elseif (isset($hi[1]) && !empty($hi[1]) && 'italic' === $hi[1]) {
- $tText .= '' . $childElement->data . '';
- $eText .= $childElement->data;
- $italic = true;
- }
- } elseif (isset($gnd) && true === $gnd) {
- $eText .= '';
- $tText .= '';
- $gnd = false;
- } elseif (isset($note) && true === $note) {
- $eText .= ' ' . $childElement->data;
- $tText .= ' ' . $childElement->data;
- $note = false;
- } elseif (isset($sic) && true === $sic) {
- $eText .= ' ';
- $tText .= ' ';
- $sic = false;
- } elseif (isset($abbr) && !empty($abbr)) {
- $tText .= $abbr;
- $abbr = '';
- }
- elseif (isset($expan) && !empty($expan)) {
- $eText .= $expan;
- $expan = '';
- } else {
- $eText .= $childElement->data;
- if (isset($renditions) && !empty($renditions)) {
- $classOpeningTag = '';
- $classEndTag = '';
-
- foreach ($renditions as $rendition) {
- if ('italic' === $rendition) {
- $class = 'i';
- } elseif ('underline' === $rendition) {
- $class = 'u';
- }
- if (isset($class) && !empty($class)) {
- $classOpeningTag .= '<' . $class . '>';
- $classEndTag .= '' . $class . '>';
- }
-
- $renditions = [];
- }
- }
-
- if (isset($classOpeningTag) && !empty($classOpeningTag)) {
- $tText .= $classOpeningTag;
- $classOpeningTag = '';
- }
-
- $tText .= $childElement->data;
-
- if (isset($classEndTag) && !empty($classEndTag)) {
- $tText .= $classEndTag;
- $classEndTag = '';
- }
- }
- } elseif ($childElement->nodeName === "seg") {
- $noteBibl = false;
- foreach ($childElement->childNodes as $childElementChildNode) {
- if ($childElementChildNode->nodeName === 'bibl') {
- $noteBibl = true;
- break;
- }
- }
-
- if (true === $noteBibl) {
- $segBiblTarget = $this->getSegBiblTarget($childElement);
- if (!empty($segBiblTarget)) {
- $segBiblTargetArr = explode('#', $segBiblTarget);
- $segTextBibl = str_replace('_', ' ', $segBiblTargetArr[1]);
- $segBiblTargetlink = $segBiblTargetArr[0];
- $segTextBibl = $segTextBibl.' ('.$segBiblTargetArr[0].')';
- $segText = $this->getElementText($childElement);
- }
- } else {
- $segText = $this->getElementText($childElement);
- }
-
- $noteUuid = $this->getUuid();
- $eText .= ''.$segText;
- $tText .= $segText;
- $note = true;
+ foreach ($pages as $key => $page) {
+ if ($key > 0) {
+ $transcriptedDoc = $this->transcriptionService->transformPage($page);
+ $pageLevelTranscriptedText[] = $transcriptedDoc->saveHTML();
- if (true === $noteBibl && !empty($segTextBibl)) {
- $pagesSegs[$k][$noteUuid] = trim($segTextBibl);
- } elseif ($noteBibl === false && !empty($segText)) {
- $pagesSegs[$k][$noteUuid] = trim($segText);
- }
- } elseif ($childElement->nodeName === "note") {
- $noteText = $this->getElementText($childElement);
- if ((isset($noteBibl) && false === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($noteText)) {
- $notesUuids[$noteUuid] = trim($noteText);
- $pagesNotesUuids[$k][$noteUuid] = trim($noteText);
- } elseif ((isset($noteBibl) && true === $noteBibl) && (isset($noteUuid) && !empty($noteUuid)) && !empty($segBiblTargetlink)) {
- $notesUuids[$noteUuid] = trim($segBiblTargetlink);
- $pagesNotesUuids[$k][$noteUuid] = trim(' ');
- }
- unset($noteUuid);
- } elseif ('item' === $childElement->nodeName) {
- if ($liNumber++ === 1) {
- $li = '- ';
- } else {
- $li = '
- ';
- }
- } elseif ('add' === $childElement->nodeName) {
- if ('rdg' === $childElement->parentNode->nodeName) {
- if (!empty($childElement->attributes)) {
- $pattern = '/^#[a-z_]*$/i';
- foreach ($childElement->attributes as $attribute) {
- if ('hand' === $attribute->nodeName) {
- if (strpos($attribute->nodeValue, 'scrb') !== false) {
- $add = $this->transformAddScrb($attribute->nodeValue);
- } else {
- $match = preg_match($pattern, $attribute->nodeValue, $matches);
- if ($match) {
- $add = 'erg. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '';
- }
- }
- }
- }
- } else {
- $add = 'erg.';
- }
-
- if (!empty($add)) {
- $add .= '〉 ';
- }
- }
- } elseif ('del' === $childElement->nodeName) {
- if ('rdg' === $childElement->parentNode->nodeName) {
- if (!empty($childElement->attributes)) {
- foreach ($childElement->attributes as $attribute) {
- if ('hand' === $attribute->nodeName) {
- if (strpos($attribute->nodeValue, 'scrb') !== false) {
- $valueArr = explode('scrb', $attribute->nodeValue);
- if (isset($valueArr[1])) {
- $valueArr = explode('_', ltrim($valueArr[1], '_'));
- if (count($valueArr) === 2) {
- $del = 'str Schrhd.' . $valueArr[0] . ' ' . $valueArr[1] . '';
- }
- }
- } else {
- $del = 'str. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '';
- }
- }
- }
- } else {
- $del = 'str.';
- }
-
- if (!empty($del)) {
- $del .= ']';
- }
- }
- } elseif ('abbr' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) {
- if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') {
- $abbr = $childElement->firstChild->data;;
- }
- } elseif ('expan' === $childElement->nodeName && 'choice' === $childElement->parentNode->nodeName) {
- if (isset($childElement->firstChild) && $childElement->firstChild->nodeName === '#text') {
- $expan = $childElement->firstChild->data;
- }
- } elseif ('handShift' === $childElement->nodeName && 'signed' === $childElement->parentNode->nodeName) {
- if (!empty($childElement->attributes)) {
- $pattern = '/^#[a-z_]*$/i';
- foreach ($childElement->attributes as $attribute) {
- if ('scribeRef' === $attribute->nodeName) {
- $match = preg_match($pattern, $attribute->nodeValue, $matches);
- if ($match) {
- $add = 'sign. ' . str_replace('_', ' ', trim($attribute->nodeValue, '#')) . '〉 ';
- }
- }
- }
- }
- } elseif ('lb' === $childElement->nodeName) {
- $tText .= '
';
- $eText .= ' ';
- } elseif ('addrLine' === $childElement->nodeName) {
- if ($n++ > 0) {
- $eText .= '
';
- $tText .= '
';
- }
- } elseif ('signed' === $childElement->nodeName) {
- $tText .= '
';
- $eText .= '
';
- $signedText = $this->getElementText($childElement);
- $tText .= $signedText;
- $eText .= $signedText;
- } elseif ('hi' === $childElement->nodeName) {
- foreach ($childElement->attributes as $attribute) {
- if (isset(explode(':', $attribute->value)[1])) {
- $rend = explode(':', $attribute->value)[1];
- if ('superscript' !== $rend && 'italic' !== $rend) {
- $renditions[] = explode(':', $attribute->value)[1];
- }
- }
- }
- } elseif ('name' === $childElement->nodeName && (isset($childElement->attributes[1]->value) && !empty($childElement->attributes[1]->value)) && (isset($childElement->attributes[0]->value) && !empty($childElement->attributes[0]->value))) {
- $entityName = $this->getEntityName($childElement);
- $uuid = $this->getUuid();
- if (str_contains($childElement->attributes[1]->value, 'gnd.')) {
- $gndsUuids[$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value);
- $pagesGndsUuids[$k][$uuid] = str_replace('gnd.', '', $childElement->attributes[1]->value);
- } elseif (str_contains($childElement->attributes[1]->value, 'gnd:')) {
- $gndsUuids[$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value);
- $pagesGndsUuids[$k][$uuid] = str_replace('gnd:', '', $childElement->attributes[1]->value);
- }
- if ('signed' === $childElement->parentNode->nodeName) {
- if (isset($childElement->parentNode->attributes[0])) {
- $hand = explode('_', trim($childElement->parentNode->attributes[0]->value, '#'));
- }
- $eText .= ''.$entityName;
- $tText .= '〈'.$entityName.' sign.';
- if (isset($hand[0]) && !empty($hand[0])) {
- $tText .= ' '.$hand[0];
- }
- if (isset($hand[1]) && !empty($hand[1])) {
- $tText .= ' '.$hand[1];
- }
- $tText .= ' 〉';
- } else {
- $eText .= ''.$entityName;
- $tText .= $entityName;
- }
- $gnd = true;
- } elseif ($childElement->nodeName === "sic") {
- $sicText = $this->getElementText($childElement);
- $sicUuid = $this->getUuid();
- $eText .= $this->createSpan($sicUuid, $sicText, $childElement->nodeName);
- $tText .= $sicText;
- $pagesSics[$k][$sicUuid] = trim($sicText);
- $sic = true;
- }
- }
+ $editedDoc = $this->editedTextService->transformPage($page);
+ $pagesGndsUuids[$key] = $this->editedTextService->getGndsUuids();
+ $pageLevelEditedText[] = $editedDoc->saveHTML();
- $eText = $this->convertSoftHyphenToHyphen($eText);
- $eText = $this->removeHyphen($eText);
- $myeditedText .= '' . $eText . '
';
- $mytranscriptedText .= '' . $tText . '
';
- } elseif ('pb' === $element->nodeName) {
- $imageHref = $this->getImageHref($element, $graphics);
- $tText = $imageHref;
- $eText = $imageHref;
- $myeditedText .= '' . $eText . '
';
- $mytranscriptedText .= '' . $tText . '
';
- }
+ $this->transcriptionService->clear();
+ $this->editedTextService->clear();
}
-
- $editedTextArr[] = $myeditedText;
- $transcriptedTextArr[] = $mytranscriptedText;
}
- $transcriptedText = '';
- foreach ($transcriptedTextArr as $mytranscriptedText) {
- $transcriptedText .= $mytranscriptedText;
+ $this->preProcessingService->clear();
+
+ $gndsUuids = array_merge($pagesGndsUuids);
+
+ $documentLevelTranscriptedText = '';
+ foreach ($pageLevelTranscriptedText as $singlePageTranscriptedText) {
+ $documentLevelTranscriptedText .= $singlePageTranscriptedText;
}
- $editedText = '';
- foreach ($editedTextArr as $myeditedText) {
- $editedText .= $myeditedText;
+ $documentLevelEditedText = '';
+ foreach ($pageLevelEditedText as $singlePageEditedText) {
+ $documentLevelEditedText .= $singlePageEditedText;
}
$solrDocument = new SolrDocument();
- $solrDocument->setTranscriptedText($transcriptedText);
- $solrDocument->setTranscriptedTextArr($transcriptedTextArr);
- $solrDocument->setEditedText($editedText);
- $solrDocument->setEditedTextArr($editedTextArr);
+ $solrDocument->setTranscriptedText($documentLevelTranscriptedText);
+ $solrDocument->setPageLevelTranscriptedText($pageLevelTranscriptedText);
+ $solrDocument->setEditedText($documentLevelEditedText);
+ $solrDocument->setPageLevelEditedText($pageLevelEditedText);
$solrDocument->setGndsUuids($gndsUuids);
$solrDocument->setPagesGndsUuids($pagesGndsUuids);
$solrDocument->setPagesNotesUuids($pagesNotesUuids);
$solrDocument->setPagesSegs($pagesSegs);
-
$solrDocument->setPagesSics($pagesSics);
return $solrDocument;
@@ -1281,7 +1003,6 @@ class Tei2SolrController extends AbstractController
return $span;
}
-
private function getSegBiblTarget(\DOMElement $childElement): ?string
{
$elementChildsArr = [];
diff --git a/src/Model/SolrDocument.php b/src/Model/SolrDocument.php
index 98fd842b7e869d54d6f571f9a71b3e5f732d393d..b5150d6c1878dcda1f2ddddb662fc79617ebccd7 100755
--- a/src/Model/SolrDocument.php
+++ b/src/Model/SolrDocument.php
@@ -5,9 +5,9 @@ namespace App\Model;
class SolrDocument
{
private string $transcriptedText;
- private array $transcriptedTextArr;
+ private array $pageLevelTranscriptedText;
private string $editedText;
- private array $editedTextArr;
+ private array $pageLevelEditedText;
private array $gndsUuids;
private array $pagesGndsUuids;
private array $pagesNotesUuids;
@@ -26,14 +26,14 @@ class SolrDocument
return $this;
}
- public function getTranscriptedTextArr(): array
+ public function getPageLevelTranscriptedText(): array
{
- return $this->transcriptedTextArr;
+ return $this->pageLevelTranscriptedText;
}
- public function setTranscriptedTextArr(array $transcriptedTextArr): SolrDocument
+ public function setPageLevelTranscriptedText(array $pageLevelTranscriptedText): SolrDocument
{
- $this->transcriptedTextArr = $transcriptedTextArr;
+ $this->pageLevelTranscriptedText = $pageLevelTranscriptedText;
return $this;
}
@@ -50,14 +50,14 @@ class SolrDocument
return $this;
}
- public function getEditedTextArr(): array
+ public function getPageLevelEditedText(): array
{
- return $this->editedTextArr;
+ return $this->pageLevelEditedText;
}
- public function setEditedTextArr(array $editedTextArr): SolrDocument
+ public function setPageLevelEditedText(array $pageLevelEditedText): SolrDocument
{
- $this->editedTextArr = $editedTextArr;
+ $this->pageLevelEditedText = $pageLevelEditedText;
return $this;
}
diff --git a/src/Service/EditedTextService.php b/src/Service/EditedTextService.php
new file mode 100644
index 0000000000000000000000000000000000000000..b6c0fb8dc121b89dcf51207a075312202bb7ac63
--- /dev/null
+++ b/src/Service/EditedTextService.php
@@ -0,0 +1,90 @@
+setDoc(new DOMDocument());
+
+ /** @var DOMElement $element */
+ foreach ($page->childNodes as $element) {
+ $transformed = $this->transformElement($element);
+ if ($transformed) {
+ $this->appendChild($transformed);
+ }
+ }
+
+ // Later this should be packed in an object
+ return $this->getDoc();
+ }
+
+ public function getGndsUuids(): array
+ {
+ return $this->gndsUuids;
+ }
+
+ public function getNotesUuids(): array
+ {
+ return $this->notesUuids;
+ }
+
+ private function transformElement(DOMNode $el): ?DOMNode
+ {
+ $htmlEl = null;
+ $methodName = 'handle' . trim(ucfirst($el->nodeName), '#');
+ if (method_exists($this, $methodName)) {
+ $htmlEl = $this->{$methodName}($el);
+ } else {
+ $htmlEl = $this->span();
+ }
+
+ if ($el->hasChildNodes()) {
+ var_dump($el->nodeName);
+ foreach ($el->childNodes as $child) {
+ $transformed = $this->transformElement($child);
+ if ($transformed) {
+ $htmlEl->appendChild($transformed);
+ }
+ }
+ }
+
+ return $htmlEl;
+ }
+
+ private function handleName(DOMNode $el): DOMNode
+ {
+ $htmlEl = $this->span();
+
+ if (isset($el->attributes[1]->value) && str_contains($el->attributes[1]->value, 'gnd:')) {
+ $uuid = $this->createUuid();
+ $this->gndsUuids[$uuid] = str_replace('gnd:', '', $el->attributes[1]->value);
+ $htmlEl->setAttribute('id', $uuid);
+ if (isset($el->attributes[0]->value)) {
+ $htmlEl->setAttribute('class', $el->attributes[0]->value);
+ }
+ }
+
+ return $htmlEl;
+ }
+
+ public function createUuid()
+ {
+ return uuid_create(UUID_TYPE_RANDOM);
+ }
+
+ public function clear()
+ {
+ parent::clear();
+ $this->gndsUuids = [];
+ $this->notesUuids = [];
+ }
+}
diff --git a/src/Service/HtmlService.php b/src/Service/HtmlService.php
new file mode 100644
index 0000000000000000000000000000000000000000..f07b4929b6505fa93aca9c26a4b7589d92da02bc
--- /dev/null
+++ b/src/Service/HtmlService.php
@@ -0,0 +1,61 @@
+doc;
+ }
+
+ public function setDoc(DOMDocument $doc): void
+ {
+ $this->doc = $doc;
+ }
+
+ public function appendChild(DOMNode $el): DOMNode
+ {
+ return $this->doc->appendChild($el);
+ }
+
+ public function p(): DOMElement
+ {
+ return $this->doc->createElement('p');
+ }
+
+ public function div():DOMElement
+ {
+ return $this->doc->createElement('div');
+ }
+
+ public function span(): DOMElement
+ {
+ return $this->doc->createElement('span');
+ }
+
+ public function br(): DOMElement
+ {
+ return $this->doc->createElement('br');
+ }
+
+ public function clear()
+ {
+ $this->doc = null;
+ }
+
+ public function handleText(DOMNode $el): DOMNode {
+ $text = new DOMText();
+ if ($el->nodeName === '#text') {
+ $text->data = $el->textContent;
+ }
+ return $text;
+ }
+}
diff --git a/src/Service/PreProcessingService.php b/src/Service/PreProcessingService.php
new file mode 100644
index 0000000000000000000000000000000000000000..ffa82f12848453dc8e4c302300721448f48b4d48
--- /dev/null
+++ b/src/Service/PreProcessingService.php
@@ -0,0 +1,126 @@
+pages;
+ }
+
+ public function splitByPages(DOMELement $body)
+ {
+ $this->pages[] = new DOMDocument();
+
+ // To start out we set our empty page as last parent to append other elements to
+ $this->lastParent = $this->getLastPage();
+
+ // Start recursion
+ $this->checkPb($body);
+
+ return $this->pages;
+ }
+
+ private function checkPb(DOMNode $el) {
+ if ($el->nodeName === '#comment') {
+ return;
+ }
+
+ $lastPage = $this->getLastPage();
+
+ if ($el->nodeName === '#text') {
+ $clone = $lastPage->createTextNode($el->textContent);
+ } else {
+ $clone = $lastPage->createElement($el->nodeName);
+ $clone = $this->cloneAttributes($el->attributes, $clone);
+ }
+
+ // Always append the current element clone to last parent
+ $this->lastParent->appendChild($clone);
+
+ if ($el->hasChildNodes()) {
+
+ // If there are children then we want to append their clones to the current element
+ // so we have to move lastParent
+ $this->lastParent = $clone;
+
+ /** @var DOMElement $child */
+ foreach ($el->childNodes as $child) {
+ if ($child->nodeName === 'pb') {
+ $this->pages[] = $this->createNewPage($child);
+ } else {
+ $this->checkPb($child);
+ }
+ }
+
+ // After we finished iterating (recursively) over all children
+ // we are done here and want move on with our next sibling
+ // so we have to set lastParent to it's parent
+ if ($this->lastParent->parentNode) {
+ $this->lastParent = $this->lastParent->parentNode;
+ }
+ }
+ }
+
+ private function createNewPage(DOMElement $pbEl): DOMDocument
+ {
+ // Creates a new DOMDocument and replicates every parent node of pb element
+ // up to the
+ $newPage = new DOMDocument();
+ $parent = $pbEl->parentNode;
+
+ $parents = [];
+ while ($parent->parentNode) {
+ // Collect all parents until
+
+ $parents[] = $parent;
+
+ if ($parent->nodeName === 'body') break;
+ $parent = $parent->parentNode;
+ }
+
+ // Reverse them to append them from document root
+ $parentsReversed = array_reverse($parents);
+
+ $lastNode = $newPage;
+ foreach ($parentsReversed as $parent) {
+ $node = $newPage->createElement($parent->nodeName);
+ $node = $this->cloneAttributes($parent->attributes, $node);
+ $node = $lastNode->appendChild($node);
+
+ // Result will be the deepest parent node so we can continue cloning the TEI
+ // in further checkPb calls
+ $this->lastParent = $node;
+ $lastNode = $node;
+ }
+
+ return $newPage;
+ }
+
+ private function getLastPage(): ?DOMDocument
+ {
+ return (!empty($this->pages)) ? $this->pages[count($this->pages) - 1] : null;
+ }
+
+ public function clear()
+ {
+ $this->pages = [];
+ }
+
+ private function cloneAttributes($attributes, $clone)
+ {
+ foreach ($attributes as $attr) {
+ $clone->setAttribute($attr->name, $attr->value);
+ }
+
+ return $clone;
+ }
+}
diff --git a/src/Service/TranscriptionService.php b/src/Service/TranscriptionService.php
new file mode 100644
index 0000000000000000000000000000000000000000..a4aa4fbf8c729c6efaa7c35b34bcd9068a14723d
--- /dev/null
+++ b/src/Service/TranscriptionService.php
@@ -0,0 +1,66 @@
+setDoc(new DOMDocument());
+
+ /** @var DOMElement $element */
+ foreach ($page->childNodes as $element) {
+ $this->appendChild($this->transformElement($element));
+ }
+
+ return $this->getDoc();
+ }
+
+ private function transformElement(DOMNode $el): DOMNode
+ {
+ $methodName = 'handle' . trim(ucfirst($el->nodeName), '#');
+ if (method_exists($this, $methodName)) {
+ $htmlEl = $this->{$methodName}($el);
+ } else {
+ $htmlEl = $this->p();
+ }
+
+ if ($el->hasChildNodes()) {
+ foreach ($el->childNodes as $child) {
+ $htmlEl->appendChild($this->transformElement($child));
+ }
+ }
+
+ return $htmlEl;
+ }
+
+ private function handleLb(DOMElement $el): DOMNode
+ {
+ return $this->br();
+ }
+
+ private function handlePb(DOMElement $el): DOMNode
+ {
+ return $this->div();
+ }
+
+ private function handleP(DOMElement $el): DOMNode
+ {
+ return $this->p();
+ }
+
+ private function handleDiv(DOMElement $el): DOMNode
+ {
+ return $this->div();
+ }
+
+ private function handleSpan(DOMElement $el): DOMNode
+ {
+ return $this->span();
+ }
+}