Merge pull request #12 from MRodz/master

added further preprocessing for whitespaces

Merge pull request #12 from MRodz/master
0ba4a38d · uwe · GitHub · a69dfd44 · c1d871df · 0ba4a38d
Unverified Commit 0ba4a38d authored 7 years ago by uwe Committed by GitHub 7 years ago
--- a/modules/intermediate_format/markerset.xqm
+++ b/modules/intermediate_format/markerset.xqm
 xquery version "3.0";
-(:~  
+(:~
 : MARKERSET Module ("markerset", "http://bdn.edition.de/intermediate_format/markerset")
 : *******************************************************************************************
 : This module is a helper module and defines functions to collect and construct reading markers
@@ -14,32 +14,32 @@ declare default element namespace "http://www.tei-c.org/ns/1.0";

 (:############################# Modules Functions #############################:)

-(:~  
+(:~
 : markerset:collect-markers()
 : This function collect markers for a given reading.
 : It destinguishes tei:lem and tei:rdg. In case of tei:lem it collects all sibling tei:rdgs. In case of tei:rdg it collect itself.
 :
 : @param $reading the reading node to collect readings for
 : @return node() representing a markerset of readings for the given node
- : 
+ :
 : @version 2.0 (2018-01-29)
 : @status working
 : @author Uwe Sikora
 :)
 declare function markerset:collect-markers
    ( $reading as node()* ) as item() {
-    
+
    let $markers := (
        if ($reading[self::lem]) then (
            attribute {"count"}{count($reading/following-sibling::rdg)},
            for $sibling in $reading/following-sibling::rdg
            return(
                element {name($sibling)} {
-                    $sibling/@*, 
+                    $sibling/@*,
                    attribute {"context"}{"lem"}
                }
            )
-        ) 
+        )
        else if ($reading[self::rdg]) then (
            element {name($reading)} {
                $reading/@*,
@@ -56,24 +56,24 @@ declare function markerset:collect-markers
 };


-(:~  
+(:~
 : markerset:merge-markers()
 : This function merges markers in a given set by the same type. It orders the merged markers according to an explicit ordering.
 :
 : @param $markerset node() including the markers that should be merged
 : @return node()* representing the merged markerset
- : 
+ :
 : @version 2.0 (2018-01-29)
 : @status working
 : @author Uwe Sikora
 :)
 declare function markerset:merge-markers
    ( $markerset as node()* ) as item()* {
-    
+
    let $order := ("om","ppl", "ptl", "pp", "pt" , "v")
    let $reading-types := distinct-values( $markerset[self::rdg or self::lem]/string(@type) )
-        
-    return (   
+
+    return (
        attribute {"order"}{distinct-values( ($order, $reading-types) ) },
        for $type in distinct-values( ($order, $reading-types) )
        let $rdgs := $markerset[@type = $type]
@@ -86,14 +86,14 @@ declare function markerset:merge-markers
                    attribute type {$type}
                }
            ) else ()
-            
+
    )
 };


-(:~  
+(:~
 : markerset:marker()
- : Constructor function which creates the marker element with name, mark-type and references 
+ : Constructor function which creates the marker element with name, mark-type and references
 :
 : @param $name The name of the marker element
 : @param $mark The mark type e.g. open or close
@@ -105,19 +105,20 @@ declare function markerset:merge-markers
 :)
 declare function markerset:marker
    ($name as xs:string, $type as xs:string, $reading as node()) as element(){
-
-    element {$name} {
+    if($type = 'open' and data($reading/@type) = 'v' and $reading/@context = 'rdg')
+    then ()
+    else (element {$name} {
        (:attribute bdnp_parent {$node/parent::node()/name()}, :)
        attribute wit { replace(data($reading/@wit), '#', '') },
        attribute type { data($reading/@type) },
        attribute ref { data($reading/@id) },
        attribute mark { $type },
        attribute context { $reading/@context }
-    }
+    })
 };


-(:~  
+(:~
 : markerset:construct-marker-from-markerset
 : Helping function to construct markers for a sequence of markersets
 :
@@ -131,9 +132,9 @@ declare function markerset:marker
 :)
 declare function markerset:construct-marker-from-markerset
    ( $name as xs:string, $marker-type as xs:string, $marker-set as node()* ) as item()* {
-    
+
    for $marker in $marker-set/node()
    return (
        markerset:marker($name, $marker-type, $marker)
    )
-};
\ No newline at end of file
+};
--- a/modules/intermediate_format/preprocessing.xqm
+++ b/modules/intermediate_format/preprocessing.xqm
 xquery version "3.0";
-(:~  
+(:~
 : PREPROCESSING Module ("pre", "http://bdn.edition.de/intermediate_format/preprocessing")
 : *******************************************************************************************
 : This module contains the preprocessing routines for the intermediate format
 :
 : It imports the whitespace handling helper module to make some whitespace handling duricng the preprocessing
- 
+
 : @version 2.0 (2018-01-29)
 : @status working
 : @author Uwe Sikora
@@ -18,20 +18,20 @@ declare default element namespace "http://www.tei-c.org/ns/1.0";

 (:############################# Modules Functions #############################:)

-(:~  
+(:~
 : pre:preprocessing-textNode
 : preprocessing function which converts each text() into a xml-node "textNode". This function is a experimental fall back solution and not the main preprocessing routine!
 :
 : @param $nodes the nodes to be converted
 : @return item()* representing the converted node
- : 
+ :
 : @version 1.2 (2017-10-15)
 : @status working
 : @author Uwe Sikora
 :)
 declare function pre:preprocessing-textNode
    ($nodes as node()*) as item()* {
-    
+
    for $node in $nodes
    return
        typeswitch($node)
@@ -44,7 +44,7 @@ declare function pre:preprocessing-textNode
                    }
                )
            )
-            
+
            case element(TEI) return (
                element{$node/name()}{
                    $node/@*,
@@ -54,7 +54,7 @@ declare function pre:preprocessing-textNode
                    }
                }
            )
-            
+
            case element(lem) return (
                element{$node/name()}{
                    $node/@*,
@@ -62,7 +62,7 @@ declare function pre:preprocessing-textNode
                    pre:preprocessing-textNode($node/node())
                }
            )
-            
+
            case element(rdg) return (
                element{$node/name()}{
                    $node/@*,
@@ -70,7 +70,7 @@ declare function pre:preprocessing-textNode
                    pre:preprocessing-textNode($node/node())
                }
            )
-            
+
            case element(note) return (
                if ($node[@type eq "editorial"]) then (
                ) else (
@@ -80,8 +80,8 @@ declare function pre:preprocessing-textNode
                    }
                )
            )
-            
-            default return ( 
+
+            default return (
                element{$node/name()}{
                    $node/@*,
                    pre:preprocessing-textNode($node/node())
@@ -90,7 +90,7 @@ declare function pre:preprocessing-textNode
 };


-(:~  
+(:~
 : pre:pre:default-element
 : function that suites as default element constructor for the preproseccing conversion.
 : It is more or less a copy function, copying the elements name and its node and recurively leeds the conversion to its child-nodes
@@ -98,7 +98,7 @@ declare function pre:preprocessing-textNode
 : @param $node the node to be copied
 : @param $recursive-function the recursive function as some kind of call back to the main conversion
 : @return item()* representing the converted node
- : 
+ :
 : @version 1.0 (2018-01-31)
 : @note Would be great if $recursive-function would be a real function and not a node-sequence (TO-DO)
 : @status working
@@ -106,39 +106,49 @@ declare function pre:preprocessing-textNode
 :)
 declare function pre:default-element
    ( $node as node(), $recursive-function as node()* ) as item()* {
-
-    element{$node/name()}{
-        $node/@*,
-        $recursive-function
-    }
+    let $following-node := $node/following-sibling::node()[1]
+    let $following-sibling := $node/following-sibling::*[1]
+    return
+        element{$node/name()}{
+            $node/@*,
+            (if($following-node[matches(., "[\s\n\r\t]") and normalize-space(.) = ""]
+            and $following-sibling[self::ref or self::app or self::hi or self::bibl
+            or self::foreign or self::choice or self::milestone or self::persName
+            or self::choice or self::index or self::seg]
+            and not($node[self::index]))
+            then
+                attribute {"break-after"}{"yes"}
+            else ()),
+            $recursive-function
+        }
 };


-(:~  
+(:~
 : pre:preprocessing
 : main preprocessing function.
 :
 : @param $nodes the nodes to be converted
 : @return item()* representing the converted node
- : 
+ :
 : @version 2.0 (2018-02-01)
 : @status working
 : @author Uwe Sikora
 :)
 declare function pre:preprocessing
    ($nodes as node()*) as item()* {
-    
+
    for $node in $nodes
    return
        typeswitch($node)
            case processing-instruction() return ()
-            
+
            case text() return (
                whitespace:text($node, "&#160;")
            )
-            
+
            case comment() return ()
-            
+
            case element(TEI) return (
                element{$node/name()}{
                    $node/@*,
@@ -150,19 +160,19 @@ declare function pre:preprocessing
                    }
                }
            )
-            
+
            case element(teiHeader) return ( $node )
-            
+
            case element(div) return (
                if ($node[@type = 'section-group']) then (
                    pre:preprocessing($node/node())
-                ) 
+                )
                else (
                    pre:default-element( $node, pre:preprocessing($node/node()) )
                )
-                
+
            )
-            
+
            case element(lem) return (
                element{$node/name()}{
                    $node/@*,
@@ -170,7 +180,7 @@ declare function pre:preprocessing
                    pre:preprocessing($node/node())
                }
            )
-            
+
            case element(rdg) return (
                element{$node/name()}{
                    $node/@*,
@@ -178,87 +188,84 @@ declare function pre:preprocessing
                    pre:preprocessing($node/node())
                }
            )
-            
+
            case element(note) return (
                if ( $node[@type != "editorial"] or $node[ not(@type) ] ) then (
                    pre:default-element( $node, pre:preprocessing($node/node()) )
                ) else ( )
            )
-            
+
            case element(pb) return (
                let $preceding-sibling := $node/preceding-sibling::node()[1]
                let $following-sibling := $node/following-sibling::node()[1]
-                let $first := $node = $node/parent::node()/node()[not(self::text() and normalize-space(self::node()) = '')][1]
-                let $ignore := ("docAuthor", "app", "index", "seg", "bibl")
                return
                    element {$node/name()}{
                        $node/@*,
-                         
-                        if ( 
+
+                        if (
                            ( $preceding-sibling[self::text() and not(normalize-space(.) = '')] and ends-with($preceding-sibling, " ") = false() )
                            and
                            ( $following-sibling[self::text() and not(normalize-space(.) = '')] and starts-with($following-sibling, " ") = false() )
-                        ) then ( attribute {"break"}{"no"} ) 
-                        
-                        (:else if ( 
-                                ( $preceeding-sibling[self::text() and not(normalize-space(.) = '')] and ends-with($preceeding-sibling, " ") = true() )
-                                and
-                                ( $following-sibling[self::text() and not(normalize-space(.) = '')] and starts-with($following-sibling, " ") = true() )
-                            ) then ( attribute {"clear"}{"left"} ) :)
-                            
-                        else if ( $following-sibling[self::docAuthor or self::app or self::index or self::seg or self::bibl] ) then (
-                            attribute {"break"}{"yes"}
-                        )
-                        
-                        else if ( 
-                            ( $preceding-sibling[self::text()][matches(., "[\s\n\r\t]") and normalize-space(.) = ""] )
+                        ) then ( attribute {"break"}{"no"} )
+                        else if (
+                            ( $preceding-sibling[matches(., "[\s\n\r\t]") and normalize-space(.) = ""] )
                            and
-                            ( $following-sibling[self::text()][matches(., "[\s\n\r\t]") and normalize-space(.) = ""] )
+                            ( $following-sibling[matches(., "[\s\n\r\t]") and normalize-space(.) = ""] )
                        ) then (
-                            attribute {"break"}{"yes"}
+                            attribute {"break-before"}{"yes"},
+                            attribute {"break-after"}{"yes"}
+                        )
+                        else if (
+                            $preceding-sibling[matches(., "[\s\n\r\t]") and normalize-space(.) = ""]
+                        ) then (
+                            attribute {"break-before"}{"yes"}
+                        )
+                        else if (
+                            $following-sibling[matches(., "[\s\n\r\t]") and normalize-space(.) = ""]
+                        ) then (
+                            attribute {"break-after"}{"yes"}
                        )
-                        
                        else ( )
                    }
            )
-            
+
            case element(hi) return (
-                if($node[@rend = 'right-aligned' or @rend = 'center-aligned']) then(
+            if($node[@rend = 'right-aligned' or @rend = 'center-aligned']) then(
                    element {'aligned'} {
                        $node/@*,
                        pre:preprocessing($node/node())
-                    } 
+                    }
                )
                else (
                    pre:default-element( $node, pre:preprocessing($node/node()) )
                )
            )
-            
+
            case element(seg) return (
                if($node[@type = 'item']) then(
                    element {'item'} {
                        $node/@*[name() != 'type'],
                        pre:preprocessing($node/node())
-                    } 
+                    }
                )
                else if($node[@type = 'head']) then(
                    element {'head'} {
                        $node/@*[name() != 'type'],
                        pre:preprocessing($node/node())
-                    } 
+                    }
                )
                else if($node[@type = 'row']) then(
                    element {'row'} {
                        $node/@*[name() != 'type'],
                        pre:preprocessing($node/node())
-                    } 
+                    }
                )
                else (
                    pre:default-element( $node, pre:preprocessing($node/node()) )
                )
            )
-            
-            default return ( 
+
+            default return (
                pre:default-element( $node, pre:preprocessing($node/node()) )
            )
-};
\ No newline at end of file
+};
--- a/modules/intermediate_format/whitespace-handling.xqm
+++ b/modules/intermediate_format/whitespace-handling.xqm
 xquery version "3.0";
-(:~  
+(:~
 : WHITESPACE Module ("whitespace", "http://bdn.edition.de/intermediate_format/whitespace_handling")
 : *******************************************************************************************
 : This module contains the functions to handle different whitespace operations on text
@@ -9,12 +9,14 @@ xquery version "3.0";
 : @author Uwe Sikora
 :)
 module namespace whitespace="http://bdn.edition.de/intermediate_format/whitespace_handling";
+import module namespace pre="http://bdn.edition.de/intermediate_format/preprocessing" at "preprocessing.xqm";
+
 declare default element namespace "http://www.tei-c.org/ns/1.0";


 (:############################# Modules Functions #############################:)

-(:~ 
+(:~
 : whitespace:text()
 : This function handles whitespace in defined text() nodes
 :
@@ -28,26 +30,26 @@ declare default element namespace "http://www.tei-c.org/ns/1.0";
 :)
 declare function whitespace:text
    ( $text as text()*, $escape-char as xs:string? ) as text()* {
-    
+
    let $normalized := normalize-space($text)
    let $whitespace-node := $text[matches(., "[\s\n\r\t]") and normalize-space(.) = ""]
    let $single-whitespace-between-nodes := $text = ' '
-    return 
+    return
        if ( not($whitespace-node) or $single-whitespace-between-nodes) then (
-            
+
            if ($escape-char) then (
-                whitespace:escape-text($text, "#") 
+                whitespace:escape-text($text, "@")
            ) else ( whitespace:escape-text($text, " ") )
-            
-        ) 
+
+        )
        else ()
-    
+
 };


-(:~ 
+(:~
 : whitespace:escape-text()
- : This function replaces whitespaces in a text() 
+ : This function replaces whitespaces in a text()
 : with a defined preservation character
 :
 : @param $text the text-node to be converted