added pandoc.py

529a88a0 · Jake · 39f0b334 · 529a88a0 · 529a88a0 · 529a88a0
Verified Commit 529a88a0 authored 2 years ago by Jake
--- a/fgs/page.py
+++ b/fgs/page.py
@@ -5,7 +5,6 @@ class Page:
    #raw = None
    #metadata = None
    #content = None
-    #toc = None
    #title = None
    #category = None
    #slug = None
@@ -18,13 +17,12 @@ class Page:
    #template = None
    

-    def __init__(self, filename, subpath, raw, metadata, content, toc, title, category, slug, lang, date_created, date_modified, authors, last_modification_author, status, config):
+    def __init__(self, filename, subpath, raw, metadata, content, title, category, slug, lang, date_created, date_modified, authors, last_modification_author, status, config):
        self.filename =  filename
        self.subpath =  subpath
        self.raw =  raw
        self.metadata =  metadata
        self.content =  content
-        self.toc =  toc
        self.title =  title
        self.category =  category
        self.slug =  slug

--- a/fgs/pandoc.py
+++ b/fgs/pandoc.py
+#!/usr/bin/env python3
+
+import validators
+
+from typing import List, Dict
+
+import subprocess
+import json
+import sys
+
+import os
+
+def run_pandoc(source, base="markdown", extensions=[], extra_args=[]):
+    to = "json"
+    ext_str = ""
+    if isinstance(extensions, list):
+        for ext in extensions:
+            if ext.startswith('#'):
+                continue
+            if ext.startswith('+') or ext.startswith('-'):
+                ext_str = ext_str + ext
+            elif len(ext) > 0:
+                ext_str = ext_str + '+' + ext
+    elif isinstance(extensions, dict):
+        for ext_key in extensions:
+            # TODO catch 'illegal' ext_keys (containing spaces for example)
+            ext = extensions[ext_key]
+            if "ignore" in ext and ext["ignore"]:
+                continue
+            flag='+'
+            if "enabled" in ext and not ext["enabled"]:
+                flag='-'
+            ext_str = ext_str + flag + ext_key
+
+    #print(ext_str)
+    pandoc_bin = "pandoc"
+    args = [pandoc_bin, "-f", base + ext_str, "-t", to] + extra_args
+    p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    out, _ = p.communicate(source.encode('utf-8', errors='strict'))
+    out_str = out.decode('utf-8')
+    out_dict = json.loads(out.decode('utf-8'))
+
+    if json.dumps(out_dict["pandoc-api-version"]) != "[1, 22, 2]":
+        raise Exception("Unsupported pandoc-api version", out_dict["pandoc-api-version"])
+
+    # parse blocks
+    raw_blocks = out_dict['blocks']
+    blocks = []
+    for raw_block in raw_blocks:
+        #print('raw_block: ', type(raw_block), raw_block)
+        blocks.append(parse_from_register(block_parsing_register, raw_block))
+
+    contentmetadata = {}
+    contentmetadata["toc_list"] = []
+    contentmetadata["toc_count"] = 0
+
+    # TODO TOC
+    #contentmetadata["toc"] = build_toc(n["toc_list"].copy())
+
+    blocks = json.dumps(blocks, cls=ElementEncoder) # TODO use json.loads() otherwise this is just a string
+
+    return (blocks, contentmetadata)
+
+def parse_from_register(reg: dict, h: dict):
+    t = h['t'] # pandoc type
+    if t not in reg:
+        raise Exception("pandoc type not in register", t, h)
+    entry = reg[t] # registry entry
+
+    c = None
+    if 'c' in h:
+        c = h['c']
+
+    res = None
+
+    if isinstance(entry, dict):
+        if "TODO" in entry and entry["TODO"]:
+            print("Warning: entry is marked as TODO: ",t, entry,file=sys.stderr)
+            return None
+
+        handler = entry['handler']
+        res = handler(entry['etype'])
+    else:
+        handler = entry
+        res = handler()
+
+    res.parse(c)
+
+    return res
+
+
+class ElementEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Element):
+            res = {}
+            for key, name in obj.export.items():
+                res[name] = getattr(obj, key)
+            return res
+        return super().default(obj)
+
+class Element():
+    def __init__(self, etype = None):
+        if etype != None:
+            self.etype = etype
+        self.children = []
+        self.export = {}
+
+        self.export_key('etype', 'type')
+        self.export_key('eclass', 'class')
+        self.export_key('children')
+
+    def addChild(self, child):
+        self.children.append(child)
+
+    def parse_internal(self, pandocraw):
+        raise Exception("parse_internal not overridden: ", self)
+
+    def parse(self, pandocraw):
+        prevkeys = dir(self)
+        self.parse_internal(pandocraw)
+        afterkeys = dir(self)
+        for key in afterkeys:
+            if key not in prevkeys:
+                self.export_key(key)
+
+
+    def export_key(self, key, name=None):
+        if name == None:
+            name = key
+        self.export[key] = name
+
+
+
+    def parse_blocks(self, raw_blocks):
+        if not isinstance(raw_blocks, list):
+            raise Exception("raw_blocks is not a list: ", raw_blocks)
+        res = []
+        for raw_block in raw_blocks:
+            res.append(self.parse_block(raw_block))
+        return res
+    def parse_inlines(self, raw_inlines):
+        if not isinstance(raw_inlines, list):
+            raise Exception("raw_inlines is not a list: ", raw_inlines)
+        res = []
+        for raw_inline in raw_inlines:
+            res.append(self.parse_inline(raw_inline))
+        return res
+
+    def parse_block(self, raw_block):
+        res = parse_from_register(block_parsing_register, raw_block)
+        self.addChild(res)
+        return res
+    def parse_inline(self, raw_inline):
+        res = parse_from_register(inline_parsing_register, raw_inline)
+        self.addChild(res)
+        return res
+
+    def parse_attr(self, raw_attr):
+        #print("called parse_attr: ", raw_attr)
+        res = {}
+        res['id'] = self.parse_text(raw_attr[0])
+        classes = []
+        for c in raw_attr[1]:
+            classes.append(self.parse_text(c))
+        res['classes'] = classes
+        # convert [ "key1", "value1", "key2", "value2" ] to {"key1":"value1", "key2", "value2"}
+        it = iter(raw_attr[2])
+        kvp = dict(zip(it, it)) # key-value pairs
+        extra = {}
+        for key, value in kvp.items():
+            res[key] = self.parse_text(value)
+        res['extra'] = extra
+
+        return res
+
+    def parse_text(self, raw_text):
+        if len(raw_text) > 0:
+            return raw_text
+        else:
+            return None
+
+    def parse_int(self, raw_num):
+        return raw_num # TODO
+
+    def parse_target(self, raw_target): # For URLs
+        res = {}
+        res['url'] = self.parse_text(raw_target[0])
+        res['title'] = self.parse_text(raw_target[1])
+        return res
+
+    def parse_code(self, code):
+        res = {}
+        res["code"] = self.parse_text(code)
+        res["code_lines"] = code.splitlines()
+        return res
+
+    def parse_enum(self, mapping, enum):
+        if len(enum.keys()) != 1 or "t" not in enum:
+            raise Exception("enum is not a valid enum", enum, mapping)
+        enum = enum["t"]
+        if enum not in mapping:
+            raise Exception("enum not found in mapping")
+        return mapping[enum]
+
+    def update(self, d): # Like dict.update()
+        for key, value in d.items():
+            setattr(self, key, value)
+
+
+
+################################ BLOCK #########################################
+
+class Block(Element):
+    eclass = "block"
+
+
+class BlockHeader(Block): # Int Attr [Inline]
+    etype = "header"
+    def parse_internal(self, pandocraw):
+        self.level = self.parse_int(pandocraw[0])
+        self.attr = self.parse_attr(pandocraw[1])
+        self.content = self.parse_inlines(pandocraw[2])
+
+class BlockRaw(Block): # Format Text
+    etype = "rawblock"
+    def parse_internal(self, pandocraw):
+        self.format = self.parse_text(pandocraw[0])
+        self.raw = self.parse_text(pandocraw[1])
+
+class BlockList(Block): # [[Block]]
+    def parse_listitems(self, rawitems):
+        res = {}
+        res['items'] = []
+        for itemrawblocks in rawitems:
+            item = self.parse_blocks(itemrawblocks)
+            res['items'].append(item)
+        res['count'] = len(res['items'])
+        return res
+
+class BlockBulletList(BlockList): # [[Block]]
+    etype = "bulletlist"
+    def parse_internal(self, pandocraw):
+        self.update(self.parse_listitems(pandocraw))
+
+class BlockOrderedList(BlockList): # ListAttributes [[Block]]
+    etype = "orderedlist"
+    def parse_internal(self, pandocraw):
+        self.update(self.parse_orderedlist_attr(pandocraw[0]))
+        self.update(self.parse_listitems(pandocraw[1]))
+
+    def parse_orderedlist_attr(self, attrs: list):
+        res = {}
+        res["start"] = attrs[0]
+
+        styles = {
+            "DefaultStyle": "default",
+            "Example"     : "example"     ,
+            "Decimal"     : "decimal"     ,
+            "LowerRoman"  : "lower_roman"  ,
+            "UpperRoman"  : "upper_roman"  ,
+            "LowerAlpha"  : "lower_alpha"  ,
+            "UpperAlpha"  : "upper_alpha"  ,
+        }
+        res["style"] = self.parse_enum(styles, attrs[1])
+
+        delims = {
+            "DefaultDelim" : "default"         ,
+            "Period"       : "period"          ,
+            "OneParen"     : "one_parenthesis" ,
+            "TwoParens"    : "two_parentheses" ,
+        }
+        res["delim"] = self.parse_enum(delims, attrs[2])
+
+        return res
+
+class BlockQuote(Block): # [Block]
+    etype = "blockquote"
+    def parse_internal(self, pandocraw):
+        self.content = self.parse_blocks(pandocraw)
+        # TODO add name, color, time
+
+class BlockPlain(Block): # [Inline]
+    etype = "plain"
+    def parse_internal(self, pandocraw):
+        self.content = self.parse_inlines(pandocraw)
+
+class BlockParagraph(Block): # [Inline]
+    etype = "paragraph"
+    def parse_internal(self, pandocraw):
+        self.content = self.parse_inlines(pandocraw)
+
+class BlockCode(Block): # Attr Text
+    etype = "codeblock"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.update(self.parse_code(pandocraw[1]))
+
+class BlockHorizontalRule(Block): #
+    etype = "horizontalrule"
+    def parse_internal(self, pandocraw):
+        pass
+
+class BlockContainer(Block): # Attr [Block]
+    # a div
+    etype = "blockcontainer"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.content = self.parse_blocks(pandocraw[1])
+        # TODO handle alerts
+
+############################## INLINE #########################################
+
+class Inline(Element):
+    eclass = "inline"
+
+class InlineSpace(Inline): #
+    etype = "space"
+    def parse_internal(self, pandocraw):
+        pass
+
+class InlineLineBreak(Inline): #
+    etype = "linebreak"
+    def parse_internal(self, pandocraw):
+        pass
+
+class InlineSoftBreak(Inline): #
+    etype = "softbreak"
+    def parse_internal(self, pandocraw):
+        pass
+
+class InlineString(Inline): # Text
+    etype = "string"
+    def parse_internal(self, pandocraw):
+        self.text = self.parse_text(pandocraw)
+        # TODO handle abbreviations
+
+class InlineSimple(Inline): # [Inline]
+    def __init__(self, etype):
+        self.etype = etype
+        super().__init__()
+
+    def parse_internal(self, pandocraw):
+        self.content = self.parse_inlines(pandocraw)
+
+class InlineLink(Inline): # Attr [Inline] Target
+    etype = "link"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.content = self.parse_inlines(pandocraw[1])
+        self.update(self.parse_target(pandocraw[2]))
+
+class InlineImage(Inline): # Attr [Inline] Target
+    etype = "image"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.alt = self.parse_inlines(pandocraw[1])
+        self.update(self.parse_target(pandocraw[2]))
+
+class InlineQuoted(Inline): # QuoteType Text
+    etype = "quoted"
+    def parse_internal(self, pandocraw):
+        self.quotetype = self.parse_enum({"SingleQuote": "single", "DoubleQuote": "double"}, pandocraw[0])
+        self.text = self.parse_text(pandocraw[1])
+
+class InlineMath(Inline): # MathType Text
+    etype = "math"
+    def parse_internal(self, pandocraw):
+        self.mathtype = self.parse_enum({"DisplayMath": "display", "InlineMath": "inline"}, pandocraw[0])
+        self.math = self.parse_text(pandocraw[1])
+
+class InlineCode(Inline): # Attr Text
+    etype = "code"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.update(self.parse_code(pandocraw[1]))
+
+class InlineContainer(Inline): # Attr [Inline]
+    etype = "inlinecontainer"
+    def parse_internal(self, pandocraw):
+        self.attr = self.parse_attr(pandocraw[0])
+        self.content = self.parse_inlines(pandocraw[1])
+        # TODO handle emojis
+
+class InlineFootnote(Inline): # [Block]
+    etype = "footnote"
+    def parse_internal(self, pandocraw):
+        self.content = self.parse_blocks(pandocraw)
+        # TODO add footnote ids # TODO add back references # TODO handle duplicates
+
+class InlineRaw(Inline): # Format Text
+    etype = "inlinecontainer"
+    def parse_internal(self, pandocraw):
+        self.format = self.parse_text(pandocraw[0])
+        self.raw = self.parse_text(pandocraw[1])
+
+
+inline_parsing_register = {
+    "Space"      : InlineSpace,
+    "Str"        : InlineString,
+    "Strong"     :{"handler" : InlineSimple, "etype":"strong"     },
+    "Emph"       :{"handler" : InlineSimple, "etype":"emph"       },
+    "Underline"  :{"handler" : InlineSimple, "etype":"underline"  },
+    "Strikeout"  :{"handler" : InlineSimple, "etype":"strikeout"  },
+    "Superscript":{"handler" : InlineSimple, "etype":"superscript"},
+    "Subscript"  :{"handler" : InlineSimple, "etype":"subscript"  },
+    "SmallCaps"  :{"handler" : InlineSimple, "etype":"smallcaps"  },
+    "Link"       : InlineLink,
+    "Image"      : InlineImage,
+    "Quoted"     : InlineQuoted,
+    "Math"       : InlineMath,
+    "Code"       : InlineCode,
+    "Span"       : InlineContainer,
+    "RawInline"  : InlineRaw,
+    "Note"       : InlineFootnote,
+    #"Cite"       :{"type":"citation","TODO": True, "c" : [] }, # [Citation] [Inline] # TODO find file that triggers Cite
+    "SoftBreak"  : InlineSoftBreak,
+    "LineBreak"  : InlineLineBreak,
+}
+block_parsing_register = {
+    "Plain"         : BlockPlain,
+    "Para"          : BlockParagraph,
+    "BlockQuote"    : BlockQuote,
+    "BulletList"    : BlockBulletList,
+    "RawBlock"      : BlockRaw,
+    "Header"        : BlockHeader,
+    "CodeBlock"     : BlockCode,
+    "Div"           : BlockContainer,
+    "OrderedList"   : BlockOrderedList,
+    "HorizontalRule": BlockHorizontalRule,
+    "Table"         : {"TODO": True,}, # Attr Caption [ColSpec] TableHead [TableBody] TableFoot
+    "DefinitionList": {"TODO": True,}, # [([Inline], [[Block]])]
+    #"LineBlock"     :{"type":"lineblock",     "TODO": True, "c" : [] }, # [[Inline]] # TODO find file that triggers LineBlock
+    #"Null"          :{"type":"nothing" }, # TODO find file that triggers Null
+}
+
--- a/fgs/reader.py
+++ b/fgs/reader.py
 import page
+import pandoc

 import frontmatter

@@ -24,22 +25,19 @@ class MarkdownReader:
        print("parsing file: ", path, subpath)

        f = open(path)
-        rawcontent = f.read()
+        rawfile = f.read()
        f.close()
-        metadata, _ = frontmatter.parse(rawcontent)
+        metadata, rawcontent = frontmatter.parse(rawfile)

        #print(metadata)

        category_name = self.get_category_name(metadata, subpath)

        # content
-        content = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5")
+        content, contentmetadata = pandoc.run_pandoc(source=rawcontent, base=self.config['pandoc']['base'], extensions=self.config['pandoc']['extensions'])
+        metadata.update(contentmetadata) # merge content specific metadata into metadata
        #print(content)

-        # TOC
-        toc = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5", ["--template", "./pandoc_toc.html", "--toc", "--toc-depth", str(self.config['toc_depth'])])
-        #print((toc))
-    
        # title
        if 'title' not in metadata:
            raise Exception("File is missing title in metadata: ", path, subpath)
@@ -117,10 +115,9 @@ class MarkdownReader:
        p = page.Page(
                filename,
                subpath,
-                rawcontent,
+                rawfile,
                metadata,
                content,
-                toc,
                title,
                category_name,
                slug,
@@ -182,37 +179,3 @@ class MarkdownReader:
        return out_str


-    def run_pandoc(self, source, base="markdown", extensions=[], to="json", extra_args=[]):
-        ext_str = ""
-        if isinstance(extensions, list):
-            for ext in extensions:
-                if ext.startswith('#'):
-                    continue
-                if ext.startswith('+') or ext.startswith('-'):
-                    ext_str = ext_str + ext
-                elif len(ext) > 0:
-                    ext_str = ext_str + '+' + ext
-        elif isinstance(extensions, dict):
-            for ext_key in extensions:
-                # TODO catch 'illegal' ext_keys (containing spaces for example)
-                ext = extensions[ext_key]
-                if "ignore" in ext and ext["ignore"]:
-                    continue
-                flag='+'
-                if "enabled" in ext and not ext["enabled"]:
-                    flag='-'
-                ext_str = ext_str + flag + ext_key
-    
-        #print(ext_str)
-        pandoc_bin = "pandoc"
-        args = [pandoc_bin, "-f", base + ext_str, "-t", to] + extra_args
-        p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        out, _ = p.communicate(source.encode('utf-8', errors='strict'))
-        out_str = out.decode('utf-8')
-        #print("----------------------")
-        #print(out_str)
-        #print("----------------------")
-        #json_dict = json.loads(out.decode('utf-8'))
-        return out_str
-
-