Skip to content
Snippets Groups Projects
Verified Commit 529a88a0 authored by Jake's avatar Jake :speech_balloon:
Browse files

added pandoc.py

parent 39f0b334
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,6 @@ class Page:
#raw = None
#metadata = None
#content = None
#toc = None
#title = None
#category = None
#slug = None
......@@ -18,13 +17,12 @@ class Page:
#template = None
def __init__(self, filename, subpath, raw, metadata, content, toc, title, category, slug, lang, date_created, date_modified, authors, last_modification_author, status, config):
def __init__(self, filename, subpath, raw, metadata, content, title, category, slug, lang, date_created, date_modified, authors, last_modification_author, status, config):
self.filename = filename
self.subpath = subpath
self.raw = raw
self.metadata = metadata
self.content = content
self.toc = toc
self.title = title
self.category = category
self.slug = slug
......
#!/usr/bin/env python3
import validators
from typing import List, Dict
import subprocess
import json
import sys
import os
def run_pandoc(source, base="markdown", extensions=[], extra_args=[]):
to = "json"
ext_str = ""
if isinstance(extensions, list):
for ext in extensions:
if ext.startswith('#'):
continue
if ext.startswith('+') or ext.startswith('-'):
ext_str = ext_str + ext
elif len(ext) > 0:
ext_str = ext_str + '+' + ext
elif isinstance(extensions, dict):
for ext_key in extensions:
# TODO catch 'illegal' ext_keys (containing spaces for example)
ext = extensions[ext_key]
if "ignore" in ext and ext["ignore"]:
continue
flag='+'
if "enabled" in ext and not ext["enabled"]:
flag='-'
ext_str = ext_str + flag + ext_key
#print(ext_str)
pandoc_bin = "pandoc"
args = [pandoc_bin, "-f", base + ext_str, "-t", to] + extra_args
p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
out, _ = p.communicate(source.encode('utf-8', errors='strict'))
out_str = out.decode('utf-8')
out_dict = json.loads(out.decode('utf-8'))
if json.dumps(out_dict["pandoc-api-version"]) != "[1, 22, 2]":
raise Exception("Unsupported pandoc-api version", out_dict["pandoc-api-version"])
# parse blocks
raw_blocks = out_dict['blocks']
blocks = []
for raw_block in raw_blocks:
#print('raw_block: ', type(raw_block), raw_block)
blocks.append(parse_from_register(block_parsing_register, raw_block))
contentmetadata = {}
contentmetadata["toc_list"] = []
contentmetadata["toc_count"] = 0
# TODO TOC
#contentmetadata["toc"] = build_toc(n["toc_list"].copy())
blocks = json.dumps(blocks, cls=ElementEncoder) # TODO use json.loads() otherwise this is just a string
return (blocks, contentmetadata)
def parse_from_register(reg: dict, h: dict):
t = h['t'] # pandoc type
if t not in reg:
raise Exception("pandoc type not in register", t, h)
entry = reg[t] # registry entry
c = None
if 'c' in h:
c = h['c']
res = None
if isinstance(entry, dict):
if "TODO" in entry and entry["TODO"]:
print("Warning: entry is marked as TODO: ",t, entry,file=sys.stderr)
return None
handler = entry['handler']
res = handler(entry['etype'])
else:
handler = entry
res = handler()
res.parse(c)
return res
class ElementEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Element):
res = {}
for key, name in obj.export.items():
res[name] = getattr(obj, key)
return res
return super().default(obj)
class Element():
def __init__(self, etype = None):
if etype != None:
self.etype = etype
self.children = []
self.export = {}
self.export_key('etype', 'type')
self.export_key('eclass', 'class')
self.export_key('children')
def addChild(self, child):
self.children.append(child)
def parse_internal(self, pandocraw):
raise Exception("parse_internal not overridden: ", self)
def parse(self, pandocraw):
prevkeys = dir(self)
self.parse_internal(pandocraw)
afterkeys = dir(self)
for key in afterkeys:
if key not in prevkeys:
self.export_key(key)
def export_key(self, key, name=None):
if name == None:
name = key
self.export[key] = name
def parse_blocks(self, raw_blocks):
if not isinstance(raw_blocks, list):
raise Exception("raw_blocks is not a list: ", raw_blocks)
res = []
for raw_block in raw_blocks:
res.append(self.parse_block(raw_block))
return res
def parse_inlines(self, raw_inlines):
if not isinstance(raw_inlines, list):
raise Exception("raw_inlines is not a list: ", raw_inlines)
res = []
for raw_inline in raw_inlines:
res.append(self.parse_inline(raw_inline))
return res
def parse_block(self, raw_block):
res = parse_from_register(block_parsing_register, raw_block)
self.addChild(res)
return res
def parse_inline(self, raw_inline):
res = parse_from_register(inline_parsing_register, raw_inline)
self.addChild(res)
return res
def parse_attr(self, raw_attr):
#print("called parse_attr: ", raw_attr)
res = {}
res['id'] = self.parse_text(raw_attr[0])
classes = []
for c in raw_attr[1]:
classes.append(self.parse_text(c))
res['classes'] = classes
# convert [ "key1", "value1", "key2", "value2" ] to {"key1":"value1", "key2", "value2"}
it = iter(raw_attr[2])
kvp = dict(zip(it, it)) # key-value pairs
extra = {}
for key, value in kvp.items():
res[key] = self.parse_text(value)
res['extra'] = extra
return res
def parse_text(self, raw_text):
if len(raw_text) > 0:
return raw_text
else:
return None
def parse_int(self, raw_num):
return raw_num # TODO
def parse_target(self, raw_target): # For URLs
res = {}
res['url'] = self.parse_text(raw_target[0])
res['title'] = self.parse_text(raw_target[1])
return res
def parse_code(self, code):
res = {}
res["code"] = self.parse_text(code)
res["code_lines"] = code.splitlines()
return res
def parse_enum(self, mapping, enum):
if len(enum.keys()) != 1 or "t" not in enum:
raise Exception("enum is not a valid enum", enum, mapping)
enum = enum["t"]
if enum not in mapping:
raise Exception("enum not found in mapping")
return mapping[enum]
def update(self, d): # Like dict.update()
for key, value in d.items():
setattr(self, key, value)
################################ BLOCK #########################################
class Block(Element):
eclass = "block"
class BlockHeader(Block): # Int Attr [Inline]
etype = "header"
def parse_internal(self, pandocraw):
self.level = self.parse_int(pandocraw[0])
self.attr = self.parse_attr(pandocraw[1])
self.content = self.parse_inlines(pandocraw[2])
class BlockRaw(Block): # Format Text
etype = "rawblock"
def parse_internal(self, pandocraw):
self.format = self.parse_text(pandocraw[0])
self.raw = self.parse_text(pandocraw[1])
class BlockList(Block): # [[Block]]
def parse_listitems(self, rawitems):
res = {}
res['items'] = []
for itemrawblocks in rawitems:
item = self.parse_blocks(itemrawblocks)
res['items'].append(item)
res['count'] = len(res['items'])
return res
class BlockBulletList(BlockList): # [[Block]]
etype = "bulletlist"
def parse_internal(self, pandocraw):
self.update(self.parse_listitems(pandocraw))
class BlockOrderedList(BlockList): # ListAttributes [[Block]]
etype = "orderedlist"
def parse_internal(self, pandocraw):
self.update(self.parse_orderedlist_attr(pandocraw[0]))
self.update(self.parse_listitems(pandocraw[1]))
def parse_orderedlist_attr(self, attrs: list):
res = {}
res["start"] = attrs[0]
styles = {
"DefaultStyle": "default",
"Example" : "example" ,
"Decimal" : "decimal" ,
"LowerRoman" : "lower_roman" ,
"UpperRoman" : "upper_roman" ,
"LowerAlpha" : "lower_alpha" ,
"UpperAlpha" : "upper_alpha" ,
}
res["style"] = self.parse_enum(styles, attrs[1])
delims = {
"DefaultDelim" : "default" ,
"Period" : "period" ,
"OneParen" : "one_parenthesis" ,
"TwoParens" : "two_parentheses" ,
}
res["delim"] = self.parse_enum(delims, attrs[2])
return res
class BlockQuote(Block): # [Block]
etype = "blockquote"
def parse_internal(self, pandocraw):
self.content = self.parse_blocks(pandocraw)
# TODO add name, color, time
class BlockPlain(Block): # [Inline]
etype = "plain"
def parse_internal(self, pandocraw):
self.content = self.parse_inlines(pandocraw)
class BlockParagraph(Block): # [Inline]
etype = "paragraph"
def parse_internal(self, pandocraw):
self.content = self.parse_inlines(pandocraw)
class BlockCode(Block): # Attr Text
etype = "codeblock"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.update(self.parse_code(pandocraw[1]))
class BlockHorizontalRule(Block): #
etype = "horizontalrule"
def parse_internal(self, pandocraw):
pass
class BlockContainer(Block): # Attr [Block]
# a div
etype = "blockcontainer"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.content = self.parse_blocks(pandocraw[1])
# TODO handle alerts
############################## INLINE #########################################
class Inline(Element):
eclass = "inline"
class InlineSpace(Inline): #
etype = "space"
def parse_internal(self, pandocraw):
pass
class InlineLineBreak(Inline): #
etype = "linebreak"
def parse_internal(self, pandocraw):
pass
class InlineSoftBreak(Inline): #
etype = "softbreak"
def parse_internal(self, pandocraw):
pass
class InlineString(Inline): # Text
etype = "string"
def parse_internal(self, pandocraw):
self.text = self.parse_text(pandocraw)
# TODO handle abbreviations
class InlineSimple(Inline): # [Inline]
def __init__(self, etype):
self.etype = etype
super().__init__()
def parse_internal(self, pandocraw):
self.content = self.parse_inlines(pandocraw)
class InlineLink(Inline): # Attr [Inline] Target
etype = "link"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.content = self.parse_inlines(pandocraw[1])
self.update(self.parse_target(pandocraw[2]))
class InlineImage(Inline): # Attr [Inline] Target
etype = "image"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.alt = self.parse_inlines(pandocraw[1])
self.update(self.parse_target(pandocraw[2]))
class InlineQuoted(Inline): # QuoteType Text
etype = "quoted"
def parse_internal(self, pandocraw):
self.quotetype = self.parse_enum({"SingleQuote": "single", "DoubleQuote": "double"}, pandocraw[0])
self.text = self.parse_text(pandocraw[1])
class InlineMath(Inline): # MathType Text
etype = "math"
def parse_internal(self, pandocraw):
self.mathtype = self.parse_enum({"DisplayMath": "display", "InlineMath": "inline"}, pandocraw[0])
self.math = self.parse_text(pandocraw[1])
class InlineCode(Inline): # Attr Text
etype = "code"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.update(self.parse_code(pandocraw[1]))
class InlineContainer(Inline): # Attr [Inline]
etype = "inlinecontainer"
def parse_internal(self, pandocraw):
self.attr = self.parse_attr(pandocraw[0])
self.content = self.parse_inlines(pandocraw[1])
# TODO handle emojis
class InlineFootnote(Inline): # [Block]
etype = "footnote"
def parse_internal(self, pandocraw):
self.content = self.parse_blocks(pandocraw)
# TODO add footnote ids # TODO add back references # TODO handle duplicates
class InlineRaw(Inline): # Format Text
etype = "inlinecontainer"
def parse_internal(self, pandocraw):
self.format = self.parse_text(pandocraw[0])
self.raw = self.parse_text(pandocraw[1])
inline_parsing_register = {
"Space" : InlineSpace,
"Str" : InlineString,
"Strong" :{"handler" : InlineSimple, "etype":"strong" },
"Emph" :{"handler" : InlineSimple, "etype":"emph" },
"Underline" :{"handler" : InlineSimple, "etype":"underline" },
"Strikeout" :{"handler" : InlineSimple, "etype":"strikeout" },
"Superscript":{"handler" : InlineSimple, "etype":"superscript"},
"Subscript" :{"handler" : InlineSimple, "etype":"subscript" },
"SmallCaps" :{"handler" : InlineSimple, "etype":"smallcaps" },
"Link" : InlineLink,
"Image" : InlineImage,
"Quoted" : InlineQuoted,
"Math" : InlineMath,
"Code" : InlineCode,
"Span" : InlineContainer,
"RawInline" : InlineRaw,
"Note" : InlineFootnote,
#"Cite" :{"type":"citation","TODO": True, "c" : [] }, # [Citation] [Inline] # TODO find file that triggers Cite
"SoftBreak" : InlineSoftBreak,
"LineBreak" : InlineLineBreak,
}
block_parsing_register = {
"Plain" : BlockPlain,
"Para" : BlockParagraph,
"BlockQuote" : BlockQuote,
"BulletList" : BlockBulletList,
"RawBlock" : BlockRaw,
"Header" : BlockHeader,
"CodeBlock" : BlockCode,
"Div" : BlockContainer,
"OrderedList" : BlockOrderedList,
"HorizontalRule": BlockHorizontalRule,
"Table" : {"TODO": True,}, # Attr Caption [ColSpec] TableHead [TableBody] TableFoot
"DefinitionList": {"TODO": True,}, # [([Inline], [[Block]])]
#"LineBlock" :{"type":"lineblock", "TODO": True, "c" : [] }, # [[Inline]] # TODO find file that triggers LineBlock
#"Null" :{"type":"nothing" }, # TODO find file that triggers Null
}
import page
import pandoc
import frontmatter
......@@ -24,22 +25,19 @@ class MarkdownReader:
print("parsing file: ", path, subpath)
f = open(path)
rawcontent = f.read()
rawfile = f.read()
f.close()
metadata, _ = frontmatter.parse(rawcontent)
metadata, rawcontent = frontmatter.parse(rawfile)
#print(metadata)
category_name = self.get_category_name(metadata, subpath)
# content
content = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5")
content, contentmetadata = pandoc.run_pandoc(source=rawcontent, base=self.config['pandoc']['base'], extensions=self.config['pandoc']['extensions'])
metadata.update(contentmetadata) # merge content specific metadata into metadata
#print(content)
# TOC
toc = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5", ["--template", "./pandoc_toc.html", "--toc", "--toc-depth", str(self.config['toc_depth'])])
#print((toc))
# title
if 'title' not in metadata:
raise Exception("File is missing title in metadata: ", path, subpath)
......@@ -117,10 +115,9 @@ class MarkdownReader:
p = page.Page(
filename,
subpath,
rawcontent,
rawfile,
metadata,
content,
toc,
title,
category_name,
slug,
......@@ -182,37 +179,3 @@ class MarkdownReader:
return out_str
def run_pandoc(self, source, base="markdown", extensions=[], to="json", extra_args=[]):
ext_str = ""
if isinstance(extensions, list):
for ext in extensions:
if ext.startswith('#'):
continue
if ext.startswith('+') or ext.startswith('-'):
ext_str = ext_str + ext
elif len(ext) > 0:
ext_str = ext_str + '+' + ext
elif isinstance(extensions, dict):
for ext_key in extensions:
# TODO catch 'illegal' ext_keys (containing spaces for example)
ext = extensions[ext_key]
if "ignore" in ext and ext["ignore"]:
continue
flag='+'
if "enabled" in ext and not ext["enabled"]:
flag='-'
ext_str = ext_str + flag + ext_key
#print(ext_str)
pandoc_bin = "pandoc"
args = [pandoc_bin, "-f", base + ext_str, "-t", to] + extra_args
p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
out, _ = p.communicate(source.encode('utf-8', errors='strict'))
out_str = out.decode('utf-8')
#print("----------------------")
#print(out_str)
#print("----------------------")
#json_dict = json.loads(out.decode('utf-8'))
return out_str
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment