import page import frontmatter from datetime import datetime from dateutil import parser as dtparser import subprocess import os class MarkdownReader: def __init__(self, config): self.config = config def read_and_parse_file(self, path, subpath): if not path.endswith(".md"): raise Exception("can only parse markdown files: ", path) elif len(subpath) > 1: raise Exception("markdown file is too deep in directory structure: ", path, subpath) print("parsing file: ", path, subpath) f = open(path) rawcontent = f.read() f.close() metadata, _ = frontmatter.parse(rawcontent) #print(metadata) category_name = self.get_category_name(metadata, subpath) # content content = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5") #print(content) # TOC toc = self.run_pandoc(rawcontent, self.config['pandoc']['base'], self.config['pandoc']['extensions'], "html5", ["--template", "./pandoc_toc.html", "--toc", "--toc-depth", str(self.config['toc_depth'])]) #print((toc)) # title if 'title' not in metadata: raise Exception("File is missing title in metadata: ", path, subpath) title = metadata['title'] # slug and lang pathlist = path.split('/') filename = pathlist[-1] filenamelist = filename.split('.') filenamelist = filenamelist[:-1] # remove .md slug = None lang = None if len(filenamelist) < 1: raise Exception("filename is empty?", path, subpath) elif len(filenamelist) == 1: slug = filenamelist[0] elif len(filenamelist) == 2: slug = filenamelist[0] lang = filenamelist[1] if 'slug' in metadata: slug = metadata['slug'] if 'lang' in metadata: lang = metadata['lang'] if lang == None: lang = self.config['lang']['default'] if not self.is_supported_lang(lang): raise Exception("language is not supported: ", lang) slug = self.secure_slug(slug) #print("slug: ", slug) #print("lang: ", lang) # date_created and date_modified date_modified = datetime.now() date_created = datetime.now() date_changes = self.run_git(path, "log", ["--follow", "--format=%ad", "--date", "iso-strict"]).splitlines() #print("date_changes: ", date_changes) if (len(date_changes) > 0): date_modified = datetime.fromisoformat(date_changes[0]) date_created = datetime.fromisoformat(date_changes[-1]) if 'date' in metadata: date_created = dtparser.parse(metadata['date']) if 'modified' in metadata: date_modified = dtparser.parse(metadata['modified']) #print("created: ", date_created) #print("last changed: ", date_modified) # author # TODO author from metadata authors_raw = self.run_git(path, "log", ["--follow", "--format=%aE@%aN", "--use-mailmap"]).splitlines() authors = [] known_author_raws = [] for author_raw in authors_raw: if author_raw not in known_author_raws: authors.append(self.extract_author(author_raw)) known_author_raws.append(author_raw) if len(authors_raw) > 0: last_modification_author = self.extract_author(authors_raw[0]) else: last_modification_author = None # status status = self.config['default_status'] if 'status' in metadata: status = metadata['status'] valid_status = ["published", "draft", "hidden"] if status not in valid_status: raise Exception("invalid status '", status, "' must be one of ", valid_status) # TODO summary p = page.Page( filename, subpath, rawcontent, metadata, content, toc, title, category_name, slug, lang, date_created, date_modified, authors, last_modification_author, status, self.config) return p def extract_author(self, raw): author_split = raw.split('@') author_local_part = author_split[0] author_domain = author_split[1] author_name = '@'.join(author_split[2:]) return (author_local_part, author_domain, author_name) def is_supported_lang(self, lang): if not isinstance(lang, str): return False return (lang in self.config['lang']['supported']) def secure_slug(self, slug): if not isinstance(slug, str): raise Exception("slug is not a string: '", slug, "'") slug = slug.lower() whitelist = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" res = "" for c in slug: if c in whitelist: #print ("c: '", c,"'") res += c #print("res: '", res, "'") if len(res) == 0: raise Exception("slug is empty") return res def get_category_name(self, metadata, subpath): if 'category' in metadata: return metadata['category'] elif len(subpath) == 1: return subpath[0] else: return 'misc' def run_git(self, path, subcmd, extra_args): real_path = os.path.realpath(path) filename = os.path.basename(real_path) dir_path = os.path.dirname(real_path) git_bin = "git" args = [git_bin, subcmd] + extra_args + ["--", filename] p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=dir_path) out, _ = p.communicate("".encode('utf-8', errors='strict')) out_str = out.decode('utf-8') return out_str def run_pandoc(self, source, base="markdown", extensions=[], to="json", extra_args=[]): ext_str = "" if isinstance(extensions, list): for ext in extensions: if ext.startswith('#'): continue if ext.startswith('+') or ext.startswith('-'): ext_str = ext_str + ext elif len(ext) > 0: ext_str = ext_str + '+' + ext elif isinstance(extensions, dict): for ext_key in extensions: # TODO catch 'illegal' ext_keys (containing spaces for example) ext = extensions[ext_key] if "ignore" in ext and ext["ignore"]: continue flag='+' if "enabled" in ext and not ext["enabled"]: flag='-' ext_str = ext_str + flag + ext_key #print(ext_str) pandoc_bin = "pandoc" args = [pandoc_bin, "-f", base + ext_str, "-t", to] + extra_args p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) out, _ = p.communicate(source.encode('utf-8', errors='strict')) out_str = out.decode('utf-8') #print("----------------------") #print(out_str) #print("----------------------") #json_dict = json.loads(out.decode('utf-8')) return out_str