"""Markdown filters with mistune Used from markdown.py """ # Copyright (c) IPython Development Team. # Distributed under the terms of the Modified BSD License. import base64 import mimetypes import os import re from functools import partial try: from html import escape html_escape = partial(escape, quote=False) except ImportError: # Python 2 from cgi import escape as html_escape import bs4 import nbconvert.filters._mistune as mistune from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name from pygments.util import ClassNotFound from nbconvert.filters.strings import add_anchor class InvalidNotebook(Exception): pass class MathBlockGrammar(mistune.BlockGrammar): """This defines a single regex comprised of the different patterns that identify math content spanning multiple lines. These are used by the MathBlockLexer. """ multi_math_str = "|".join( [r"^\$\$.*?\$\$", r"^\\\\\[.*?\\\\\]", r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}"] ) multiline_math = re.compile(multi_math_str, re.DOTALL) class MathBlockLexer(mistune.BlockLexer): """This acts as a pass-through to the MathInlineLexer. It is needed in order to avoid other block level rules splitting math sections apart. """ default_rules = ["multiline_math"] + mistune.BlockLexer.default_rules def __init__(self, rules=None, **kwargs): if rules is None: rules = MathBlockGrammar() super().__init__(rules, **kwargs) def parse_multiline_math(self, m): """Add token to pass through mutiline math.""" self.tokens.append({"type": "multiline_math", "text": m.group(0)}) class MathInlineGrammar(mistune.InlineGrammar): """This defines different ways of declaring math objects that should be passed through to mathjax unaffected. These are used by the MathInlineLexer. """ inline_math = re.compile(r"^\$(.+?)\$|^\\\\$(.+?)\\\\$", re.DOTALL) block_math = re.compile(r"^\$\$(.*?)\$\$|^\\\\\[(.*?)\\\\\]", re.DOTALL) latex_environment = re.compile(r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}", re.DOTALL) text = re.compile(r"^[\s\S]+?(?=[\\%s\n" % mistune.escape(code) formatter = HtmlFormatter() return highlight(code, lexer, formatter) def block_html(self, html): embed_images = self.options.get("embed_images", False) if embed_images: html = self._html_embed_images(html) return super().block_html(html) def inline_html(self, html): embed_images = self.options.get("embed_images", False) if embed_images: html = self._html_embed_images(html) return super().inline_html(html) def header(self, text, level, raw=None): html = super().header(text, level, raw=raw) if self.options.get("exclude_anchor_links"): return html anchor_link_text = self.options.get("anchor_link_text", "¶") return add_anchor(html, anchor_link_text=anchor_link_text) def escape_html(self, text): return html_escape(text) def block_math(self, text): return "$$%s$$" % self.escape_html(text) def latex_environment(self, name, text): name = self.escape_html(name) text = self.escape_html(text) return rf"\begin{{{name}}}{text}\end{{{name}}}" def inline_math(self, text): return "$%s$" % self.escape_html(text) def image(self, src, title, text): """Rendering a image with title and text. :param src: source link of the image. :param title: title text of the image. :param text: alt text of the image. """ attachments = self.options.get("attachments", {}) attachment_prefix = "attachment:" embed_images = self.options.get("embed_images", False) if src.startswith(attachment_prefix): name = src[len(attachment_prefix) :] if name not in attachments: raise InvalidNotebook(f"missing attachment: {name}") attachment = attachments[name] # we choose vector over raster, and lossless over lossy preferred_mime_types = ["image/svg+xml", "image/png", "image/jpeg"] for preferred_mime_type in preferred_mime_types: if preferred_mime_type in attachment: break else: # otherwise we choose the first mimetype we can find preferred_mime_type = list(attachment.keys())[0] mime_type = preferred_mime_type data = attachment[mime_type] src = "data:" + mime_type + ";base64," + data elif embed_images: base64_url = self._src_to_base64(src) if base64_url is not None: src = base64_url return super().image(src, title, text) def _src_to_base64(self, src): """Turn the source file into a base64 url. :param src: source link of the file. :return: the base64 url or None if the file was not found. """ path = self.options.get("path", "") src_path = os.path.join(path, src) if not os.path.exists(src_path): return None with open(src_path, "rb") as fobj: mime_type = mimetypes.guess_type(src_path)[0] base64_data = base64.b64encode(fobj.read()) base64_data = base64_data.replace(b"\n", b"").decode("ascii") return f"data:{mime_type};base64,{base64_data}" def _html_embed_images(self, html): parsed_html = bs4.BeautifulSoup(html, features="html.parser") imgs = parsed_html.find_all("img") # Replace img tags's sources by base64 dataurls for img in imgs: if "src" not in img.attrs: continue base64_url = self._src_to_base64(img.attrs["src"]) if base64_url is not None: img.attrs["src"] = base64_url return str(parsed_html) def markdown2html_mistune(source): """Convert a markdown string to HTML using mistune""" return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)