# :Id: $Id: latex2mathml.py 9536 2024-02-01 13:04:22Z milde $ # :Copyright: © 2005 Jens Jørgen Mortensen [1]_ # © 2010, 2021, 2024 Günter Milde. # # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. # # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause # # .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math` """Convert LaTex maths code into presentational MathML. This module is provisional: the API is not settled and may change with any minor Docutils version. """ # Usage: # # >>> from latex2mathml import * import re import unicodedata from docutils.utils.math import (MathError, mathalphabet2unichar, tex2unichar, toplevel_code) from docutils.utils.math.mathml_elements import ( math, mtable, mrow, mtr, mtd, menclose, mphantom, msqrt, mi, mn, mo, mtext, msub, msup, msubsup, munder, mover, munderover, mroot, mfrac, mspace, MathRow) # Character data # -------------- # LaTeX math macro to Unicode mappings. # Character categories. # identifiers -> letters = {'hbar': 'ℏ'} # Compatibility mapping: \hbar resembles italic ħ # "unicode-math" unifies \hbar and \hslash to ℏ. letters.update(tex2unichar.mathalpha) ordinary = tex2unichar.mathord # Miscellaneous symbols # special case: Capital Greek letters: (upright in TeX style) greek_capitals = { 'Phi': '\u03a6', 'Xi': '\u039e', 'Sigma': '\u03a3', 'Psi': '\u03a8', 'Delta': '\u0394', 'Theta': '\u0398', 'Upsilon': '\u03d2', 'Pi': '\u03a0', 'Omega': '\u03a9', 'Gamma': '\u0393', 'Lambda': '\u039b'} # functions -> functions = { # functions with a space in the name 'liminf': 'lim\u202finf', 'limsup': 'lim\u202fsup', 'injlim': 'inj\u202flim', 'projlim': 'proj\u202flim', # embellished function names (see handle_cmd() below) 'varlimsup': 'lim', 'varliminf': 'lim', 'varprojlim': 'lim', 'varinjlim': 'lim', # custom function name 'operatorname': None, } functions.update((name, name) for name in ('arccos', 'arcsin', 'arctan', 'arg', 'cos', 'cosh', 'cot', 'coth', 'csc', 'deg', 'det', 'dim', 'exp', 'gcd', 'hom', 'ker', 'lg', 'ln', 'log', 'Pr', 'sec', 'sin', 'sinh', 'tan', 'tanh')) # Function with limits: 'lim', 'sup', 'inf', 'max', 'min': # use to allow "movablelimits" attribute (see below). # modulo operator/arithmetic modulo_functions = { # cmdname: (binary, named, parentheses, padding) 'bmod': (True, True, False, '0.278em'), # a mod n 'pmod': (False, True, True, '0.444em'), # a (mod n) 'mod': (False, True, False, '0.667em'), # a mod n 'pod': (False, False, True, '0.444em'), # a (n) } # "mathematical alphabets": map identifiers to the corresponding # characters from the "Mathematical Alphanumeric Symbols" block math_alphabets = { # 'cmdname': 'mathvariant value' # package 'mathbb': 'double-struck', # amssymb 'mathbf': 'bold', 'mathbfit': 'bold-italic', # isomath 'mathcal': 'script', 'mathfrak': 'fraktur', # amssymb 'mathit': 'italic', 'mathrm': 'normal', 'mathscr': 'script', # mathrsfs et al 'mathsf': 'sans-serif', 'mathbfsfit': 'sans-serif-bold-italic', # unicode-math 'mathsfbfit': 'sans-serif-bold-italic', # isomath 'mathsfit': 'sans-serif-italic', # isomath 'mathtt': 'monospace', # unsupported: bold-fraktur # bold-script # bold-sans-serif } # operator, fence, or separator -> stretchables = { # extensible delimiters allowed in left/right cmds 'backslash': '\\', 'uparrow': '\u2191', # ↑ UPWARDS ARROW 'downarrow': '\u2193', # ↓ DOWNWARDS ARROW 'updownarrow': '\u2195', # ↕ UP DOWN ARROW 'Uparrow': '\u21d1', # ⇑ UPWARDS DOUBLE ARROW 'Downarrow': '\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW 'Updownarrow': '\u21d5', # ⇕ UP DOWN DOUBLE ARROW 'lmoustache': '\u23b0', # ⎰ … CURLY BRACKET SECTION 'rmoustache': '\u23b1', # ⎱ … LEFT CURLY BRACKET SECTION 'arrowvert': '\u23d0', # ⏐ VERTICAL LINE EXTENSION 'bracevert': '\u23aa', # ⎪ CURLY BRACKET EXTENSION 'lvert': '|', # left | 'lVert': '\u2016', # left ‖ 'rvert': '|', # right | 'rVert': '\u2016', # right ‖ 'Arrowvert': '\u2016', # ‖ } stretchables.update(tex2unichar.mathfence) stretchables.update(tex2unichar.mathopen) # Braces stretchables.update(tex2unichar.mathclose) # Braces # >>> print(' '.join(sorted(set(stretchables.values())))) # [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈ operators = { # negated symbols without pre-composed Unicode character 'nleqq': '\u2266\u0338', # ≦̸ 'ngeqq': '\u2267\u0338', # ≧̸ 'nleqslant': '\u2a7d\u0338', # ⩽̸ 'ngeqslant': '\u2a7e\u0338', # ⩾̸ 'ngtrless': '\u2277\u0338', # txfonts 'nlessgtr': '\u2276\u0338', # txfonts 'nsubseteqq': '\u2AC5\u0338', # ⫅̸ 'nsupseteqq': '\u2AC6\u0338', # ⫆̸ # compatibility definitions: 'centerdot': '\u2B1D', # BLACK VERY SMALL SQUARE | mathbin 'varnothing': '\u2300', # ⌀ DIAMETER SIGN | empty set 'varpropto': '\u221d', # ∝ PROPORTIONAL TO | sans serif 'triangle': '\u25B3', # WHITE UP-POINTING TRIANGLE | mathord 'triangledown': '\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord # alias commands: 'dotsb': '\u22ef', # ⋯ with binary operators/relations 'dotsc': '\u2026', # … with commas 'dotsi': '\u22ef', # ⋯ with integrals 'dotsm': '\u22ef', # ⋯ multiplication dots 'dotso': '\u2026', # … other dots # functions with movable limits (requires ) 'lim': 'lim', 'sup': 'sup', 'inf': 'inf', 'max': 'max', 'min': 'min', } operators.update(tex2unichar.mathbin) # Binary symbols operators.update(tex2unichar.mathrel) # Relation symbols, arrow symbols operators.update(tex2unichar.mathpunct) # Punctuation operators.update(tex2unichar.mathop) # Variable-sized symbols operators.update(stretchables) # special cases thick_operators = { # style='font-weight: bold;' 'thicksim': '\u223C', # ∼ 'thickapprox': '\u2248', # ≈ } small_operators = { # mathsize='75%' 'shortmid': '\u2223', # ∣ 'shortparallel': '\u2225', # ∥ 'nshortmid': '\u2224', # ∤ 'nshortparallel': '\u2226', # ∦ 'smallfrown': '\u2322', # ⌢ FROWN 'smallsmile': '\u2323', # ⌣ SMILE 'smallint': '\u222b', # ∫ INTEGRAL } # Operators and functions with limits above/below in display formulas # and in index position inline (movablelimits=True) movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes', 'bigsqcup', 'biguplus', 'bigvee', 'bigwedge', 'coprod', 'intop', 'ointop', 'prod', 'sum', 'lim', 'max', 'min', 'sup', 'inf') # Depending on settings, integrals may also be in this category. # (e.g. if "amsmath" is loaded with option "intlimits", see # http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf) # movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint', # 'oint', 'ointctrclockwise', 'sqint', # 'varointclockwise',)) # horizontal space -> spaces = {'qquad': '2em', # two \quad 'quad': '1em', # 18 mu 'thickspace': '0.2778em', # 5mu = 5/18em ';': '0.2778em', # 5mu thickspace ' ': '0.25em', # inter word space '\n': '0.25em', # inter word space 'medspace': '0.2222em', # 4mu = 2/9em ':': '0.2222em', # 4mu medspace 'thinspace': '0.1667em', # 3mu = 1/6em ',': '0.1667em', # 3mu thinspace 'negthinspace': '-0.1667em', # -3mu = -1/6em '!': '-0.1667em', # negthinspace 'negmedspace': '-0.2222em', # -4mu = -2/9em 'negthickspace': '-0.2778em', # -5mu = -5/18em } # accents: -> in accents = { # TeX: spacing combining 'acute': '´', # '\u0301' 'bar': 'ˉ', # '\u0304' 'breve': '˘', # '\u0306' 'check': 'ˇ', # '\u030C' 'dot': '˙', # '\u0307' 'ddot': '¨', # '\u0308' 'dddot': '˙˙˙', # '\u20DB' # or … ? 'ddddot': '˙˙˙˙', # '\u20DC' # or ¨¨ ? 'grave': '`', # '\u0300' 'hat': 'ˆ', # '\u0302' 'mathring': '˚', # '\u030A' 'tilde': '~', # '\u0303' # tilde ~ or small tilde ˜? 'vec': '→', # '\u20d7' # → too heavy, use scriptlevel="+1" } # limits etc. -> in or over = { # TeX: (char, offset-correction/em) 'overbrace': ('\u23DE', -0.2), # DejaVu Math -0.6 'overleftarrow': ('\u2190', -0.2), 'overleftrightarrow': ('\u2194', -0.2), 'overline': ('_', -0.2), # \u2012 does not stretch 'overrightarrow': ('\u2192', -0.2), 'widehat': ('^', -0.5), 'widetilde': ('~', -0.3), } under = {'underbrace': ('\u23DF', 0.1), # DejaVu Math -0.7 'underleftarrow': ('\u2190', -0.2), 'underleftrightarrow': ('\u2194', -0.2), 'underline': ('_', -0.8), 'underrightarrow': ('\u2192', -0.2), } # Character translations # ---------------------- # characters with preferred alternative in mathematical use # cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous anomalous_chars = {'-': '\u2212', # HYPHEN-MINUS -> MINUS SIGN ':': '\u2236', # COLON -> RATIO '~': '\u00a0', # NO-BREAK SPACE } # blackboard bold (Greek characters not working with "mathvariant" (Firefox 78) mathbb = {'Γ': '\u213E', # ℾ 'Π': '\u213F', # ℿ 'Σ': '\u2140', # ⅀ 'γ': '\u213D', # ℽ 'π': '\u213C', # ℼ } # Matrix environments matrices = { # name: fences 'matrix': ('', ''), 'smallmatrix': ('', ''), # smaller, see begin_environment()! 'pmatrix': ('(', ')'), 'bmatrix': ('[', ']'), 'Bmatrix': ('{', '}'), 'vmatrix': ('|', '|'), 'Vmatrix': ('\u2016', '\u2016'), # ‖ 'aligned': ('', ''), 'cases': ('{', ''), } layout_styles = { 'displaystyle': {'displaystyle': True, 'scriptlevel': 0}, 'textstyle': {'displaystyle': False, 'scriptlevel': 0}, 'scriptstyle': {'displaystyle': False, 'scriptlevel': 1}, 'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2}, } # See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel fractions = { # name: attributes 'frac': {}, 'cfrac': {'displaystyle': True, 'scriptlevel': 0, 'class': 'cfrac'}, # in LaTeX with padding 'dfrac': layout_styles['displaystyle'], 'tfrac': layout_styles['textstyle'], 'binom': {'linethickness': 0}, 'dbinom': layout_styles['displaystyle'] | {'linethickness': 0}, 'tbinom': layout_styles['textstyle'] | {'linethickness': 0}, } delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em'] bigdelimiters = {'left': 0, 'right': 0, 'bigl': 1, 'bigr': 1, 'Bigl': 2, 'Bigr': 2, 'biggl': 3, 'biggr': 3, 'Biggl': 4, 'Biggr': 4, } # LaTeX to MathML translation # --------------------------- # auxiliary functions # ~~~~~~~~~~~~~~~~~~~ def tex_cmdname(string): """Return leading TeX command name and remainder of `string`. >>> tex_cmdname('mymacro2') # up to first non-letter ('mymacro', '2') >>> tex_cmdname('name 2') # strip trailing whitespace ('name', '2') >>> tex_cmdname('_2') # single non-letter character ('_', '2') """ m = re.match(r'([a-zA-Z]+)[ \n]*(.*)', string, re.DOTALL) if m is None: m = re.match(r'(.?)(.*)', string, re.DOTALL) return m.group(1), m.group(2) # Test: # # >>> tex_cmdname('name\nnext') # strip trailing whitespace, also newlines # ('name', 'next') # >>> tex_cmdname('name_2') # first non-letter terminates # ('name', '_2') # >>> tex_cmdname('name_2\nnext line') # line-break allowed # ('name', '_2\nnext line') # >>> tex_cmdname(' next') # leading whitespace is returned # (' ', 'next') # >>> tex_cmdname('1 2') # whitespace after non-letter is kept # ('1', ' 2') # >>> tex_cmdname('1\n2\t3') # whitespace after non-letter is kept # ('1', '\n2\t3') # >>> tex_cmdname('') # empty string # ('', '') def tex_number(string): """Return leading number literal and remainder of `string`. >>> tex_number('123.4') ('123.4', '') """ m = re.match(r'([0-9.,]*[0-9]+)(.*)', string, re.DOTALL) if m is None: return '', string return m.group(1), m.group(2) # Test: # # >>> tex_number(' 23.4b') # leading whitespace -> no number # ('', ' 23.4b') # >>> tex_number('23,400/2') # comma separator included # ('23,400', '/2') # >>> tex_number('23. 4/2') # trailing separator not included # ('23', '. 4/2') # >>> tex_number('4, 2') # trailing separator not included # ('4', ', 2') # >>> tex_number('1 000.4') # ('1', ' 000.4') def tex_token(string): """Return first simple TeX token and remainder of `string`. >>> tex_token('\\command{without argument}') ('\\command', '{without argument}') >>> tex_token('or first character') ('o', 'r first character') """ m = re.match(r"""((?P\\[a-zA-Z]+)\s* # TeX command, skip whitespace |(?P\\.) # one-character TeX command |(?P.?)) # first character (or empty) (?P.*$) # remaining part of string """, string, re.VERBOSE | re.DOTALL) cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder') return cmd or chcmd or ch, remainder # Test: # # >>> tex_token('{opening bracket of group}') # ('{', 'opening bracket of group}') # >>> tex_token('\\skip whitespace after macro name') # ('\\skip', 'whitespace after macro name') # >>> tex_token('. but not after single char') # ('.', ' but not after single char') # >>> tex_token('') # empty string. # ('', '') # >>> tex_token('\{escaped bracket') # ('\\{', 'escaped bracket') def tex_group(string): """Return first TeX group or token and remainder of `string`. >>> tex_group('{first group} returned without brackets') ('first group', ' returned without brackets') """ split_index = 0 nest_level = 0 # level of {{nested} groups} escape = False # the next character is escaped (\) if not string.startswith('{'): # special case: there is no group, return first token and remainder return string[:1], string[1:] for c in string: split_index += 1 if escape: escape = False elif c == '\\': escape = True elif c == '{': nest_level += 1 elif c == '}': nest_level -= 1 if nest_level == 0: break else: raise MathError('Group without closing bracket!') return string[1:split_index-1], string[split_index:] # >>> tex_group('{} empty group') # ('', ' empty group') # >>> tex_group('{group with {nested} group} ') # ('group with {nested} group', ' ') # >>> tex_group('{group with {nested group}} at the end') # ('group with {nested group}', ' at the end') # >>> tex_group('{{group} {with {{complex }nesting}} constructs}') # ('{group} {with {{complex }nesting}} constructs', '') # >>> tex_group('{group with \\{escaped\\} brackets}') # ('group with \\{escaped\\} brackets', '') # >>> tex_group('{group followed by closing bracket}} from outer group') # ('group followed by closing bracket', '} from outer group') # >>> tex_group('No group? Return first character.') # ('N', 'o group? Return first character.') # >>> tex_group(' {also whitespace}') # (' ', '{also whitespace}') def tex_token_or_group(string): """Return first TeX group or token and remainder of `string`. >>> tex_token_or_group('\\command{without argument}') ('\\command', '{without argument}') >>> tex_token_or_group('first character') ('f', 'irst character') >>> tex_token_or_group(' also whitespace') (' ', 'also whitespace') >>> tex_token_or_group('{first group} keep rest') ('first group', ' keep rest') """ arg, remainder = tex_token(string) if arg == '{': arg, remainder = tex_group(string.lstrip()) return arg, remainder # >>> tex_token_or_group('\{no group but left bracket') # ('\\{', 'no group but left bracket') def tex_optarg(string): """Return optional argument and remainder. >>> tex_optarg('[optional argument] returned without brackets') ('optional argument', ' returned without brackets') >>> tex_optarg('{empty string, if there is no optional arg}') ('', '{empty string, if there is no optional arg}') """ m = re.match(r"""\s* # leading whitespace \[(?P(\\]|[^\[\]]|\\])*)\] # [group] without nested groups (?P.*$) """, string, re.VERBOSE | re.DOTALL) if m is None and not string.startswith('['): return '', string try: return m.group('optarg'), m.group('remainder') except AttributeError: raise MathError(f'Could not extract optional argument from "{string}"!') # Test: # >>> tex_optarg(' [optional argument] after whitespace') # ('optional argument', ' after whitespace') # >>> tex_optarg('[missing right bracket') # Traceback (most recent call last): # ... # docutils.utils.math.MathError: Could not extract optional argument from "[missing right bracket"! # >>> tex_optarg('[group with [nested group]]') # Traceback (most recent call last): # ... # docutils.utils.math.MathError: Could not extract optional argument from "[group with [nested group]]"! def parse_latex_math(root, source): """Append MathML conversion of `string` to `node` and return it. >>> parse_latex_math(math(), r'\alpha') math(mi('α')) >>> parse_latex_math(mrow(), r'x_{n}') mrow(msub(mi('x'), mi('n'))) """ # Normalize white-space: string = source # not-yet handled part of source node = root # the current "insertion point" # Loop over `string` while changing it. while len(string) > 0: # Take off first character: c, string = string[0], string[1:] if c in ' \n': continue # whitespace is ignored in LaTeX math mode if c == '\\': # start of a LaTeX macro cmdname, string = tex_cmdname(string) node, string = handle_cmd(cmdname, node, string) elif c in "_^": node = handle_script_or_limit(node, c) elif c == '{': if isinstance(node, MathRow) and node.nchildren == 1: # LaTeX takes one arg, MathML node accepts a group node.nchildren = None # allow appending until closed by '}' else: # wrap group in an new_node = mrow() node.append(new_node) node = new_node elif c == '}': node = node.close() elif c == '&': new_node = mtd() node.close().append(new_node) node = new_node elif c.isalpha(): node = node.append(mi(c)) elif c.isdigit(): number, string = tex_number(string) node = node.append(mn(c+number)) elif c in anomalous_chars: # characters with a special meaning in LaTeX math mode # fix spacing before "unary" minus. attributes = {} if c == '-' and len(node): previous_node = node[-1] if (previous_node.text and previous_node.text in '([=' or previous_node.get('class') == 'mathopen'): attributes['form'] = 'prefix' node = node.append(mo(anomalous_chars[c], **attributes)) elif c in "/()[]|": node = node.append(mo(c, stretchy=False)) elif c in "+*=<>,.!?`';@": node = node.append(mo(c)) else: raise MathError(f'Unsupported character: "{c}"!') # TODO: append as ? if node is None: if not string: return root # ignore unbalanced braces raise MathError(f'No insertion point for "{string}". ' f'Unbalanced braces in "{source[:-len(string)]}"?') if node.nchildren and len(node) < node.nchildren: raise MathError('Last node missing children. Source incomplete?') return root # Test: # >>> parse_latex_math(math(), '') # math() # >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}') # math(msqrt(mi('α'))) # >>> parse_latex_math(math(), '23.4x') # math(mn('23.4'), mi('x')) # >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3') # math(msqrt(mn('2')), mo('≠'), mn('3')) # >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 10') # math(msqrt(mn('2'), mo('+'), mn('3'), nchildren=3), mo('<'), mn('10')) # >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}') # math(mroot(mrow(mn('2'), mo('+'), mn('3'), nchildren=3), mn('3'))) # >>> parse_latex_math(math(), '\max_x') # function takes limits # math(munder(mo('max', movablelimits='true'), mi('x'))) # >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup # math(msubsup(mi('x'), mi('i'), mi('j'))) # >>> parse_latex_math(math(), '\int^j_i') # ensure correct order # math(msubsup(mo('∫'), mi('i'), mi('j'))) # >>> parse_latex_math(math(), 'x_{\\alpha}') # math(msub(mi('x'), mi('α'))) # >>> parse_latex_math(math(), 'x_\\text{in}') # math(msub(mi('x'), mtext('in'))) # >>> parse_latex_math(math(), '2⌘') # Traceback (most recent call last): # docutils.utils.math.MathError: Unsupported character: "⌘"! # >>> parse_latex_math(math(), '23}x') # doctest: +ELLIPSIS # Traceback (most recent call last): # ... # docutils.utils.math.MathError: ... Unbalanced braces in "23}"? # >>> parse_latex_math(math(), '\\frac{2}') # Traceback (most recent call last): # ... # docutils.utils.math.MathError: Last node missing children. Source incomplete? def handle_cmd(name, node, string): # noqa: C901 TODO make this less complex """Process LaTeX command `name` followed by `string`. Append result to `node`. If needed, parse `string` for command argument. Return new current node and remainder of `string`: >>> handle_cmd('hbar', math(), r' \frac') (math(mi('ℏ')), ' \\frac') >>> handle_cmd('hspace', math(), r'{1ex} (x)') (math(mspace(width='1ex')), ' (x)') """ # Token elements # ============== # identifier -> if name in letters: new_node = mi(letters[name]) if name in greek_capitals: # upright in "TeX style" but MathML sets them italic ("ISO style"). # CSS styling does not change the font style in Firefox 78. # Use 'mathvariant="normal"'? new_node.set('class', 'capital-greek') node = node.append(new_node) return node, string if name in ordinary: # well supported by Chromium but # Firefox 115.5.0 puts additional space around the symbol, e.g. # t looks like ∂ t, not ∂t # return node.append(mi(ordinary[name], mathvariant='normal')), string return node.append(mi(ordinary[name])), string if name in functions: # use followed by invisible function applicator character # (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi) if name == 'operatorname': # custom function name, e.g. ``\operatorname{abs}(x)`` # TODO: \operatorname* -> with limits arg, string = tex_token_or_group(string) new_node = mi(arg, mathvariant='normal') else: new_node = mi(functions[name]) # embellished function names: if name == 'varliminf': # \underline\lim new_node = munder(new_node, mo('_')) elif name == 'varlimsup': # \overline\lim new_node = mover(new_node, mo('¯'), accent=False) elif name == 'varprojlim': # \underleftarrow\lim new_node = munder(new_node, mo('\u2190')) elif name == 'varinjlim': # \underrightarrow\lim new_node = munder(new_node, mo('\u2192')) node = node.append(new_node) # add ApplyFunction when appropriate (not \sin^2(x), say) # cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi if string and string[0] not in ('^', '_'): node = node.append(mo('\u2061')) # ⁡ return node, string if name in modulo_functions: (binary, named, parentheses, padding) = modulo_functions[name] if binary: node = node.append(mo('mod', lspace=padding, rspace=padding)) return node, string # left padding if node.in_block(): padding = '1em' node = node.append(mspace(width=padding)) if parentheses: node = node.append(mo('(', stretchy=False)) if named: node = node.append(mi('mod')) node = node.append(mspace(width='0.333em')) arg, string = tex_token_or_group(string) node = parse_latex_math(node, arg) if parentheses: node = node.append(mo(')', stretchy=False)) return node, string # font changes or mathematical alphanumeric characters if name in ('boldsymbol', 'pmb'): # \pmb is "poor mans bold" new_node = mrow(CLASS='boldsymbol') node.append(new_node) return new_node, string if name in math_alphabets: return handle_math_alphabet(name, node, string) # operator, fence, or separator -> if name == 'colon': # trailing punctuation, not binary relation node = node.append(mo(':', form='postfix', lspace='0', rspace='0.28em')) return node, string if name == 'idotsint': # AMS shortcut for ∫︀···∫︀ node = parse_latex_math(node, r'\int\dotsi\int') return node, string if name in thick_operators: node = node.append(mo(thick_operators[name], style='font-weight: bold')) return node, string if name in small_operators: node = node.append(mo(small_operators[name], mathsize='75%')) return node, string if name in operators: attributes = {} if name in movablelimits and string and string[0] in ' _^': attributes['movablelimits'] = True elif name in ('lvert', 'lVert'): attributes['class'] = 'mathopen' node = node.append(mo(operators[name], **attributes)) return node, string if name in bigdelimiters: delimiter_attributes = {} size = delimiter_sizes[bigdelimiters[name]] delimiter, string = tex_token_or_group(string) if delimiter not in '()[]/|.': try: delimiter = stretchables[delimiter.lstrip('\\')] except KeyError: raise MathError(f'Unsupported "\\{name}" delimiter ' f'"{delimiter}"!') if size: delimiter_attributes['maxsize'] = size delimiter_attributes['minsize'] = size delimiter_attributes['symmetric'] = True if name == 'left' or name.endswith('l'): row = mrow() node.append(row) node = row if delimiter != '.': # '.' stands for "empty delimiter" node.append(mo(delimiter, **delimiter_attributes)) if name == 'right' or name.endswith('r'): node = node.close() return node, string if name == 'not': # negation: LaTeX just overlays next symbol with "/". arg, string = tex_token(string) if arg == '{': return node, '{\\not ' + string if arg.startswith('\\'): # LaTeX macro try: arg = operators[arg[1:]] except KeyError: raise MathError(rf'"\not" cannot negate: "{arg}"!') arg = unicodedata.normalize('NFC', arg+'\u0338') node = node.append(mo(arg)) return node, string # arbitrary text (usually comments) -> if name in ('text', 'mbox', 'textrm'): arg, string = tex_token_or_group(string) parts = arg.split('$') # extract inline math for i, part in enumerate(parts): if i % 2 == 0: # i is even # LaTeX keeps whitespace in, e.g., ``\text{ foo }``, # displays only internal whitespace. # → replace marginal whitespace with NBSP part = re.sub('(^[ \n]|[ \n]$)', '\u00a0', part) node = node.append(mtext(part)) else: parse_latex_math(node, part) return node, string # horizontal space -> if name in spaces: node = node.append(mspace(width='%s'%spaces[name])) return node, string if name in ('hspace', 'mspace'): arg, string = tex_group(string) if arg.endswith('mu'): # unit "mu" (1mu=1/18em) not supported by MathML arg = '%sem' % (float(arg[:-2])/18) node = node.append(mspace(width='%s'%arg)) return node, string if name == 'phantom': new_node = mphantom() node.append(new_node) return new_node, string if name == 'boxed': # CSS padding is broken in Firefox 115.6.0esr # therefore we still need the deprecated element new_node = menclose(notation='box', CLASS='boxed') node.append(new_node) return new_node, string # Complex elements (Layout schemata) # ================================== if name == 'sqrt': radix, string = tex_optarg(string) if radix: indexnode = mrow() new_node = mroot(indexnode, switch=True) parse_latex_math(indexnode, radix) indexnode.close() else: new_node = msqrt() node.append(new_node) return new_node, string if name in fractions: attributes = fractions[name] if name == 'cfrac': optarg, string = tex_optarg(string) optargs = {'l': 'left', 'r': 'right'} if optarg in optargs: attributes = attributes.copy() attributes['numalign'] = optargs[optarg] # "numalign" is deprecated attributes['class'] += ' numalign-' + optargs[optarg] new_node = frac = mfrac(**attributes) if name.endswith('binom'): new_node = mrow(mo('('), new_node, mo(')'), CLASS='binom') new_node.nchildren = 3 node.append(new_node) return frac, string if name == '\\': # end of a row entry = mtd() new_node = mtr(entry) node.close().close().append(new_node) return entry, string if name in accents: accent_node = mo(accents[name], stretchy=False) # mi() would be simpler, but semantically wrong # --- https://w3c.github.io/mathml-core/#operator-fence-separator-or-accent-mo if name == 'vec': accent_node.set('scriptlevel', '+1') # scale down arrow new_node = mover(accent_node, accent=True, switch=True) node.append(new_node) return new_node, string if name in over: # set "accent" to False (otherwise dots on i and j are dropped) # but to True on accent node get "textstyle" (full size) symbols on top new_node = mover(mo(over[name][0], accent=True), switch=True, accent=False) node.append(new_node) return new_node, string if name == 'overset': new_node = mover(switch=True) node.append(new_node) return new_node, string if name in under: new_node = munder(mo(under[name][0]), switch=True) node.append(new_node) return new_node, string if name == 'underset': new_node = munder(switch=True) node.append(new_node) return new_node, string if name in ('xleftarrow', 'xrightarrow'): subscript, string = tex_optarg(string) base = mo(operators['long'+name[1:]]) if subscript: new_node = munderover(base) sub_node = parse_latex_math(mrow(), subscript) if len(sub_node) == 1: sub_node = sub_node[0] new_node.append(sub_node) else: new_node = mover(base) node.append(new_node) return new_node, string if name in layout_styles: # 'displaystyle', 'textstyle', ... if len(node) > 0: raise MathError(rf'Declaration "\{name}" must be first command ' 'in a group!') for k, v in layout_styles[name].items(): node.set(k, v) return node, string if name.endswith('limits'): arg, remainder = tex_token(string) if arg in '_^': # else ignore string = remainder node = handle_script_or_limit(node, arg, limits=name) return node, string # Environments if name == 'begin': return begin_environment(node, string) if name == 'end': return end_environment(node, string) raise MathError(rf'Unknown LaTeX command "\{name}".') # >>> handle_cmd('left', math(), '[a\\right]') # (mrow(mo('[')), 'a\\right]') # >>> handle_cmd('left', math(), '. a)') # empty \left # (mrow(), ' a)') # >>> handle_cmd('left', math(), '\\uparrow a)') # cmd # (mrow(mo('↑')), 'a)') # >>> handle_cmd('not', math(), '\\equiv \\alpha)') # cmd # (math(mo('≢')), '\\alpha)') # >>> handle_cmd('text', math(), '{ for } i>0') # group # (math(mtext('\xa0for\xa0')), ' i>0') # >>> handle_cmd('text', math(), '{B}T') # group # (math(mtext('B')), 'T') # >>> handle_cmd('text', math(), '{number of apples}}') # group # (math(mtext('number of apples')), '}') # >>> handle_cmd('text', math(), 'i \\sin(x)') # single char # (math(mtext('i')), ' \\sin(x)') # >>> handle_cmd(' ', math(), ' next') # inter word space # (math(mspace(width='0.25em')), ' next') # >>> handle_cmd('\n', math(), '\nnext') # inter word space # (math(mspace(width='0.25em')), '\nnext') # >>> handle_cmd('sin', math(), '(\\alpha)') # (math(mi('sin'), mo('\u2061')), '(\\alpha)') # >>> handle_cmd('sin', math(), ' \\alpha') # (math(mi('sin'), mo('\u2061')), ' \\alpha') # >>> handle_cmd('operatorname', math(), '{abs}(x)') # (math(mi('abs', mathvariant='normal'), mo('\u2061')), '(x)') # >>> handle_cmd('overline', math(), '{981}') # (mover(mo('_', accent='true'), switch=True, accent='false'), '{981}') # >>> handle_cmd('bar', math(), '{x}') # (mover(mo('ˉ', stretchy='false'), switch=True, accent='true'), '{x}') # >>> handle_cmd('xleftarrow', math(), r'[\alpha]{10}') # (munderover(mo('⟵'), mi('α')), '{10}') # >>> handle_cmd('xleftarrow', math(), r'[\alpha=5]{10}') # (munderover(mo('⟵'), mrow(mi('α'), mo('='), mn('5'))), '{10}') # >>> handle_cmd('left', math(), '< a)') # Traceback (most recent call last): # docutils.utils.math.MathError: Unsupported "\left" delimiter "<"! # >>> handle_cmd('not', math(), '{< b} c') # LaTeX ignores the braces, too. # (math(), '{\\not < b} c') def handle_math_alphabet(name, node, string): attributes = {} if name == 'mathscr': attributes['class'] = 'mathscr' arg, string = tex_token_or_group(string) # Shortcut for text arg like \mathrm{out} with more than one letter: if name == 'mathrm' and arg.isalpha() and len(arg) > 1: node = node.append(mi(arg)) # defaults to "normal" font return node, string # Parse into an container = mrow(**attributes) node.append(container) parse_latex_math(container, arg) key = name.replace('mathscr', 'mathcal').replace('mathbfsfit', 'mathsfbfit') a2ch = getattr(mathalphabet2unichar, key, {}) for subnode in container.iter(): if isinstance(subnode, mn): # a number may consist of more than one digit subnode.text = ''.join(a2ch.get(ch, ch) for ch in subnode.text) elif isinstance(subnode, mi): # don't convert multi-letter identifiers (functions) subnode.text = a2ch.get(subnode.text, subnode.text) if name == 'mathrm' and subnode.text.isalpha(): subnode.set('mathvariant', 'normal') return container.close(), string # >>> handle_math_alphabet('mathrm', math(), '\\alpha') # (math(mi('α', mathvariant='normal')), '') # >>> handle_math_alphabet('mathbb', math(), '{R} = 3') # (math(mi('ℝ')), ' = 3') # >>> handle_math_alphabet('mathcal', math(), '{F = 3}') # (math(mrow(mi('ℱ'), mo('='), mn('3'), nchildren=3)), '') # >>> handle_math_alphabet('mathrm', math(), '{out} = 3') # drop # (math(mi('out')), ' = 3') # # Single letters in \mathrm require "mathvariant='normal'": # >>> handle_math_alphabet('mathrm', math(), '{V = 3}') # doctest: +ELLIPSIS # (math(mrow(mi('V', mathvariant='normal'), mo('='), mn('3'), ...)), '') def handle_script_or_limit(node, c, limits=''): """Append script or limit element to `node`.""" child = node.pop() if limits == 'limits': child.set('movablelimits', 'false') elif (limits == 'movablelimits' or getattr(child, 'text', '') in movablelimits): child.set('movablelimits', 'true') if c == '_': if isinstance(child, mover): new_node = munderover(*child, switch=True) elif isinstance(child, msup): new_node = msubsup(*child, switch=True) elif (limits in ('limits', 'movablelimits') or limits == '' and child.get('movablelimits', None)): new_node = munder(child) else: new_node = msub(child) elif c == '^': if isinstance(child, munder): new_node = munderover(*child) elif isinstance(child, msub): new_node = msubsup(*child) elif (limits in ('limits', 'movablelimits') or limits == '' and child.get('movablelimits', None)): new_node = mover(child) else: new_node = msup(child) node.append(new_node) return new_node def begin_environment(node, string): name, string = tex_group(string) if name in matrices: left_delimiter = matrices[name][0] attributes = {} if left_delimiter: wrapper = mrow(mo(left_delimiter)) if name == 'cases': wrapper = mrow(mo(left_delimiter, rspace='0.17em')) attributes['columnalign'] = 'left' attributes['class'] = 'cases' node.append(wrapper) node = wrapper elif name == 'smallmatrix': attributes['rowspacing'] = '0.02em' attributes['columnspacing'] = '0.333em' attributes['scriptlevel'] = '1' elif name == 'aligned': attributes['class'] = 'ams-align' # TODO: array, aligned & alignedat take an optional [t], [b], or [c]. entry = mtd() node.append(mtable(mtr(entry), **attributes)) node = entry else: raise MathError(f'Environment "{name}" not supported!') return node, string def end_environment(node, string): name, string = tex_group(string) if name in matrices: node = node.close().close().close() # close: mtd, mdr, mtable right_delimiter = matrices[name][1] if right_delimiter: node = node.append(mo(right_delimiter)) node = node.close() elif name == 'cases': node = node.close() else: raise MathError(f'Environment "{name}" not supported!') return node, string # Return the number of "equation_columns" in `code_lines`. cf. "alignat" # in http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf def tex_equation_columns(rows): tabs = max(row.count('&') - row.count(r'\&') for row in rows) if tabs == 0: return 0 return int(tabs/2 + 1) # >>> tex_equation_columns(['a = b']) # 0 # >>> tex_equation_columns(['a &= b']) # 1 # >>> tex_equation_columns(['a &= b & a \in S']) # 2 # >>> tex_equation_columns(['a &= b & c &= d']) # 2 # Return dictionary with attributes to style an as align environment: # Not used with HTML. Replaced by CSS rule for "mtable.ams-align" in # "minimal.css" as "columnalign" is disregarded by Chromium and webkit. def align_attributes(rows): atts = {'class': 'ams-align', 'displaystyle': True} # get maximal number of non-escaped "next column" markup characters: tabs = max(row.count('&') - row.count(r'\&') for row in rows) if tabs: aligns = ['right', 'left'] * tabs spacing = ['0', '2em'] * tabs atts['columnalign'] = ' '.join(aligns[:tabs+1]) atts['columnspacing'] = ' '.join(spacing[:tabs]) return atts # >>> align_attributes(['a = b']) # {'class': 'ams-align', 'displaystyle': True} # >>> align_attributes(['a &= b']) # {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left', 'columnspacing': '0'} # >>> align_attributes(['a &= b & a \in S']) # {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right', 'columnspacing': '0 2em'} # >>> align_attributes(['a &= b & c &= d']) # {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'} # >>> align_attributes([r'a &= b & c &= d \& e']) # {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'} # >>> align_attributes([r'a &= b & c &= d & e']) # {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left right', 'columnspacing': '0 2em 0 2em'} def tex2mathml(tex_math, as_block=False): """Return string with MathML code corresponding to `tex_math`. Set `as_block` to ``True`` for displayed formulas. """ # Set up tree math_tree = math(xmlns='http://www.w3.org/1998/Math/MathML') node = math_tree if as_block: math_tree.set('display', 'block') rows = toplevel_code(tex_math).split(r'\\') if len(rows) > 1: # emulate "align*" environment with a math table node = mtd() math_tree.append(mtable(mtr(node), CLASS='ams-align', displaystyle=True)) parse_latex_math(node, tex_math) math_tree.indent_xml() return math_tree.toxml() # >>> print(tex2mathml('3')) # # 3 # # >>> print(tex2mathml('3', as_block=True)) # # 3 # # >>> print(tex2mathml(r'a & b \\ c & d', as_block=True)) # # # # # a # # # b # # # # # c # # # d # # # # # >>> print(tex2mathml(r'a \\ b', as_block=True)) # # # # # a # # # # # b # # # # # TODO: look up more symbols from tr25, e.g. # # # Table 2.8 Using Vertical Line or Solidus Overlay # some of the negated forms of mathematical relations that can only be # encoded by using either U+0338 COMBINING LONG SOLIDUS OVERLAY or U+20D2 # COMBINING LONG VERTICAL LINE OVERLAY . (For issues with using 0338 in # MathML, see Section 3.2.7, Combining Marks. # # Table 2.9 Variants of Mathematical Symbols using VS1? # # Sequence Description # 0030 + VS1 DIGIT ZERO - short diagonal stroke form # 2205 + VS1 EMPTY SET - zero with long diagonal stroke overlay form # 2229 + VS1 INTERSECTION - with serifs # 222A + VS1 UNION - with serifs # 2268 + VS1 LESS-THAN BUT NOT EQUAL TO - with vertical stroke # 2269 + VS1 GREATER-THAN BUT NOT EQUAL TO - with vertical stroke # 2272 + VS1 LESS-THAN OR EQUIVALENT TO - following the slant of the lower leg # 2273 + VS1 GREATER-THAN OR EQUIVALENT TO - following the slant of the lower leg # 228A + VS1 SUBSET OF WITH NOT EQUAL TO - variant with stroke through bottom members # 228B + VS1 SUPERSET OF WITH NOT EQUAL TO - variant with stroke through bottom members # 2293 + VS1 SQUARE CAP - with serifs # 2294 + VS1 SQUARE CUP - with serifs # 2295 + VS1 CIRCLED PLUS - with white rim # 2297 + VS1 CIRCLED TIMES - with white rim # 229C + VS1 CIRCLED EQUALS - equal sign inside and touching the circle # 22DA + VS1 LESS-THAN slanted EQUAL TO OR GREATER-THAN # 22DB + VS1 GREATER-THAN slanted EQUAL TO OR LESS-THAN # 2A3C + VS1 INTERIOR PRODUCT - tall variant with narrow foot # 2A3D + VS1 RIGHTHAND INTERIOR PRODUCT - tall variant with narrow foot # 2A9D + VS1 SIMILAR OR LESS-THAN - following the slant of the upper leg # 2A9E + VS1 SIMILAR OR GREATER-THAN - following the slant of the upper leg # 2AAC + VS1 SMALLER THAN OR slanted EQUAL # 2AAD + VS1 LARGER THAN OR slanted EQUAL # 2ACB + VS1 SUBSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members # 2ACC + VS1 SUPERSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members