test-kivy-app/kivy_venv/lib/python3.11/site-packages/docutils/utils/math/latex2mathml.py

1253 lines
46 KiB
Python
Raw Normal View History

2024-09-15 12:12:16 +00:00
# :Id: $Id: latex2mathml.py 9536 2024-02-01 13:04:22Z milde $
# :Copyright: © 2005 Jens Jørgen Mortensen [1]_
# © 2010, 2021, 2024 Günter Milde.
#
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
# are permitted in any medium without royalty provided the copyright
# notice and this notice are preserved.
# This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math`
"""Convert LaTex maths code into presentational MathML.
This module is provisional:
the API is not settled and may change with any minor Docutils version.
"""
# Usage:
#
# >>> from latex2mathml import *
import re
import unicodedata
from docutils.utils.math import (MathError, mathalphabet2unichar,
tex2unichar, toplevel_code)
from docutils.utils.math.mathml_elements import (
math, mtable, mrow, mtr, mtd, menclose, mphantom, msqrt, mi, mn, mo,
mtext, msub, msup, msubsup, munder, mover, munderover, mroot, mfrac,
mspace, MathRow)
# Character data
# --------------
# LaTeX math macro to Unicode mappings.
# Character categories.
# identifiers -> <mi>
letters = {'hbar': ''} # Compatibility mapping: \hbar resembles italic ħ
# "unicode-math" unifies \hbar and \hslash to ℏ.
letters.update(tex2unichar.mathalpha)
ordinary = tex2unichar.mathord # Miscellaneous symbols
# special case: Capital Greek letters: (upright in TeX style)
greek_capitals = {
'Phi': '\u03a6', 'Xi': '\u039e', 'Sigma': '\u03a3',
'Psi': '\u03a8', 'Delta': '\u0394', 'Theta': '\u0398',
'Upsilon': '\u03d2', 'Pi': '\u03a0', 'Omega': '\u03a9',
'Gamma': '\u0393', 'Lambda': '\u039b'}
# functions -> <mi>
functions = {
# functions with a space in the name
'liminf': 'lim\u202finf',
'limsup': 'lim\u202fsup',
'injlim': 'inj\u202flim',
'projlim': 'proj\u202flim',
# embellished function names (see handle_cmd() below)
'varlimsup': 'lim',
'varliminf': 'lim',
'varprojlim': 'lim',
'varinjlim': 'lim',
# custom function name
'operatorname': None,
}
functions.update((name, name) for name in
('arccos', 'arcsin', 'arctan', 'arg', 'cos',
'cosh', 'cot', 'coth', 'csc', 'deg',
'det', 'dim', 'exp', 'gcd', 'hom',
'ker', 'lg', 'ln', 'log', 'Pr',
'sec', 'sin', 'sinh', 'tan', 'tanh'))
# Function with limits: 'lim', 'sup', 'inf', 'max', 'min':
# use <mo> to allow "movablelimits" attribute (see below).
# modulo operator/arithmetic
modulo_functions = {
# cmdname: (binary, named, parentheses, padding)
'bmod': (True, True, False, '0.278em'), # a mod n
'pmod': (False, True, True, '0.444em'), # a (mod n)
'mod': (False, True, False, '0.667em'), # a mod n
'pod': (False, False, True, '0.444em'), # a (n)
}
# "mathematical alphabets": map identifiers to the corresponding
# characters from the "Mathematical Alphanumeric Symbols" block
math_alphabets = {
# 'cmdname': 'mathvariant value' # package
'mathbb': 'double-struck', # amssymb
'mathbf': 'bold',
'mathbfit': 'bold-italic', # isomath
'mathcal': 'script',
'mathfrak': 'fraktur', # amssymb
'mathit': 'italic',
'mathrm': 'normal',
'mathscr': 'script', # mathrsfs et al
'mathsf': 'sans-serif',
'mathbfsfit': 'sans-serif-bold-italic', # unicode-math
'mathsfbfit': 'sans-serif-bold-italic', # isomath
'mathsfit': 'sans-serif-italic', # isomath
'mathtt': 'monospace',
# unsupported: bold-fraktur
# bold-script
# bold-sans-serif
}
# operator, fence, or separator -> <mo>
stretchables = {
# extensible delimiters allowed in left/right cmds
'backslash': '\\',
'uparrow': '\u2191', # ↑ UPWARDS ARROW
'downarrow': '\u2193', # ↓ DOWNWARDS ARROW
'updownarrow': '\u2195', # ↕ UP DOWN ARROW
'Uparrow': '\u21d1', # ⇑ UPWARDS DOUBLE ARROW
'Downarrow': '\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW
'Updownarrow': '\u21d5', # ⇕ UP DOWN DOUBLE ARROW
'lmoustache': '\u23b0', # ⎰ … CURLY BRACKET SECTION
'rmoustache': '\u23b1', # ⎱ … LEFT CURLY BRACKET SECTION
'arrowvert': '\u23d0', # ⏐ VERTICAL LINE EXTENSION
'bracevert': '\u23aa', # ⎪ CURLY BRACKET EXTENSION
'lvert': '|', # left |
'lVert': '\u2016', # left ‖
'rvert': '|', # right |
'rVert': '\u2016', # right ‖
'Arrowvert': '\u2016', # ‖
}
stretchables.update(tex2unichar.mathfence)
stretchables.update(tex2unichar.mathopen) # Braces
stretchables.update(tex2unichar.mathclose) # Braces
# >>> print(' '.join(sorted(set(stretchables.values()))))
# [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈
operators = {
# negated symbols without pre-composed Unicode character
'nleqq': '\u2266\u0338', # ≦̸
'ngeqq': '\u2267\u0338', # ≧̸
'nleqslant': '\u2a7d\u0338', # ⩽̸
'ngeqslant': '\u2a7e\u0338', # ⩾̸
'ngtrless': '\u2277\u0338', # txfonts
'nlessgtr': '\u2276\u0338', # txfonts
'nsubseteqq': '\u2AC5\u0338', # ⫅̸
'nsupseteqq': '\u2AC6\u0338', # ⫆̸
# compatibility definitions:
'centerdot': '\u2B1D', # BLACK VERY SMALL SQUARE | mathbin
'varnothing': '\u2300', # ⌀ DIAMETER SIGN | empty set
'varpropto': '\u221d', # ∝ PROPORTIONAL TO | sans serif
'triangle': '\u25B3', # WHITE UP-POINTING TRIANGLE | mathord
'triangledown': '\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord
# alias commands:
'dotsb': '\u22ef', # ⋯ with binary operators/relations
'dotsc': '\u2026', # … with commas
'dotsi': '\u22ef', # ⋯ with integrals
'dotsm': '\u22ef', # ⋯ multiplication dots
'dotso': '\u2026', # … other dots
# functions with movable limits (requires <mo>)
'lim': 'lim',
'sup': 'sup',
'inf': 'inf',
'max': 'max',
'min': 'min',
}
operators.update(tex2unichar.mathbin) # Binary symbols
operators.update(tex2unichar.mathrel) # Relation symbols, arrow symbols
operators.update(tex2unichar.mathpunct) # Punctuation
operators.update(tex2unichar.mathop) # Variable-sized symbols
operators.update(stretchables)
# special cases
thick_operators = {
# style='font-weight: bold;'
'thicksim': '\u223C', #
'thickapprox': '\u2248', # ≈
}
small_operators = {
# mathsize='75%'
'shortmid': '\u2223', #
'shortparallel': '\u2225', # ∥
'nshortmid': '\u2224', # ∤
'nshortparallel': '\u2226', # ∦
'smallfrown': '\u2322', # ⌢ FROWN
'smallsmile': '\u2323', # ⌣ SMILE
'smallint': '\u222b', # ∫ INTEGRAL
}
# Operators and functions with limits above/below in display formulas
# and in index position inline (movablelimits=True)
movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes',
'bigsqcup', 'biguplus', 'bigvee', 'bigwedge',
'coprod', 'intop', 'ointop', 'prod', 'sum',
'lim', 'max', 'min', 'sup', 'inf')
# Depending on settings, integrals may also be in this category.
# (e.g. if "amsmath" is loaded with option "intlimits", see
# http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf)
# movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint',
# 'oint', 'ointctrclockwise', 'sqint',
# 'varointclockwise',))
# horizontal space -> <mspace>
spaces = {'qquad': '2em', # two \quad
'quad': '1em', # 18 mu
'thickspace': '0.2778em', # 5mu = 5/18em
';': '0.2778em', # 5mu thickspace
' ': '0.25em', # inter word space
'\n': '0.25em', # inter word space
'medspace': '0.2222em', # 4mu = 2/9em
':': '0.2222em', # 4mu medspace
'thinspace': '0.1667em', # 3mu = 1/6em
',': '0.1667em', # 3mu thinspace
'negthinspace': '-0.1667em', # -3mu = -1/6em
'!': '-0.1667em', # negthinspace
'negmedspace': '-0.2222em', # -4mu = -2/9em
'negthickspace': '-0.2778em', # -5mu = -5/18em
}
# accents: -> <mo stretchy="false"> in <mover>
accents = {
# TeX: spacing combining
'acute': '´', # '\u0301'
'bar': 'ˉ', # '\u0304'
'breve': '˘', # '\u0306'
'check': 'ˇ', # '\u030C'
'dot': '˙', # '\u0307'
'ddot': '¨', # '\u0308'
'dddot': '˙˙˙', # '\u20DB' # or … ?
'ddddot': '˙˙˙˙', # '\u20DC' # or ¨¨ ?
'grave': '`', # '\u0300'
'hat': 'ˆ', # '\u0302'
'mathring': '˚', # '\u030A'
'tilde': '~', # '\u0303' # tilde ~ or small tilde ˜?
'vec': '', # '\u20d7' # → too heavy, use scriptlevel="+1"
}
# limits etc. -> <mo> in <mover> or <munder>
over = {
# TeX: (char, offset-correction/em)
'overbrace': ('\u23DE', -0.2), # DejaVu Math -0.6
'overleftarrow': ('\u2190', -0.2),
'overleftrightarrow': ('\u2194', -0.2),
'overline': ('_', -0.2), # \u2012 does not stretch
'overrightarrow': ('\u2192', -0.2),
'widehat': ('^', -0.5),
'widetilde': ('~', -0.3),
}
under = {'underbrace': ('\u23DF', 0.1), # DejaVu Math -0.7
'underleftarrow': ('\u2190', -0.2),
'underleftrightarrow': ('\u2194', -0.2),
'underline': ('_', -0.8),
'underrightarrow': ('\u2192', -0.2),
}
# Character translations
# ----------------------
# characters with preferred alternative in mathematical use
# cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous
anomalous_chars = {'-': '\u2212', # HYPHEN-MINUS -> MINUS SIGN
':': '\u2236', # COLON -> RATIO
'~': '\u00a0', # NO-BREAK SPACE
}
# blackboard bold (Greek characters not working with "mathvariant" (Firefox 78)
mathbb = {'Γ': '\u213E', # ℾ
'Π': '\u213F', # ℿ
'Σ': '\u2140', # ⅀
'γ': '\u213D', #
'π': '\u213C', # ℼ
}
# Matrix environments
matrices = {
# name: fences
'matrix': ('', ''),
'smallmatrix': ('', ''), # smaller, see begin_environment()!
'pmatrix': ('(', ')'),
'bmatrix': ('[', ']'),
'Bmatrix': ('{', '}'),
'vmatrix': ('|', '|'),
'Vmatrix': ('\u2016', '\u2016'), # ‖
'aligned': ('', ''),
'cases': ('{', ''),
}
layout_styles = {
'displaystyle': {'displaystyle': True, 'scriptlevel': 0},
'textstyle': {'displaystyle': False, 'scriptlevel': 0},
'scriptstyle': {'displaystyle': False, 'scriptlevel': 1},
'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2},
}
# See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel
fractions = {
# name: attributes
'frac': {},
'cfrac': {'displaystyle': True, 'scriptlevel': 0,
'class': 'cfrac'}, # in LaTeX with padding
'dfrac': layout_styles['displaystyle'],
'tfrac': layout_styles['textstyle'],
'binom': {'linethickness': 0},
'dbinom': layout_styles['displaystyle'] | {'linethickness': 0},
'tbinom': layout_styles['textstyle'] | {'linethickness': 0},
}
delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em']
bigdelimiters = {'left': 0,
'right': 0,
'bigl': 1,
'bigr': 1,
'Bigl': 2,
'Bigr': 2,
'biggl': 3,
'biggr': 3,
'Biggl': 4,
'Biggr': 4,
}
# LaTeX to MathML translation
# ---------------------------
# auxiliary functions
# ~~~~~~~~~~~~~~~~~~~
def tex_cmdname(string):
"""Return leading TeX command name and remainder of `string`.
>>> tex_cmdname('mymacro2') # up to first non-letter
('mymacro', '2')
>>> tex_cmdname('name 2') # strip trailing whitespace
('name', '2')
>>> tex_cmdname('_2') # single non-letter character
('_', '2')
"""
m = re.match(r'([a-zA-Z]+)[ \n]*(.*)', string, re.DOTALL)
if m is None:
m = re.match(r'(.?)(.*)', string, re.DOTALL)
return m.group(1), m.group(2)
# Test:
#
# >>> tex_cmdname('name\nnext') # strip trailing whitespace, also newlines
# ('name', 'next')
# >>> tex_cmdname('name_2') # first non-letter terminates
# ('name', '_2')
# >>> tex_cmdname('name_2\nnext line') # line-break allowed
# ('name', '_2\nnext line')
# >>> tex_cmdname(' next') # leading whitespace is returned
# (' ', 'next')
# >>> tex_cmdname('1 2') # whitespace after non-letter is kept
# ('1', ' 2')
# >>> tex_cmdname('1\n2\t3') # whitespace after non-letter is kept
# ('1', '\n2\t3')
# >>> tex_cmdname('') # empty string
# ('', '')
def tex_number(string):
"""Return leading number literal and remainder of `string`.
>>> tex_number('123.4')
('123.4', '')
"""
m = re.match(r'([0-9.,]*[0-9]+)(.*)', string, re.DOTALL)
if m is None:
return '', string
return m.group(1), m.group(2)
# Test:
#
# >>> tex_number(' 23.4b') # leading whitespace -> no number
# ('', ' 23.4b')
# >>> tex_number('23,400/2') # comma separator included
# ('23,400', '/2')
# >>> tex_number('23. 4/2') # trailing separator not included
# ('23', '. 4/2')
# >>> tex_number('4, 2') # trailing separator not included
# ('4', ', 2')
# >>> tex_number('1 000.4')
# ('1', ' 000.4')
def tex_token(string):
"""Return first simple TeX token and remainder of `string`.
>>> tex_token('\\command{without argument}')
('\\command', '{without argument}')
>>> tex_token('or first character')
('o', 'r first character')
"""
m = re.match(r"""((?P<cmd>\\[a-zA-Z]+)\s* # TeX command, skip whitespace
|(?P<chcmd>\\.) # one-character TeX command
|(?P<ch>.?)) # first character (or empty)
(?P<remainder>.*$) # remaining part of string
""", string, re.VERBOSE | re.DOTALL)
cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder')
return cmd or chcmd or ch, remainder
# Test:
#
# >>> tex_token('{opening bracket of group}')
# ('{', 'opening bracket of group}')
# >>> tex_token('\\skip whitespace after macro name')
# ('\\skip', 'whitespace after macro name')
# >>> tex_token('. but not after single char')
# ('.', ' but not after single char')
# >>> tex_token('') # empty string.
# ('', '')
# >>> tex_token('\{escaped bracket')
# ('\\{', 'escaped bracket')
def tex_group(string):
"""Return first TeX group or token and remainder of `string`.
>>> tex_group('{first group} returned without brackets')
('first group', ' returned without brackets')
"""
split_index = 0
nest_level = 0 # level of {{nested} groups}
escape = False # the next character is escaped (\)
if not string.startswith('{'):
# special case: there is no group, return first token and remainder
return string[:1], string[1:]
for c in string:
split_index += 1
if escape:
escape = False
elif c == '\\':
escape = True
elif c == '{':
nest_level += 1
elif c == '}':
nest_level -= 1
if nest_level == 0:
break
else:
raise MathError('Group without closing bracket!')
return string[1:split_index-1], string[split_index:]
# >>> tex_group('{} empty group')
# ('', ' empty group')
# >>> tex_group('{group with {nested} group} ')
# ('group with {nested} group', ' ')
# >>> tex_group('{group with {nested group}} at the end')
# ('group with {nested group}', ' at the end')
# >>> tex_group('{{group} {with {{complex }nesting}} constructs}')
# ('{group} {with {{complex }nesting}} constructs', '')
# >>> tex_group('{group with \\{escaped\\} brackets}')
# ('group with \\{escaped\\} brackets', '')
# >>> tex_group('{group followed by closing bracket}} from outer group')
# ('group followed by closing bracket', '} from outer group')
# >>> tex_group('No group? Return first character.')
# ('N', 'o group? Return first character.')
# >>> tex_group(' {also whitespace}')
# (' ', '{also whitespace}')
def tex_token_or_group(string):
"""Return first TeX group or token and remainder of `string`.
>>> tex_token_or_group('\\command{without argument}')
('\\command', '{without argument}')
>>> tex_token_or_group('first character')
('f', 'irst character')
>>> tex_token_or_group(' also whitespace')
(' ', 'also whitespace')
>>> tex_token_or_group('{first group} keep rest')
('first group', ' keep rest')
"""
arg, remainder = tex_token(string)
if arg == '{':
arg, remainder = tex_group(string.lstrip())
return arg, remainder
# >>> tex_token_or_group('\{no group but left bracket')
# ('\\{', 'no group but left bracket')
def tex_optarg(string):
"""Return optional argument and remainder.
>>> tex_optarg('[optional argument] returned without brackets')
('optional argument', ' returned without brackets')
>>> tex_optarg('{empty string, if there is no optional arg}')
('', '{empty string, if there is no optional arg}')
"""
m = re.match(r"""\s* # leading whitespace
\[(?P<optarg>(\\]|[^\[\]]|\\])*)\] # [group] without nested groups
(?P<remainder>.*$)
""", string, re.VERBOSE | re.DOTALL)
if m is None and not string.startswith('['):
return '', string
try:
return m.group('optarg'), m.group('remainder')
except AttributeError:
raise MathError(f'Could not extract optional argument from "{string}"!')
# Test:
# >>> tex_optarg(' [optional argument] after whitespace')
# ('optional argument', ' after whitespace')
# >>> tex_optarg('[missing right bracket')
# Traceback (most recent call last):
# ...
# docutils.utils.math.MathError: Could not extract optional argument from "[missing right bracket"!
# >>> tex_optarg('[group with [nested group]]')
# Traceback (most recent call last):
# ...
# docutils.utils.math.MathError: Could not extract optional argument from "[group with [nested group]]"!
def parse_latex_math(root, source):
"""Append MathML conversion of `string` to `node` and return it.
>>> parse_latex_math(math(), r'\alpha')
math(mi('α'))
>>> parse_latex_math(mrow(), r'x_{n}')
mrow(msub(mi('x'), mi('n')))
"""
# Normalize white-space:
string = source # not-yet handled part of source
node = root # the current "insertion point"
# Loop over `string` while changing it.
while len(string) > 0:
# Take off first character:
c, string = string[0], string[1:]
if c in ' \n':
continue # whitespace is ignored in LaTeX math mode
if c == '\\': # start of a LaTeX macro
cmdname, string = tex_cmdname(string)
node, string = handle_cmd(cmdname, node, string)
elif c in "_^":
node = handle_script_or_limit(node, c)
elif c == '{':
if isinstance(node, MathRow) and node.nchildren == 1:
# LaTeX takes one arg, MathML node accepts a group
node.nchildren = None # allow appending until closed by '}'
else: # wrap group in an <mrow>
new_node = mrow()
node.append(new_node)
node = new_node
elif c == '}':
node = node.close()
elif c == '&':
new_node = mtd()
node.close().append(new_node)
node = new_node
elif c.isalpha():
node = node.append(mi(c))
elif c.isdigit():
number, string = tex_number(string)
node = node.append(mn(c+number))
elif c in anomalous_chars:
# characters with a special meaning in LaTeX math mode
# fix spacing before "unary" minus.
attributes = {}
if c == '-' and len(node):
previous_node = node[-1]
if (previous_node.text and previous_node.text in '([='
or previous_node.get('class') == 'mathopen'):
attributes['form'] = 'prefix'
node = node.append(mo(anomalous_chars[c], **attributes))
elif c in "/()[]|":
node = node.append(mo(c, stretchy=False))
elif c in "+*=<>,.!?`';@":
node = node.append(mo(c))
else:
raise MathError(f'Unsupported character: "{c}"!')
# TODO: append as <mi>?
if node is None:
if not string:
return root # ignore unbalanced braces
raise MathError(f'No insertion point for "{string}". '
f'Unbalanced braces in "{source[:-len(string)]}"?')
if node.nchildren and len(node) < node.nchildren:
raise MathError('Last node missing children. Source incomplete?')
return root
# Test:
# >>> parse_latex_math(math(), '')
# math()
# >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}')
# math(msqrt(mi('α')))
# >>> parse_latex_math(math(), '23.4x')
# math(mn('23.4'), mi('x'))
# >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3')
# math(msqrt(mn('2')), mo('≠'), mn('3'))
# >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 10')
# math(msqrt(mn('2'), mo('+'), mn('3'), nchildren=3), mo('<'), mn('10'))
# >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}')
# math(mroot(mrow(mn('2'), mo('+'), mn('3'), nchildren=3), mn('3')))
# >>> parse_latex_math(math(), '\max_x') # function takes limits
# math(munder(mo('max', movablelimits='true'), mi('x')))
# >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup
# math(msubsup(mi('x'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), '\int^j_i') # ensure correct order
# math(msubsup(mo('∫'), mi('i'), mi('j')))
# >>> parse_latex_math(math(), 'x_{\\alpha}')
# math(msub(mi('x'), mi('α')))
# >>> parse_latex_math(math(), 'x_\\text{in}')
# math(msub(mi('x'), mtext('in')))
# >>> parse_latex_math(math(), '2⌘')
# Traceback (most recent call last):
# docutils.utils.math.MathError: Unsupported character: "⌘"!
# >>> parse_latex_math(math(), '23}x') # doctest: +ELLIPSIS
# Traceback (most recent call last):
# ...
# docutils.utils.math.MathError: ... Unbalanced braces in "23}"?
# >>> parse_latex_math(math(), '\\frac{2}')
# Traceback (most recent call last):
# ...
# docutils.utils.math.MathError: Last node missing children. Source incomplete?
def handle_cmd(name, node, string): # noqa: C901 TODO make this less complex
"""Process LaTeX command `name` followed by `string`.
Append result to `node`.
If needed, parse `string` for command argument.
Return new current node and remainder of `string`:
>>> handle_cmd('hbar', math(), r' \frac')
(math(mi('')), ' \\frac')
>>> handle_cmd('hspace', math(), r'{1ex} (x)')
(math(mspace(width='1ex')), ' (x)')
"""
# Token elements
# ==============
# identifier -> <mi>
if name in letters:
new_node = mi(letters[name])
if name in greek_capitals:
# upright in "TeX style" but MathML sets them italic ("ISO style").
# CSS styling does not change the font style in Firefox 78.
# Use 'mathvariant="normal"'?
new_node.set('class', 'capital-greek')
node = node.append(new_node)
return node, string
if name in ordinary:
# <mi mathvariant="normal"> well supported by Chromium but
# Firefox 115.5.0 puts additional space around the symbol, e.g.
# <mi mathvariant="normal">∂</mi><mi>t</mi> looks like ∂ t, not ∂t
# return node.append(mi(ordinary[name], mathvariant='normal')), string
return node.append(mi(ordinary[name])), string
if name in functions:
# use <mi> followed by invisible function applicator character
# (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi)
if name == 'operatorname':
# custom function name, e.g. ``\operatorname{abs}(x)``
# TODO: \operatorname* -> with limits
arg, string = tex_token_or_group(string)
new_node = mi(arg, mathvariant='normal')
else:
new_node = mi(functions[name])
# embellished function names:
if name == 'varliminf': # \underline\lim
new_node = munder(new_node, mo('_'))
elif name == 'varlimsup': # \overline\lim
new_node = mover(new_node, mo('¯'), accent=False)
elif name == 'varprojlim': # \underleftarrow\lim
new_node = munder(new_node, mo('\u2190'))
elif name == 'varinjlim': # \underrightarrow\lim
new_node = munder(new_node, mo('\u2192'))
node = node.append(new_node)
# add ApplyFunction when appropriate (not \sin^2(x), say)
# cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi
if string and string[0] not in ('^', '_'):
node = node.append(mo('\u2061')) # &ApplyFunction;
return node, string
if name in modulo_functions:
(binary, named, parentheses, padding) = modulo_functions[name]
if binary:
node = node.append(mo('mod', lspace=padding, rspace=padding))
return node, string
# left padding
if node.in_block():
padding = '1em'
node = node.append(mspace(width=padding))
if parentheses:
node = node.append(mo('(', stretchy=False))
if named:
node = node.append(mi('mod'))
node = node.append(mspace(width='0.333em'))
arg, string = tex_token_or_group(string)
node = parse_latex_math(node, arg)
if parentheses:
node = node.append(mo(')', stretchy=False))
return node, string
# font changes or mathematical alphanumeric characters
if name in ('boldsymbol', 'pmb'): # \pmb is "poor mans bold"
new_node = mrow(CLASS='boldsymbol')
node.append(new_node)
return new_node, string
if name in math_alphabets:
return handle_math_alphabet(name, node, string)
# operator, fence, or separator -> <mo>
if name == 'colon': # trailing punctuation, not binary relation
node = node.append(mo(':', form='postfix', lspace='0', rspace='0.28em'))
return node, string
if name == 'idotsint': # AMS shortcut for ∫︀···∫︀
node = parse_latex_math(node, r'\int\dotsi\int')
return node, string
if name in thick_operators:
node = node.append(mo(thick_operators[name], style='font-weight: bold'))
return node, string
if name in small_operators:
node = node.append(mo(small_operators[name], mathsize='75%'))
return node, string
if name in operators:
attributes = {}
if name in movablelimits and string and string[0] in ' _^':
attributes['movablelimits'] = True
elif name in ('lvert', 'lVert'):
attributes['class'] = 'mathopen'
node = node.append(mo(operators[name], **attributes))
return node, string
if name in bigdelimiters:
delimiter_attributes = {}
size = delimiter_sizes[bigdelimiters[name]]
delimiter, string = tex_token_or_group(string)
if delimiter not in '()[]/|.':
try:
delimiter = stretchables[delimiter.lstrip('\\')]
except KeyError:
raise MathError(f'Unsupported "\\{name}" delimiter '
f'"{delimiter}"!')
if size:
delimiter_attributes['maxsize'] = size
delimiter_attributes['minsize'] = size
delimiter_attributes['symmetric'] = True
if name == 'left' or name.endswith('l'):
row = mrow()
node.append(row)
node = row
if delimiter != '.': # '.' stands for "empty delimiter"
node.append(mo(delimiter, **delimiter_attributes))
if name == 'right' or name.endswith('r'):
node = node.close()
return node, string
if name == 'not':
# negation: LaTeX just overlays next symbol with "/".
arg, string = tex_token(string)
if arg == '{':
return node, '{\\not ' + string
if arg.startswith('\\'): # LaTeX macro
try:
arg = operators[arg[1:]]
except KeyError:
raise MathError(rf'"\not" cannot negate: "{arg}"!')
arg = unicodedata.normalize('NFC', arg+'\u0338')
node = node.append(mo(arg))
return node, string
# arbitrary text (usually comments) -> <mtext>
if name in ('text', 'mbox', 'textrm'):
arg, string = tex_token_or_group(string)
parts = arg.split('$') # extract inline math
for i, part in enumerate(parts):
if i % 2 == 0: # i is even
# LaTeX keeps whitespace in, e.g., ``\text{ foo }``,
# <mtext> displays only internal whitespace.
# → replace marginal whitespace with NBSP
part = re.sub('(^[ \n]|[ \n]$)', '\u00a0', part)
node = node.append(mtext(part))
else:
parse_latex_math(node, part)
return node, string
# horizontal space -> <mspace>
if name in spaces:
node = node.append(mspace(width='%s'%spaces[name]))
return node, string
if name in ('hspace', 'mspace'):
arg, string = tex_group(string)
if arg.endswith('mu'):
# unit "mu" (1mu=1/18em) not supported by MathML
arg = '%sem' % (float(arg[:-2])/18)
node = node.append(mspace(width='%s'%arg))
return node, string
if name == 'phantom':
new_node = mphantom()
node.append(new_node)
return new_node, string
if name == 'boxed':
# CSS padding is broken in Firefox 115.6.0esr
# therefore we still need the deprecated <menclose> element
new_node = menclose(notation='box', CLASS='boxed')
node.append(new_node)
return new_node, string
# Complex elements (Layout schemata)
# ==================================
if name == 'sqrt':
radix, string = tex_optarg(string)
if radix:
indexnode = mrow()
new_node = mroot(indexnode, switch=True)
parse_latex_math(indexnode, radix)
indexnode.close()
else:
new_node = msqrt()
node.append(new_node)
return new_node, string
if name in fractions:
attributes = fractions[name]
if name == 'cfrac':
optarg, string = tex_optarg(string)
optargs = {'l': 'left', 'r': 'right'}
if optarg in optargs:
attributes = attributes.copy()
attributes['numalign'] = optargs[optarg] # "numalign" is deprecated
attributes['class'] += ' numalign-' + optargs[optarg]
new_node = frac = mfrac(**attributes)
if name.endswith('binom'):
new_node = mrow(mo('('), new_node, mo(')'), CLASS='binom')
new_node.nchildren = 3
node.append(new_node)
return frac, string
if name == '\\': # end of a row
entry = mtd()
new_node = mtr(entry)
node.close().close().append(new_node)
return entry, string
if name in accents:
accent_node = mo(accents[name], stretchy=False)
# mi() would be simpler, but semantically wrong
# --- https://w3c.github.io/mathml-core/#operator-fence-separator-or-accent-mo
if name == 'vec':
accent_node.set('scriptlevel', '+1') # scale down arrow
new_node = mover(accent_node, accent=True, switch=True)
node.append(new_node)
return new_node, string
if name in over:
# set "accent" to False (otherwise dots on i and j are dropped)
# but to True on accent node get "textstyle" (full size) symbols on top
new_node = mover(mo(over[name][0], accent=True),
switch=True, accent=False)
node.append(new_node)
return new_node, string
if name == 'overset':
new_node = mover(switch=True)
node.append(new_node)
return new_node, string
if name in under:
new_node = munder(mo(under[name][0]), switch=True)
node.append(new_node)
return new_node, string
if name == 'underset':
new_node = munder(switch=True)
node.append(new_node)
return new_node, string
if name in ('xleftarrow', 'xrightarrow'):
subscript, string = tex_optarg(string)
base = mo(operators['long'+name[1:]])
if subscript:
new_node = munderover(base)
sub_node = parse_latex_math(mrow(), subscript)
if len(sub_node) == 1:
sub_node = sub_node[0]
new_node.append(sub_node)
else:
new_node = mover(base)
node.append(new_node)
return new_node, string
if name in layout_styles: # 'displaystyle', 'textstyle', ...
if len(node) > 0:
raise MathError(rf'Declaration "\{name}" must be first command '
'in a group!')
for k, v in layout_styles[name].items():
node.set(k, v)
return node, string
if name.endswith('limits'):
arg, remainder = tex_token(string)
if arg in '_^': # else ignore
string = remainder
node = handle_script_or_limit(node, arg, limits=name)
return node, string
# Environments
if name == 'begin':
return begin_environment(node, string)
if name == 'end':
return end_environment(node, string)
raise MathError(rf'Unknown LaTeX command "\{name}".')
# >>> handle_cmd('left', math(), '[a\\right]')
# (mrow(mo('[')), 'a\\right]')
# >>> handle_cmd('left', math(), '. a)') # empty \left
# (mrow(), ' a)')
# >>> handle_cmd('left', math(), '\\uparrow a)') # cmd
# (mrow(mo('↑')), 'a)')
# >>> handle_cmd('not', math(), '\\equiv \\alpha)') # cmd
# (math(mo('≢')), '\\alpha)')
# >>> handle_cmd('text', math(), '{ for } i>0') # group
# (math(mtext('\xa0for\xa0')), ' i>0')
# >>> handle_cmd('text', math(), '{B}T') # group
# (math(mtext('B')), 'T')
# >>> handle_cmd('text', math(), '{number of apples}}') # group
# (math(mtext('number of apples')), '}')
# >>> handle_cmd('text', math(), 'i \\sin(x)') # single char
# (math(mtext('i')), ' \\sin(x)')
# >>> handle_cmd(' ', math(), ' next') # inter word space
# (math(mspace(width='0.25em')), ' next')
# >>> handle_cmd('\n', math(), '\nnext') # inter word space
# (math(mspace(width='0.25em')), '\nnext')
# >>> handle_cmd('sin', math(), '(\\alpha)')
# (math(mi('sin'), mo('\u2061')), '(\\alpha)')
# >>> handle_cmd('sin', math(), ' \\alpha')
# (math(mi('sin'), mo('\u2061')), ' \\alpha')
# >>> handle_cmd('operatorname', math(), '{abs}(x)')
# (math(mi('abs', mathvariant='normal'), mo('\u2061')), '(x)')
# >>> handle_cmd('overline', math(), '{981}')
# (mover(mo('_', accent='true'), switch=True, accent='false'), '{981}')
# >>> handle_cmd('bar', math(), '{x}')
# (mover(mo('ˉ', stretchy='false'), switch=True, accent='true'), '{x}')
# >>> handle_cmd('xleftarrow', math(), r'[\alpha]{10}')
# (munderover(mo('⟵'), mi('α')), '{10}')
# >>> handle_cmd('xleftarrow', math(), r'[\alpha=5]{10}')
# (munderover(mo('⟵'), mrow(mi('α'), mo('='), mn('5'))), '{10}')
# >>> handle_cmd('left', math(), '< a)')
# Traceback (most recent call last):
# docutils.utils.math.MathError: Unsupported "\left" delimiter "<"!
# >>> handle_cmd('not', math(), '{< b} c') # LaTeX ignores the braces, too.
# (math(), '{\\not < b} c')
def handle_math_alphabet(name, node, string):
attributes = {}
if name == 'mathscr':
attributes['class'] = 'mathscr'
arg, string = tex_token_or_group(string)
# Shortcut for text arg like \mathrm{out} with more than one letter:
if name == 'mathrm' and arg.isalpha() and len(arg) > 1:
node = node.append(mi(arg)) # <mi> defaults to "normal" font
return node, string
# Parse into an <mrow>
container = mrow(**attributes)
node.append(container)
parse_latex_math(container, arg)
key = name.replace('mathscr', 'mathcal').replace('mathbfsfit', 'mathsfbfit')
a2ch = getattr(mathalphabet2unichar, key, {})
for subnode in container.iter():
if isinstance(subnode, mn):
# a number may consist of more than one digit
subnode.text = ''.join(a2ch.get(ch, ch) for ch in subnode.text)
elif isinstance(subnode, mi):
# don't convert multi-letter identifiers (functions)
subnode.text = a2ch.get(subnode.text, subnode.text)
if name == 'mathrm' and subnode.text.isalpha():
subnode.set('mathvariant', 'normal')
return container.close(), string
# >>> handle_math_alphabet('mathrm', math(), '\\alpha')
# (math(mi('α', mathvariant='normal')), '')
# >>> handle_math_alphabet('mathbb', math(), '{R} = 3')
# (math(mi('')), ' = 3')
# >>> handle_math_alphabet('mathcal', math(), '{F = 3}')
# (math(mrow(mi(''), mo('='), mn('3'), nchildren=3)), '')
# >>> handle_math_alphabet('mathrm', math(), '{out} = 3') # drop <mrow>
# (math(mi('out')), ' = 3')
#
# Single letters in \mathrm require "mathvariant='normal'":
# >>> handle_math_alphabet('mathrm', math(), '{V = 3}') # doctest: +ELLIPSIS
# (math(mrow(mi('V', mathvariant='normal'), mo('='), mn('3'), ...)), '')
def handle_script_or_limit(node, c, limits=''):
"""Append script or limit element to `node`."""
child = node.pop()
if limits == 'limits':
child.set('movablelimits', 'false')
elif (limits == 'movablelimits'
or getattr(child, 'text', '') in movablelimits):
child.set('movablelimits', 'true')
if c == '_':
if isinstance(child, mover):
new_node = munderover(*child, switch=True)
elif isinstance(child, msup):
new_node = msubsup(*child, switch=True)
elif (limits in ('limits', 'movablelimits')
or limits == '' and child.get('movablelimits', None)):
new_node = munder(child)
else:
new_node = msub(child)
elif c == '^':
if isinstance(child, munder):
new_node = munderover(*child)
elif isinstance(child, msub):
new_node = msubsup(*child)
elif (limits in ('limits', 'movablelimits')
or limits == '' and child.get('movablelimits', None)):
new_node = mover(child)
else:
new_node = msup(child)
node.append(new_node)
return new_node
def begin_environment(node, string):
name, string = tex_group(string)
if name in matrices:
left_delimiter = matrices[name][0]
attributes = {}
if left_delimiter:
wrapper = mrow(mo(left_delimiter))
if name == 'cases':
wrapper = mrow(mo(left_delimiter, rspace='0.17em'))
attributes['columnalign'] = 'left'
attributes['class'] = 'cases'
node.append(wrapper)
node = wrapper
elif name == 'smallmatrix':
attributes['rowspacing'] = '0.02em'
attributes['columnspacing'] = '0.333em'
attributes['scriptlevel'] = '1'
elif name == 'aligned':
attributes['class'] = 'ams-align'
# TODO: array, aligned & alignedat take an optional [t], [b], or [c].
entry = mtd()
node.append(mtable(mtr(entry), **attributes))
node = entry
else:
raise MathError(f'Environment "{name}" not supported!')
return node, string
def end_environment(node, string):
name, string = tex_group(string)
if name in matrices:
node = node.close().close().close() # close: mtd, mdr, mtable
right_delimiter = matrices[name][1]
if right_delimiter:
node = node.append(mo(right_delimiter))
node = node.close()
elif name == 'cases':
node = node.close()
else:
raise MathError(f'Environment "{name}" not supported!')
return node, string
# Return the number of "equation_columns" in `code_lines`. cf. "alignat"
# in http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf
def tex_equation_columns(rows):
tabs = max(row.count('&') - row.count(r'\&') for row in rows)
if tabs == 0:
return 0
return int(tabs/2 + 1)
# >>> tex_equation_columns(['a = b'])
# 0
# >>> tex_equation_columns(['a &= b'])
# 1
# >>> tex_equation_columns(['a &= b & a \in S'])
# 2
# >>> tex_equation_columns(['a &= b & c &= d'])
# 2
# Return dictionary with attributes to style an <mtable> as align environment:
# Not used with HTML. Replaced by CSS rule for "mtable.ams-align" in
# "minimal.css" as "columnalign" is disregarded by Chromium and webkit.
def align_attributes(rows):
atts = {'class': 'ams-align',
'displaystyle': True}
# get maximal number of non-escaped "next column" markup characters:
tabs = max(row.count('&') - row.count(r'\&') for row in rows)
if tabs:
aligns = ['right', 'left'] * tabs
spacing = ['0', '2em'] * tabs
atts['columnalign'] = ' '.join(aligns[:tabs+1])
atts['columnspacing'] = ' '.join(spacing[:tabs])
return atts
# >>> align_attributes(['a = b'])
# {'class': 'ams-align', 'displaystyle': True}
# >>> align_attributes(['a &= b'])
# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left', 'columnspacing': '0'}
# >>> align_attributes(['a &= b & a \in S'])
# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right', 'columnspacing': '0 2em'}
# >>> align_attributes(['a &= b & c &= d'])
# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'}
# >>> align_attributes([r'a &= b & c &= d \& e'])
# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'}
# >>> align_attributes([r'a &= b & c &= d & e'])
# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left right', 'columnspacing': '0 2em 0 2em'}
def tex2mathml(tex_math, as_block=False):
"""Return string with MathML code corresponding to `tex_math`.
Set `as_block` to ``True`` for displayed formulas.
"""
# Set up tree
math_tree = math(xmlns='http://www.w3.org/1998/Math/MathML')
node = math_tree
if as_block:
math_tree.set('display', 'block')
rows = toplevel_code(tex_math).split(r'\\')
if len(rows) > 1:
# emulate "align*" environment with a math table
node = mtd()
math_tree.append(mtable(mtr(node), CLASS='ams-align',
displaystyle=True))
parse_latex_math(node, tex_math)
math_tree.indent_xml()
return math_tree.toxml()
# >>> print(tex2mathml('3'))
# <math xmlns="http://www.w3.org/1998/Math/MathML">
# <mn>3</mn>
# </math>
# >>> print(tex2mathml('3', as_block=True))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
# <mn>3</mn>
# </math>
# >>> print(tex2mathml(r'a & b \\ c & d', as_block=True))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
# <mtable class="ams-align" displaystyle="true">
# <mtr>
# <mtd>
# <mi>a</mi>
# </mtd>
# <mtd>
# <mi>b</mi>
# </mtd>
# </mtr>
# <mtr>
# <mtd>
# <mi>c</mi>
# </mtd>
# <mtd>
# <mi>d</mi>
# </mtd>
# </mtr>
# </mtable>
# </math>
# >>> print(tex2mathml(r'a \\ b', as_block=True))
# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
# <mtable class="ams-align" displaystyle="true">
# <mtr>
# <mtd>
# <mi>a</mi>
# </mtd>
# </mtr>
# <mtr>
# <mtd>
# <mi>b</mi>
# </mtd>
# </mtr>
# </mtable>
# </math>
# TODO: look up more symbols from tr25, e.g.
#
#
# Table 2.8 Using Vertical Line or Solidus Overlay
# some of the negated forms of mathematical relations that can only be
# encoded by using either U+0338 COMBINING LONG SOLIDUS OVERLAY or U+20D2
# COMBINING LONG VERTICAL LINE OVERLAY . (For issues with using 0338 in
# MathML, see Section 3.2.7, Combining Marks.
#
# Table 2.9 Variants of Mathematical Symbols using VS1?
#
# Sequence Description
# 0030 + VS1 DIGIT ZERO - short diagonal stroke form
# 2205 + VS1 EMPTY SET - zero with long diagonal stroke overlay form
# 2229 + VS1 INTERSECTION - with serifs
# 222A + VS1 UNION - with serifs
# 2268 + VS1 LESS-THAN BUT NOT EQUAL TO - with vertical stroke
# 2269 + VS1 GREATER-THAN BUT NOT EQUAL TO - with vertical stroke
# 2272 + VS1 LESS-THAN OR EQUIVALENT TO - following the slant of the lower leg
# 2273 + VS1 GREATER-THAN OR EQUIVALENT TO - following the slant of the lower leg
# 228A + VS1 SUBSET OF WITH NOT EQUAL TO - variant with stroke through bottom members
# 228B + VS1 SUPERSET OF WITH NOT EQUAL TO - variant with stroke through bottom members
# 2293 + VS1 SQUARE CAP - with serifs
# 2294 + VS1 SQUARE CUP - with serifs
# 2295 + VS1 CIRCLED PLUS - with white rim
# 2297 + VS1 CIRCLED TIMES - with white rim
# 229C + VS1 CIRCLED EQUALS - equal sign inside and touching the circle
# 22DA + VS1 LESS-THAN slanted EQUAL TO OR GREATER-THAN
# 22DB + VS1 GREATER-THAN slanted EQUAL TO OR LESS-THAN
# 2A3C + VS1 INTERIOR PRODUCT - tall variant with narrow foot
# 2A3D + VS1 RIGHTHAND INTERIOR PRODUCT - tall variant with narrow foot
# 2A9D + VS1 SIMILAR OR LESS-THAN - following the slant of the upper leg
# 2A9E + VS1 SIMILAR OR GREATER-THAN - following the slant of the upper leg
# 2AAC + VS1 SMALLER THAN OR slanted EQUAL
# 2AAD + VS1 LARGER THAN OR slanted EQUAL
# 2ACB + VS1 SUBSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members
# 2ACC + VS1 SUPERSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members