469 lines
16 KiB
Python
469 lines
16 KiB
Python
|
"""
|
||
|
pygments.lexers.rdf
|
||
|
~~~~~~~~~~~~~~~~~~~
|
||
|
|
||
|
Lexers for semantic web and RDF query languages and markup.
|
||
|
|
||
|
:copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
|
||
|
:license: BSD, see LICENSE for details.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
|
||
|
from pygments.lexer import RegexLexer, bygroups, default
|
||
|
from pygments.token import Keyword, Punctuation, String, Number, Operator, \
|
||
|
Generic, Whitespace, Name, Literal, Comment, Text
|
||
|
|
||
|
__all__ = ['SparqlLexer', 'TurtleLexer', 'ShExCLexer']
|
||
|
|
||
|
|
||
|
class SparqlLexer(RegexLexer):
|
||
|
"""
|
||
|
Lexer for SPARQL query language.
|
||
|
"""
|
||
|
name = 'SPARQL'
|
||
|
aliases = ['sparql']
|
||
|
filenames = ['*.rq', '*.sparql']
|
||
|
mimetypes = ['application/sparql-query']
|
||
|
url = 'https://www.w3.org/TR/sparql11-query'
|
||
|
version_added = '2.0'
|
||
|
|
||
|
# character group definitions ::
|
||
|
|
||
|
PN_CHARS_BASE_GRP = ('a-zA-Z'
|
||
|
'\u00c0-\u00d6'
|
||
|
'\u00d8-\u00f6'
|
||
|
'\u00f8-\u02ff'
|
||
|
'\u0370-\u037d'
|
||
|
'\u037f-\u1fff'
|
||
|
'\u200c-\u200d'
|
||
|
'\u2070-\u218f'
|
||
|
'\u2c00-\u2fef'
|
||
|
'\u3001-\ud7ff'
|
||
|
'\uf900-\ufdcf'
|
||
|
'\ufdf0-\ufffd')
|
||
|
|
||
|
PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
|
||
|
|
||
|
PN_CHARS_GRP = (PN_CHARS_U_GRP +
|
||
|
r'\-' +
|
||
|
r'0-9' +
|
||
|
'\u00b7' +
|
||
|
'\u0300-\u036f' +
|
||
|
'\u203f-\u2040')
|
||
|
|
||
|
HEX_GRP = '0-9A-Fa-f'
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
|
||
|
|
||
|
# terminal productions ::
|
||
|
|
||
|
PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
|
||
|
|
||
|
PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
|
||
|
|
||
|
PN_CHARS = '[' + PN_CHARS_GRP + ']'
|
||
|
|
||
|
HEX = '[' + HEX_GRP + ']'
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
|
||
|
|
||
|
IRIREF = r'<(?:[^<>"{}|^`\\\x00-\x20])*>'
|
||
|
|
||
|
BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
|
||
|
'.]*' + PN_CHARS + ')?'
|
||
|
|
||
|
PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
|
||
|
|
||
|
VARNAME = '[0-9' + PN_CHARS_U_GRP + '][' + PN_CHARS_U_GRP + \
|
||
|
'0-9\u00b7\u0300-\u036f\u203f-\u2040]*'
|
||
|
|
||
|
PERCENT = '%' + HEX + HEX
|
||
|
|
||
|
PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
|
||
|
|
||
|
PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
|
||
|
|
||
|
PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
|
||
|
'(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
|
||
|
PN_CHARS_GRP + ':]|' + PLX + '))?')
|
||
|
|
||
|
EXPONENT = r'[eE][+-]?\d+'
|
||
|
|
||
|
# Lexer token definitions ::
|
||
|
|
||
|
tokens = {
|
||
|
'root': [
|
||
|
(r'\s+', Text),
|
||
|
# keywords ::
|
||
|
(r'(?i)(select|construct|describe|ask|where|filter|group\s+by|minus|'
|
||
|
r'distinct|reduced|from\s+named|from|order\s+by|desc|asc|limit|'
|
||
|
r'offset|values|bindings|load|into|clear|drop|create|add|move|copy|'
|
||
|
r'insert\s+data|delete\s+data|delete\s+where|with|delete|insert|'
|
||
|
r'using\s+named|using|graph|default|named|all|optional|service|'
|
||
|
r'silent|bind|undef|union|not\s+in|in|as|having|to|prefix|base)\b', Keyword),
|
||
|
(r'(a)\b', Keyword),
|
||
|
# IRIs ::
|
||
|
('(' + IRIREF + ')', Name.Label),
|
||
|
# blank nodes ::
|
||
|
('(' + BLANK_NODE_LABEL + ')', Name.Label),
|
||
|
# # variables ::
|
||
|
('[?$]' + VARNAME, Name.Variable),
|
||
|
# prefixed names ::
|
||
|
(r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
|
||
|
bygroups(Name.Namespace, Punctuation, Name.Tag)),
|
||
|
# function names ::
|
||
|
(r'(?i)(str|lang|langmatches|datatype|bound|iri|uri|bnode|rand|abs|'
|
||
|
r'ceil|floor|round|concat|strlen|ucase|lcase|encode_for_uri|'
|
||
|
r'contains|strstarts|strends|strbefore|strafter|year|month|day|'
|
||
|
r'hours|minutes|seconds|timezone|tz|now|uuid|struuid|md5|sha1|sha256|sha384|'
|
||
|
r'sha512|coalesce|if|strlang|strdt|sameterm|isiri|isuri|isblank|'
|
||
|
r'isliteral|isnumeric|regex|substr|replace|exists|not\s+exists|'
|
||
|
r'count|sum|min|max|avg|sample|group_concat|separator)\b',
|
||
|
Name.Function),
|
||
|
# boolean literals ::
|
||
|
(r'(true|false)', Keyword.Constant),
|
||
|
# double literals ::
|
||
|
(r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
|
||
|
# decimal literals ::
|
||
|
(r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
|
||
|
# integer literals ::
|
||
|
(r'[+\-]?\d+', Number.Integer),
|
||
|
# operators ::
|
||
|
(r'(\|\||&&|=|\*|\-|\+|/|!=|<=|>=|!|<|>)', Operator),
|
||
|
# punctuation characters ::
|
||
|
(r'[(){}.;,:^\[\]]', Punctuation),
|
||
|
# line comments ::
|
||
|
(r'#[^\n]*', Comment),
|
||
|
# strings ::
|
||
|
(r'"""', String, 'triple-double-quoted-string'),
|
||
|
(r'"', String, 'single-double-quoted-string'),
|
||
|
(r"'''", String, 'triple-single-quoted-string'),
|
||
|
(r"'", String, 'single-single-quoted-string'),
|
||
|
],
|
||
|
'triple-double-quoted-string': [
|
||
|
(r'"""', String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'single-double-quoted-string': [
|
||
|
(r'"', String, 'end-of-string'),
|
||
|
(r'[^"\\\n]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'triple-single-quoted-string': [
|
||
|
(r"'''", String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String.Escape, 'string-escape'),
|
||
|
],
|
||
|
'single-single-quoted-string': [
|
||
|
(r"'", String, 'end-of-string'),
|
||
|
(r"[^'\\\n]+", String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'string-escape': [
|
||
|
(r'u' + HEX + '{4}', String.Escape, '#pop'),
|
||
|
(r'U' + HEX + '{8}', String.Escape, '#pop'),
|
||
|
(r'.', String.Escape, '#pop'),
|
||
|
],
|
||
|
'end-of-string': [
|
||
|
(r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
|
||
|
bygroups(Operator, Name.Function), '#pop:2'),
|
||
|
(r'\^\^', Operator, '#pop:2'),
|
||
|
default('#pop:2'),
|
||
|
],
|
||
|
}
|
||
|
|
||
|
|
||
|
class TurtleLexer(RegexLexer):
|
||
|
"""
|
||
|
Lexer for Turtle data language.
|
||
|
"""
|
||
|
name = 'Turtle'
|
||
|
aliases = ['turtle']
|
||
|
filenames = ['*.ttl']
|
||
|
mimetypes = ['text/turtle', 'application/x-turtle']
|
||
|
url = 'https://www.w3.org/TR/turtle'
|
||
|
version_added = '2.1'
|
||
|
|
||
|
# character group definitions ::
|
||
|
PN_CHARS_BASE_GRP = ('a-zA-Z'
|
||
|
'\u00c0-\u00d6'
|
||
|
'\u00d8-\u00f6'
|
||
|
'\u00f8-\u02ff'
|
||
|
'\u0370-\u037d'
|
||
|
'\u037f-\u1fff'
|
||
|
'\u200c-\u200d'
|
||
|
'\u2070-\u218f'
|
||
|
'\u2c00-\u2fef'
|
||
|
'\u3001-\ud7ff'
|
||
|
'\uf900-\ufdcf'
|
||
|
'\ufdf0-\ufffd')
|
||
|
|
||
|
PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
|
||
|
|
||
|
PN_CHARS_GRP = (PN_CHARS_U_GRP +
|
||
|
r'\-' +
|
||
|
r'0-9' +
|
||
|
'\u00b7' +
|
||
|
'\u0300-\u036f' +
|
||
|
'\u203f-\u2040')
|
||
|
|
||
|
PN_CHARS = '[' + PN_CHARS_GRP + ']'
|
||
|
|
||
|
PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
|
||
|
|
||
|
PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
|
||
|
|
||
|
HEX_GRP = '0-9A-Fa-f'
|
||
|
|
||
|
HEX = '[' + HEX_GRP + ']'
|
||
|
|
||
|
PERCENT = '%' + HEX + HEX
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
|
||
|
|
||
|
PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
|
||
|
|
||
|
PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
|
||
|
|
||
|
PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
|
||
|
'(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
|
||
|
PN_CHARS_GRP + ':]|' + PLX + '))?')
|
||
|
|
||
|
patterns = {
|
||
|
'PNAME_NS': r'((?:[a-zA-Z][\w-]*)?\:)', # Simplified character range
|
||
|
'IRIREF': r'(<[^<>"{}|^`\\\x00-\x20]*>)'
|
||
|
}
|
||
|
|
||
|
tokens = {
|
||
|
'root': [
|
||
|
(r'\s+', Text),
|
||
|
|
||
|
# Base / prefix
|
||
|
(r'(@base|BASE)(\s+){IRIREF}(\s*)(\.?)'.format(**patterns),
|
||
|
bygroups(Keyword, Whitespace, Name.Variable, Whitespace,
|
||
|
Punctuation)),
|
||
|
(r'(@prefix|PREFIX)(\s+){PNAME_NS}(\s+){IRIREF}(\s*)(\.?)'.format(**patterns),
|
||
|
bygroups(Keyword, Whitespace, Name.Namespace, Whitespace,
|
||
|
Name.Variable, Whitespace, Punctuation)),
|
||
|
|
||
|
# The shorthand predicate 'a'
|
||
|
(r'(?<=\s)a(?=\s)', Keyword.Type),
|
||
|
|
||
|
# IRIREF
|
||
|
(r'{IRIREF}'.format(**patterns), Name.Variable),
|
||
|
|
||
|
# PrefixedName
|
||
|
(r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
|
||
|
bygroups(Name.Namespace, Punctuation, Name.Tag)),
|
||
|
|
||
|
# BlankNodeLabel
|
||
|
(r'(_)(:)([' + PN_CHARS_U_GRP + r'0-9]([' + PN_CHARS_GRP + r'.]*' + PN_CHARS + ')?)',
|
||
|
bygroups(Name.Namespace, Punctuation, Name.Tag)),
|
||
|
|
||
|
# Comment
|
||
|
(r'#[^\n]+', Comment),
|
||
|
|
||
|
(r'\b(true|false)\b', Literal),
|
||
|
(r'[+\-]?\d*\.\d+', Number.Float),
|
||
|
(r'[+\-]?\d*(:?\.\d+)?E[+\-]?\d+', Number.Float),
|
||
|
(r'[+\-]?\d+', Number.Integer),
|
||
|
(r'[\[\](){}.;,:^]', Punctuation),
|
||
|
|
||
|
(r'"""', String, 'triple-double-quoted-string'),
|
||
|
(r'"', String, 'single-double-quoted-string'),
|
||
|
(r"'''", String, 'triple-single-quoted-string'),
|
||
|
(r"'", String, 'single-single-quoted-string'),
|
||
|
],
|
||
|
'triple-double-quoted-string': [
|
||
|
(r'"""', String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'single-double-quoted-string': [
|
||
|
(r'"', String, 'end-of-string'),
|
||
|
(r'[^"\\\n]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'triple-single-quoted-string': [
|
||
|
(r"'''", String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'single-single-quoted-string': [
|
||
|
(r"'", String, 'end-of-string'),
|
||
|
(r"[^'\\\n]+", String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'string-escape': [
|
||
|
(r'.', String, '#pop'),
|
||
|
],
|
||
|
'end-of-string': [
|
||
|
(r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
|
||
|
bygroups(Operator, Generic.Emph), '#pop:2'),
|
||
|
|
||
|
(r'(\^\^){IRIREF}'.format(**patterns), bygroups(Operator, Generic.Emph), '#pop:2'),
|
||
|
|
||
|
default('#pop:2'),
|
||
|
|
||
|
],
|
||
|
}
|
||
|
|
||
|
# Turtle and Tera Term macro files share the same file extension
|
||
|
# but each has a recognizable and distinct syntax.
|
||
|
def analyse_text(text):
|
||
|
for t in ('@base ', 'BASE ', '@prefix ', 'PREFIX '):
|
||
|
if re.search(rf'^\s*{t}', text):
|
||
|
return 0.80
|
||
|
|
||
|
|
||
|
class ShExCLexer(RegexLexer):
|
||
|
"""
|
||
|
Lexer for ShExC shape expressions language syntax.
|
||
|
"""
|
||
|
name = 'ShExC'
|
||
|
aliases = ['shexc', 'shex']
|
||
|
filenames = ['*.shex']
|
||
|
mimetypes = ['text/shex']
|
||
|
url = 'https://shex.io/shex-semantics/#shexc'
|
||
|
version_added = ''
|
||
|
|
||
|
# character group definitions ::
|
||
|
|
||
|
PN_CHARS_BASE_GRP = ('a-zA-Z'
|
||
|
'\u00c0-\u00d6'
|
||
|
'\u00d8-\u00f6'
|
||
|
'\u00f8-\u02ff'
|
||
|
'\u0370-\u037d'
|
||
|
'\u037f-\u1fff'
|
||
|
'\u200c-\u200d'
|
||
|
'\u2070-\u218f'
|
||
|
'\u2c00-\u2fef'
|
||
|
'\u3001-\ud7ff'
|
||
|
'\uf900-\ufdcf'
|
||
|
'\ufdf0-\ufffd')
|
||
|
|
||
|
PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
|
||
|
|
||
|
PN_CHARS_GRP = (PN_CHARS_U_GRP +
|
||
|
r'\-' +
|
||
|
r'0-9' +
|
||
|
'\u00b7' +
|
||
|
'\u0300-\u036f' +
|
||
|
'\u203f-\u2040')
|
||
|
|
||
|
HEX_GRP = '0-9A-Fa-f'
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS_GRP = r"_~.\-!$&'()*+,;=/?#@%"
|
||
|
|
||
|
# terminal productions ::
|
||
|
|
||
|
PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
|
||
|
|
||
|
PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
|
||
|
|
||
|
PN_CHARS = '[' + PN_CHARS_GRP + ']'
|
||
|
|
||
|
HEX = '[' + HEX_GRP + ']'
|
||
|
|
||
|
PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
|
||
|
|
||
|
UCHAR_NO_BACKSLASH = '(?:u' + HEX + '{4}|U' + HEX + '{8})'
|
||
|
|
||
|
UCHAR = r'\\' + UCHAR_NO_BACKSLASH
|
||
|
|
||
|
IRIREF = r'<(?:[^\x00-\x20<>"{}|^`\\]|' + UCHAR + ')*>'
|
||
|
|
||
|
BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
|
||
|
'.]*' + PN_CHARS + ')?'
|
||
|
|
||
|
PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
|
||
|
|
||
|
PERCENT = '%' + HEX + HEX
|
||
|
|
||
|
PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
|
||
|
|
||
|
PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
|
||
|
|
||
|
PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
|
||
|
'(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
|
||
|
PN_CHARS_GRP + ':]|' + PLX + '))?')
|
||
|
|
||
|
EXPONENT = r'[eE][+-]?\d+'
|
||
|
|
||
|
# Lexer token definitions ::
|
||
|
|
||
|
tokens = {
|
||
|
'root': [
|
||
|
(r'\s+', Text),
|
||
|
# keywords ::
|
||
|
(r'(?i)(base|prefix|start|external|'
|
||
|
r'literal|iri|bnode|nonliteral|length|minlength|maxlength|'
|
||
|
r'mininclusive|minexclusive|maxinclusive|maxexclusive|'
|
||
|
r'totaldigits|fractiondigits|'
|
||
|
r'closed|extra)\b', Keyword),
|
||
|
(r'(a)\b', Keyword),
|
||
|
# IRIs ::
|
||
|
('(' + IRIREF + ')', Name.Label),
|
||
|
# blank nodes ::
|
||
|
('(' + BLANK_NODE_LABEL + ')', Name.Label),
|
||
|
# prefixed names ::
|
||
|
(r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + ')?',
|
||
|
bygroups(Name.Namespace, Punctuation, Name.Tag)),
|
||
|
# boolean literals ::
|
||
|
(r'(true|false)', Keyword.Constant),
|
||
|
# double literals ::
|
||
|
(r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
|
||
|
# decimal literals ::
|
||
|
(r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
|
||
|
# integer literals ::
|
||
|
(r'[+\-]?\d+', Number.Integer),
|
||
|
# operators ::
|
||
|
(r'[@|$&=*+?^\-~]', Operator),
|
||
|
# operator keywords ::
|
||
|
(r'(?i)(and|or|not)\b', Operator.Word),
|
||
|
# punctuation characters ::
|
||
|
(r'[(){}.;,:^\[\]]', Punctuation),
|
||
|
# line comments ::
|
||
|
(r'#[^\n]*', Comment),
|
||
|
# strings ::
|
||
|
(r'"""', String, 'triple-double-quoted-string'),
|
||
|
(r'"', String, 'single-double-quoted-string'),
|
||
|
(r"'''", String, 'triple-single-quoted-string'),
|
||
|
(r"'", String, 'single-single-quoted-string'),
|
||
|
],
|
||
|
'triple-double-quoted-string': [
|
||
|
(r'"""', String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'single-double-quoted-string': [
|
||
|
(r'"', String, 'end-of-string'),
|
||
|
(r'[^"\\\n]+', String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'triple-single-quoted-string': [
|
||
|
(r"'''", String, 'end-of-string'),
|
||
|
(r'[^\\]+', String),
|
||
|
(r'\\', String.Escape, 'string-escape'),
|
||
|
],
|
||
|
'single-single-quoted-string': [
|
||
|
(r"'", String, 'end-of-string'),
|
||
|
(r"[^'\\\n]+", String),
|
||
|
(r'\\', String, 'string-escape'),
|
||
|
],
|
||
|
'string-escape': [
|
||
|
(UCHAR_NO_BACKSLASH, String.Escape, '#pop'),
|
||
|
(r'.', String.Escape, '#pop'),
|
||
|
],
|
||
|
'end-of-string': [
|
||
|
(r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
|
||
|
bygroups(Operator, Name.Function), '#pop:2'),
|
||
|
(r'\^\^', Operator, '#pop:2'),
|
||
|
default('#pop:2'),
|
||
|
],
|
||
|
}
|