first commit

This commit is contained in:
Yura 2024-09-15 15:12:16 +03:00
commit 417e54da96
5696 changed files with 900003 additions and 0 deletions

View file

@ -0,0 +1,25 @@
cdef class Action:
cdef perform(self, token_stream, text)
cpdef same_as(self, other)
cdef class Return(Action):
cdef object value
cdef perform(self, token_stream, text)
cpdef same_as(self, other)
cdef class Call(Action):
cdef object function
cdef perform(self, token_stream, text)
cpdef same_as(self, other)
cdef class Begin(Action):
cdef object state_name
cdef perform(self, token_stream, text)
cpdef same_as(self, other)
cdef class Ignore(Action):
cdef perform(self, token_stream, text)
cdef class Text(Action):
cdef perform(self, token_stream, text)

View file

@ -0,0 +1,110 @@
# cython: auto_pickle=False
#=======================================================================
#
# Python Lexical Analyser
#
# Actions for use in token specifications
#
#=======================================================================
class Action(object):
def perform(self, token_stream, text):
pass # abstract
def same_as(self, other):
return self is other
class Return(Action):
"""
Internal Plex action which causes |value| to
be returned as the value of the associated token
"""
def __init__(self, value):
self.value = value
def perform(self, token_stream, text):
return self.value
def same_as(self, other):
return isinstance(other, Return) and self.value == other.value
def __repr__(self):
return "Return(%s)" % repr(self.value)
class Call(Action):
"""
Internal Plex action which causes a function to be called.
"""
def __init__(self, function):
self.function = function
def perform(self, token_stream, text):
return self.function(token_stream, text)
def __repr__(self):
return "Call(%s)" % self.function.__name__
def same_as(self, other):
return isinstance(other, Call) and self.function is other.function
class Begin(Action):
"""
Begin(state_name) is a Plex action which causes the Scanner to
enter the state |state_name|. See the docstring of Plex.Lexicon
for more information.
"""
def __init__(self, state_name):
self.state_name = state_name
def perform(self, token_stream, text):
token_stream.begin(self.state_name)
def __repr__(self):
return "Begin(%s)" % self.state_name
def same_as(self, other):
return isinstance(other, Begin) and self.state_name == other.state_name
class Ignore(Action):
"""
IGNORE is a Plex action which causes its associated token
to be ignored. See the docstring of Plex.Lexicon for more
information.
"""
def perform(self, token_stream, text):
return None
def __repr__(self):
return "IGNORE"
IGNORE = Ignore()
#IGNORE.__doc__ = Ignore.__doc__
class Text(Action):
"""
TEXT is a Plex action which causes the text of a token to
be returned as the value of the token. See the docstring of
Plex.Lexicon for more information.
"""
def perform(self, token_stream, text):
return text
def __repr__(self):
return "TEXT"
TEXT = Text()
#TEXT.__doc__ = Text.__doc__

View file

@ -0,0 +1,164 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Converting NFA to DFA
#
#=======================================================================
from __future__ import absolute_import
from . import Machines
from .Machines import LOWEST_PRIORITY
from .Transitions import TransitionMap
def nfa_to_dfa(old_machine, debug=None):
"""
Given a nondeterministic Machine, return a new equivalent
Machine which is deterministic.
"""
# We build a new machine whose states correspond to sets of states
# in the old machine. Initially we add a new state corresponding to
# the epsilon-closure of each initial old state. Then we give transitions
# to each new state which are the union of all transitions out of any
# of the corresponding old states. The new state reached on a given
# character is the one corresponding to the set of states reachable
# on that character from any of the old states. As new combinations of
# old states are created, new states are added as needed until closure
# is reached.
new_machine = Machines.FastMachine()
state_map = StateMap(new_machine)
# Seed the process using the initial states of the old machine.
# Make the corresponding new states into initial states of the new
# machine with the same names.
for (key, old_state) in old_machine.initial_states.items():
new_state = state_map.old_to_new(epsilon_closure(old_state))
new_machine.make_initial_state(key, new_state)
# Tricky bit here: we add things to the end of this list while we're
# iterating over it. The iteration stops when closure is achieved.
for new_state in new_machine.states:
transitions = TransitionMap()
for old_state in state_map.new_to_old(new_state):
for event, old_target_states in old_state.transitions.items():
if event and old_target_states:
transitions.add_set(event, set_epsilon_closure(old_target_states))
for event, old_states in transitions.items():
new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states))
if debug:
debug.write("\n===== State Mapping =====\n")
state_map.dump(debug)
return new_machine
def set_epsilon_closure(state_set):
"""
Given a set of states, return the union of the epsilon
closures of its member states.
"""
result = {}
for state1 in state_set:
for state2 in epsilon_closure(state1):
result[state2] = 1
return result
def epsilon_closure(state):
"""
Return the set of states reachable from the given state
by epsilon moves.
"""
# Cache the result
result = state.epsilon_closure
if result is None:
result = {}
state.epsilon_closure = result
add_to_epsilon_closure(result, state)
return result
def add_to_epsilon_closure(state_set, state):
"""
Recursively add to |state_set| states reachable from the given state
by epsilon moves.
"""
if not state_set.get(state, 0):
state_set[state] = 1
state_set_2 = state.transitions.get_epsilon()
if state_set_2:
for state2 in state_set_2:
add_to_epsilon_closure(state_set, state2)
class StateMap(object):
"""
Helper class used by nfa_to_dfa() to map back and forth between
sets of states from the old machine and states of the new machine.
"""
new_machine = None # Machine
old_to_new_dict = None # {(old_state,...) : new_state}
new_to_old_dict = None # {id(new_state) : old_state_set}
def __init__(self, new_machine):
self.new_machine = new_machine
self.old_to_new_dict = {}
self.new_to_old_dict = {}
def old_to_new(self, old_state_set):
"""
Return the state of the new machine corresponding to the
set of old machine states represented by |state_set|. A new
state will be created if necessary. If any of the old states
are accepting states, the new state will be an accepting state
with the highest priority action from the old states.
"""
key = self.make_key(old_state_set)
new_state = self.old_to_new_dict.get(key, None)
if not new_state:
action = self.highest_priority_action(old_state_set)
new_state = self.new_machine.new_state(action)
self.old_to_new_dict[key] = new_state
self.new_to_old_dict[id(new_state)] = old_state_set
#for old_state in old_state_set.keys():
#new_state.merge_actions(old_state)
return new_state
def highest_priority_action(self, state_set):
best_action = None
best_priority = LOWEST_PRIORITY
for state in state_set:
priority = state.action_priority
if priority > best_priority:
best_action = state.action
best_priority = priority
return best_action
# def old_to_new_set(self, old_state_set):
# """
# Return the new state corresponding to a set of old states as
# a singleton set.
# """
# return {self.old_to_new(old_state_set):1}
def new_to_old(self, new_state):
"""Given a new state, return a set of corresponding old states."""
return self.new_to_old_dict[id(new_state)]
def make_key(self, state_set):
"""
Convert a set of states into a uniquified
sorted tuple suitable for use as a dictionary key.
"""
lst = list(state_set)
lst.sort()
return tuple(lst)
def dump(self, file):
from .Transitions import state_set_str
for new_state in self.new_machine.states:
old_state_set = self.new_to_old_dict[id(new_state)]
file.write(" State %s <-- %s\n" % (
new_state['number'], state_set_str(old_state_set)))

View file

@ -0,0 +1,54 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Exception classes
#
#=======================================================================
class PlexError(Exception):
message = ""
class PlexTypeError(PlexError, TypeError):
pass
class PlexValueError(PlexError, ValueError):
pass
class InvalidRegex(PlexError):
pass
class InvalidToken(PlexError):
def __init__(self, token_number, message):
PlexError.__init__(self, "Token number %d: %s" % (token_number, message))
class InvalidScanner(PlexError):
pass
class AmbiguousAction(PlexError):
message = "Two tokens with different actions can match the same string"
def __init__(self):
pass
class UnrecognizedInput(PlexError):
scanner = None
position = None
state_name = None
def __init__(self, scanner, state_name):
self.scanner = scanner
self.position = scanner.get_position()
self.state_name = state_name
def __str__(self):
return ("'%s', line %d, char %d: Token not recognised in state %r" % (
self.position + (self.state_name,)))

View file

@ -0,0 +1,200 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Lexical Analyser Specification
#
#=======================================================================
from __future__ import absolute_import
import types
from . import Actions
from . import DFA
from . import Errors
from . import Machines
from . import Regexps
# debug_flags for Lexicon constructor
DUMP_NFA = 1
DUMP_DFA = 2
class State(object):
"""
This class is used as part of a Plex.Lexicon specification to
introduce a user-defined state.
Constructor:
State(name, token_specifications)
"""
name = None
tokens = None
def __init__(self, name, tokens):
self.name = name
self.tokens = tokens
class Lexicon(object):
"""
Lexicon(specification) builds a lexical analyser from the given
|specification|. The specification consists of a list of
specification items. Each specification item may be either:
1) A token definition, which is a tuple:
(pattern, action)
The |pattern| is a regular axpression built using the
constructors defined in the Plex module.
The |action| is the action to be performed when this pattern
is recognised (see below).
2) A state definition:
State(name, tokens)
where |name| is a character string naming the state,
and |tokens| is a list of token definitions as
above. The meaning and usage of states is described
below.
Actions
-------
The |action| in a token specication may be one of three things:
1) A function, which is called as follows:
function(scanner, text)
where |scanner| is the relevant Scanner instance, and |text|
is the matched text. If the function returns anything
other than None, that value is returned as the value of the
token. If it returns None, scanning continues as if the IGNORE
action were specified (see below).
2) One of the following special actions:
IGNORE means that the recognised characters will be treated as
white space and ignored. Scanning will continue until
the next non-ignored token is recognised before returning.
TEXT causes the scanned text itself to be returned as the
value of the token.
3) Any other value, which is returned as the value of the token.
States
------
At any given time, the scanner is in one of a number of states.
Associated with each state is a set of possible tokens. When scanning,
only tokens associated with the current state are recognised.
There is a default state, whose name is the empty string. Token
definitions which are not inside any State definition belong to
the default state.
The initial state of the scanner is the default state. The state can
be changed in one of two ways:
1) Using Begin(state_name) as the action of a token.
2) Calling the begin(state_name) method of the Scanner.
To change back to the default state, use '' as the state name.
"""
machine = None # Machine
tables = None # StateTableMachine
def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
if not isinstance(specifications, list):
raise Errors.InvalidScanner("Scanner definition is not a list")
if timings:
from .Timing import time
total_time = 0.0
time1 = time()
nfa = Machines.Machine()
default_initial_state = nfa.new_initial_state('')
token_number = 1
for spec in specifications:
if isinstance(spec, State):
user_initial_state = nfa.new_initial_state(spec.name)
for token in spec.tokens:
self.add_token_to_machine(
nfa, user_initial_state, token, token_number)
token_number += 1
elif isinstance(spec, tuple):
self.add_token_to_machine(
nfa, default_initial_state, spec, token_number)
token_number += 1
else:
raise Errors.InvalidToken(
token_number,
"Expected a token definition (tuple) or State instance")
if timings:
time2 = time()
total_time = total_time + (time2 - time1)
time3 = time()
if debug and (debug_flags & 1):
debug.write("\n============= NFA ===========\n")
nfa.dump(debug)
dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
if timings:
time4 = time()
total_time = total_time + (time4 - time3)
if debug and (debug_flags & 2):
debug.write("\n============= DFA ===========\n")
dfa.dump(debug)
if timings:
timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
timings.write("TOTAL : %5.2f\n" % total_time)
self.machine = dfa
def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
try:
(re, action_spec) = self.parse_token_definition(token_spec)
# Disabled this -- matching empty strings can be useful
#if re.nullable:
# raise Errors.InvalidToken(
# token_number, "Pattern can match 0 input symbols")
if isinstance(action_spec, Actions.Action):
action = action_spec
else:
try:
action_spec.__call__
except AttributeError:
action = Actions.Return(action_spec)
else:
action = Actions.Call(action_spec)
final_state = machine.new_state()
re.build_machine(machine, initial_state, final_state,
match_bol=1, nocase=0)
final_state.set_action(action, priority=-token_number)
except Errors.PlexError as e:
raise e.__class__("Token number %d: %s" % (token_number, e))
def parse_token_definition(self, token_spec):
if not isinstance(token_spec, tuple):
raise Errors.InvalidToken("Token definition is not a tuple")
if len(token_spec) != 2:
raise Errors.InvalidToken("Wrong number of items in token definition")
pattern, action = token_spec
if not isinstance(pattern, Regexps.RE):
raise Errors.InvalidToken("Pattern is not an RE instance")
return (pattern, action)
def get_initial_state(self, name):
return self.machine.get_initial_state(name)

View file

@ -0,0 +1,255 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Classes for building NFAs and DFAs
#
#=======================================================================
from __future__ import absolute_import
import sys
from .Transitions import TransitionMap
try:
from sys import maxsize as maxint
except ImportError:
from sys import maxint
try:
unichr
except NameError:
unichr = chr
LOWEST_PRIORITY = -maxint
class Machine(object):
"""A collection of Nodes representing an NFA or DFA."""
states = None # [Node]
next_state_number = 1
initial_states = None # {(name, bol): Node}
def __init__(self):
self.states = []
self.initial_states = {}
def __del__(self):
#print "Destroying", self ###
for state in self.states:
state.destroy()
def new_state(self):
"""Add a new state to the machine and return it."""
s = Node()
n = self.next_state_number
self.next_state_number = n + 1
s.number = n
self.states.append(s)
return s
def new_initial_state(self, name):
state = self.new_state()
self.make_initial_state(name, state)
return state
def make_initial_state(self, name, state):
self.initial_states[name] = state
def get_initial_state(self, name):
return self.initial_states[name]
def dump(self, file):
file.write("Plex.Machine:\n")
if self.initial_states is not None:
file.write(" Initial states:\n")
for (name, state) in sorted(self.initial_states.items()):
file.write(" '%s': %d\n" % (name, state.number))
for s in self.states:
s.dump(file)
class Node(object):
"""A state of an NFA or DFA."""
transitions = None # TransitionMap
action = None # Action
action_priority = None # integer
number = 0 # for debug output
epsilon_closure = None # used by nfa_to_dfa()
def __init__(self):
# Preinitialise the list of empty transitions, because
# the nfa-to-dfa algorithm needs it
#self.transitions = {'':[]}
self.transitions = TransitionMap()
self.action_priority = LOWEST_PRIORITY
def destroy(self):
#print "Destroying", self ###
self.transitions = None
self.action = None
self.epsilon_closure = None
def add_transition(self, event, new_state):
self.transitions.add(event, new_state)
def link_to(self, state):
"""Add an epsilon-move from this state to another state."""
self.add_transition('', state)
def set_action(self, action, priority):
"""Make this an accepting state with the given action. If
there is already an action, choose the action with highest
priority."""
if priority > self.action_priority:
self.action = action
self.action_priority = priority
def get_action(self):
return self.action
def get_action_priority(self):
return self.action_priority
def is_accepting(self):
return self.action is not None
def __str__(self):
return "State %d" % self.number
def dump(self, file):
# Header
file.write(" State %d:\n" % self.number)
# Transitions
# self.dump_transitions(file)
self.transitions.dump(file)
# Action
action = self.action
priority = self.action_priority
if action is not None:
file.write(" %s [priority %d]\n" % (action, priority))
def __lt__(self, other):
return self.number < other.number
class FastMachine(object):
"""
FastMachine is a deterministic machine represented in a way that
allows fast scanning.
"""
initial_states = None # {state_name:state}
states = None # [state] where state = {event:state, 'else':state, 'action':Action}
next_number = 1 # for debugging
new_state_template = {
'': None, 'bol': None, 'eol': None, 'eof': None, 'else': None
}
def __init__(self):
self.initial_states = {}
self.states = []
def __del__(self):
for state in self.states:
state.clear()
def new_state(self, action=None):
number = self.next_number
self.next_number = number + 1
result = self.new_state_template.copy()
result['number'] = number
result['action'] = action
self.states.append(result)
return result
def make_initial_state(self, name, state):
self.initial_states[name] = state
def add_transitions(self, state, event, new_state, maxint=maxint):
if type(event) is tuple:
code0, code1 = event
if code0 == -maxint:
state['else'] = new_state
elif code1 != maxint:
while code0 < code1:
state[unichr(code0)] = new_state
code0 += 1
else:
state[event] = new_state
def get_initial_state(self, name):
return self.initial_states[name]
def dump(self, file):
file.write("Plex.FastMachine:\n")
file.write(" Initial states:\n")
for name, state in sorted(self.initial_states.items()):
file.write(" %s: %s\n" % (repr(name), state['number']))
for state in self.states:
self.dump_state(state, file)
def dump_state(self, state, file):
# Header
file.write(" State %d:\n" % state['number'])
# Transitions
self.dump_transitions(state, file)
# Action
action = state['action']
if action is not None:
file.write(" %s\n" % action)
def dump_transitions(self, state, file):
chars_leading_to_state = {}
special_to_state = {}
for (c, s) in state.items():
if len(c) == 1:
chars = chars_leading_to_state.get(id(s), None)
if chars is None:
chars = []
chars_leading_to_state[id(s)] = chars
chars.append(c)
elif len(c) <= 4:
special_to_state[c] = s
ranges_to_state = {}
for state in self.states:
char_list = chars_leading_to_state.get(id(state), None)
if char_list:
ranges = self.chars_to_ranges(char_list)
ranges_to_state[ranges] = state
ranges_list = ranges_to_state.keys()
ranges_list.sort()
for ranges in ranges_list:
key = self.ranges_to_string(ranges)
state = ranges_to_state[ranges]
file.write(" %s --> State %d\n" % (key, state['number']))
for key in ('bol', 'eol', 'eof', 'else'):
state = special_to_state.get(key, None)
if state:
file.write(" %s --> State %d\n" % (key, state['number']))
def chars_to_ranges(self, char_list):
char_list.sort()
i = 0
n = len(char_list)
result = []
while i < n:
c1 = ord(char_list[i])
c2 = c1
i += 1
while i < n and ord(char_list[i]) == c2 + 1:
i += 1
c2 += 1
result.append((chr(c1), chr(c2)))
return tuple(result)
def ranges_to_string(self, range_list):
return ','.join(map(self.range_to_string, range_list))
def range_to_string(self, range_tuple):
(c1, c2) = range_tuple
if c1 == c2:
return repr(c1)
else:
return "%s..%s" % (repr(c1), repr(c2))

View file

@ -0,0 +1,576 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Regular Expressions
#
#=======================================================================
from __future__ import absolute_import
import types
try:
from sys import maxsize as maxint
except ImportError:
from sys import maxint
from . import Errors
#
# Constants
#
BOL = 'bol'
EOL = 'eol'
EOF = 'eof'
nl_code = ord('\n')
#
# Helper functions
#
def chars_to_ranges(s):
"""
Return a list of character codes consisting of pairs
[code1a, code1b, code2a, code2b,...] which cover all
the characters in |s|.
"""
char_list = list(s)
char_list.sort()
i = 0
n = len(char_list)
result = []
while i < n:
code1 = ord(char_list[i])
code2 = code1 + 1
i += 1
while i < n and code2 >= ord(char_list[i]):
code2 += 1
i += 1
result.append(code1)
result.append(code2)
return result
def uppercase_range(code1, code2):
"""
If the range of characters from code1 to code2-1 includes any
lower case letters, return the corresponding upper case range.
"""
code3 = max(code1, ord('a'))
code4 = min(code2, ord('z') + 1)
if code3 < code4:
d = ord('A') - ord('a')
return (code3 + d, code4 + d)
else:
return None
def lowercase_range(code1, code2):
"""
If the range of characters from code1 to code2-1 includes any
upper case letters, return the corresponding lower case range.
"""
code3 = max(code1, ord('A'))
code4 = min(code2, ord('Z') + 1)
if code3 < code4:
d = ord('a') - ord('A')
return (code3 + d, code4 + d)
else:
return None
def CodeRanges(code_list):
"""
Given a list of codes as returned by chars_to_ranges, return
an RE which will match a character in any of the ranges.
"""
re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)]
return Alt(*re_list)
def CodeRange(code1, code2):
"""
CodeRange(code1, code2) is an RE which matches any character
with a code |c| in the range |code1| <= |c| < |code2|.
"""
if code1 <= nl_code < code2:
return Alt(RawCodeRange(code1, nl_code),
RawNewline,
RawCodeRange(nl_code + 1, code2))
else:
return RawCodeRange(code1, code2)
#
# Abstract classes
#
class RE(object):
"""RE is the base class for regular expression constructors.
The following operators are defined on REs:
re1 + re2 is an RE which matches |re1| followed by |re2|
re1 | re2 is an RE which matches either |re1| or |re2|
"""
nullable = 1 # True if this RE can match 0 input symbols
match_nl = 1 # True if this RE can match a string ending with '\n'
str = None # Set to a string to override the class's __str__ result
def build_machine(self, machine, initial_state, final_state,
match_bol, nocase):
"""
This method should add states to |machine| to implement this
RE, starting at |initial_state| and ending at |final_state|.
If |match_bol| is true, the RE must be able to match at the
beginning of a line. If nocase is true, upper and lower case
letters should be treated as equivalent.
"""
raise NotImplementedError("%s.build_machine not implemented" %
self.__class__.__name__)
def build_opt(self, m, initial_state, c):
"""
Given a state |s| of machine |m|, return a new state
reachable from |s| on character |c| or epsilon.
"""
s = m.new_state()
initial_state.link_to(s)
initial_state.add_transition(c, s)
return s
def __add__(self, other):
return Seq(self, other)
def __or__(self, other):
return Alt(self, other)
def __str__(self):
if self.str:
return self.str
else:
return self.calc_str()
def check_re(self, num, value):
if not isinstance(value, RE):
self.wrong_type(num, value, "Plex.RE instance")
def check_string(self, num, value):
if type(value) != type(''):
self.wrong_type(num, value, "string")
def check_char(self, num, value):
self.check_string(num, value)
if len(value) != 1:
raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
"Expected a string of length 1, got: %s" % (
num, self.__class__.__name__, repr(value)))
def wrong_type(self, num, value, expected):
if type(value) == types.InstanceType:
got = "%s.%s instance" % (
value.__class__.__module__, value.__class__.__name__)
else:
got = type(value).__name__
raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
"(expected %s, got %s" % (
num, self.__class__.__name__, expected, got))
#
# Primitive RE constructors
# -------------------------
#
# These are the basic REs from which all others are built.
#
## class Char(RE):
## """
## Char(c) is an RE which matches the character |c|.
## """
## nullable = 0
## def __init__(self, char):
## self.char = char
## self.match_nl = char == '\n'
## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
## c = self.char
## if match_bol and c != BOL:
## s1 = self.build_opt(m, initial_state, BOL)
## else:
## s1 = initial_state
## if c == '\n' or c == EOF:
## s1 = self.build_opt(m, s1, EOL)
## if len(c) == 1:
## code = ord(self.char)
## s1.add_transition((code, code+1), final_state)
## if nocase and is_letter_code(code):
## code2 = other_case_code(code)
## s1.add_transition((code2, code2+1), final_state)
## else:
## s1.add_transition(c, final_state)
## def calc_str(self):
## return "Char(%s)" % repr(self.char)
def Char(c):
"""
Char(c) is an RE which matches the character |c|.
"""
if len(c) == 1:
result = CodeRange(ord(c), ord(c) + 1)
else:
result = SpecialSymbol(c)
result.str = "Char(%s)" % repr(c)
return result
class RawCodeRange(RE):
"""
RawCodeRange(code1, code2) is a low-level RE which matches any character
with a code |c| in the range |code1| <= |c| < |code2|, where the range
does not include newline. For internal use only.
"""
nullable = 0
match_nl = 0
range = None # (code, code)
uppercase_range = None # (code, code) or None
lowercase_range = None # (code, code) or None
def __init__(self, code1, code2):
self.range = (code1, code2)
self.uppercase_range = uppercase_range(code1, code2)
self.lowercase_range = lowercase_range(code1, code2)
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
initial_state.add_transition(self.range, final_state)
if nocase:
if self.uppercase_range:
initial_state.add_transition(self.uppercase_range, final_state)
if self.lowercase_range:
initial_state.add_transition(self.lowercase_range, final_state)
def calc_str(self):
return "CodeRange(%d,%d)" % (self.code1, self.code2)
class _RawNewline(RE):
"""
RawNewline is a low-level RE which matches a newline character.
For internal use only.
"""
nullable = 0
match_nl = 1
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
s = self.build_opt(m, initial_state, EOL)
s.add_transition((nl_code, nl_code + 1), final_state)
RawNewline = _RawNewline()
class SpecialSymbol(RE):
"""
SpecialSymbol(sym) is an RE which matches the special input
symbol |sym|, which is one of BOL, EOL or EOF.
"""
nullable = 0
match_nl = 0
sym = None
def __init__(self, sym):
self.sym = sym
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
# Sequences 'bol bol' and 'bol eof' are impossible, so only need
# to allow for bol if sym is eol
if match_bol and self.sym == EOL:
initial_state = self.build_opt(m, initial_state, BOL)
initial_state.add_transition(self.sym, final_state)
class Seq(RE):
"""Seq(re1, re2, re3...) is an RE which matches |re1| followed by
|re2| followed by |re3|..."""
def __init__(self, *re_list):
nullable = 1
for i, re in enumerate(re_list):
self.check_re(i, re)
nullable = nullable and re.nullable
self.re_list = re_list
self.nullable = nullable
i = len(re_list)
match_nl = 0
while i:
i -= 1
re = re_list[i]
if re.match_nl:
match_nl = 1
break
if not re.nullable:
break
self.match_nl = match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
re_list = self.re_list
if len(re_list) == 0:
initial_state.link_to(final_state)
else:
s1 = initial_state
n = len(re_list)
for i, re in enumerate(re_list):
if i < n - 1:
s2 = m.new_state()
else:
s2 = final_state
re.build_machine(m, s1, s2, match_bol, nocase)
s1 = s2
match_bol = re.match_nl or (match_bol and re.nullable)
def calc_str(self):
return "Seq(%s)" % ','.join(map(str, self.re_list))
class Alt(RE):
"""Alt(re1, re2, re3...) is an RE which matches either |re1| or
|re2| or |re3|..."""
def __init__(self, *re_list):
self.re_list = re_list
nullable = 0
match_nl = 0
nullable_res = []
non_nullable_res = []
i = 1
for re in re_list:
self.check_re(i, re)
if re.nullable:
nullable_res.append(re)
nullable = 1
else:
non_nullable_res.append(re)
if re.match_nl:
match_nl = 1
i += 1
self.nullable_res = nullable_res
self.non_nullable_res = non_nullable_res
self.nullable = nullable
self.match_nl = match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
for re in self.nullable_res:
re.build_machine(m, initial_state, final_state, match_bol, nocase)
if self.non_nullable_res:
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
for re in self.non_nullable_res:
re.build_machine(m, initial_state, final_state, 0, nocase)
def calc_str(self):
return "Alt(%s)" % ','.join(map(str, self.re_list))
class Rep1(RE):
"""Rep1(re) is an RE which matches one or more repetitions of |re|."""
def __init__(self, re):
self.check_re(1, re)
self.re = re
self.nullable = re.nullable
self.match_nl = re.match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
s1 = m.new_state()
s2 = m.new_state()
initial_state.link_to(s1)
self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
s2.link_to(s1)
s2.link_to(final_state)
def calc_str(self):
return "Rep1(%s)" % self.re
class SwitchCase(RE):
"""
SwitchCase(re, nocase) is an RE which matches the same strings as RE,
but treating upper and lower case letters according to |nocase|. If
|nocase| is true, case is ignored, otherwise it is not.
"""
re = None
nocase = None
def __init__(self, re, nocase):
self.re = re
self.nocase = nocase
self.nullable = re.nullable
self.match_nl = re.match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
self.re.build_machine(m, initial_state, final_state, match_bol,
self.nocase)
def calc_str(self):
if self.nocase:
name = "NoCase"
else:
name = "Case"
return "%s(%s)" % (name, self.re)
#
# Composite RE constructors
# -------------------------
#
# These REs are defined in terms of the primitive REs.
#
Empty = Seq()
Empty.__doc__ = \
"""
Empty is an RE which matches the empty string.
"""
Empty.str = "Empty"
def Str1(s):
"""
Str1(s) is an RE which matches the literal string |s|.
"""
result = Seq(*tuple(map(Char, s)))
result.str = "Str(%s)" % repr(s)
return result
def Str(*strs):
"""
Str(s) is an RE which matches the literal string |s|.
Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
"""
if len(strs) == 1:
return Str1(strs[0])
else:
result = Alt(*tuple(map(Str1, strs)))
result.str = "Str(%s)" % ','.join(map(repr, strs))
return result
def Any(s):
"""
Any(s) is an RE which matches any character in the string |s|.
"""
#result = apply(Alt, tuple(map(Char, s)))
result = CodeRanges(chars_to_ranges(s))
result.str = "Any(%s)" % repr(s)
return result
def AnyBut(s):
"""
AnyBut(s) is an RE which matches any character (including
newline) which is not in the string |s|.
"""
ranges = chars_to_ranges(s)
ranges.insert(0, -maxint)
ranges.append(maxint)
result = CodeRanges(ranges)
result.str = "AnyBut(%s)" % repr(s)
return result
AnyChar = AnyBut("")
AnyChar.__doc__ = \
"""
AnyChar is an RE which matches any single character (including a newline).
"""
AnyChar.str = "AnyChar"
def Range(s1, s2=None):
"""
Range(c1, c2) is an RE which matches any single character in the range
|c1| to |c2| inclusive.
Range(s) where |s| is a string of even length is an RE which matches
any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
"""
if s2:
result = CodeRange(ord(s1), ord(s2) + 1)
result.str = "Range(%s,%s)" % (s1, s2)
else:
ranges = []
for i in range(0, len(s1), 2):
ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1))
result = Alt(*ranges)
result.str = "Range(%s)" % repr(s1)
return result
def Opt(re):
"""
Opt(re) is an RE which matches either |re| or the empty string.
"""
result = Alt(re, Empty)
result.str = "Opt(%s)" % re
return result
def Rep(re):
"""
Rep(re) is an RE which matches zero or more repetitions of |re|.
"""
result = Opt(Rep1(re))
result.str = "Rep(%s)" % re
return result
def NoCase(re):
"""
NoCase(re) is an RE which matches the same strings as RE, but treating
upper and lower case letters as equivalent.
"""
return SwitchCase(re, nocase=1)
def Case(re):
"""
Case(re) is an RE which matches the same strings as RE, but treating
upper and lower case letters as distinct, i.e. it cancels the effect
of any enclosing NoCase().
"""
return SwitchCase(re, nocase=0)
#
# RE Constants
#
Bol = Char(BOL)
Bol.__doc__ = \
"""
Bol is an RE which matches the beginning of a line.
"""
Bol.str = "Bol"
Eol = Char(EOL)
Eol.__doc__ = \
"""
Eol is an RE which matches the end of a line.
"""
Eol.str = "Eol"
Eof = Char(EOF)
Eof.__doc__ = \
"""
Eof is an RE which matches the end of the file.
"""
Eof.str = "Eof"

View file

@ -0,0 +1,50 @@
from __future__ import absolute_import
import cython
from Cython.Plex.Actions cimport Action
cdef class Scanner:
cdef public lexicon
cdef public stream
cdef public name
cdef public unicode buffer
cdef public Py_ssize_t buf_start_pos
cdef public Py_ssize_t next_pos
cdef public Py_ssize_t cur_pos
cdef public Py_ssize_t cur_line
cdef public Py_ssize_t cur_line_start
cdef public Py_ssize_t start_pos
cdef public Py_ssize_t start_line
cdef public Py_ssize_t start_col
cdef public text
cdef public initial_state # int?
cdef public state_name
cdef public list queue
cdef public bint trace
cdef public cur_char
cdef public long input_state
cdef public level
@cython.final
@cython.locals(input_state=long)
cdef next_char(self)
@cython.locals(action=Action)
cpdef tuple read(self)
@cython.final
cdef tuple scan_a_token(self)
##cdef tuple position(self) # used frequently by Parsing.py
@cython.final
@cython.locals(cur_pos=Py_ssize_t, cur_line=Py_ssize_t, cur_line_start=Py_ssize_t,
input_state=long, next_pos=Py_ssize_t, state=dict,
buf_start_pos=Py_ssize_t, buf_len=Py_ssize_t, buf_index=Py_ssize_t,
trace=bint, discard=Py_ssize_t, data=unicode, buffer=unicode)
cdef run_machine_inlined(self)
@cython.final
cdef begin(self, state)
@cython.final
cdef produce(self, value, text = *)

View file

@ -0,0 +1,338 @@
# cython: auto_pickle=False
#=======================================================================
#
# Python Lexical Analyser
#
#
# Scanning an input stream
#
#=======================================================================
from __future__ import absolute_import
import cython
cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
from . import Errors
from .Regexps import BOL, EOL, EOF
NOT_FOUND = object()
class Scanner(object):
"""
A Scanner is used to read tokens from a stream of characters
using the token set specified by a Plex.Lexicon.
Constructor:
Scanner(lexicon, stream, name = '')
See the docstring of the __init__ method for details.
Methods:
See the docstrings of the individual methods for more
information.
read() --> (value, text)
Reads the next lexical token from the stream.
position() --> (name, line, col)
Returns the position of the last token read using the
read() method.
begin(state_name)
Causes scanner to change state.
produce(value [, text])
Causes return of a token value to the caller of the
Scanner.
"""
# lexicon = None # Lexicon
# stream = None # file-like object
# name = ''
# buffer = ''
# buf_start_pos = 0 # position in input of start of buffer
# next_pos = 0 # position in input of next char to read
# cur_pos = 0 # position in input of current char
# cur_line = 1 # line number of current char
# cur_line_start = 0 # position in input of start of current line
# start_pos = 0 # position in input of start of token
# start_line = 0 # line number of start of token
# start_col = 0 # position in line of start of token
# text = None # text of last token read
# initial_state = None # Node
# state_name = '' # Name of initial state
# queue = None # list of tokens to be returned
# trace = 0
def __init__(self, lexicon, stream, name='', initial_pos=None):
"""
Scanner(lexicon, stream, name = '')
|lexicon| is a Plex.Lexicon instance specifying the lexical tokens
to be recognised.
|stream| can be a file object or anything which implements a
compatible read() method.
|name| is optional, and may be the name of the file being
scanned or any other identifying string.
"""
self.trace = 0
self.buffer = u''
self.buf_start_pos = 0
self.next_pos = 0
self.cur_pos = 0
self.cur_line = 1
self.start_pos = 0
self.start_line = 0
self.start_col = 0
self.text = None
self.state_name = None
self.lexicon = lexicon
self.stream = stream
self.name = name
self.queue = []
self.initial_state = None
self.begin('')
self.next_pos = 0
self.cur_pos = 0
self.cur_line_start = 0
self.cur_char = BOL
self.input_state = 1
if initial_pos is not None:
self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
def read(self):
"""
Read the next lexical token from the stream and return a
tuple (value, text), where |value| is the value associated with
the token as specified by the Lexicon, and |text| is the actual
string read from the stream. Returns (None, '') on end of file.
"""
queue = self.queue
while not queue:
self.text, action = self.scan_a_token()
if action is None:
self.produce(None)
self.eof()
else:
value = action.perform(self, self.text)
if value is not None:
self.produce(value)
result = queue[0]
del queue[0]
return result
def scan_a_token(self):
"""
Read the next input sequence recognised by the machine
and return (text, action). Returns ('', None) on end of
file.
"""
self.start_pos = self.cur_pos
self.start_line = self.cur_line
self.start_col = self.cur_pos - self.cur_line_start
action = self.run_machine_inlined()
if action is not None:
if self.trace:
print("Scanner: read: Performing %s %d:%d" % (
action, self.start_pos, self.cur_pos))
text = self.buffer[
self.start_pos - self.buf_start_pos:
self.cur_pos - self.buf_start_pos]
return (text, action)
else:
if self.cur_pos == self.start_pos:
if self.cur_char is EOL:
self.next_char()
if self.cur_char is None or self.cur_char is EOF:
return (u'', None)
raise Errors.UnrecognizedInput(self, self.state_name)
def run_machine_inlined(self):
"""
Inlined version of run_machine for speed.
"""
state = self.initial_state
cur_pos = self.cur_pos
cur_line = self.cur_line
cur_line_start = self.cur_line_start
cur_char = self.cur_char
input_state = self.input_state
next_pos = self.next_pos
buffer = self.buffer
buf_start_pos = self.buf_start_pos
buf_len = len(buffer)
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
None, 0, 0, 0, u'', 0, 0
trace = self.trace
while 1:
if trace: #TRACE#
print("State %d, %d/%d:%s -->" % ( #TRACE#
state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
# Begin inlined self.save_for_backup()
#action = state.action #@slow
action = state['action'] #@fast
if action is not None:
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
# End inlined self.save_for_backup()
c = cur_char
#new_state = state.new_state(c) #@slow
new_state = state.get(c, NOT_FOUND) #@fast
if new_state is NOT_FOUND: #@fast
new_state = c and state.get('else') #@fast
if new_state:
if trace: #TRACE#
print("State %d" % new_state['number']) #TRACE#
state = new_state
# Begin inlined: self.next_char()
if input_state == 1:
cur_pos = next_pos
# Begin inlined: c = self.read_char()
buf_index = next_pos - buf_start_pos
if buf_index < buf_len:
c = buffer[buf_index]
next_pos += 1
else:
discard = self.start_pos - buf_start_pos
data = self.stream.read(0x1000)
buffer = self.buffer[discard:] + data
self.buffer = buffer
buf_start_pos += discard
self.buf_start_pos = buf_start_pos
buf_len = len(buffer)
buf_index -= discard
if data:
c = buffer[buf_index]
next_pos += 1
else:
c = u''
# End inlined: c = self.read_char()
if c == u'\n':
cur_char = EOL
input_state = 2
elif not c:
cur_char = EOL
input_state = 4
else:
cur_char = c
elif input_state == 2:
cur_char = u'\n'
input_state = 3
elif input_state == 3:
cur_line += 1
cur_line_start = cur_pos = next_pos
cur_char = BOL
input_state = 1
elif input_state == 4:
cur_char = EOF
input_state = 5
else: # input_state = 5
cur_char = u''
# End inlined self.next_char()
else: # not new_state
if trace: #TRACE#
print("blocked") #TRACE#
# Begin inlined: action = self.back_up()
if b_action is not None:
(action, cur_pos, cur_line, cur_line_start,
cur_char, input_state, next_pos) = \
(b_action, b_cur_pos, b_cur_line, b_cur_line_start,
b_cur_char, b_input_state, b_next_pos)
else:
action = None
break # while 1
# End inlined: action = self.back_up()
self.cur_pos = cur_pos
self.cur_line = cur_line
self.cur_line_start = cur_line_start
self.cur_char = cur_char
self.input_state = input_state
self.next_pos = next_pos
if trace: #TRACE#
if action is not None: #TRACE#
print("Doing %s" % action) #TRACE#
return action
def next_char(self):
input_state = self.input_state
if self.trace:
print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
if input_state == 1:
self.cur_pos = self.next_pos
c = self.read_char()
if c == u'\n':
self.cur_char = EOL
self.input_state = 2
elif not c:
self.cur_char = EOL
self.input_state = 4
else:
self.cur_char = c
elif input_state == 2:
self.cur_char = u'\n'
self.input_state = 3
elif input_state == 3:
self.cur_line += 1
self.cur_line_start = self.cur_pos = self.next_pos
self.cur_char = BOL
self.input_state = 1
elif input_state == 4:
self.cur_char = EOF
self.input_state = 5
else: # input_state = 5
self.cur_char = u''
if self.trace:
print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char))
def position(self):
"""
Return a tuple (name, line, col) representing the location of
the last token read using the read() method. |name| is the
name that was provided to the Scanner constructor; |line|
is the line number in the stream (1-based); |col| is the
position within the line of the first character of the token
(0-based).
"""
return (self.name, self.start_line, self.start_col)
def get_position(self):
"""Python accessible wrapper around position(), only for error reporting.
"""
return self.position()
def begin(self, state_name):
"""Set the current state of the scanner to the named state."""
self.initial_state = (
self.lexicon.get_initial_state(state_name))
self.state_name = state_name
def produce(self, value, text=None):
"""
Called from an action procedure, causes |value| to be returned
as the token value from read(). If |text| is supplied, it is
returned in place of the scanned text.
produce() can be called more than once during a single call to an action
procedure, in which case the tokens are queued up and returned one
at a time by subsequent calls to read(), until the queue is empty,
whereupon scanning resumes.
"""
if text is None:
text = self.text
self.queue.append((value, text))
def eof(self):
"""
Override this method if you want something to be done at
end of file.
"""

View file

@ -0,0 +1,23 @@
#
# Get time in platform-dependent way
#
from __future__ import absolute_import
import os
from sys import platform, exit, stderr
if platform == 'mac':
import MacOS
def time():
return MacOS.GetTicks() / 60.0
timekind = "real"
elif hasattr(os, 'times'):
def time():
t = os.times()
return t[0] + t[1]
timekind = "cpu"
else:
stderr.write(
"Don't know how to get time on platform %s\n" % repr(platform))
exit(1)

View file

@ -0,0 +1,158 @@
#=======================================================================
#
# Python Lexical Analyser
#
# Traditional Regular Expression Syntax
#
#=======================================================================
from __future__ import absolute_import
from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
from .Errors import PlexError
class RegexpSyntaxError(PlexError):
pass
def re(s):
"""
Convert traditional string representation of regular expression |s|
into Plex representation.
"""
return REParser(s).parse_re()
class REParser(object):
def __init__(self, s):
self.s = s
self.i = -1
self.end = 0
self.next()
def parse_re(self):
re = self.parse_alt()
if not self.end:
self.error("Unexpected %s" % repr(self.c))
return re
def parse_alt(self):
"""Parse a set of alternative regexps."""
re = self.parse_seq()
if self.c == '|':
re_list = [re]
while self.c == '|':
self.next()
re_list.append(self.parse_seq())
re = Alt(*re_list)
return re
def parse_seq(self):
"""Parse a sequence of regexps."""
re_list = []
while not self.end and not self.c in "|)":
re_list.append(self.parse_mod())
return Seq(*re_list)
def parse_mod(self):
"""Parse a primitive regexp followed by *, +, ? modifiers."""
re = self.parse_prim()
while not self.end and self.c in "*+?":
if self.c == '*':
re = Rep(re)
elif self.c == '+':
re = Rep1(re)
else: # self.c == '?'
re = Opt(re)
self.next()
return re
def parse_prim(self):
"""Parse a primitive regexp."""
c = self.get()
if c == '.':
re = AnyBut("\n")
elif c == '^':
re = Bol
elif c == '$':
re = Eol
elif c == '(':
re = self.parse_alt()
self.expect(')')
elif c == '[':
re = self.parse_charset()
self.expect(']')
else:
if c == '\\':
c = self.get()
re = Char(c)
return re
def parse_charset(self):
"""Parse a charset. Does not include the surrounding []."""
char_list = []
invert = 0
if self.c == '^':
invert = 1
self.next()
if self.c == ']':
char_list.append(']')
self.next()
while not self.end and self.c != ']':
c1 = self.get()
if self.c == '-' and self.lookahead(1) != ']':
self.next()
c2 = self.get()
for a in range(ord(c1), ord(c2) + 1):
char_list.append(chr(a))
else:
char_list.append(c1)
chars = ''.join(char_list)
if invert:
return AnyBut(chars)
else:
return Any(chars)
def next(self):
"""Advance to the next char."""
s = self.s
i = self.i = self.i + 1
if i < len(s):
self.c = s[i]
else:
self.c = ''
self.end = 1
def get(self):
if self.end:
self.error("Premature end of string")
c = self.c
self.next()
return c
def lookahead(self, n):
"""Look ahead n chars."""
j = self.i + n
if j < len(self.s):
return self.s[j]
else:
return ''
def expect(self, c):
"""
Expect to find character |c| at current position.
Raises an exception otherwise.
"""
if self.c == c:
self.next()
else:
self.error("Missing %s" % repr(c))
def error(self, mess):
"""Raise exception to signal syntax error in regexp."""
raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
repr(self.s), self.i, mess))

View file

@ -0,0 +1,251 @@
#
# Plex - Transition Maps
#
# This version represents state sets directly as dicts for speed.
#
from __future__ import absolute_import
try:
from sys import maxsize as maxint
except ImportError:
from sys import maxint
class TransitionMap(object):
"""
A TransitionMap maps an input event to a set of states.
An input event is one of: a range of character codes,
the empty string (representing an epsilon move), or one
of the special symbols BOL, EOL, EOF.
For characters, this implementation compactly represents
the map by means of a list:
[code_0, states_0, code_1, states_1, code_2, states_2,
..., code_n-1, states_n-1, code_n]
where |code_i| is a character code, and |states_i| is a
set of states corresponding to characters with codes |c|
in the range |code_i| <= |c| <= |code_i+1|.
The following invariants hold:
n >= 1
code_0 == -maxint
code_n == maxint
code_i < code_i+1 for i in 0..n-1
states_0 == states_n-1
Mappings for the special events '', BOL, EOL, EOF are
kept separately in a dictionary.
"""
map = None # The list of codes and states
special = None # Mapping for special events
def __init__(self, map=None, special=None):
if not map:
map = [-maxint, {}, maxint]
if not special:
special = {}
self.map = map
self.special = special
#self.check() ###
def add(self, event, new_state,
TupleType=tuple):
"""
Add transition to |new_state| on |event|.
"""
if type(event) is TupleType:
code0, code1 = event
i = self.split(code0)
j = self.split(code1)
map = self.map
while i < j:
map[i + 1][new_state] = 1
i += 2
else:
self.get_special(event)[new_state] = 1
def add_set(self, event, new_set,
TupleType=tuple):
"""
Add transitions to the states in |new_set| on |event|.
"""
if type(event) is TupleType:
code0, code1 = event
i = self.split(code0)
j = self.split(code1)
map = self.map
while i < j:
map[i + 1].update(new_set)
i += 2
else:
self.get_special(event).update(new_set)
def get_epsilon(self,
none=None):
"""
Return the mapping for epsilon, or None.
"""
return self.special.get('', none)
def iteritems(self,
len=len):
"""
Return the mapping as an iterable of ((code1, code2), state_set) and
(special_event, state_set) pairs.
"""
result = []
map = self.map
else_set = map[1]
i = 0
n = len(map) - 1
code0 = map[0]
while i < n:
set = map[i + 1]
code1 = map[i + 2]
if set or else_set:
result.append(((code0, code1), set))
code0 = code1
i += 2
for event, set in self.special.items():
if set:
result.append((event, set))
return iter(result)
items = iteritems
# ------------------- Private methods --------------------
def split(self, code,
len=len, maxint=maxint):
"""
Search the list for the position of the split point for |code|,
inserting a new split point if necessary. Returns index |i| such
that |code| == |map[i]|.
"""
# We use a funky variation on binary search.
map = self.map
hi = len(map) - 1
# Special case: code == map[-1]
if code == maxint:
return hi
# General case
lo = 0
# loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2
while hi - lo >= 4:
# Find midpoint truncated to even index
mid = ((lo + hi) // 2) & ~1
if code < map[mid]:
hi = mid
else:
lo = mid
# map[lo] <= code < map[hi] and hi - lo == 2
if map[lo] == code:
return lo
else:
map[hi:hi] = [code, map[hi - 1].copy()]
#self.check() ###
return hi
def get_special(self, event):
"""
Get state set for special event, adding a new entry if necessary.
"""
special = self.special
set = special.get(event, None)
if not set:
set = {}
special[event] = set
return set
# --------------------- Conversion methods -----------------------
def __str__(self):
map_strs = []
map = self.map
n = len(map)
i = 0
while i < n:
code = map[i]
if code == -maxint:
code_str = "-inf"
elif code == maxint:
code_str = "inf"
else:
code_str = str(code)
map_strs.append(code_str)
i += 1
if i < n:
map_strs.append(state_set_str(map[i]))
i += 1
special_strs = {}
for event, set in self.special.items():
special_strs[event] = state_set_str(set)
return "[%s]+%s" % (
','.join(map_strs),
special_strs
)
# --------------------- Debugging methods -----------------------
def check(self):
"""Check data structure integrity."""
if not self.map[-3] < self.map[-1]:
print(self)
assert 0
def dump(self, file):
map = self.map
i = 0
n = len(map) - 1
while i < n:
self.dump_range(map[i], map[i + 2], map[i + 1], file)
i += 2
for event, set in self.special.items():
if set:
if not event:
event = 'empty'
self.dump_trans(event, set, file)
def dump_range(self, code0, code1, set, file):
if set:
if code0 == -maxint:
if code1 == maxint:
k = "any"
else:
k = "< %s" % self.dump_char(code1)
elif code1 == maxint:
k = "> %s" % self.dump_char(code0 - 1)
elif code0 == code1 - 1:
k = self.dump_char(code0)
else:
k = "%s..%s" % (self.dump_char(code0),
self.dump_char(code1 - 1))
self.dump_trans(k, set, file)
def dump_char(self, code):
if 0 <= code <= 255:
return repr(chr(code))
else:
return "chr(%d)" % code
def dump_trans(self, key, set, file):
file.write(" %s --> %s\n" % (key, self.dump_set(set)))
def dump_set(self, set):
return state_set_str(set)
#
# State set manipulation functions
#
#def merge_state_sets(set1, set2):
# for state in set2.keys():
# set1[state] = 1
def state_set_str(set):
return "[%s]" % ','.join(["S%d" % state.number for state in set])

View file

@ -0,0 +1,39 @@
#=======================================================================
#
# Python Lexical Analyser
#
#=======================================================================
"""
The Plex module provides lexical analysers with similar capabilities
to GNU Flex. The following classes and functions are exported;
see the attached docstrings for more information.
Scanner For scanning a character stream under the
direction of a Lexicon.
Lexicon For constructing a lexical definition
to be used by a Scanner.
Str, Any, AnyBut, AnyChar, Seq, Alt, Opt, Rep, Rep1,
Bol, Eol, Eof, Empty
Regular expression constructors, for building pattern
definitions for a Lexicon.
State For defining scanner states when creating a
Lexicon.
TEXT, IGNORE, Begin
Actions for associating with patterns when
creating a Lexicon.
"""
from __future__ import absolute_import
from .Actions import TEXT, IGNORE, Begin
from .Lexicons import Lexicon, State
from .Regexps import RE, Seq, Alt, Rep1, Empty, Str, Any, AnyBut, AnyChar, Range
from .Regexps import Opt, Rep, Bol, Eol, Eof, Case, NoCase
from .Scanners import Scanner