#======================================================================= # # Python Lexical Analyser # # Traditional Regular Expression Syntax # #======================================================================= from __future__ import absolute_import from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char from .Errors import PlexError class RegexpSyntaxError(PlexError): pass def re(s): """ Convert traditional string representation of regular expression |s| into Plex representation. """ return REParser(s).parse_re() class REParser(object): def __init__(self, s): self.s = s self.i = -1 self.end = 0 self.next() def parse_re(self): re = self.parse_alt() if not self.end: self.error("Unexpected %s" % repr(self.c)) return re def parse_alt(self): """Parse a set of alternative regexps.""" re = self.parse_seq() if self.c == '|': re_list = [re] while self.c == '|': self.next() re_list.append(self.parse_seq()) re = Alt(*re_list) return re def parse_seq(self): """Parse a sequence of regexps.""" re_list = [] while not self.end and not self.c in "|)": re_list.append(self.parse_mod()) return Seq(*re_list) def parse_mod(self): """Parse a primitive regexp followed by *, +, ? modifiers.""" re = self.parse_prim() while not self.end and self.c in "*+?": if self.c == '*': re = Rep(re) elif self.c == '+': re = Rep1(re) else: # self.c == '?' re = Opt(re) self.next() return re def parse_prim(self): """Parse a primitive regexp.""" c = self.get() if c == '.': re = AnyBut("\n") elif c == '^': re = Bol elif c == '$': re = Eol elif c == '(': re = self.parse_alt() self.expect(')') elif c == '[': re = self.parse_charset() self.expect(']') else: if c == '\\': c = self.get() re = Char(c) return re def parse_charset(self): """Parse a charset. Does not include the surrounding [].""" char_list = [] invert = 0 if self.c == '^': invert = 1 self.next() if self.c == ']': char_list.append(']') self.next() while not self.end and self.c != ']': c1 = self.get() if self.c == '-' and self.lookahead(1) != ']': self.next() c2 = self.get() for a in range(ord(c1), ord(c2) + 1): char_list.append(chr(a)) else: char_list.append(c1) chars = ''.join(char_list) if invert: return AnyBut(chars) else: return Any(chars) def next(self): """Advance to the next char.""" s = self.s i = self.i = self.i + 1 if i < len(s): self.c = s[i] else: self.c = '' self.end = 1 def get(self): if self.end: self.error("Premature end of string") c = self.c self.next() return c def lookahead(self, n): """Look ahead n chars.""" j = self.i + n if j < len(self.s): return self.s[j] else: return '' def expect(self, c): """ Expect to find character |c| at current position. Raises an exception otherwise. """ if self.c == c: self.next() else: self.error("Missing %s" % repr(c)) def error(self, mess): """Raise exception to signal syntax error in regexp.""" raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( repr(self.s), self.i, mess))