pyPEG.py
changeset 0 76005e62091d
child 25 cb4a7f8b230d
child 31 d3dddb80d1f5
equal deleted inserted replaced
-1:000000000000 0:76005e62091d
       
     1 # YPL parser 1.5
       
     2 
       
     3 # written by VB.
       
     4 
       
     5 import re
       
     6 import sys, codecs
       
     7 import exceptions
       
     8 
       
     9 class keyword(unicode): pass
       
    10 class code(unicode): pass
       
    11 class ignore(object):
       
    12     def __init__(self, regex_text, *args):
       
    13         self.regex = re.compile(regex_text, *args)
       
    14 
       
    15 class _and(object):
       
    16     def __init__(self, something):
       
    17         self.obj = something
       
    18 
       
    19 class _not(_and): pass
       
    20 
       
    21 class Name(unicode):
       
    22     def __init__(self, *args):
       
    23         self.line = 0
       
    24         self.file = u""
       
    25 
       
    26 class Symbol(list):
       
    27     def __init__(self, name, what):
       
    28         self.__name__ = name
       
    29         self.append(name)
       
    30         self.what = what
       
    31         self.append(what)
       
    32     def __call__(self):
       
    33         return self.what
       
    34     def __unicode__(self):
       
    35         return u'Symbol(' + repr(self.__name__) + ', ' + repr(self.what) + u')'
       
    36     def __repr__(self):
       
    37         return unicode(self)
       
    38 
       
    39 word_regex = re.compile(ur"\w+")
       
    40 rest_regex = re.compile(ur".*")
       
    41 
       
    42 print_trace = False
       
    43 
       
    44 def u(text):
       
    45     if isinstance(text, exceptions.BaseException):
       
    46         text = text.args[0]
       
    47     if type(text) is unicode:
       
    48         return text
       
    49     if isinstance(text, str):
       
    50         if sys.stdin.encoding:
       
    51             return codecs.decode(text, sys.stdin.encoding)
       
    52         else:
       
    53             return codecs.decode(text, "utf-8")
       
    54     return unicode(text)
       
    55 
       
    56 def skip(skipper, text, skipWS, skipComments):
       
    57     if skipWS:
       
    58         t = text.lstrip()
       
    59     else:
       
    60         t = text
       
    61     if skipComments:
       
    62         try:
       
    63             while True:
       
    64                 skip, t = skipper.parseLine(t, skipComments, [], skipWS, None)
       
    65                 if skipWS:
       
    66                     t = t.lstrip()
       
    67         except: pass
       
    68     return t
       
    69 
       
    70 class parser(object):
       
    71     def __init__(self, another = False, p = False):
       
    72         self.restlen = -1 
       
    73         if not(another):
       
    74             self.skipper = parser(True, p)
       
    75             self.skipper.packrat = p
       
    76         else:
       
    77             self.skipper = self
       
    78         self.lines = None
       
    79         self.textlen = 0
       
    80         self.memory = {}
       
    81         self.packrat = p
       
    82 
       
    83     # parseLine():
       
    84     #   textline:       text to parse
       
    85     #   pattern:        pyPEG language description
       
    86     #   resultSoFar:    parsing result so far (default: blank list [])
       
    87     #   skipWS:         Flag if whitespace should be skipped (default: True)
       
    88     #   skipComments:   Python functions returning pyPEG for matching comments
       
    89     #   
       
    90     #   returns:        pyAST, textrest
       
    91     #
       
    92     #   raises:         SyntaxError(reason) if textline is detected not being in language
       
    93     #                   described by pattern
       
    94     #
       
    95     #                   SyntaxError(reason) if pattern is an illegal language description
       
    96 
       
    97     def parseLine(self, textline, pattern, resultSoFar = [], skipWS = True, skipComments = None):
       
    98         name = None
       
    99         _textline = textline
       
   100         _pattern = pattern
       
   101 
       
   102         def R(result, text):
       
   103             if __debug__:
       
   104                 if print_trace:
       
   105                     try:
       
   106                         if _pattern.__name__ != "comment":
       
   107                             sys.stderr.write(u"match: " + _pattern.__name__ + u"\n")
       
   108                     except: pass
       
   109 
       
   110             if self.restlen == -1:
       
   111                 self.restlen = len(text)
       
   112             else:
       
   113                 self.restlen = min(self.restlen, len(text))
       
   114             res = resultSoFar
       
   115             if name and result:
       
   116                 name.line = self.lineNo()
       
   117                 res.append(Symbol(name, result))
       
   118             elif name:
       
   119                 name.line = self.lineNo()
       
   120                 res.append(Symbol(name, []))
       
   121             elif result:
       
   122                 if type(result) is type([]):
       
   123                     res.extend(result)
       
   124                 else:
       
   125                     res.extend([result])
       
   126             if self.packrat:
       
   127                 self.memory[(len(_textline), id(_pattern))] = (res, text)
       
   128             return res, text
       
   129 
       
   130         def syntaxError():
       
   131             if self.packrat:
       
   132                 self.memory[(len(_textline), id(_pattern))] = False
       
   133             raise SyntaxError()
       
   134 
       
   135         if self.packrat:
       
   136             try:
       
   137                 result = self.memory[(len(textline), id(pattern))]
       
   138                 if result:
       
   139                     return result
       
   140                 else:
       
   141                     raise SyntaxError()
       
   142             except: pass
       
   143 
       
   144         if callable(pattern):
       
   145             if __debug__:
       
   146                 if print_trace:
       
   147                     try:
       
   148                         if pattern.__name__ != "comment":
       
   149                             sys.stderr.write(u"testing with " + pattern.__name__ + u": " + textline[:40] + u"\n")
       
   150                     except: pass
       
   151 
       
   152             if pattern.__name__[0] != "_":
       
   153                 name = Name(pattern.__name__)
       
   154 
       
   155             pattern = pattern()
       
   156             if callable(pattern):
       
   157                 pattern = (pattern,)
       
   158 
       
   159         text = skip(self.skipper, textline, skipWS, skipComments)
       
   160 
       
   161         pattern_type = type(pattern)
       
   162 
       
   163         if pattern_type is str or pattern_type is unicode:
       
   164             if text[:len(pattern)] == pattern:
       
   165                 text = skip(self.skipper, text[len(pattern):], skipWS, skipComments)
       
   166                 return R(None, text)
       
   167             else:
       
   168                 syntaxError()
       
   169 
       
   170         elif pattern_type is keyword:
       
   171             m = word_regex.match(text)
       
   172             if m:
       
   173                 if m.group(0) == pattern:
       
   174                     text = skip(self.skipper, text[len(pattern):], skipWS, skipComments)
       
   175                     return R(None, text)
       
   176                 else:
       
   177                     syntaxError()
       
   178             else:
       
   179                 syntaxError()
       
   180 
       
   181         elif pattern_type is _not:
       
   182             try:
       
   183                 r, t = self.parseLine(text, pattern.obj, [], skipWS, skipComments)
       
   184             except:
       
   185                 return resultSoFar, textline
       
   186             syntaxError()
       
   187 
       
   188         elif pattern_type is _and:
       
   189             r, t = self.parseLine(text, pattern.obj, [], skipWS, skipComments)
       
   190             return resultSoFar, textline
       
   191 
       
   192         elif pattern_type is type(word_regex) or pattern_type is ignore:
       
   193             if pattern_type is ignore:
       
   194                 pattern = pattern.regex
       
   195             m = pattern.match(text)
       
   196             if m:
       
   197                 text = skip(self.skipper, text[len(m.group(0)):], skipWS, skipComments)
       
   198                 if pattern_type is ignore:
       
   199                     return R(None, text)
       
   200                 else:
       
   201                     return R(m.group(0), text)
       
   202             else:
       
   203                 syntaxError()
       
   204 
       
   205         elif pattern_type is tuple:
       
   206             result = []
       
   207             n = 1
       
   208             for p in pattern:
       
   209                 if type(p) is type(0):
       
   210                     n = p
       
   211                 else:
       
   212                     if n>0:
       
   213                         for i in range(n):
       
   214                             result, text = self.parseLine(text, p, result, skipWS, skipComments)
       
   215                     elif n==0:
       
   216                         if text == "":
       
   217                             pass
       
   218                         else:
       
   219                             try:
       
   220                                 newResult, newText = self.parseLine(text, p, result, skipWS, skipComments)
       
   221                                 result, text = newResult, newText
       
   222                             except SyntaxError:
       
   223                                 pass
       
   224                     elif n<0:
       
   225                         found = False
       
   226                         while True:
       
   227                             try:
       
   228                                 newResult, newText = self.parseLine(text, p, result, skipWS, skipComments)
       
   229                                 result, text, found = newResult, newText, True
       
   230                             except SyntaxError:
       
   231                                 break
       
   232                         if n == -2 and not(found):
       
   233                             syntaxError()
       
   234                     n = 1
       
   235             return R(result, text)
       
   236 
       
   237         elif pattern_type is list:
       
   238             result = []
       
   239             found = False
       
   240             for p in pattern:
       
   241                 try:
       
   242                     result, text = self.parseLine(text, p, result, skipWS, skipComments)
       
   243                     found = True
       
   244                 except SyntaxError:
       
   245                     pass
       
   246                 if found:
       
   247                     break
       
   248             if found:
       
   249                 return R(result, text)
       
   250             else:
       
   251                 syntaxError()
       
   252 
       
   253         else:
       
   254             raise SyntaxError(u"illegal type in grammar: " + u(pattern_type))
       
   255 
       
   256     def lineNo(self):
       
   257         if not(self.lines): return u""
       
   258         if self.restlen == -1: return u""
       
   259         parsed = self.textlen - self.restlen
       
   260 
       
   261         left, right = 0, len(self.lines)
       
   262 
       
   263         while True:
       
   264             mid = int((right + left) / 2)
       
   265             if self.lines[mid][0] <= parsed:
       
   266                 try:
       
   267                     if self.lines[mid + 1][0] >= parsed:
       
   268                         try:
       
   269                             return u(self.lines[mid + 1][1]) + u":" + u(self.lines[mid + 1][2])
       
   270                         except:
       
   271                             return u""
       
   272                     else:
       
   273                         left = mid + 1
       
   274                 except:
       
   275                     try:
       
   276                         return u(self.lines[mid + 1][1]) + u":" + u(self.lines[mid + 1][2])
       
   277                     except:
       
   278                         return u""
       
   279             else:
       
   280                 right = mid - 1
       
   281             if left > right:
       
   282                 return u""
       
   283 
       
   284 # plain module API
       
   285 
       
   286 def parseLine(textline, pattern, resultSoFar = [], skipWS = True, skipComments = None, packrat = False):
       
   287     p = parser(p=packrat)
       
   288     text = skip(p.skipper, textline, skipWS, skipComments)
       
   289     ast, text = p.parseLine(text, pattern, resultSoFar, skipWS, skipComments)
       
   290     return ast, text
       
   291 
       
   292 # parse():
       
   293 #   language:       pyPEG language description
       
   294 #   lineSource:     a fileinput.FileInput object
       
   295 #   skipWS:         Flag if whitespace should be skipped (default: True)
       
   296 #   skipComments:   Python function which returns pyPEG for matching comments
       
   297 #   packrat:        use memoization
       
   298 #   lineCount:      add line number information to AST
       
   299 #   
       
   300 #   returns:        pyAST
       
   301 #
       
   302 #   raises:         SyntaxError(reason), if a parsed line is not in language
       
   303 #                   SyntaxError(reason), if the language description is illegal
       
   304 
       
   305 def parse(language, lineSource, skipWS = True, skipComments = None, packrat = False, lineCount = True):
       
   306     lines, lineNo = [], 0
       
   307 
       
   308     while callable(language):
       
   309         language = language()
       
   310 
       
   311     orig, ld = u"", 0
       
   312     for line in lineSource:
       
   313         if lineSource.isfirstline():
       
   314             ld = 1
       
   315         else:
       
   316             ld += 1
       
   317         lines.append((len(orig), lineSource.filename(), lineSource.lineno() - 1))
       
   318         orig += u(line)
       
   319 
       
   320     textlen = len(orig)
       
   321 
       
   322     try:
       
   323         p = parser(p=packrat)
       
   324         p.textlen = len(orig)
       
   325         if lineCount:
       
   326             p.lines = lines
       
   327         else:
       
   328             p.line = None
       
   329         text = skip(p.skipper, orig, skipWS, skipComments)
       
   330         result, text = p.parseLine(text, language, [], skipWS, skipComments)
       
   331         if text:
       
   332             raise SyntaxError()
       
   333 
       
   334     except SyntaxError, msg:
       
   335         parsed = textlen - p.restlen
       
   336         textlen = 0
       
   337         nn, lineNo, file = 0, 0, u""
       
   338         for n, ld, l in lines:
       
   339             if n >= parsed:
       
   340                 break
       
   341             else:
       
   342                 lineNo = l
       
   343                 nn += 1
       
   344                 file = ld
       
   345 
       
   346         lineNo += 1
       
   347         nn -= 1
       
   348         lineCont = orig.splitlines()[nn]
       
   349         raise SyntaxError(u"syntax error in " + u(file) + u":" + u(lineNo) + u": " + lineCont)
       
   350 
       
   351     return result