|
5 | 5 |
|
6 | 6 | import re
|
7 | 7 |
|
8 |
| -'''Minimalistic lexer engine inspired by the PyPigments RegexLexer''' |
| 8 | +"""Minimalistic lexer engine inspired by the PyPigments RegexLexer""" |
9 | 9 |
|
10 | 10 | __version__ = '1.0.7'
|
11 | 11 |
|
12 |
| -class MiniLexer(object): |
13 |
| - '''Simple lexer state machine with regex matching rules''' |
14 |
| - |
15 |
| - def __init__(self, tokens, flags=re.MULTILINE): |
16 |
| - '''Create a new lexer |
17 |
| - |
18 |
| - Args: |
19 |
| - tokens (dict(match rules)): Hierarchical dict of states with a list of regex patterns and transitions |
20 |
| - flags (int): Optional regex flags |
21 |
| - ''' |
22 |
| - self.tokens = {} |
23 |
| - |
24 |
| - # Pre-process the state definitions |
25 |
| - for state, patterns in tokens.items(): |
26 |
| - full_patterns = [] |
27 |
| - for p in patterns: |
28 |
| - pat = re.compile(p[0], flags) |
29 |
| - action = p[1] |
30 |
| - new_state = p[2] if len(p) >= 3 else None |
31 |
| - |
32 |
| - # Convert pops into an integer |
33 |
| - if new_state and new_state.startswith('#pop'): |
34 |
| - try: |
35 |
| - new_state = -int(new_state.split(':')[1]) |
36 |
| - except ValueError: |
37 |
| - new_state = -1 |
38 |
| - except IndexError: |
39 |
| - new_state = -1 |
40 |
| - |
41 |
| - full_patterns.append((pat, action, new_state)) |
42 |
| - self.tokens[state] = full_patterns |
43 |
| - |
44 |
| - |
45 |
| - def run(self, text): |
46 |
| - '''Run lexer rules against a source text |
47 |
| -
|
48 |
| - Args: |
49 |
| - text (str): Text to apply lexer to |
50 |
| -
|
51 |
| - Yields: |
52 |
| - A sequence of lexer matches. |
53 |
| - ''' |
54 |
| - |
55 |
| - stack = ['root'] |
56 |
| - pos = 0 |
57 |
| - |
58 |
| - patterns = self.tokens[stack[-1]] |
59 |
| - |
60 |
| - while True: |
61 |
| - for pat, action, new_state in patterns: |
62 |
| - m = pat.match(text, pos) |
63 |
| - if m: |
64 |
| - if action: |
65 |
| - #print('## MATCH: {} -> {}'.format(m.group(), action)) |
66 |
| - #print(m.string[m.pos:m.endpos]) |
67 |
| - yield (pos, m.end()-1), action, m.groups() |
68 |
| - |
69 |
| - pos = m.end() |
70 |
| - |
71 |
| - if new_state: |
72 |
| - if isinstance(new_state, int): # Pop states |
73 |
| - del stack[new_state:] |
74 |
| - else: |
75 |
| - stack.append(new_state) |
76 |
| - |
77 |
| - #print('## CHANGE STATE:', pos, new_state, stack) |
78 |
| - patterns = self.tokens[stack[-1]] |
79 | 12 |
|
80 |
| - break |
81 |
| - |
82 |
| - else: |
83 |
| - try: |
84 |
| - if text[pos] == '\n': |
85 |
| - pos += 1 |
86 |
| - continue |
87 |
| - pos += 1 |
88 |
| - except IndexError: |
89 |
| - break |
| 13 | +class MiniLexer(object): |
| 14 | + """Simple lexer state machine with regex matching rules""" |
| 15 | + |
| 16 | + def __init__(self, tokens, flags=re.MULTILINE): |
| 17 | + """Create a new lexer |
| 18 | +
|
| 19 | + Args: |
| 20 | + tokens (dict(match rules)): Hierarchical dict of states with a list of regex patterns and transitions |
| 21 | + flags (int): Optional regex flags |
| 22 | + """ |
| 23 | + self.tokens = {} |
| 24 | + |
| 25 | + # Pre-process the state definitions |
| 26 | + for state, patterns in tokens.items(): |
| 27 | + full_patterns = [] |
| 28 | + for p in patterns: |
| 29 | + pat = re.compile(p[0], flags) |
| 30 | + action = p[1] |
| 31 | + new_state = p[2] if len(p) >= 3 else None |
| 32 | + |
| 33 | + # Convert pops into an integer |
| 34 | + if new_state and new_state.startswith('#pop'): |
| 35 | + try: |
| 36 | + new_state = -int(new_state.split(':')[1]) |
| 37 | + except ValueError: |
| 38 | + new_state = -1 |
| 39 | + except IndexError: |
| 40 | + new_state = -1 |
| 41 | + |
| 42 | + full_patterns.append((pat, action, new_state)) |
| 43 | + self.tokens[state] = full_patterns |
| 44 | + |
| 45 | + def run(self, text): |
| 46 | + """Run lexer rules against a source text |
| 47 | +
|
| 48 | + Args: |
| 49 | + text (str): Text to apply lexer to |
| 50 | +
|
| 51 | + Yields: |
| 52 | + A sequence of lexer matches. |
| 53 | + """ |
| 54 | + |
| 55 | + stack = ['root'] |
| 56 | + pos = 0 |
| 57 | + |
| 58 | + patterns = self.tokens[stack[-1]] |
| 59 | + |
| 60 | + while True: |
| 61 | + for pat, action, new_state in patterns: |
| 62 | + m = pat.match(text, pos) |
| 63 | + if m: |
| 64 | + if action: |
| 65 | + # print('## MATCH: {} -> {}'.format(m.group(), action)) |
| 66 | + # print(m.string[m.pos:m.endpos]) |
| 67 | + yield (pos, m.end() - 1), action, m.groups() |
| 68 | + |
| 69 | + pos = m.end() |
| 70 | + |
| 71 | + if new_state: |
| 72 | + if isinstance(new_state, int): # Pop states |
| 73 | + del stack[new_state:] |
| 74 | + else: |
| 75 | + stack.append(new_state) |
| 76 | + |
| 77 | + # print('## CHANGE STATE:', pos, new_state, stack) |
| 78 | + patterns = self.tokens[stack[-1]] |
| 79 | + |
| 80 | + break |
90 | 81 |
|
| 82 | + else: |
| 83 | + try: |
| 84 | + if text[pos] == '\n': |
| 85 | + pos += 1 |
| 86 | + continue |
| 87 | + pos += 1 |
| 88 | + except IndexError: |
| 89 | + break |
0 commit comments