-
Notifications
You must be signed in to change notification settings - Fork 64
/
Copy pathLexer.php
199 lines (169 loc) · 6.65 KB
/
Lexer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
<?php declare(strict_types = 1);
namespace PHPStan\PhpDocParser\Lexer;
use PHPStan\PhpDocParser\ParserConfig;
use function implode;
use function preg_match_all;
use const PREG_SET_ORDER;
/**
* Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
*/
class Lexer
{
public const TOKEN_REFERENCE = 0;
public const TOKEN_UNION = 1;
public const TOKEN_INTERSECTION = 2;
public const TOKEN_NULLABLE = 3;
public const TOKEN_OPEN_PARENTHESES = 4;
public const TOKEN_CLOSE_PARENTHESES = 5;
public const TOKEN_OPEN_ANGLE_BRACKET = 6;
public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
public const TOKEN_OPEN_SQUARE_BRACKET = 8;
public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
public const TOKEN_COMMA = 10;
public const TOKEN_VARIADIC = 11;
public const TOKEN_DOUBLE_COLON = 12;
public const TOKEN_DOUBLE_ARROW = 13;
public const TOKEN_EQUAL = 14;
public const TOKEN_OPEN_PHPDOC = 15;
public const TOKEN_CLOSE_PHPDOC = 16;
public const TOKEN_PHPDOC_TAG = 17;
public const TOKEN_DOCTRINE_TAG = 18;
public const TOKEN_FLOAT = 19;
public const TOKEN_INTEGER = 20;
public const TOKEN_SINGLE_QUOTED_STRING = 21;
public const TOKEN_DOUBLE_QUOTED_STRING = 22;
public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23;
public const TOKEN_IDENTIFIER = 24;
public const TOKEN_THIS_VARIABLE = 25;
public const TOKEN_VARIABLE = 26;
public const TOKEN_HORIZONTAL_WS = 27;
public const TOKEN_PHPDOC_EOL = 28;
public const TOKEN_OTHER = 29;
public const TOKEN_END = 30;
public const TOKEN_COLON = 31;
public const TOKEN_WILDCARD = 32;
public const TOKEN_OPEN_CURLY_BRACKET = 33;
public const TOKEN_CLOSE_CURLY_BRACKET = 34;
public const TOKEN_NEGATED = 35;
public const TOKEN_ARROW = 36;
public const TOKEN_COMMENT = 37;
public const TOKEN_LABELS = [
self::TOKEN_REFERENCE => '\'&\'',
self::TOKEN_UNION => '\'|\'',
self::TOKEN_INTERSECTION => '\'&\'',
self::TOKEN_NULLABLE => '\'?\'',
self::TOKEN_NEGATED => '\'!\'',
self::TOKEN_OPEN_PARENTHESES => '\'(\'',
self::TOKEN_CLOSE_PARENTHESES => '\')\'',
self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
self::TOKEN_COMMA => '\',\'',
self::TOKEN_COMMENT => '\'//\'',
self::TOKEN_COLON => '\':\'',
self::TOKEN_VARIADIC => '\'...\'',
self::TOKEN_DOUBLE_COLON => '\'::\'',
self::TOKEN_DOUBLE_ARROW => '\'=>\'',
self::TOKEN_ARROW => '\'->\'',
self::TOKEN_EQUAL => '\'=\'',
self::TOKEN_OPEN_PHPDOC => '\'/**\'',
self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG',
self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
self::TOKEN_FLOAT => 'TOKEN_FLOAT',
self::TOKEN_INTEGER => 'TOKEN_INTEGER',
self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING',
self::TOKEN_IDENTIFIER => 'type',
self::TOKEN_THIS_VARIABLE => '\'$this\'',
self::TOKEN_VARIABLE => 'variable',
self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
self::TOKEN_OTHER => 'TOKEN_OTHER',
self::TOKEN_END => 'TOKEN_END',
self::TOKEN_WILDCARD => '*',
];
public const VALUE_OFFSET = 0;
public const TYPE_OFFSET = 1;
public const LINE_OFFSET = 2;
private ParserConfig $config; // @phpstan-ignore property.onlyWritten
private ?string $regexp = null;
public function __construct(ParserConfig $config)
{
$this->config = $config;
}
/**
* @return list<array{string, int, int}>
*/
public function tokenize(string $s): array
{
if ($this->regexp === null) {
$this->regexp = $this->generateRegexp();
}
preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
$tokens = [];
$line = 1;
foreach ($matches as $match) {
$type = (int) $match['MARK'];
$tokens[] = [$match[0], $type, $line];
if ($type !== self::TOKEN_PHPDOC_EOL) {
continue;
}
$line++;
}
$tokens[] = ['', self::TOKEN_END, $line];
return $tokens;
}
private function generateRegexp(): string
{
$patterns = [
self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
// '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
self::TOKEN_UNION => '\\|',
self::TOKEN_INTERSECTION => '&',
self::TOKEN_NULLABLE => '\\?',
self::TOKEN_NEGATED => '!',
self::TOKEN_OPEN_PARENTHESES => '\\(',
self::TOKEN_CLOSE_PARENTHESES => '\\)',
self::TOKEN_OPEN_ANGLE_BRACKET => '<',
self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
self::TOKEN_COMMA => ',',
self::TOKEN_COMMENT => '\/\/[^\\r\\n]*(?=\n|\r|\*/)',
self::TOKEN_VARIADIC => '\\.\\.\\.',
self::TOKEN_DOUBLE_COLON => '::',
self::TOKEN_DOUBLE_ARROW => '=>',
self::TOKEN_ARROW => '->',
self::TOKEN_EQUAL => '=',
self::TOKEN_COLON => ':',
self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
self::TOKEN_CLOSE_PHPDOC => '\\*/',
self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
self::TOKEN_DOCTRINE_TAG => '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*',
self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))',
self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))',
self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
self::TOKEN_DOCTRINE_ANNOTATION_STRING => '"(?:""|[^"])*+"',
self::TOKEN_WILDCARD => '\\*',
// anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
self::TOKEN_OTHER => '(?:(?!\\*/)[^\\s])++',
];
foreach ($patterns as $type => &$pattern) {
$pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
}
return '~' . implode('|', $patterns) . '~Asi';
}
}