Skip to content

Commit e0ffaa3

Browse files
authored
Merge pull request #1854 from idodeclare/feature/improve_here_docs
Perl improvements
2 parents d087783 + 8cc4a83 commit e0ffaa3

File tree

8 files changed

+274
-87
lines changed

8 files changed

+274
-87
lines changed

src/org/opensolaris/opengrok/analysis/perl/PerlLexHelper.java

+107-26
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
*/
2323
package org.opensolaris.opengrok.analysis.perl;
2424
import java.io.IOException;
25+
import java.util.LinkedList;
26+
import java.util.Queue;
2527
import java.util.regex.Matcher;
2628
import java.util.regex.Pattern;
2729

@@ -31,6 +33,7 @@
3133
interface PerlLexListener {
3234
void pushState(int state);
3335
void popState() throws IOException;
36+
void switchState(int state);
3437
void maybeIntraState();
3538
void take(String value) throws IOException;
3639
void takeNonword(String value) throws IOException;
@@ -335,27 +338,89 @@ private void takeWhitespace(String whsp) throws IOException {
335338
}
336339

337340
/**
338-
* Begins a Here-document state, and writes the {@code capture} to output.
341+
* Parses a Here-document declaration, and writes the {@code capture} to
342+
* output. If the declaration is valid, {@code hereSettings} will have been
343+
* appended.
339344
*/
340-
public void hop(String capture, boolean nointerp, boolean indented)
341-
throws IOException {
345+
public void hop(String capture) throws IOException {
346+
if (!capture.startsWith("<<")) {
347+
throw new IllegalArgumentException("bad HERE: " + capture);
348+
}
342349

343350
listener.takeNonword(capture);
351+
if (hereSettings == null) hereSettings = new LinkedList<>();
352+
353+
String remaining = capture;
354+
int i = 0;
355+
HereDocSettings settings;
356+
boolean indented = false;
357+
boolean nointerp;
358+
String terminator;
359+
360+
String opener = remaining.substring(0, i + 2);
361+
remaining = remaining.substring(opener.length());
362+
if (remaining.startsWith("~")) {
363+
indented = true;
364+
remaining = remaining.substring(1);
365+
}
366+
remaining = remaining.replaceFirst("^\\s+", "");
367+
char c = remaining.charAt(0);
368+
switch (c) {
369+
case '\'':
370+
nointerp = true;
371+
remaining = remaining.substring(1);
372+
break;
373+
case '`':
374+
case '\"':
375+
nointerp = false;
376+
remaining = remaining.substring(1);
377+
break;
378+
case '\\':
379+
c = '\0';
380+
nointerp = true;
381+
remaining = remaining.substring(1);
382+
break;
383+
default:
384+
c = '\0';
385+
nointerp = false;
386+
break;
387+
}
344388

345-
hereTerminator = null;
346-
Matcher m = HERE_TERMINATOR_MATCH.matcher(capture);
347-
if (!m.find()) return;
348-
hereTerminator = m.group(0);
389+
if (c != '\0') {
390+
if ((i = remaining.indexOf(c)) < 1) {
391+
terminator = remaining;
392+
} else {
393+
terminator = remaining.substring(0, i);
394+
}
395+
} else {
396+
Matcher m = HERE_TERMINATOR_MATCH.matcher(remaining);
397+
if (!m.find()) return;
398+
terminator = m.group(0);
399+
}
349400

350401
int state;
351402
if (nointerp) {
352403
state = indented ? HEREinxN : HERExN;
353404
} else {
354405
state = indented ? HEREin : HERE;
355406
}
356-
listener.maybeIntraState();
357-
listener.pushState(state);
358-
listener.take(Consts.SS);
407+
settings = new HereDocSettings(terminator, state);
408+
hereSettings.add(settings);
409+
}
410+
411+
/**
412+
* Pushes the first Here-document state if any declarations were parsed, or
413+
* else does nothing.
414+
* @return true if a Here state was pushed
415+
*/
416+
public boolean maybeStartHere() throws IOException {
417+
if (hereSettings != null && hereSettings.size() > 0) {
418+
HereDocSettings settings = hereSettings.peek();
419+
listener.pushState(settings.state);
420+
listener.take(Consts.SS);
421+
return true;
422+
}
423+
return false;
359424
}
360425

361426
/**
@@ -364,26 +429,29 @@ public void hop(String capture, boolean nointerp, boolean indented)
364429
* @return true if the quote state ended
365430
*/
366431
public boolean maybeEndHere(String capture) throws IOException {
367-
if (!isHereEnding(capture)) {
368-
listener.takeNonword(capture);
432+
String trimmed = capture.replaceFirst("^\\s+", "");
433+
HereDocSettings settings = hereSettings.peek();
434+
435+
boolean didZspan = false;
436+
if (trimmed.equals(settings.terminator)) {
437+
listener.take(Consts.ZS);
438+
didZspan = true;
439+
hereSettings.remove();
440+
}
441+
442+
listener.takeNonword(capture);
443+
444+
if (hereSettings.size() > 0) {
445+
settings = hereSettings.peek();
446+
listener.switchState(settings.state);
447+
if (didZspan) listener.take(Consts.SS);
369448
return false;
370449
} else {
371450
listener.popState();
372-
listener.take(Consts.ZS);
373-
listener.takeNonword(capture);
374451
return true;
375452
}
376453
}
377454

378-
/**
379-
* Gets a value indicating if the Here-document should be ended.
380-
* @return true if the quote state should end
381-
*/
382-
public boolean isHereEnding(String capture) {
383-
String trimmed = capture.replaceFirst("^\\s+", "");
384-
return trimmed.equals(hereTerminator);
385-
}
386-
387455
/**
388456
* Splits a sigil identifier -- where the {@code capture} starts with
389457
* a sigil and ends in an identifier and where Perl allows whitespace after
@@ -504,7 +572,7 @@ public void specialID(String capture) throws IOException {
504572
}
505573

506574
private final static Pattern HERE_TERMINATOR_MATCH = Pattern.compile(
507-
"[a-zA-Z0-9_]+$");
575+
"^[a-zA-Z0-9_]+");
508576

509577
/**
510578
* When matching a quoting construct like qq[], q(), m//, s```, etc., the
@@ -549,6 +617,19 @@ public void specialID(String capture) throws IOException {
549617
*/
550618
private boolean waitq;
551619

552-
/** Stores the terminating identifier for For Here-documents */
553-
private String hereTerminator;
620+
private Queue<HereDocSettings> hereSettings;
621+
622+
class HereDocSettings {
623+
private final String terminator;
624+
private final int state;
625+
626+
public HereDocSettings(String terminator, int state) {
627+
this.terminator = terminator;
628+
this.state = state;
629+
}
630+
631+
public String getTerminator() { return terminator; }
632+
633+
public int getState() { return state; }
634+
}
554635
}

src/org/opensolaris/opengrok/analysis/perl/PerlProductions.lexh

+43-38
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ MaybeWhsp = {WhspChar}*
3232
EOL = \r|\n|\r\n
3333
Identifier = [a-zA-Z_] [a-zA-Z0-9_]*
3434
Sigils = ("$" | "@" | "%" | "&" | "*")
35-
WxSigils = [[\W]--[\$\@\%\&\*\"\'\`\#]]
35+
WxSigils = [[\W]--[\$\@\%\&\*\"\'\`\#\r\n]]
3636

3737
// Perl special identifiers (four of six from
3838
// https://perldoc.perl.org/perldata.html#Identifier-parsing):
@@ -100,35 +100,38 @@ TRhash = "tr"\#
100100
TRpunc = "tr" {MaybeWhsp} {Quo0xHash}
101101
TRword = "tr" {WhiteSpace} \w
102102

103-
HereContinuation = \,{MaybeWhsp} "<<"\~? {MaybeWhsp}
104-
MaybeHereMarkers = ([\"\'\`\\]?{Identifier} [^\n\r]* {HereContinuation})?
103+
HereEOF1 = [\"][^\r\n\"]*[\"]
104+
HereEOF2 = [\`][^\r\n\`]*[\`]
105+
HereEOF3 = [\'][^\r\n\']*[\']
106+
HereEOF4 = [\\]?{Identifier}
105107

106108
//
107109
// Track some keywords that can be used to identify heuristically a possible
108110
// beginning of the shortcut syntax, //, for m//. Also include any perlfunc
109-
// that takes /PATTERN/ -- which is just "split". Heuristics using punctuation
110-
// are defined inline later in some rules.
111+
// that takes /PATTERN/. Heuristics using punctuation are defined inline later
112+
// in some rules.
111113
//
112114
Mwords_1 = ("eq" | "ne" | "le" | "ge" | "lt" | "gt" | "cmp")
113115
Mwords_2 = ("if" | "unless" | "or" | "and" | "not")
114-
Mwords_3 = ("split")
116+
Mwords_3 = ("split" | "grep")
115117
Mwords = ({Mwords_1} | {Mwords_2} | {Mwords_3})
116118

117119
Mpunc1YYIN = [\(\!]
118-
Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
120+
Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>"|"=>")
119121

120122
//
121123
// There are two dimensions to quoting: "link"-or-not and "interpolate"-or-not.
122124
// Unfortunately, we cannot control the %state values, so we have to declare
123125
// a cross-product of states. (Technically, state values are not guaranteed to
124126
// be unique by jflex, but states that do not have identical rules will have
125-
// different values. The following four "QUO" states satisfy this difference
126-
// criterion. Likewise with the four "HERE" states.)
127+
// different values. The four "QUO" below states satisfy this difference
128+
// criterion; as likewise do the four "HERE" states.)
127129
//
128130
// YYINITIAL : nothing yet parsed or just after a non-quoted [;{}]
129131
// INTRA : saw content from YYINITIAL but not yet other state or [;{}]
130132
// SCOMMENT : single-line comment
131133
// POD : Perl Plain-Old-Documentation
134+
// FMT : an output record format
132135
// QUO : quote-like that is OK to match paths|files|URLs|e-mails
133136
// QUOxN : "" but with no interpolation
134137
// QUOxL : quote-like that is not OK to match paths|files|URLs|e-mails
@@ -139,20 +142,21 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
139142
// HERExN : Here-docs with no interpolation
140143
// HEREin : Indented Here-docs
141144
// HEREinxN : Indented Here-docs with no interpolation
142-
// FMT : an output record format
143145
//
144-
%state INTRA SCOMMENT POD FMT QUO QUOxN QUOxL QUOxLxN QM HERE HERExN HEREin HEREinxN
146+
%state INTRA SCOMMENT POD FMT
147+
%state QUO QUOxN QUOxL QUOxLxN QM
148+
%state HERE HERExN HEREin HEREinxN
145149

146150
%%
147151
<HERE, HERExN> {
148152
^ {Identifier} / {MaybeWhsp}{EOL} {
149-
if (h.maybeEndHere(yytext())) yyjump(YYINITIAL);
153+
h.maybeEndHere(yytext());
150154
}
151155
}
152156

153157
<HEREin, HEREinxN> {
154158
^ {MaybeWhsp} {Identifier} / {MaybeWhsp}{EOL} {
155-
if (h.maybeEndHere(yytext())) yyjump(YYINITIAL);
159+
h.maybeEndHere(yytext());
156160
}
157161
}
158162

@@ -173,23 +177,8 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
173177
takeNonword(yytext());
174178
}
175179

176-
// Following are rules for Here-documents. Stacked multiple here-docs are
177-
// recognized, but not fully supported, as only the interpolation setting
178-
// of the first marker will apply to all sections. (The final, second HERE
179-
// quoting character is not demanded, as it is superfluous for the needs of
180-
// xref lexing; and leaving it off simplifies parsing.)
181-
182-
"<<" {MaybeWhsp} {MaybeHereMarkers} [\"\`]?{Identifier} {
183-
h.hop(yytext(), false/*nointerp*/, false/*indented*/);
184-
}
185-
"<<~" {MaybeWhsp} {MaybeHereMarkers} [\"\`]?{Identifier} {
186-
h.hop(yytext(), false/*nointerp*/, true/*indented*/);
187-
}
188-
"<<" {MaybeWhsp} {MaybeHereMarkers} [\'\\]{Identifier} {
189-
h.hop(yytext(), true/*nointerp*/, false/*indented*/);
190-
}
191-
"<<~" {MaybeWhsp} {MaybeHereMarkers} [\'\\]{Identifier} {
192-
h.hop(yytext(), true/*nointerp*/, true/*indented*/);
180+
"<<"[~]? {MaybeWhsp} ({HereEOF1}|{HereEOF2}|{HereEOF3}|{HereEOF4}) {
181+
h.hop(yytext());
193182
}
194183

195184
{Identifier} {
@@ -293,7 +282,8 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
293282
}
294283

295284
// FORMAT start
296-
^ {MaybeWhsp} "format" ({WhiteSpace} {Identifier})? {MaybeWhsp} "=" {
285+
^ {MaybeWhsp} "format" ({WhiteSpace} {Identifier})? {MaybeWhsp} "=" /
286+
{MaybeWhsp}{EOL} {
297287
pushState(FMT);
298288
if (takeAllContent()) {
299289
// split off the " format" as `initial' for keyword processing
@@ -399,11 +389,13 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
399389
}
400390
}
401391

402-
<QUO, QUOxN, QUOxL, QUOxLxN> {
392+
<FMT, QUO, QUOxN, QUOxL, QUOxLxN, HERE, HEREin> {
403393
\\ \S {
404394
takeNonword(yytext());
405395
}
396+
}
406397

398+
<QUO, QUOxN, QUOxL, QUOxLxN> {
407399
{Quo0} |
408400
\w {
409401
String capture = yytext();
@@ -449,6 +441,11 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
449441
takeNonword(yytext());
450442
take(Consts.ZS);
451443
}
444+
445+
{WhiteSpace}{EOL} |
446+
{EOL} {
447+
doStartNewLine();
448+
}
452449
}
453450

454451
<FMT> {
@@ -475,24 +472,32 @@ Mpunc2IN = ([!=]"~" | [\:\?\=\+\-\<\>] | "=="|"!="|"<="|">="|"<=>")
475472
<SCOMMENT> {
476473
{WhiteSpace}{EOL} |
477474
{EOL} {
475+
String capture = yytext();
476+
yypushback(capture.length());
478477
yypop();
479478
take(Consts.ZS);
480-
doStartNewLine();
481479
}
482480
}
483481

482+
<YYINITIAL, INTRA> {
483+
{WhiteSpace}{EOL} |
484+
{EOL} {
485+
String capture = yytext();
486+
if (h.maybeStartHere()) {
487+
yypushback(capture.length());
488+
} else {
489+
doStartNewLine();
490+
}
491+
}
492+
}
493+
484494
<YYINITIAL, INTRA, SCOMMENT, POD, FMT, QUO, QUOxN, QUOxL, QUOxLxN,
485495
HERE, HERExN, HEREin, HEREinxN> {
486496
[&<>\"\'] {
487497
maybeIntraState();
488498
takeNonword(yytext());
489499
}
490500

491-
{WhiteSpace}{EOL} |
492-
{EOL} {
493-
doStartNewLine();
494-
}
495-
496501
// Only one whitespace char at a time or else {WxSigils} can be broken
497502
{WhspChar} {
498503
take(yytext());

src/org/opensolaris/opengrok/analysis/perl/PerlSymbolTokenizer.lex

+2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ super(in);
5555

5656
public void popState() throws IOException { yypop(); }
5757

58+
public void switchState(int state) { yybegin(state); }
59+
5860
public void take(String value) throws IOException {
5961
// noop
6062
}

src/org/opensolaris/opengrok/analysis/perl/PerlXref.lex

+2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ import org.opensolaris.opengrok.web.Util;
5757

5858
public void popState() throws IOException { yypop(); }
5959

60+
public void switchState(int state) { yybegin(state); }
61+
6062
public void take(String value) throws IOException {
6163
out.write(value);
6264
}

0 commit comments

Comments
 (0)