Skip to content

Commit 133a606

Browse files
authored
Merge pull request #1851 from idodeclare/feature/run_mandoc
Add MandocAnalyzer and OPENGROK_MANDOC setting
2 parents 408b52b + ecf7c72 commit 133a606

23 files changed

+1409
-11
lines changed

LICENSE-mandoc.txt

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
mandoc.css is included in OpenGrok under the following license:
2+
3+
$Id: LICENSE,v 1.17 2017/06/23 15:58:14 schwarze Exp $
4+
5+
With the exceptions noted below, all code and documentation
6+
contained in the mandoc toolkit is protected by the Copyright
7+
of the following developers:
8+
9+
Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <[email protected]>
10+
Copyright (c) 2010-2017 Ingo Schwarze <[email protected]>
11+
Copyright (c) 2009, 2010, 2011, 2012 Joerg Sonnenberger <[email protected]>
12+
Copyright (c) 2013 Franco Fichtner <[email protected]>
13+
Copyright (c) 2014 Baptiste Daroussin <[email protected]>
14+
Copyright (c) 2016 Ed Maste <[email protected]>
15+
Copyright (c) 2017 Michael Stapelberg <[email protected]>
16+
Copyright (c) 1999, 2004 Marc Espie <[email protected]>
17+
Copyright (c) 1998, 2004, 2010 Todd C. Miller <[email protected]>
18+
Copyright (c) 2008, 2017 Otto Moerbeek <[email protected]>
19+
Copyright (c) 2004 Ted Unangst <[email protected]>
20+
Copyright (c) 1994 Christos Zoulas <[email protected]>
21+
Copyright (c) 2003, 2007, 2008, 2014 Jason McIntyre <[email protected]>
22+
23+
See the individual source files for information about who contributed
24+
to which file during which years.
25+
26+
27+
The mandoc distribution as a whole is distributed by its developers
28+
under the following license:
29+
30+
Permission to use, copy, modify, and distribute this software for any
31+
purpose with or without fee is hereby granted, provided that the above
32+
copyright notice and this permission notice appear in all copies.
33+
34+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
35+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
36+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
37+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
38+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
39+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
40+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
41+
42+
43+
The following files included from outside sources are protected by
44+
other people's Copyright and are distributed under various 2-clause
45+
and 3-clause BSD licenses; see these individual files for details.
46+
47+
soelim.c, soelim.1:
48+
Copyright (c) 2014 Baptiste Daroussin <[email protected]>
49+
50+
compat_err.c, compat_fts.c, compat_fts.h,
51+
compat_getsubopt.c, compat_strcasestr.c, compat_strsep.c,
52+
man.1:
53+
Copyright (c) 1989,1990,1993,1994 The Regents of the University of California
54+
55+
compat_stringlist.c, compat_stringlist.h:
56+
Copyright (c) 1994 Christos Zoulas <[email protected]>

OpenGrok

+3
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
# - OPENGROK_CTAGS_OPTIONS_FILE Full path to file with extra command line
4242
# options for CTags program (for its --options
4343
# switch), default is DATA_ROOT/etc/ctags.config
44+
# - OPENGROK_MANDOC Full path to mandoc(1) binary
4445
# - JAVA_HOME Full Path to Java Installation Root
4546
# - JAVA Full Path to java binary (to enable 64bit JDK)
4647
# - JAVA_OPTS Java options (e.g. for JVM memory increase
@@ -118,6 +119,7 @@
118119

119120
#
120121
# Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
122+
# Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
121123
#
122124

123125
#
@@ -853,6 +855,7 @@ CommonInvocation()
853855
${SCAN_DEPTH} \
854856
${PROGRESS} \
855857
${OPENGROK_CTAGS:+-c} ${OPENGROK_CTAGS} \
858+
${OPENGROK_MANDOC:+--mandoc} ${OPENGROK_MANDOC} \
856859
${CTAGS_OPTIONS_FILE:+-o} ${CTAGS_OPTIONS_FILE} \
857860
${OPENGROK_FLUSH_RAM_BUFFER_SIZE} ${SKIN} ${LEADING_WILDCARD} \
858861
${READ_XML_CONF} \

build.xml

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ information: Portions Copyright [yyyy] [name of copyright owner]
1919
CDDL HEADER END
2020
2121
Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
22+
Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
2223
2324
-->
2425
<project name="OpenGrok" default="jar" basedir="." xmlns:jacoco="antlib:org.jacoco.ant"
@@ -322,6 +323,7 @@ Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
322323

323324
<run-jflex dir="${gen.analysis.dir}/sql" name="SQLXref"/>
324325
<run-jflex dir="${gen.analysis.dir}/sql" name="PLSQLXref"/>
326+
<run-jflex dir="${gen.analysis.dir}/document" name="MandocXref"/>
325327
<run-jflex dir="${gen.analysis.dir}/document" name="TroffXref"/>
326328
<run-jflex dir="${gen.analysis.dir}/document" name="TroffFullTokenizer"/>
327329
<run-jflex dir="${gen.analysis.dir}/sh" name="ShSymbolTokenizer"/>

opengrok-indexer/pom.xml

+7
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
4949
<include>*.adb</include>
5050
</includes>
5151
</resource>
52+
<resource>
53+
<targetPath>org/opensolaris/opengrok/analysis/document/</targetPath>
54+
<directory>../src/org/opensolaris/opengrok/analysis/document/</directory>
55+
<includes>
56+
<include>*.1m</include>
57+
</includes>
58+
</resource>
5259
<resource>
5360
<targetPath>org/opensolaris/opengrok/analysis/sql/</targetPath>
5461
<directory>../src/org/opensolaris/opengrok/analysis/sql/</directory>

src/org/opensolaris/opengrok/analysis/AnalyzerGuru.java

+7-2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import org.opensolaris.opengrok.analysis.csharp.CSharpAnalyzerFactory;
6565
import org.opensolaris.opengrok.analysis.data.IgnorantAnalyzerFactory;
6666
import org.opensolaris.opengrok.analysis.data.ImageAnalyzerFactory;
67+
import org.opensolaris.opengrok.analysis.document.MandocAnalyzerFactory;
6768
import org.opensolaris.opengrok.analysis.document.TroffAnalyzerFactory;
6869
import org.opensolaris.opengrok.analysis.erlang.ErlangAnalyzerFactory;
6970
import org.opensolaris.opengrok.analysis.executables.ELFAnalyzerFactory;
@@ -199,15 +200,19 @@ public class AnalyzerGuru {
199200
private static final Map<String, String> fileTypeDescriptions = new TreeMap<>();
200201

201202
/*
202-
* If you write your own analyzer please register it here
203+
* If you write your own analyzer please register it here. The order is
204+
* important for any factory that uses a FileAnalyzerFactory.Matcher
205+
* implementation, as those are run in the same order as defined below --
206+
* though precise Matchers are run before imprecise ones.
203207
*/
204208
static {
205209
FileAnalyzerFactory[] analyzers = {
206210
DEFAULT_ANALYZER_FACTORY,
207211
new IgnorantAnalyzerFactory(),
208212
new BZip2AnalyzerFactory(),
209213
new XMLAnalyzerFactory(),
210-
new TroffAnalyzerFactory(),
214+
MandocAnalyzerFactory.DEFAULT_INSTANCE,
215+
TroffAnalyzerFactory.DEFAULT_INSTANCE,
211216
new ELFAnalyzerFactory(),
212217
new JavaClassAnalyzerFactory(),
213218
new ImageAnalyzerFactory(),

src/org/opensolaris/opengrok/analysis/FileAnalyzerFactory.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,11 @@ protected FileAnalyzer newAnalyzer() {
216216
/**
217217
* Interface for matchers which map file contents to analyzer factories.
218218
*/
219-
protected interface Matcher {
219+
public interface Matcher {
220220

221221
/**
222222
* Get a value indicating if the magic is byte-precise.
223+
* @return true if precise
223224
*/
224225
default boolean getIsPreciseMagic() { return false; }
225226

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2017, Chris Fraire <[email protected]>.
22+
*/
23+
package org.opensolaris.opengrok.analysis.document;
24+
25+
import java.io.BufferedReader;
26+
import java.io.IOException;
27+
import java.io.InputStream;
28+
import java.io.InputStreamReader;
29+
import java.util.Arrays;
30+
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory;
31+
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory.Matcher;
32+
import org.opensolaris.opengrok.util.IOUtils;
33+
34+
/**
35+
* Represents an implementation of {@link Matcher} that detects a troff-
36+
* or mandoc-like document
37+
*/
38+
public class DocumentMatcher implements Matcher {
39+
40+
/**
41+
* Set to 512K {@code int}, but {@code NUMCHARS_FIRST_LOOK} and
42+
* {@code LINE_LIMIT} should apply beforehand. This value is "effectively
43+
* unbounded" without being literally 2_147_483_647 -- as the other limits
44+
* will apply first, and the {@link java.io.BufferedInputStream} will
45+
* manage a reasonably-sized buffer.
46+
*/
47+
private static final int MARK_READ_LIMIT = 1024 * 512;
48+
49+
private static final int LINE_LIMIT = 100;
50+
51+
private static final int FIRST_LOOK_WIDTH = 300;
52+
53+
private final FileAnalyzerFactory factory;
54+
55+
private final String[] lineStarters;
56+
57+
/**
58+
* Initializes an instance for the required parameters.
59+
* @param factory required factory to return when matched
60+
* @param lineStarters required list of line starters that indicate a match
61+
* @throws IllegalArgumentException thrown if any parameter is null
62+
*/
63+
public DocumentMatcher(FileAnalyzerFactory factory, String[] lineStarters) {
64+
if (factory == null) {
65+
throw new IllegalArgumentException("`factory' is null");
66+
}
67+
if (lineStarters == null) {
68+
throw new IllegalArgumentException("`lineStarters' is null");
69+
}
70+
if (lineStarters.length < 1) {
71+
throw new IllegalArgumentException("`lineStarters' is empty");
72+
}
73+
74+
String[] copyOf = Arrays.copyOf(lineStarters, lineStarters.length);
75+
for (String elem : copyOf) {
76+
if (elem == null) {
77+
throw new IllegalArgumentException(
78+
"`lineStarters' has null element");
79+
}
80+
}
81+
82+
this.factory = factory;
83+
this.lineStarters = copyOf;
84+
}
85+
86+
/**
87+
* Try to match the file contents by looking for {@code lineStarters} in
88+
* the first 100 lines while also affirming that the document starts
89+
* with "." or "'" after a limited amount of whitespace.
90+
* <p>
91+
* The stream is reset before returning.
92+
*
93+
* @param contents the first few bytes of a file
94+
* @param in the input stream from which the full file can be read
95+
* @return an analyzer factory if the contents match, or {@code null}
96+
* otherwise
97+
* @throws IOException in case of any read error
98+
*/
99+
@Override
100+
public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
101+
throws IOException {
102+
103+
if (!in.markSupported()) return null;
104+
in.mark(MARK_READ_LIMIT);
105+
106+
// read encoding, and skip past any BOM
107+
int bomLength = 0;
108+
String encoding = IOUtils.findBOMEncoding(contents);
109+
if (encoding == null) {
110+
encoding = "UTF-8";
111+
} else {
112+
bomLength = IOUtils.skipForBOM(contents);
113+
if (in.skip(bomLength) != bomLength) {
114+
in.reset();
115+
return null;
116+
}
117+
}
118+
119+
// affirm that a LF exists in a first block
120+
boolean foundLF = hasLineFeed(in, encoding);
121+
in.reset();
122+
if (!foundLF) return null;
123+
if (bomLength > 0) in.skip(bomLength);
124+
125+
// read line-by-line for a first few lines
126+
BufferedReader rdr = new BufferedReader(new InputStreamReader(
127+
in, encoding));
128+
boolean foundContent = false;
129+
int numFirstChars = 0;
130+
int numLines = 0;
131+
String line;
132+
while ((line = rdr.readLine()) != null) {
133+
for (int i = 0; i < lineStarters.length; ++i) {
134+
if (line.startsWith(lineStarters[i])) {
135+
in.reset();
136+
return factory;
137+
}
138+
}
139+
if (++numLines >= LINE_LIMIT) {
140+
in.reset();
141+
return null;
142+
}
143+
144+
// If not yet `foundContent', then only a limited allowance is
145+
// given until a sentinel '.' or '\'' must be seen after nothing
146+
// else but whitespace.
147+
if (!foundContent) {
148+
for (int i = 0; i < line.length() && numFirstChars <
149+
FIRST_LOOK_WIDTH; ++i, ++numFirstChars) {
150+
char c = line.charAt(i);
151+
if (c == '.' || c == '\'') {
152+
foundContent = true;
153+
break;
154+
} else if (!Character.isWhitespace(c)) {
155+
in.reset();
156+
return null;
157+
}
158+
}
159+
if (!foundContent && numFirstChars >= FIRST_LOOK_WIDTH) {
160+
in.reset();
161+
return null;
162+
}
163+
}
164+
}
165+
166+
in.reset();
167+
return null;
168+
}
169+
170+
/**
171+
* Determines if the {@code in} stream has a line feed character within the
172+
* first {@code FIRST_LOOK_WIDTH} characters.
173+
* @param in the input stream has any BOM (not {@code reset} after use)
174+
* @param encoding the input stream charset
175+
* @return true if a line feed '\n' was found
176+
* @throws IOException thrown on any error in reading
177+
*/
178+
private boolean hasLineFeed(InputStream in, String encoding)
179+
throws IOException {
180+
byte[] buf;
181+
int nextra;
182+
int noff;
183+
switch (encoding) {
184+
case "UTF-16LE":
185+
buf = new byte[FIRST_LOOK_WIDTH * 2];
186+
nextra = 1;
187+
noff = 0;
188+
break;
189+
case "UTF-16BE":
190+
buf = new byte[FIRST_LOOK_WIDTH * 2];
191+
nextra = 1;
192+
noff = 1;
193+
break;
194+
default:
195+
buf = new byte[FIRST_LOOK_WIDTH];
196+
nextra = 0;
197+
noff = 0;
198+
break;
199+
}
200+
201+
int nread = in.read(buf);
202+
for (int i = 0; i + nextra < nread; i += 1 + nextra) {
203+
if (nextra > 0) {
204+
if (buf[i + noff] == '\n' && buf[i + 1 - noff] == '\0') {
205+
return true;
206+
}
207+
} else {
208+
if (buf[i] == '\n') return true;
209+
}
210+
}
211+
return false;
212+
}
213+
}

0 commit comments

Comments
 (0)