Skip to content

Commit fa18568

Browse files
committed
Add MandocAnalyzer and OPENGROK_MANDOC setting
Also: - add DocumentMatcher to be used by TroffAnalyzerFactory and MandocAnalyzerFactory.
1 parent 9801824 commit fa18568

23 files changed

+1378
-10
lines changed

LICENSE-mandoc.txt

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
mandoc.css is included in OpenGrok under the following license:
2+
3+
$Id: LICENSE,v 1.17 2017/06/23 15:58:14 schwarze Exp $
4+
5+
With the exceptions noted below, all code and documentation
6+
contained in the mandoc toolkit is protected by the Copyright
7+
of the following developers:
8+
9+
Copyright (c) 2008-2012, 2014 Kristaps Dzonsons <[email protected]>
10+
Copyright (c) 2010-2017 Ingo Schwarze <[email protected]>
11+
Copyright (c) 2009, 2010, 2011, 2012 Joerg Sonnenberger <[email protected]>
12+
Copyright (c) 2013 Franco Fichtner <[email protected]>
13+
Copyright (c) 2014 Baptiste Daroussin <[email protected]>
14+
Copyright (c) 2016 Ed Maste <[email protected]>
15+
Copyright (c) 2017 Michael Stapelberg <[email protected]>
16+
Copyright (c) 1999, 2004 Marc Espie <[email protected]>
17+
Copyright (c) 1998, 2004, 2010 Todd C. Miller <[email protected]>
18+
Copyright (c) 2008, 2017 Otto Moerbeek <[email protected]>
19+
Copyright (c) 2004 Ted Unangst <[email protected]>
20+
Copyright (c) 1994 Christos Zoulas <[email protected]>
21+
Copyright (c) 2003, 2007, 2008, 2014 Jason McIntyre <[email protected]>
22+
23+
See the individual source files for information about who contributed
24+
to which file during which years.
25+
26+
27+
The mandoc distribution as a whole is distributed by its developers
28+
under the following license:
29+
30+
Permission to use, copy, modify, and distribute this software for any
31+
purpose with or without fee is hereby granted, provided that the above
32+
copyright notice and this permission notice appear in all copies.
33+
34+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
35+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
36+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
37+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
38+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
39+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
40+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
41+
42+
43+
The following files included from outside sources are protected by
44+
other people's Copyright and are distributed under various 2-clause
45+
and 3-clause BSD licenses; see these individual files for details.
46+
47+
soelim.c, soelim.1:
48+
Copyright (c) 2014 Baptiste Daroussin <[email protected]>
49+
50+
compat_err.c, compat_fts.c, compat_fts.h,
51+
compat_getsubopt.c, compat_strcasestr.c, compat_strsep.c,
52+
man.1:
53+
Copyright (c) 1989,1990,1993,1994 The Regents of the University of California
54+
55+
compat_stringlist.c, compat_stringlist.h:
56+
Copyright (c) 1994 Christos Zoulas <[email protected]>

OpenGrok

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
# - OPENGROK_CTAGS_OPTIONS_FILE Full path to file with extra command line
4242
# options for CTags program (for its --options
4343
# switch), default is DATA_ROOT/etc/ctags.config
44+
# - OPENGROK_MANDOC Full path to mandoc(1) binary
4445
# - JAVA_HOME Full Path to Java Installation Root
4546
# - JAVA Full Path to java binary (to enable 64bit JDK)
4647
# - JAVA_OPTS Java options (e.g. for JVM memory increase
@@ -853,6 +854,7 @@ CommonInvocation()
853854
${SCAN_DEPTH} \
854855
${PROGRESS} \
855856
${OPENGROK_CTAGS:+-c} ${OPENGROK_CTAGS} \
857+
${OPENGROK_MANDOC:+--mandoc} ${OPENGROK_MANDOC} \
856858
${CTAGS_OPTIONS_FILE:+-o} ${CTAGS_OPTIONS_FILE} \
857859
${OPENGROK_FLUSH_RAM_BUFFER_SIZE} ${SKIN} ${LEADING_WILDCARD} \
858860
${READ_XML_CONF} \

build.xml

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ information: Portions Copyright [yyyy] [name of copyright owner]
1919
CDDL HEADER END
2020
2121
Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
22+
Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
2223
2324
-->
2425
<project name="OpenGrok" default="jar" basedir="." xmlns:jacoco="antlib:org.jacoco.ant"
@@ -320,6 +321,7 @@ Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
320321

321322
<run-jflex dir="${gen.analysis.dir}/sql" name="SQLXref"/>
322323
<run-jflex dir="${gen.analysis.dir}/sql" name="PLSQLXref"/>
324+
<run-jflex dir="${gen.analysis.dir}/document" name="MandocXref"/>
323325
<run-jflex dir="${gen.analysis.dir}/document" name="TroffXref"/>
324326
<run-jflex dir="${gen.analysis.dir}/document" name="TroffFullTokenizer"/>
325327
<run-jflex dir="${gen.analysis.dir}/sh" name="ShSymbolTokenizer"/>

opengrok-indexer/pom.xml

+7
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ Portions Copyright (c) 2017, Chris Fraire <[email protected]>.
4242
<testSourceDirectory>../test</testSourceDirectory>
4343

4444
<resources>
45+
<resource>
46+
<targetPath>org/opensolaris/opengrok/analysis/document/</targetPath>
47+
<directory>../src/org/opensolaris/opengrok/analysis/document/</directory>
48+
<includes>
49+
<include>*.1m</include>
50+
</includes>
51+
</resource>
4552
<resource>
4653
<targetPath>org/opensolaris/opengrok/analysis/sql/</targetPath>
4754
<directory>../src/org/opensolaris/opengrok/analysis/sql/</directory>

src/org/opensolaris/opengrok/analysis/AnalyzerGuru.java

+7-2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
import org.opensolaris.opengrok.analysis.csharp.CSharpAnalyzerFactory;
6464
import org.opensolaris.opengrok.analysis.data.IgnorantAnalyzerFactory;
6565
import org.opensolaris.opengrok.analysis.data.ImageAnalyzerFactory;
66+
import org.opensolaris.opengrok.analysis.document.MandocAnalyzerFactory;
6667
import org.opensolaris.opengrok.analysis.document.TroffAnalyzerFactory;
6768
import org.opensolaris.opengrok.analysis.erlang.ErlangAnalyzerFactory;
6869
import org.opensolaris.opengrok.analysis.executables.ELFAnalyzerFactory;
@@ -198,15 +199,19 @@ public class AnalyzerGuru {
198199
private static final Map<String, String> fileTypeDescriptions = new TreeMap<>();
199200

200201
/*
201-
* If you write your own analyzer please register it here
202+
* If you write your own analyzer please register it here. The order is
203+
* important for any factory that uses a FileAnalyzerFactory.Matcher
204+
* implementation, as those are run in the same order as defined below --
205+
* though precise Matchers are run before imprecise ones.
202206
*/
203207
static {
204208
FileAnalyzerFactory[] analyzers = {
205209
DEFAULT_ANALYZER_FACTORY,
206210
new IgnorantAnalyzerFactory(),
207211
new BZip2AnalyzerFactory(),
208212
new XMLAnalyzerFactory(),
209-
new TroffAnalyzerFactory(),
213+
MandocAnalyzerFactory.DEFAULT_INSTANCE,
214+
TroffAnalyzerFactory.DEFAULT_INSTANCE,
210215
new ELFAnalyzerFactory(),
211216
new JavaClassAnalyzerFactory(),
212217
new ImageAnalyzerFactory(),

src/org/opensolaris/opengrok/analysis/FileAnalyzerFactory.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,11 @@ protected FileAnalyzer newAnalyzer() {
216216
/**
217217
* Interface for matchers which map file contents to analyzer factories.
218218
*/
219-
protected interface Matcher {
219+
public interface Matcher {
220220

221221
/**
222222
* Get a value indicating if the magic is byte-precise.
223+
* @return true if precise
223224
*/
224225
default boolean getIsPreciseMagic() { return false; }
225226

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2017, Chris Fraire <[email protected]>.
22+
*/
23+
package org.opensolaris.opengrok.analysis.document;
24+
25+
import java.io.BufferedReader;
26+
import java.io.IOException;
27+
import java.io.InputStream;
28+
import java.io.InputStreamReader;
29+
import java.util.Arrays;
30+
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory;
31+
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory.Matcher;
32+
import org.opensolaris.opengrok.util.IOUtils;
33+
34+
/**
35+
* Represents an implementation of {@link Matcher} that detects a troff-
36+
* or mandoc-like document
37+
*/
38+
public class DocumentMatcher implements Matcher {
39+
40+
/**
41+
* Set to 512K {@code int}, but {@code NUMCHARS_FIRST_LOOK} and
42+
* {@code LINE_LIMIT} should apply beforehand
43+
*/
44+
private static final int MARK_READ_LIMIT = 1024 * 512;
45+
46+
private static final int LINE_LIMIT = 100;
47+
48+
private static final int FIRST_LOOK_WIDTH = 300;
49+
50+
private static final int FIRST_CONTENT_WIDTH = 8;
51+
52+
private final FileAnalyzerFactory factory;
53+
54+
private final String[] lineStarters;
55+
56+
/**
57+
* Initializes an instance for the required parameters
58+
* @param factory required factory to return when matched
59+
* @param lineStarters required list of line starters that indicate a match
60+
* @throws IllegalArgumentException if any parameter is null
61+
*/
62+
public DocumentMatcher(FileAnalyzerFactory factory, String[] lineStarters) {
63+
if (factory == null) {
64+
throw new IllegalArgumentException("`factory' is null");
65+
}
66+
if (lineStarters == null) {
67+
throw new IllegalArgumentException("`lineStarters' is null");
68+
}
69+
if (lineStarters.length < 1) {
70+
throw new IllegalArgumentException("`lineStarters' is empty");
71+
}
72+
73+
String[] copyOf = Arrays.copyOf(lineStarters, lineStarters.length);
74+
for (String elem : copyOf) {
75+
if (elem == null) {
76+
throw new IllegalArgumentException(
77+
"`lineStarters' has null element");
78+
}
79+
}
80+
81+
this.factory = factory;
82+
this.lineStarters = copyOf;
83+
}
84+
85+
/**
86+
* Try to match the file contents by first affirming the document starts
87+
* with "." or "'" and then looks for {@code lineStarters} in the first
88+
* 100 lines.
89+
* <p>
90+
* The stream is reset before returning.
91+
*
92+
* @param contents the first few bytes of a file
93+
* @param in the input stream from which the full file can be read
94+
* @return an analyzer factory if the contents match, or {@code null}
95+
* otherwise
96+
* @throws IOException in case of any read error
97+
*/
98+
@Override
99+
public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
100+
throws IOException {
101+
102+
if (!in.markSupported()) return null;
103+
in.mark(MARK_READ_LIMIT);
104+
105+
int bomLength = 0;
106+
String encoding = IOUtils.findBOMEncoding(contents);
107+
if (encoding == null) {
108+
encoding = "UTF-8";
109+
} else {
110+
bomLength = IOUtils.skipForBOM(contents);
111+
if (in.skip(bomLength) != bomLength) {
112+
in.reset();
113+
return null;
114+
}
115+
}
116+
117+
BufferedReader rdr = new BufferedReader(new InputStreamReader(
118+
in, encoding));
119+
120+
// Before reading a line, read some characters for a first look
121+
char[] buf = new char[FIRST_LOOK_WIDTH];
122+
int lenFirstLook;
123+
if ((lenFirstLook = rdr.read(buf)) < 1) {
124+
in.reset();
125+
return null;
126+
}
127+
128+
// Require a "." or "'" as the first non-whitespace character after
129+
// only a limited number of whitespaces or else infer it is not troff
130+
// or mandoc.
131+
int actualFirstContentWidth = lenFirstLook < FIRST_CONTENT_WIDTH ?
132+
lenFirstLook : FIRST_CONTENT_WIDTH;
133+
boolean foundContent = false;
134+
for (int i = 0; i < actualFirstContentWidth; ++i) {
135+
if (buf[i] == '.' || buf[i] == '\'') {
136+
foundContent = true;
137+
break;
138+
} else if (!Character.isWhitespace(buf[i])) {
139+
in.reset();
140+
return null;
141+
}
142+
}
143+
if (!foundContent) {
144+
in.reset();
145+
return null;
146+
}
147+
148+
// affirm that a LF is seen in the first look or else quickly
149+
// infer it is not troff
150+
boolean foundLF = false;
151+
for (int i = 0; i < lenFirstLook; ++i) {
152+
if (buf[i] == '\n') {
153+
foundLF = true;
154+
break;
155+
}
156+
}
157+
if (!foundLF) {
158+
in.reset();
159+
return null;
160+
}
161+
162+
// reset for line-by-line reading below
163+
in.reset();
164+
if (bomLength > 0) in.skip(bomLength);
165+
rdr = new BufferedReader(new InputStreamReader(in, encoding));
166+
167+
int numLines = 0;
168+
String line;
169+
while ((line = rdr.readLine()) != null) {
170+
for (int i = 0; i < lineStarters.length; ++i) {
171+
if (line.startsWith(lineStarters[i])) {
172+
in.reset();
173+
return factory;
174+
}
175+
}
176+
if (++numLines >= LINE_LIMIT) {
177+
in.reset();
178+
return null;
179+
}
180+
}
181+
182+
in.reset();
183+
return null;
184+
}
185+
}

0 commit comments

Comments
 (0)