Skip to content

Commit d36e247

Browse files
Merge upstream branch 'master' from https://github.com/iipc/webarchive-commons
- updates version number to 1.1.11-SNAPSHOT - keep commons-io version at 2.11.0 to stay in sync with version Hadoop depends on
2 parents 5cfb65d + 76d95cc commit d36e247

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+98
-163
lines changed

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
<groupId>org.commoncrawl</groupId>
1111
<artifactId>ia-web-commons</artifactId>
12-
<version>1.1.10-SNAPSHOT</version>
12+
<version>1.1.11-SNAPSHOT</version>
1313
<packaging>jar</packaging>
1414

1515
<name>ia-web-commons</name>
@@ -137,7 +137,7 @@
137137
<dependency>
138138
<groupId>commons-io</groupId>
139139
<artifactId>commons-io</artifactId>
140-
<version>2.8.0</version>
140+
<version>2.11.0</version>
141141
</dependency>
142142

143143
<dependency>

src/main/java/org/archive/format/gzip/GZIPStaticHeader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* @author Brad
1414
*
1515
* +---+---+---+---+---+---+---+---+---+---+
16-
* |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
16+
* |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more--&gt;)
1717
* +---+---+---+---+---+---+---+---+---+---+
1818
*/
1919
public class GZIPStaticHeader implements GZIPConstants {

src/main/java/org/archive/format/gzip/zipnum/MultiBlockIterator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public class MultiBlockIterator extends AbstractPeekableIterator<String> {
1818
private CloseableIterator<CloseableIterator<String>> blockItr = null;
1919

2020
/**
21-
* @param blocks which should be fetched and unzipped, one after another
21+
* @param blockItr blocks which should be fetched and unzipped, one after another
2222
*/
2323
public MultiBlockIterator(CloseableIterator<CloseableIterator<String>> blockItr) {
2424
this.blockItr = blockItr;

src/main/java/org/archive/format/text/charset/CharsetDetector.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ protected String contentTypeToCharset(final String contentType) {
139139
* Attempt to divine the character encoding of the document from the
140140
* Content-Type HTTP header (with a "charset=")
141141
*
142-
* @param resource
143142
* @return String character set found or null if the header was not present
144143
* @throws IOException
145144
*/
@@ -161,7 +160,6 @@ protected String getCharsetFromHeaders(HttpHeaders headers)
161160
* Attempt to find a META tag in the HTML that hints at the character set
162161
* used to write the document.
163162
*
164-
* @param resource
165163
* @return String character set found from META tags in the HTML
166164
* @throws IOException
167165
*/
@@ -224,9 +222,7 @@ public static String findMetaContentType(String pageSample) {
224222
* Attempts to figure out the character set of the document using
225223
* the excellent juniversalchardet library.
226224
*
227-
* @param resource
228225
* @return String character encoding found, or null if nothing looked good.
229-
* @throws IOException
230226
*/
231227
protected String getCharsetFromBytes(byte buffer[], int len)
232228
throws IOException {
@@ -242,9 +238,6 @@ protected String getCharsetFromBytes(byte buffer[], int len)
242238
return null;
243239
}
244240
/**
245-
* @param resource (presumably text) Resource to determine the charset
246-
* @param request WaybackRequest which may contain additional hints to
247-
* processing
248241
* @return String charset name for the Resource
249242
* @throws IOException if there are problems reading the Resource
250243
*/

src/main/java/org/archive/format/warc/WARCConstants.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
/**
2525
* WARC Constants used by WARC readers and writers.
2626
*
27-
* @contributor stack
27+
* @author stack
2828
*/
2929
public interface WARCConstants extends ArchiveFileConstants {
3030
/**
@@ -175,8 +175,8 @@ enum WARCRecordType {
175175
/**
176176
* These fields help a consumer of the warc to locate the warc record that
177177
* {@value #HEADER_KEY_REFERS_TO} refers to.
178-
*
179-
* @see WARCWriterProcessor
178+
* <p>
179+
* See WARCWriterProcessor
180180
*/
181181
public static final String HEADER_KEY_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
182182
public static final String HEADER_KEY_REFERS_TO_DATE = "WARC-Refers-To-Date";

src/main/java/org/archive/httpclient/package.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@
77
apache <a href="http://jakarta.apache.org/commons/httpclient/">jakarta
88
commons httpclient</a>.
99

10-
<h3>HttpRecorderGetMethod</h3>
10+
<h2>HttpRecorderGetMethod</h2>
1111
<p>Class that the passed HttpRecorder w/ boundary between
1212
HTTP header and content. Also forces a close on the response on
1313
call to releaseConnection.</p>
1414

15-
<h3>ConfigurableTrustManagerProtocolSocketFactory</h3>
15+
<h2>ConfigurableTrustManagerProtocolSocketFactory</h2>
1616
<p>A protocol socket factory that allows setting of trust level on
1717
construction.</p>
1818

19-
<h3>References</h3>
19+
<h2>References</h2>
2020
<p><a
2121
href="http://java.sun.com/j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html">JavaTM Secure Socket Extension (JSSE): Reference Guide</a></p>
2222

src/main/java/org/archive/io/ArchiveReader.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -711,9 +711,7 @@ public boolean outputRecord(final String format)
711711

712712
/**
713713
* Dump this file on STDOUT
714-
* @throws compress True if dumped output is compressed.
715-
* @throws IOException
716-
* @throws java.text.ParseException
714+
* @param compress True if dumped output is compressed.
717715
*/
718716
public abstract void dump(final boolean compress)
719717
throws IOException, java.text.ParseException;

src/main/java/org/archive/io/GenericReplayCharSequence.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,11 @@
4545

4646
/**
4747
* (Replay)CharSequence view on recorded streams.
48+
**
49+
* <p>Call {@link #close()} on this class when done to clean up resources.
4850
*
49-
* For small streams, use {@link InMemoryReplayCharSequence}.
50-
*
51-
* <p>Call {@link close()} on this class when done to clean up resources.
52-
*
53-
* @contributor stack
54-
* @contributor nlevitt
51+
* @author stack
52+
* @author nlevitt
5553
* @version $Revision$, $Date$
5654
*/
5755
public class GenericReplayCharSequence implements ReplayCharSequence {
@@ -67,7 +65,7 @@ public class GenericReplayCharSequence implements ReplayCharSequence {
6765
* decodings. The name of the file that holds the decoding is the name
6866
* of the backing file w/ this encoding for a suffix.
6967
*
70-
* <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
68+
* <p>See <a href="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
7169
*/
7270
public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
7371

src/main/java/org/archive/io/MiserOutputStream.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* A filter stream that both counts bytes written, and optionally swallows
2828
* flush() requests.
2929
*
30-
* @contributor gojomo
30+
* @author gojomo
3131
*/
3232
public class MiserOutputStream extends FilterOutputStream {
3333
protected long count;

src/main/java/org/archive/io/Preformatter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Interface indicating a logging Formatter can preformat a record (outside
2525
* the standard-implementation synchronized block) and cache it, returning it
2626
* for the next request for formatting from the same thread.
27-
* @contributor gojomo
27+
* @author gojomo
2828
*/
2929
public interface Preformatter {
3030
public void preformat(LogRecord record);

src/main/java/org/archive/io/RecordingInputStream.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ public void readToEndOfContent(long contentLength)
225225

226226
/**
227227
* Read all of a stream (Or read until we timeout or have read to the max).
228-
* @param softMaxLength Maximum length to read; if zero or < 0, then no
228+
* @param softMaxLength Maximum length to read; if zero or &lt; 0, then no
229229
* limit. If met, return normally.
230230
* @throws IOException failed read.
231231
* @throws RecorderLengthExceededException

src/main/java/org/archive/io/ReplayCharSequence.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
5959
public long getDecodeExceptionCount();
6060

6161
/**
62-
* Return the first coding-exception encountered, if the count > 0.
62+
* Return the first coding-exception encountered, if the count &gt; 0.
6363
* @return CharacterCodingException
6464
*/
6565
public CharacterCodingException getCodingException();

src/main/java/org/archive/io/ReplayInputStream.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public class ReplayInputStream extends SeekInputStream
6464
* @param size Size of data to replay.
6565
* @param responseBodyStart Start of the response body.
6666
* @param backingFilename Backing file that sits behind the buffer. If
67-
* <code>size<code> > than buffer then we go to backing file to read
67+
* <code>size</code> &gt; than buffer then we go to backing file to read
6868
* data that is beyond buffer.length.
6969
*
7070
* @throws IOException If we fail to open an input stream on
@@ -84,7 +84,7 @@ public ReplayInputStream(byte[] buffer, long size, long responseBodyStart,
8484
* @param buffer Buffer to read from.
8585
* @param size Size of data to replay.
8686
* @param backingFilename Backing file that sits behind the buffer. If
87-
* <code>size<code> > than buffer then we go to backing file to read
87+
* <code>size</code> &gt; than buffer then we go to backing file to read
8888
* data that is beyond buffer.length.
8989
* @throws IOException If we fail to open an input stream on
9090
* backing file.
@@ -130,7 +130,7 @@ public ReplayInputStream(InputStream fillStream) throws IOException {
130130
}
131131

132132
/**
133-
* Close & destroy any internally-generated temporary files.
133+
* Close &amp; destroy any internally-generated temporary files.
134134
*/
135135
public void destroy() {
136136
IOUtils.closeQuietly(this);

src/main/java/org/archive/io/RepositionableInputStream.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
* stream. Uses a {@link BufferedInputStream}. Calls mark on every read so
3030
* we'll remember at least the last thing read (You can only backup on the
3131
* last thing read -- not last 2 or 3 things read). Used by
32-
* {@link GzippedInputStream} when reading streams over a network. Wraps a
32+
* GzippedInputStream when reading streams over a network. Wraps a
3333
* HTTP, etc., stream so we can back it up if needs be after the
3434
* GZIP inflater has done a fill of its full buffer though it only needed
3535
* the first few bytes to finish decompressing the current GZIP member.

src/main/java/org/archive/io/WriterPool.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,7 @@ public abstract class WriterPool {
8888
/**
8989
* Constructor
9090
* @param serial Used to generate unique filename sequences
91-
* @param factory Factory that knows how to make a {@link WriterPoolMember}.
9291
* @param settings Settings for this pool.
93-
* @param poolMaximumActive
94-
* @param poolMaximumWait
9592
*/
9693
public WriterPool(final AtomicInteger serial,
9794
final WriterPoolSettings settings,

src/main/java/org/archive/io/WriterPoolMember.java

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,6 @@ public abstract class WriterPoolMember implements ArchiveFileConstants {
125125
* @param serialNo used to create unique filename sequences
126126
* @param out Where to write.
127127
* @param file File the <code>out</code> is connected to.
128-
* @param cmprs Compress the content written.
129-
* @param a14DigitDate If null, we'll write current time.
130-
* @throws IOException
131128
*/
132129
protected WriterPoolMember(AtomicInteger serialNo,
133130
final OutputStream out, final File file,
@@ -145,11 +142,6 @@ protected WriterPoolMember(AtomicInteger serialNo,
145142
* Constructor.
146143
*
147144
* @param serialNo used to create unique filename sequences
148-
* @param dirs Where to drop files.
149-
* @param prefix File prefix to use.
150-
* @param cmprs Compress the records written.
151-
* @param maxSize Maximum size for ARC files written.
152-
* @param template filenaming template to use
153145
* @param extension Extension to give file.
154146
*/
155147
public WriterPoolMember(AtomicInteger serialNo,
@@ -361,7 +353,6 @@ protected void postWriteRecordTasks()
361353
* Position in raw output (typically, physical file).
362354
* Used making accounting of bytes written.
363355
* @return Position in final media (assuming all flushing completes)
364-
* @throws IOException
365356
*/
366357
public long getPosition() {
367358
return (countOut==null)? 0L : this.countOut.getCount();

src/main/java/org/archive/io/arc/ARCRecord.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,10 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
190190
* formatted).
191191
* @param parseHttpHeaders True if we are to parse HTTP headers. Costs
192192
* about ~20% of CPU during an ARC parse.
193-
* @param isAllignedOnFirstRecord True if this is the first record to be
193+
* @param isAlignedOnFirstRecord True if this is the first record to be
194194
* read from an archive
195-
* @param String version Version information to be returned to the
195+
* @param version Version information to be returned to the
196196
* ARCReader constructing this record
197-
*
198-
* @throws IOException
199197
*/
200198
public ARCRecord(InputStream in, final String identifier,
201199
final long offset, boolean digest, boolean strict,

src/main/java/org/archive/io/arc/ARCWriter.java

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686
* write our own GZIP*Streams, ones that resettable and consious of gzip
8787
* members.
8888
*
89-
* <p>This class will write until we hit >= maxSize. The check is done at
89+
* <p>This class will write until we hit &gt;= maxSize. The check is done at
9090
* record boundary. Records do not span ARC files. We will then close current
9191
* file and open another and then continue writing.
9292
*
@@ -95,9 +95,9 @@
9595
* <a href="http://www.archive.org/web/researcher/tool_documentation.php">alexa
9696
* ARC c-tools</a>:
9797
* <pre>
98-
* % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
98+
* % av_procarc hx20040109230030-0.arc.gz | av_ziparc &gt; \
9999
* /tmp/hx20040109230030-0.dat.gz
100-
* % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
100+
* % av_ripdat /tmp/hx20040109230030-0.dat.gz &gt; /tmp/hx20040109230030-0.cdx
101101
* </pre>
102102
* Examine the produced cdx file to make sure it makes sense. Search
103103
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
@@ -129,12 +129,7 @@ public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeab
129129
* @param serialNo used to generate unique file name sequences
130130
* @param out Where to write.
131131
* @param arc File the <code>out</code> is connected to.
132-
* @param cmprs Compress the content written.
133-
* @param metadata File meta data. Can be null. Is list of File and/or
134-
* String objects.
135-
* @param a14DigitDate If null, we'll write current time.
136-
* @throws IOException
137-
*/
132+
2 */
138133
public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
139134
final File arc, final WriterPoolSettings settings)
140135
throws IOException {

src/main/java/org/archive/io/warc/WARCWriter.java

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
*
5454
* <p>While being written, WARCs have a '.open' suffix appended.
5555
*
56-
* @contributor stack
56+
* @author stack
5757
* @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
5858
*/
5959
public class WARCWriter extends WriterPoolMember
@@ -81,7 +81,7 @@ public class WARCWriter extends WriterPoolMember
8181

8282
/**
8383
* Temporarily accumulates stats managed externally by
84-
* {@link WARCWriterProcessor}. WARCWriterProcessor will call
84+
* WARCWriterProcessor. WARCWriterProcessor will call
8585
* {@link #resetTmpStats()}, write some records, then add
8686
* {@link #getTmpStats()} into its long-term running totals.
8787
*/
@@ -97,9 +97,6 @@ public class WARCWriter extends WriterPoolMember
9797
* @param serialNo used to generate unique file name sequences
9898
* @param out Where to write.
9999
* @param f File the <code>out</code> is connected to.
100-
* @param cmprs Compress the content written.
101-
* @param a14DigitDate If null, we'll write current time.
102-
* @throws IOException
103100
*/
104101
public WARCWriter(final AtomicInteger serialNo,
105102
final OutputStream out, final File f,
@@ -110,13 +107,6 @@ public WARCWriter(final AtomicInteger serialNo,
110107

111108
/**
112109
* Constructor.
113-
*
114-
* @param dirs Where to drop files.
115-
* @param prefix File prefix to use.
116-
* @param cmprs Compress the records written.
117-
* @param maxSize Maximum size for ARC files written.
118-
* @param suffix File tail to use. If null, unused.
119-
* @param warcinfoData File metadata for warcinfo record.
120110
*/
121111
public WARCWriter(final AtomicInteger serialNo,
122112
final WARCWriterPoolSettings settings) {

src/main/java/org/archive/io/warc/WARCWriterPool.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626

2727
/**
2828
* A pool of WARCWriters.
29-
* @contributor stack
30-
* @contributor gojomo
29+
* @author stack
30+
* @author gojomo
3131
* @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $
3232
*/
3333
public class WARCWriterPool extends WriterPool {

src/main/java/org/archive/io/warc/package.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
<h2>Implementation Notes</h2>
1717
<h3>Tools</h3>
1818
<p>Initial implementations of <code>Arc2Warc</code> and <code>Warc2Arc</code>
19-
tools can be found in the package above this one, at
20-
{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc}
19+
tools can be found in Heritrix, at
20+
org.archive.io.Arc2Warc and org.archive.io.Warc2Arc
2121
respectively. Pass <code>--help</code> to learn how to use each tool.
2222
</p>
2323

src/main/java/org/archive/net/PublicSuffixes.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public class PublicSuffixes {
7474
* prefix tree node. each Node represents sequence of letters (prefix)
7575
* and alternative sequences following it (list of Node's). Nodes in
7676
* {@code branches} are sorted for skip list like lookup and for generating
77-
* effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).)
77+
* effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char)}.
7878
*
7979
* as is intended for internal use only, there's no access methods. procedures for updating
8080
* prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}).

src/main/java/org/archive/resource/ResourceFactory.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,6 @@ public interface ResourceFactory {
1212

1313
/**
1414
* Attempts to create a Resource from the InputStream
15-
* @param is
16-
* @param metaData
17-
* @param container
18-
* @return
19-
* @throws ResourceParseException
20-
* @throws IOException
2115
*/
2216
public Resource getResource(InputStream is, MetaData parentMetaData,
2317
ResourceContainer container)

0 commit comments

Comments
 (0)