Skip to content

Commit

Permalink
Merge upstream branch 'master' from https://github.com/iipc/webarchiv…
Browse files Browse the repository at this point in the history
…e-commons

- updates version number to 1.1.11-SNAPSHOT
- keep commons-io version at 2.11.0 to stay in sync with version
  Hadoop depends on
  • Loading branch information
sebastian-nagel committed Nov 25, 2024
2 parents 5cfb65d + 76d95cc commit d36e247
Show file tree
Hide file tree
Showing 50 changed files with 98 additions and 163 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

<groupId>org.commoncrawl</groupId>
<artifactId>ia-web-commons</artifactId>
<version>1.1.10-SNAPSHOT</version>
<version>1.1.11-SNAPSHOT</version>
<packaging>jar</packaging>

<name>ia-web-commons</name>
Expand Down Expand Up @@ -137,7 +137,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.8.0</version>
<version>2.11.0</version>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* @author Brad
*
* +---+---+---+---+---+---+---+---+---+---+
* |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
* |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more--&gt;)
* +---+---+---+---+---+---+---+---+---+---+
*/
public class GZIPStaticHeader implements GZIPConstants {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class MultiBlockIterator extends AbstractPeekableIterator<String> {
private CloseableIterator<CloseableIterator<String>> blockItr = null;

/**
* @param blocks which should be fetched and unzipped, one after another
* @param blockItr blocks which should be fetched and unzipped, one after another
*/
public MultiBlockIterator(CloseableIterator<CloseableIterator<String>> blockItr) {
this.blockItr = blockItr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ protected String contentTypeToCharset(final String contentType) {
* Attempt to divine the character encoding of the document from the
* Content-Type HTTP header (with a "charset=")
*
* @param resource
* @return String character set found or null if the header was not present
* @throws IOException
*/
Expand All @@ -161,7 +160,6 @@ protected String getCharsetFromHeaders(HttpHeaders headers)
* Attempt to find a META tag in the HTML that hints at the character set
* used to write the document.
*
* @param resource
* @return String character set found from META tags in the HTML
* @throws IOException
*/
Expand Down Expand Up @@ -224,9 +222,7 @@ public static String findMetaContentType(String pageSample) {
* Attempts to figure out the character set of the document using
* the excellent juniversalchardet library.
*
* @param resource
* @return String character encoding found, or null if nothing looked good.
* @throws IOException
*/
protected String getCharsetFromBytes(byte buffer[], int len)
throws IOException {
Expand All @@ -242,9 +238,6 @@ protected String getCharsetFromBytes(byte buffer[], int len)
return null;
}
/**
* @param resource (presumably text) Resource to determine the charset
* @param request WaybackRequest which may contain additional hints to
* processing
* @return String charset name for the Resource
* @throws IOException if there are problems reading the Resource
*/
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/archive/format/warc/WARCConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
/**
* WARC Constants used by WARC readers and writers.
*
* @contributor stack
* @author stack
*/
public interface WARCConstants extends ArchiveFileConstants {
/**
Expand Down Expand Up @@ -175,8 +175,8 @@ enum WARCRecordType {
/**
* These fields help a consumer of the warc to locate the warc record that
* {@value #HEADER_KEY_REFERS_TO} refers to.
*
* @see WARCWriterProcessor
* <p>
* See WARCWriterProcessor
*/
public static final String HEADER_KEY_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
public static final String HEADER_KEY_REFERS_TO_DATE = "WARC-Refers-To-Date";
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/archive/httpclient/package.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
apache <a href="http://jakarta.apache.org/commons/httpclient/">jakarta
commons httpclient</a>.

<h3>HttpRecorderGetMethod</h3>
<h2>HttpRecorderGetMethod</h2>
<p>Class that the passed HttpRecorder w/ boundary between
HTTP header and content. Also forces a close on the response on
call to releaseConnection.</p>

<h3>ConfigurableTrustManagerProtocolSocketFactory</h3>
<h2>ConfigurableTrustManagerProtocolSocketFactory</h2>
<p>A protocol socket factory that allows setting of trust level on
construction.</p>

<h3>References</h3>
<h2>References</h2>
<p><a
href="http://java.sun.com/j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html">JavaTM Secure Socket Extension (JSSE): Reference Guide</a></p>

Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/archive/io/ArchiveReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -711,9 +711,7 @@ public boolean outputRecord(final String format)

/**
* Dump this file on STDOUT
* @throws compress True if dumped output is compressed.
* @throws IOException
* @throws java.text.ParseException
* @param compress True if dumped output is compressed.
*/
public abstract void dump(final boolean compress)
throws IOException, java.text.ParseException;
Expand Down
12 changes: 5 additions & 7 deletions src/main/java/org/archive/io/GenericReplayCharSequence.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@

/**
* (Replay)CharSequence view on recorded streams.
**
* <p>Call {@link #close()} on this class when done to clean up resources.
*
* For small streams, use {@link InMemoryReplayCharSequence}.
*
* <p>Call {@link close()} on this class when done to clean up resources.
*
* @contributor stack
* @contributor nlevitt
* @author stack
* @author nlevitt
* @version $Revision$, $Date$
*/
public class GenericReplayCharSequence implements ReplayCharSequence {
Expand All @@ -67,7 +65,7 @@ public class GenericReplayCharSequence implements ReplayCharSequence {
* decodings. The name of the file that holds the decoding is the name
* of the backing file w/ this encoding for a suffix.
*
* <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
* <p>See <a href="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
*/
public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/io/MiserOutputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* A filter stream that both counts bytes written, and optionally swallows
* flush() requests.
*
* @contributor gojomo
* @author gojomo
*/
public class MiserOutputStream extends FilterOutputStream {
protected long count;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/io/Preformatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* Interface indicating a logging Formatter can preformat a record (outside
* the standard-implementation synchronized block) and cache it, returning it
* for the next request for formatting from the same thread.
* @contributor gojomo
* @author gojomo
*/
public interface Preformatter {
public void preformat(LogRecord record);
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/io/RecordingInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ public void readToEndOfContent(long contentLength)

/**
* Read all of a stream (Or read until we timeout or have read to the max).
* @param softMaxLength Maximum length to read; if zero or < 0, then no
* @param softMaxLength Maximum length to read; if zero or &lt; 0, then no
* limit. If met, return normally.
* @throws IOException failed read.
* @throws RecorderLengthExceededException
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/io/ReplayCharSequence.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public interface ReplayCharSequence extends CharSequence, Closeable {
public long getDecodeExceptionCount();

/**
* Return the first coding-exception encountered, if the count > 0.
* Return the first coding-exception encountered, if the count &gt; 0.
* @return CharacterCodingException
*/
public CharacterCodingException getCodingException();
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/archive/io/ReplayInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public class ReplayInputStream extends SeekInputStream
* @param size Size of data to replay.
* @param responseBodyStart Start of the response body.
* @param backingFilename Backing file that sits behind the buffer. If
* <code>size<code> > than buffer then we go to backing file to read
* <code>size</code> &gt; than buffer then we go to backing file to read
* data that is beyond buffer.length.
*
* @throws IOException If we fail to open an input stream on
Expand All @@ -84,7 +84,7 @@ public ReplayInputStream(byte[] buffer, long size, long responseBodyStart,
* @param buffer Buffer to read from.
* @param size Size of data to replay.
* @param backingFilename Backing file that sits behind the buffer. If
* <code>size<code> > than buffer then we go to backing file to read
* <code>size</code> &gt; than buffer then we go to backing file to read
* data that is beyond buffer.length.
* @throws IOException If we fail to open an input stream on
* backing file.
Expand Down Expand Up @@ -130,7 +130,7 @@ public ReplayInputStream(InputStream fillStream) throws IOException {
}

/**
* Close & destroy any internally-generated temporary files.
* Close &amp; destroy any internally-generated temporary files.
*/
public void destroy() {
IOUtils.closeQuietly(this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
* stream. Uses a {@link BufferedInputStream}. Calls mark on every read so
* we'll remember at least the last thing read (You can only backup on the
* last thing read -- not last 2 or 3 things read). Used by
* {@link GzippedInputStream} when reading streams over a network. Wraps a
* GzippedInputStream when reading streams over a network. Wraps a
* HTTP, etc., stream so we can back it up if needs be after the
* GZIP inflater has done a fill of its full buffer though it only needed
* the first few bytes to finish decompressing the current GZIP member.
Expand Down
3 changes: 0 additions & 3 deletions src/main/java/org/archive/io/WriterPool.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,7 @@ public abstract class WriterPool {
/**
* Constructor
* @param serial Used to generate unique filename sequences
* @param factory Factory that knows how to make a {@link WriterPoolMember}.
* @param settings Settings for this pool.
* @param poolMaximumActive
* @param poolMaximumWait
*/
public WriterPool(final AtomicInteger serial,
final WriterPoolSettings settings,
Expand Down
9 changes: 0 additions & 9 deletions src/main/java/org/archive/io/WriterPoolMember.java
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ public abstract class WriterPoolMember implements ArchiveFileConstants {
* @param serialNo used to create unique filename sequences
* @param out Where to write.
* @param file File the <code>out</code> is connected to.
* @param cmprs Compress the content written.
* @param a14DigitDate If null, we'll write current time.
* @throws IOException
*/
protected WriterPoolMember(AtomicInteger serialNo,
final OutputStream out, final File file,
Expand All @@ -145,11 +142,6 @@ protected WriterPoolMember(AtomicInteger serialNo,
* Constructor.
*
* @param serialNo used to create unique filename sequences
* @param dirs Where to drop files.
* @param prefix File prefix to use.
* @param cmprs Compress the records written.
* @param maxSize Maximum size for ARC files written.
* @param template filenaming template to use
* @param extension Extension to give file.
*/
public WriterPoolMember(AtomicInteger serialNo,
Expand Down Expand Up @@ -361,7 +353,6 @@ protected void postWriteRecordTasks()
* Position in raw output (typically, physical file).
* Used making accounting of bytes written.
* @return Position in final media (assuming all flushing completes)
* @throws IOException
*/
public long getPosition() {
return (countOut==null)? 0L : this.countOut.getCount();
Expand Down
6 changes: 2 additions & 4 deletions src/main/java/org/archive/io/arc/ARCRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,10 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
* formatted).
* @param parseHttpHeaders True if we are to parse HTTP headers. Costs
* about ~20% of CPU during an ARC parse.
* @param isAllignedOnFirstRecord True if this is the first record to be
* @param isAlignedOnFirstRecord True if this is the first record to be
* read from an archive
* @param String version Version information to be returned to the
* @param version Version information to be returned to the
* ARCReader constructing this record
*
* @throws IOException
*/
public ARCRecord(InputStream in, final String identifier,
final long offset, boolean digest, boolean strict,
Expand Down
13 changes: 4 additions & 9 deletions src/main/java/org/archive/io/arc/ARCWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
* write our own GZIP*Streams, ones that resettable and consious of gzip
* members.
*
* <p>This class will write until we hit >= maxSize. The check is done at
* <p>This class will write until we hit &gt;= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
Expand All @@ -95,9 +95,9 @@
* <a href="http://www.archive.org/web/researcher/tool_documentation.php">alexa
* ARC c-tools</a>:
* <pre>
* % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
* % av_procarc hx20040109230030-0.arc.gz | av_ziparc &gt; \
* /tmp/hx20040109230030-0.dat.gz
* % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
* % av_ripdat /tmp/hx20040109230030-0.dat.gz &gt; /tmp/hx20040109230030-0.cdx
* </pre>
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
Expand Down Expand Up @@ -129,12 +129,7 @@ public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeab
* @param serialNo used to generate unique file name sequences
* @param out Where to write.
* @param arc File the <code>out</code> is connected to.
* @param cmprs Compress the content written.
* @param metadata File meta data. Can be null. Is list of File and/or
* String objects.
* @param a14DigitDate If null, we'll write current time.
* @throws IOException
*/
2 */
public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
final File arc, final WriterPoolSettings settings)
throws IOException {
Expand Down
14 changes: 2 additions & 12 deletions src/main/java/org/archive/io/warc/WARCWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
*
* <p>While being written, WARCs have a '.open' suffix appended.
*
* @contributor stack
* @author stack
* @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
*/
public class WARCWriter extends WriterPoolMember
Expand Down Expand Up @@ -81,7 +81,7 @@ public class WARCWriter extends WriterPoolMember

/**
* Temporarily accumulates stats managed externally by
* {@link WARCWriterProcessor}. WARCWriterProcessor will call
* WARCWriterProcessor. WARCWriterProcessor will call
* {@link #resetTmpStats()}, write some records, then add
* {@link #getTmpStats()} into its long-term running totals.
*/
Expand All @@ -97,9 +97,6 @@ public class WARCWriter extends WriterPoolMember
* @param serialNo used to generate unique file name sequences
* @param out Where to write.
* @param f File the <code>out</code> is connected to.
* @param cmprs Compress the content written.
* @param a14DigitDate If null, we'll write current time.
* @throws IOException
*/
public WARCWriter(final AtomicInteger serialNo,
final OutputStream out, final File f,
Expand All @@ -110,13 +107,6 @@ public WARCWriter(final AtomicInteger serialNo,

/**
* Constructor.
*
* @param dirs Where to drop files.
* @param prefix File prefix to use.
* @param cmprs Compress the records written.
* @param maxSize Maximum size for ARC files written.
* @param suffix File tail to use. If null, unused.
* @param warcinfoData File metadata for warcinfo record.
*/
public WARCWriter(final AtomicInteger serialNo,
final WARCWriterPoolSettings settings) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/archive/io/warc/WARCWriterPool.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

/**
* A pool of WARCWriters.
* @contributor stack
* @contributor gojomo
* @author stack
* @author gojomo
* @version $Revision: 4566 $ $Date: 2006-08-31 09:51:41 -0700 (Thu, 31 Aug 2006) $
*/
public class WARCWriterPool extends WriterPool {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/archive/io/warc/package.html
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
<h2>Implementation Notes</h2>
<h3>Tools</h3>
<p>Initial implementations of <code>Arc2Warc</code> and <code>Warc2Arc</code>
tools can be found in the package above this one, at
{@link org.archive.io.Arc2Warc} and {@link org.archive.io.Warc2Arc}
tools can be found in Heritrix, at
org.archive.io.Arc2Warc and org.archive.io.Warc2Arc
respectively. Pass <code>--help</code> to learn how to use each tool.
</p>

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/net/PublicSuffixes.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public class PublicSuffixes {
* prefix tree node. each Node represents sequence of letters (prefix)
* and alternative sequences following it (list of Node's). Nodes in
* {@code branches} are sorted for skip list like lookup and for generating
* effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char).)
* effective regular expression (see {@link #compareTo(Node)} and {@link #compareTo(char)}.
*
* as is intended for internal use only, there's no access methods. procedures for updating
* prefix tree with new input are defined within this class ({@link #addBranch(CharSequence)}).
Expand Down
6 changes: 0 additions & 6 deletions src/main/java/org/archive/resource/ResourceFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,6 @@ public interface ResourceFactory {

/**
* Attempts to create a Resource from the InputStream
* @param is
* @param metaData
* @param container
* @return
* @throws ResourceParseException
* @throws IOException
*/
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container)
Expand Down
Loading

0 comments on commit d36e247

Please sign in to comment.