Skip to content

Commit

Permalink
Merge pull request #44 from clarin-eric/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
wowasa authored May 22, 2022
2 parents a3fb621 + 0d86e7b commit bf076b9
Show file tree
Hide file tree
Showing 16 changed files with 654 additions and 455 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# version 2.4.0
- upgrade to storm 2.4.0 and storm crawler 2.4

# version 2.3.0
- improved algorithm for next links to check
- trimming URLs (done by RASA) and escaping white spaces in URLs used for request
Expand Down
29 changes: 17 additions & 12 deletions crawler-test-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ config:
topology.max.spout.pending: 100
topology.debug: false

fetcher.threads.number: 10
fetcher.threads.number: 1

# override the JVM parameters for the workers
topology.worker.childopts: "-Xmx2g -Djava.net.preferIPv4Stack=true"
Expand All @@ -28,19 +28,23 @@ config:
# lists the metadata to persist to storage
# these are not transfered to the outlinks
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
- urlId
- originalUrl
- fetch.statusCode
- fetch.category
- fetch.message
- fetch.contentType
- fetch.byteLength
- fetch.duration
- fetch.startTime
- fetch.redirectCount
- http.method.head

http.agent.name: "CLARIN Linkchecker: https://www.clarin.eu/linkchecker"
http.agent.version: "2.1"
http.agent.description: "built with StormCrawler Archetype 2.1"
http.agent.name: "CLARIN Linkchecker: https://www.clarin.eu/linkchecker (!!! TEST !!!)"
http.agent.version: "2.4"
http.agent.description: "built with StormCrawler Archetype 2.4"
http.agent.url: "https://www.clarin.eu/linkchecker"
http.agent.email: "linkchecker@clarin.eu"
http.agent.email: "clarin@wowasa.com"
http.redirectLimit: 5
http.timeout: 5000

Expand All @@ -62,7 +66,7 @@ config:
# The maximum number of bytes for returned HTTP response bodies.
# Set -1 to disable the limit.
# this is 0 so that we don't download any payload when doing GET requests, it will get trimmed.
http.content.limit: 100000
http.content.limit: 0

# FetcherBolt queue dump => comment out to activate
# if a file exists on the worker machine with the corresponding port number
Expand Down Expand Up @@ -151,4 +155,5 @@ config:
login.list.url: "https://raw.githubusercontent.com/clarin-eric/login-pages/master/list.txt"
ok.status.codes: [200, 304]
redirect.status.codes: [301, 302, 303, 307, 308]
undeterminded.status.codes: [401, 405, 429]
undeterminded.status.codes: [405, 429]
restricted.access.status.codes: [401, 403]
10 changes: 8 additions & 2 deletions crawler-test.flux
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "linkchecker"
name: "linkchecker-test"

includes:
- resource: true
Expand All @@ -11,7 +11,13 @@ includes:

spouts:
- id: "spout"
className: "eu.clarin.linkchecker.spout.RASASpout"
className: "eu.clarin.linkchecker.spout.RASAQuerySpout"
constructorArgs:
- >
SELECT u.* FROM url u
WHERE u.valid=true
ORDER BY RAND()
LIMIT 1000
parallelism: 1

bolts:
Expand Down
80 changes: 44 additions & 36 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
<modelVersion>4.0.0</modelVersion>
<groupId>eu.clarin.cmdi</groupId>
<artifactId>linkchecker</artifactId>
<version>2.3.0</version>
<version>2.4.0</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<storm.version>2.1.1</storm.version>
<stormcrawler.version>2.1</stormcrawler.version>
<rasa.version>5.0.2</rasa.version>
<storm.version>2.4.0</storm.version>
<stormcrawler.version>2.4</stormcrawler.version>
<rasa.version>5.1.0</rasa.version>
<mariadb.version>2.7.5</mariadb.version>
<hikari.version>5.0.1</hikari.version>
<lombok.version>1.18.24</lombok.version>
</properties>
<distributionManagement>
<snapshotRepository>
Expand Down Expand Up @@ -52,39 +55,29 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.3.3</version>
<version>1.4</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
<manifestEntries>
<Change></Change>
<Build-Date></Build-Date>
</manifestEntries>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugin>
</plugins>
</build>
<dependencies>
Expand All @@ -97,7 +90,6 @@
<groupId>javax.activation</groupId>
<artifactId>activation</artifactId>
</exclusion>

<exclusion>
<groupId>io.netty</groupId>
<artifactId>
Expand Down Expand Up @@ -201,6 +193,22 @@
<classifier>shaded</classifier>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.mariadb.jdbc</groupId>
<artifactId>mariadb-java-client</artifactId>
<version>${mariadb.version}</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
<version>${hikari.version}</version>
</dependency>
</dependencies>
<repositories>
<repository>
Expand Down
71 changes: 71 additions & 0 deletions src/main/java/eu/clarin/linkchecker/bolt/MetadataPrinterBolt.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* NOTICE: This code was modified in ACDH - Austrian Academy of Sciences, based on Stormcrawler source code.
*/

package eu.clarin.linkchecker.bolt;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.AbstractStatusUpdaterBolt;
import com.digitalpebble.stormcrawler.persistence.Status;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;

import java.util.Date;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Pattern;

/**
* Status updater for SQL backend. Discovered URLs are sent as a batch, whereas
* updates are atomic.
**/

@SuppressWarnings("serial")
public class MetadataPrinterBolt extends AbstractStatusUpdaterBolt {



/**
* Does not shard based on the total number of queues
**/
public MetadataPrinterBolt() {
}

@SuppressWarnings({ "rawtypes" })
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
super.prepare(stormConf, context, collector);
}

@Override
public synchronized void store(String url, Status status, Metadata metadata, Optional<Date> nextFetch, Tuple t)
throws Exception {

System.out.println(metadata.toString());


_collector.ack(t);

}

@Override
public void cleanup() {

}
}
Loading

0 comments on commit bf076b9

Please sign in to comment.