Skip to content

Commit 97d9e8c

Browse files
committed
1 parent 04acdae commit 97d9e8c

File tree

3 files changed

+129
-0
lines changed

3 files changed

+129
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
21+
import org.apache.commons.io.IOUtils;
22+
import org.jsoup.Jsoup;
23+
import org.jsoup.nodes.Document;
24+
import org.jsoup.nodes.Element;
25+
import org.metafacture.framework.FluxCommand;
26+
import org.metafacture.framework.ObjectReceiver;
27+
import org.metafacture.framework.annotations.Description;
28+
import org.metafacture.framework.annotations.In;
29+
import org.metafacture.framework.annotations.Out;
30+
import org.metafacture.framework.helpers.DefaultObjectPipe;
31+
32+
/**
33+
* Extracts the the specified element from an HTML document
34+
*
35+
* @author Fabian Steeg
36+
*/
37+
@Description("Extracts the specified element from an HTML document")
38+
@In(Reader.class)
39+
@Out(String.class)
40+
@FluxCommand("extract-element")
41+
public class ElementExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
42+
private String selector;
43+
44+
/**
45+
* @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax
46+
*/
47+
public ElementExtractor(final String selector) {
48+
this.selector = selector;
49+
}
50+
51+
@Override
52+
public void process(final Reader reader) {
53+
try {
54+
Document document = Jsoup.parse(IOUtils.toString(reader));
55+
Element firstElement = document.select(selector).first();
56+
getReceiver().process(firstElement.data());
57+
} catch (IOException e) {
58+
e.printStackTrace();
59+
}
60+
}
61+
}

metafacture-html/src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
#
1616
decode-html org.metafacture.html.HtmlDecoder
1717
extract-script org.metafacture.html.ScriptExtractor
18+
extract-element org.metafacture.html.ElementExtractor
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright 2020 Fabian Steeg, hbz
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.metafacture.html;
17+
18+
import static org.mockito.Mockito.verify;
19+
import static org.mockito.Mockito.verifyNoMoreInteractions;
20+
21+
import java.io.StringReader;
22+
23+
import org.junit.After;
24+
import org.junit.Before;
25+
import org.junit.Test;
26+
import org.metafacture.framework.ObjectReceiver;
27+
import org.mockito.Mock;
28+
import org.mockito.MockitoAnnotations;
29+
30+
/**
31+
* Tests for {@link ElementExtractor}.
32+
*
33+
* @author Fabian Steeg
34+
*
35+
*/
36+
public final class ElementExtractorTest {
37+
38+
private static final StringReader IN = new StringReader("<html>"
39+
+ "<script data-test='site-head-data'>{\"code\":\"hey\"}</script>"
40+
+ "<script data-test='model-linked-data'>{\"code\":\"yo\"}");
41+
42+
private static final String OUT = "{\"code\":\"yo\"}";
43+
44+
private ElementExtractor elementExtractor;
45+
46+
@Mock
47+
private ObjectReceiver<String> receiver;
48+
49+
@Before
50+
public void setup() {
51+
MockitoAnnotations.initMocks(this);
52+
elementExtractor = new ElementExtractor("script[data-test=model-linked-data]");
53+
elementExtractor.setReceiver(receiver);
54+
}
55+
56+
@Test
57+
public void testShouldProcessRecordsFollowedbySeparator() {
58+
elementExtractor.process(IN);
59+
verify(receiver).process(OUT);
60+
verifyNoMoreInteractions(receiver);
61+
}
62+
63+
@After
64+
public void cleanup() {
65+
elementExtractor.closeStream();
66+
}
67+
}

0 commit comments

Comments
 (0)