Skip to content

Commit b7e78ea

Browse files
authored
Merge pull request #349 from metafacture/extract-element
Replace unreleased ScriptExtractor with generic ElementExtractor
2 parents 4c2eb8d + 99bc941 commit b7e78ea

File tree

3 files changed

+27
-15
lines changed

3 files changed

+27
-15
lines changed

Diff for: metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java renamed to metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java

+15-6
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,30 @@
3030
import org.metafacture.framework.helpers.DefaultObjectPipe;
3131

3232
/**
33-
* Extracts the first script from an HTML document
33+
* Extracts the the specified element from an HTML document
3434
*
3535
* @author Fabian Steeg
3636
*/
37-
@Description("Extracts the first script from an HTML document")
37+
@Description("Extracts the specified element from an HTML document")
3838
@In(Reader.class)
3939
@Out(String.class)
40-
@FluxCommand("extract-script")
41-
public class ScriptExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
40+
@FluxCommand("extract-element")
41+
public class ElementExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
42+
private String selector;
43+
44+
/**
45+
* @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax
46+
*/
47+
public ElementExtractor(final String selector) {
48+
this.selector = selector;
49+
}
50+
4251
@Override
4352
public void process(final Reader reader) {
4453
try {
4554
Document document = Jsoup.parse(IOUtils.toString(reader));
46-
Element firstScript = document.select("script").first();
47-
getReceiver().process(firstScript.data());
55+
Element firstElement = document.select(selector).first();
56+
getReceiver().process(firstElement.data());
4857
} catch (IOException e) {
4958
e.printStackTrace();
5059
}

Diff for: metafacture-html/src/main/resources/flux-commands.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414
# limitations under the License.
1515
#
1616
decode-html org.metafacture.html.HtmlDecoder
17-
extract-script org.metafacture.html.ScriptExtractor
17+
extract-element org.metafacture.html.ElementExtractor

Diff for: metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java renamed to metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java

+11-8
Original file line numberDiff line numberDiff line change
@@ -28,37 +28,40 @@
2828
import org.mockito.MockitoAnnotations;
2929

3030
/**
31-
* Tests for {@link ScriptExtractor}.
31+
* Tests for {@link ElementExtractor}.
3232
*
3333
* @author Fabian Steeg
3434
*
3535
*/
36-
public final class ScriptExtractorTest {
36+
public final class ElementExtractorTest {
3737

38-
private static final StringReader IN = new StringReader("<html><script>{\"code\":\"yo\"}");
38+
private static final StringReader IN = new StringReader("<html>"
39+
+ "<script data-test='site-head-data'>{\"code\":\"hey\"}</script>"
40+
+ "<script data-test='model-linked-data'>{\"code\":\"yo\"}");
41+
3942
private static final String OUT = "{\"code\":\"yo\"}";
4043

41-
private ScriptExtractor scriptExtractor;
44+
private ElementExtractor elementExtractor;
4245

4346
@Mock
4447
private ObjectReceiver<String> receiver;
4548

4649
@Before
4750
public void setup() {
4851
MockitoAnnotations.initMocks(this);
49-
scriptExtractor = new ScriptExtractor();
50-
scriptExtractor.setReceiver(receiver);
52+
elementExtractor = new ElementExtractor("script[data-test=model-linked-data]");
53+
elementExtractor.setReceiver(receiver);
5154
}
5255

5356
@Test
5457
public void testShouldProcessRecordsFollowedbySeparator() {
55-
scriptExtractor.process(IN);
58+
elementExtractor.process(IN);
5659
verify(receiver).process(OUT);
5760
verifyNoMoreInteractions(receiver);
5861
}
5962

6063
@After
6164
public void cleanup() {
62-
scriptExtractor.closeStream();
65+
elementExtractor.closeStream();
6366
}
6467
}

0 commit comments

Comments
 (0)