Skip to content

Commit 4e757d0

Browse files
authored
Merge pull request #1666 from marklogic/feature/encoding-test
Verifies that XMLSplitter can use custom encoding
2 parents 0791568 + c7053b2 commit 4e757d0

File tree

2 files changed

+219
-0
lines changed

2 files changed

+219
-0
lines changed

marklogic-client-api/src/test/java/com/marklogic/client/test/datamovement/XMLSplitterTest.java

+23
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,45 @@
2424
import com.marklogic.client.io.marker.XMLWriteHandle;
2525
import org.junit.jupiter.api.Test;
2626

27+
import javax.xml.stream.XMLInputFactory;
2728
import javax.xml.stream.XMLStreamReader;
2829
import java.io.File;
2930
import java.io.FileInputStream;
31+
import java.io.IOException;
32+
import java.io.InputStream;
3033
import java.util.Iterator;
3134
import java.util.stream.Stream;
3235

3336
import static org.junit.jupiter.api.Assertions.*;
3437

3538
public class XMLSplitterTest {
39+
3640
static final private String xmlFile = "src/test/resources/data" + File.separator + "pathSplitter/people.xml";
3741
static final private String[] expected = new String[]{
3842
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><person xmlns=\"http://www.marklogic.com/people/\" president=\"yes\"><first>George</first><last>Washington</last></person>",
3943
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><person xmlns=\"http://www.marklogic.com/people/\" president=\"no\"><first>Betsy</first><last>Ross</last></person>",
4044
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><person xmlns=\"http://www.marklogic.com/people/\" president=\"yes\"><first>John</first><last>Kennedy</last></person>"
4145
};
4246

47+
private static final String ENCODED_FILE = "src/test/resources/encoding/medline04.small.iso-8859-1.xml";
48+
49+
@Test
50+
void customEncoding() throws Exception {
51+
InputStream inputStream = new FileInputStream(ENCODED_FILE);
52+
XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(inputStream, "iso-8859-1");
53+
assertEquals(2, XMLSplitter.makeSplitter(null, "MedlineCitation").split(reader).count(),
54+
"By constructing a reader with a custom encoding, the file can be read and split successfully " +
55+
"into 2 XML fragments.");
56+
}
57+
58+
@Test
59+
void wrongEncoding() throws Exception {
60+
InputStream inputStream = new FileInputStream(ENCODED_FILE);
61+
final Stream<StringHandle> stream = XMLSplitter.makeSplitter(null, "MedlineCitation").split(inputStream);
62+
assertThrows(RuntimeException.class, () -> stream.count(), "An error should occur since the input file uses " +
63+
"'iso-8859-1' as the encoding, but the splitter defaults to assuming UTF-8.");
64+
}
65+
4366
@Test
4467
public void testXMLSplitter() throws Exception {
4568

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
<MedlineCitationSet>
2+
<MedlineCitation Owner="NLM" Status="Completed">
3+
<PMID>10605436</PMID>
4+
<DateCreated>
5+
<Year>2000</Year>
6+
<Month>01</Month>
7+
<Day>07</Day>
8+
</DateCreated>
9+
<DateCompleted>
10+
<Year>2000</Year>
11+
<Month>01</Month>
12+
<Day>07</Day>
13+
</DateCompleted>
14+
<DateRevised>
15+
<Year>2003</Year>
16+
<Month>11</Month>
17+
<Day>14</Day>
18+
</DateRevised>
19+
<Article>
20+
<Journal>
21+
<ISSN>0021-9525</ISSN>
22+
<JournalIssue PrintYN="Y">
23+
<Volume>76</Volume>
24+
<Issue>2</Issue>
25+
<PubDate>
26+
<Year>1978</Year>
27+
<Month>Feb</Month>
28+
</PubDate>
29+
</JournalIssue>
30+
</Journal>
31+
<ArticleTitle>Concerning the localization of steroids in centrioles and basal bodies by immunofluorescence.</ArticleTitle>
32+
<Pagination>
33+
<MedlinePgn>255-60</MedlinePgn>
34+
</Pagination>
35+
<Abstract>
36+
<AbstractText>Specific steroid antibodies, by the immunofluorescence technique, regularly reveal fluorescent centrioles and cilia-bearing basal bodies in target and nontarget cells. Although the precise identity of the immunoreactive steroid substance has not yet been established, it seems noteworthy that exogenous steroids can be vitally concentrated by centrioles, perhaps by exchange with steroids already present at this level. This unexpected localization suggests that steroids may affect cell growth and differentiation in some way different from the two-step receptor mechanism.</AbstractText>
37+
</Abstract>
38+
<Affiliation>Istituto di Anatomia e Istologia Patologica, Università di Ferrara, Italy.</Affiliation>
39+
<AuthorList CompleteYN="Y">
40+
<Author>
41+
<LastName>Nenci</LastName>
42+
<ForeName>I</ForeName>
43+
<Initials>I</Initials>
44+
</Author>
45+
<Author>
46+
<LastName>Marchetti</LastName>
47+
<ForeName>E</ForeName>
48+
<Initials>E</Initials>
49+
</Author>
50+
</AuthorList>
51+
<Language>eng</Language>
52+
<PublicationTypeList>
53+
<PublicationType>Journal Article</PublicationType>
54+
</PublicationTypeList>
55+
</Article>
56+
<MedlineJournalInfo>
57+
<Country>UNITED STATES</Country>
58+
<MedlineTA>J Cell Biol</MedlineTA>
59+
<NlmUniqueID>0375356</NlmUniqueID>
60+
</MedlineJournalInfo>
61+
<ChemicalList>
62+
<Chemical>
63+
<RegistryNumber>0</RegistryNumber>
64+
<NameOfSubstance>Steroids</NameOfSubstance>
65+
</Chemical>
66+
</ChemicalList>
67+
<CitationSubset>IM</CitationSubset>
68+
<MeshHeadingList>
69+
<MeshHeading>
70+
<DescriptorName MajorTopicYN="N">Animals</DescriptorName>
71+
</MeshHeading>
72+
<MeshHeading>
73+
<DescriptorName MajorTopicYN="N">Centrioles</DescriptorName>
74+
<QualifierName MajorTopicYN="Y">ultrastructure</QualifierName>
75+
</MeshHeading>
76+
<MeshHeading>
77+
<DescriptorName MajorTopicYN="N">Cilia</DescriptorName>
78+
<QualifierName MajorTopicYN="N">ultrastructure</QualifierName>
79+
</MeshHeading>
80+
<MeshHeading>
81+
<DescriptorName MajorTopicYN="N">Female</DescriptorName>
82+
</MeshHeading>
83+
<MeshHeading>
84+
<DescriptorName MajorTopicYN="N">Fluorescent Antibody Technique</DescriptorName>
85+
</MeshHeading>
86+
<MeshHeading>
87+
<DescriptorName MajorTopicYN="N">Human</DescriptorName>
88+
</MeshHeading>
89+
<MeshHeading>
90+
<DescriptorName MajorTopicYN="N">Lymphocytes</DescriptorName>
91+
<QualifierName MajorTopicYN="Y">cytology</QualifierName>
92+
</MeshHeading>
93+
<MeshHeading>
94+
<DescriptorName MajorTopicYN="N">Male</DescriptorName>
95+
</MeshHeading>
96+
<MeshHeading>
97+
<DescriptorName MajorTopicYN="N">Organelles</DescriptorName>
98+
<QualifierName MajorTopicYN="Y">ultrastructure</QualifierName>
99+
</MeshHeading>
100+
<MeshHeading>
101+
<DescriptorName MajorTopicYN="N">Rats</DescriptorName>
102+
</MeshHeading>
103+
<MeshHeading>
104+
<DescriptorName MajorTopicYN="N">Rats, Sprague-Dawley</DescriptorName>
105+
</MeshHeading>
106+
<MeshHeading>
107+
<DescriptorName MajorTopicYN="N">Respiratory Mucosa</DescriptorName>
108+
<QualifierName MajorTopicYN="N">cytology</QualifierName>
109+
</MeshHeading>
110+
<MeshHeading>
111+
<DescriptorName MajorTopicYN="N">Steroids</DescriptorName>
112+
<QualifierName MajorTopicYN="Y">analysis</QualifierName>
113+
</MeshHeading>
114+
<MeshHeading>
115+
<DescriptorName MajorTopicYN="N">Trachea</DescriptorName>
116+
</MeshHeading>
117+
</MeshHeadingList>
118+
</MedlineCitation>
119+
<MedlineCitation Owner="PIP" Status="Completed">
120+
<PMID>12261559</PMID>
121+
<DateCreated>
122+
<Year>1982</Year>
123+
<Month>03</Month>
124+
<Day>11</Day>
125+
</DateCreated>
126+
<DateCompleted>
127+
<Year>1982</Year>
128+
<Month>03</Month>
129+
<Day>11</Day>
130+
</DateCompleted>
131+
<DateRevised>
132+
<Year>2002</Year>
133+
<Month>10</Month>
134+
<Day>30</Day>
135+
</DateRevised>
136+
<Article>
137+
<Journal>
138+
<ISSN>0016-6987</ISSN>
139+
<JournalIssue PrintYN="Y">
140+
<Volume>34</Volume>
141+
<Issue>3-4</Issue>
142+
<PubDate>
143+
<Year>1978</Year>
144+
</PubDate>
145+
</JournalIssue>
146+
</Journal>
147+
<ArticleTitle>[An attempt to study, through genealogies, family structures in the case of a non-noble family]</ArticleTitle>
148+
<Pagination>
149+
<MedlinePgn>127-32</MedlinePgn>
150+
</Pagination>
151+
<AuthorList CompleteYN="Y">
152+
<Author>
153+
<LastName>Mira</LastName>
154+
<ForeName>G</ForeName>
155+
<Initials>G</Initials>
156+
</Author>
157+
</AuthorList>
158+
<Language>ita</Language>
159+
<PublicationTypeList>
160+
<PublicationType>Journal Article</PublicationType>
161+
</PublicationTypeList>
162+
<VernacularTitle>Un tentativo di studio, tramite, genealogie, di strutture familiari nel caso di una famiglia non nobile</VernacularTitle>
163+
</Article>
164+
<MedlineJournalInfo>
165+
<Country>United States</Country>
166+
<MedlineTA>Genus</MedlineTA>
167+
<NlmUniqueID>17120050R</NlmUniqueID>
168+
</MedlineJournalInfo>
169+
<CitationSubset>J</CitationSubset>
170+
<MeshHeadingList>
171+
<MeshHeading>
172+
<DescriptorName MajorTopicYN="Y">Demography</DescriptorName>
173+
</MeshHeading>
174+
<MeshHeading>
175+
<DescriptorName MajorTopicYN="N">Developed Countries</DescriptorName>
176+
</MeshHeading>
177+
<MeshHeading>
178+
<DescriptorName MajorTopicYN="N">English Abstract</DescriptorName>
179+
</MeshHeading>
180+
<MeshHeading>
181+
<DescriptorName MajorTopicYN="N">Europe</DescriptorName>
182+
</MeshHeading>
183+
<MeshHeading>
184+
<DescriptorName MajorTopicYN="Y">Family Characteristics</DescriptorName>
185+
</MeshHeading>
186+
<MeshHeading>
187+
<DescriptorName MajorTopicYN="N">Italy</DescriptorName>
188+
</MeshHeading>
189+
<MeshHeading>
190+
<DescriptorName MajorTopicYN="N">Social Sciences</DescriptorName>
191+
</MeshHeading>
192+
</MeshHeadingList>
193+
<OtherID Source="IND">013477</OtherID>
194+
<OtherID Source="POP">00102468</OtherID>
195+
</MedlineCitation>
196+
</MedlineCitationSet>

0 commit comments

Comments
 (0)