Skip to content

Commit bcee4b7

Browse files
committed
Respect xml:space="preserve" (#43)
1 parent f259aff commit bcee4b7

File tree

2 files changed

+239
-77
lines changed

2 files changed

+239
-77
lines changed

src/raw.jl

Lines changed: 130 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
x === RawDocument ? Document :
2929
nothing
3030

31+
struct XMLSpaceContext
32+
preserve_space::Vector{Bool} # Stack to track xml:space state
33+
end
34+
XMLSpaceContext() = XMLSpaceContext([false]) # Default is not preserving
35+
3136
#-----------------------------------------------------------------------------# Raw
3237
"""
3338
Raw(filename::String)
@@ -64,8 +69,10 @@ struct Raw
6469
pos::Int
6570
len::Int
6671
data::Vector{UInt8}
72+
ctx::XMLSpaceContext
6773
end
68-
Raw(data::Vector{UInt8}) = Raw(RawDocument, 0, 0, 0, data)
74+
Raw(data::Vector{UInt8}, ctx=XMLSpaceContext()) = Raw(RawDocument, 0, 0, 0, data, ctx)
75+
6976

7077
Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
7178
Raw(Mmap.mmap(filename)) :
@@ -117,7 +124,7 @@ end
117124
# starting at position i, return attributes up until the next '>' or '?' (DTD)
118125
function get_attributes(data, i, j)
119126
i = name_start(data, i)
120-
i > j && return nothing
127+
(isnothing(j) || isnothing(i) || i > j) && return nothing
121128
out = OrderedDict{String, String}()
122129
while !isnothing(i) && i < j
123130
key, i = get_name(data, i)
@@ -161,7 +168,26 @@ function attributes(o::Raw)
161168
i = o.pos
162169
i = name_start(o.data, i)
163170
i = name_stop(o.data, i)
164-
get_attributes(o.data, i + 1, o.pos + o.len)
171+
out=get_attributes(o.data, i + 1, o.pos + o.len)
172+
if !isnothing(out) && haskey(out, "xml:space")
173+
# If xml:space attribute is present, we need to preserve whitespace
174+
if out["xml:space"] == "preserve"
175+
push!(o.ctx.preserve_space, true)
176+
elseif out["xml:space"] == "default"
177+
push!(o.ctx.preserve_space, false)
178+
else
179+
error("Invalid value for xml:space attribute: $(out["xml:space"]). Must be 'preserve' or 'default'.")
180+
end
181+
end
182+
out
183+
184+
elseif o.type === RawText
185+
if length(o.ctx.preserve_space)>0
186+
push!(o.ctx.preserve_space, o.ctx.preserve_space[end])
187+
else
188+
push!(o.ctx.preserve_space, false)
189+
end
190+
nothing
165191
elseif o.type === RawDeclaration
166192
get_attributes(o.data, o.pos + 6, o.pos + o.len)
167193
else
@@ -198,7 +224,15 @@ function children(o::Raw)
198224
depth = o.depth
199225
out = Raw[]
200226
for item in xml_nodes(o)
201-
item.depth == depth + 1 && push!(out, item)
227+
if item.depth == depth + 1
228+
if length(item.ctx.preserve_space) > 0
229+
item.ctx.preserve_space[1] = o.ctx.preserve_space[end] # inherit the context
230+
else
231+
push!(item.ctx.preserve_space, false)
232+
end
233+
o.type==RawElementOpen && attributes(item)
234+
push!(out, item)
235+
end
202236
item.depth == depth && break
203237
o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
204238
end
@@ -247,55 +281,65 @@ function next(o::Raw)
247281
depth = o.depth
248282
data = o.data
249283
type = o.type
250-
i = findnext(!isspace, data, i) # skip insignificant whitespace
251-
isnothing(i) && return nothing
284+
ctx = o.ctx
285+
k = findnext(!isspace, data, i)
286+
if (isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0)
287+
length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context
288+
return nothing
289+
end
290+
i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : i
291+
j = i + 1
292+
c = Char(o.data[k])
293+
d = Char(o.data[k+1])
252294
if type === RawElementOpen || type === RawDocument
253295
depth += 1
254296
end
255-
c = Char(o.data[i])
256-
j = i + 1
257-
if c !== '<'
297+
if c !== '<' || type === RawElementOpen && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end])
258298
type = RawText
259299
j = findnext(==(UInt8('<')), data, i) - 1
260-
j = findprev(!isspace, data, j) # "rstrip"
261-
elseif c === '<'
262-
c2 = Char(o.data[i + 1])
263-
if c2 === '!'
264-
c3 = Char(o.data[i + 2])
265-
if c3 === '-'
266-
type = RawComment
267-
j = findnext(Vector{UInt8}("-->"), data, i)[end]
268-
elseif c3 === '['
269-
type = RawCData
270-
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
271-
elseif c3 === 'D' || c3 == 'd'
272-
type = RawDTD
273-
j = findnext(==(UInt8('>')), data, i)
274-
while sum(==(UInt8('>')), data[i:j]) != sum(==(UInt8('<')), data[i:j])
275-
j = findnext(==(UInt8('>')), data, j + 1)
300+
j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, j) : j # preserving whitespace if needed
301+
else
302+
i=k
303+
j=k+1
304+
if c === '<'
305+
c2 = Char(o.data[i + 1])
306+
if c2 === '!'
307+
c3 = Char(o.data[i + 2])
308+
if c3 === '-'
309+
type = RawComment
310+
j = findnext(Vector{UInt8}("-->"), data, i)[end]
311+
elseif c3 === '['
312+
type = RawCData
313+
j = findnext(Vector{UInt8}("]]>"), data, i)[end]
314+
elseif c3 === 'D' || c3 == 'd'
315+
type = RawDTD
316+
j = findnext(==(UInt8('>')), data, i)
317+
while sum(==(UInt8('>')), data[k:j]) != sum(==(UInt8('<')), data[i:j])
318+
j = findnext(==(UInt8('>')), data, j + 1)
319+
end
276320
end
277-
end
278-
elseif c2 === '?'
279-
if get_name(data, i + 2)[1] == "xml"
280-
type = RawDeclaration
281-
else
282-
type = RawProcessingInstruction
283-
end
284-
j = findnext(Vector{UInt8}("?>"), data, i)[end]
285-
elseif c2 === '/'
286-
type = RawElementClose
287-
depth -= 1
288-
j = findnext(==(UInt8('>')), data, i)
289-
else
290-
j = findnext(==(UInt8('>')), data, i)
291-
if data[j-1] === UInt8('/')
292-
type = RawElementSelfClosed
321+
elseif c2 === '?'
322+
if get_name(data, i + 2)[1] == "xml"
323+
type = RawDeclaration
324+
else
325+
type = RawProcessingInstruction
326+
end
327+
j = findnext(Vector{UInt8}("?>"), data, i)[end]
328+
elseif c2 === '/'
329+
type = RawElementClose
330+
depth -= 1
331+
j = findnext(==(UInt8('>')), data, i)
293332
else
294-
type = RawElementOpen
333+
j = findnext(==(UInt8('>')), data, i)
334+
if data[j-1] === UInt8('/')
335+
type = RawElementSelfClosed
336+
else
337+
type = RawElementOpen
338+
end
295339
end
296340
end
297341
end
298-
return Raw(type, depth, i, j - i, data)
342+
return Raw(type, depth, i, j - i, data, ctx)
299343
end
300344

301345
#-----------------------------------------------------------------------------# prev Raw
@@ -308,52 +352,62 @@ function prev(o::Raw)
308352
depth = o.depth
309353
data = o.data
310354
type = o.type
355+
ctx = o.ctx
311356
type === RawDocument && return nothing
312357
j = o.pos - 1
313-
j = findprev(!isspace, data, j) # skip insignificant whitespace
314-
isnothing(j) && return Raw(data) # RawDocument
358+
k = findprev(!isspace, data, j)
359+
if isnothing(k) || length(String(o.data[o.pos + o.len + 1:end]))==0
360+
length(ctx.preserve_space)>0 && pop!(ctx.preserve_space) # pop the previous context
361+
return Raw(data) # RawDocument
362+
end
363+
j = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? k : j
315364
c = Char(o.data[j])
365+
d = Char(data[findprev(==(UInt8('<')), data, j)+1])
316366
i = j - 1
317367
next_type = type
318-
if c !== '>' # text
368+
if c !== '>' || type === RawElementClose && d === '/' && length(ctx.preserve_space) > 0 && (ctx.preserve_space[end]) # text or empty whitespace
319369
type = RawText
320-
i = findprev(==(UInt8('>')), data, j) + 1
321-
i = findnext(!isspace, data, i) # "lstrip"
322-
elseif c === '>'
323-
c2 = Char(o.data[j - 1])
324-
if c2 === '-'
325-
type = RawComment
326-
i = findprev(Vector{UInt8}("<--"), data, j)[1]
327-
elseif c2 === ']'
328-
type = RawCData
329-
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
330-
elseif c2 === '?'
331-
i = findprev(Vector{UInt8}("<?"), data, j)[1]
332-
if get_name(data, i + 2)[1] == "xml"
333-
type = RawDeclaration
370+
i=findprev(==(UInt8('>')), data, j) + 1
371+
i = length(ctx.preserve_space) == 0 || !(ctx.preserve_space[end]) ? findprev(!isspace, data, i) : i # If preserving whitespace, retain leading and trailing whitespace
372+
else
373+
j=k
374+
i=k-1
375+
if c === '>'
376+
c2 = Char(o.data[j - 1])
377+
if c2 === '-'
378+
type = RawComment
379+
i = findprev(Vector{UInt8}("<--"), data, j)[1]
380+
elseif c2 === ']'
381+
type = RawCData
382+
i = findprev(Vector{UInt8}("<![CData["), data, j)[1]
383+
elseif c2 === '?'
384+
i = findprev(Vector{UInt8}("<?"), data, j)[1]
385+
if get_name(data, i + 2)[1] == "xml"
386+
type = RawDeclaration
387+
else
388+
type = RawProcessingInstruction
389+
end
334390
else
335-
type = RawProcessingInstruction
336-
end
391+
i = findprev(==(UInt8('<')), data, j)
392+
char = Char(data[i+1])
393+
if char === '/'
394+
type = RawElementClose
395+
elseif char === '!'
396+
type = DTD
397+
elseif isletter(char) || char === '_'
398+
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
399+
else
400+
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
401+
end
402+
end
337403
else
338-
i = findprev(==(UInt8('<')), data, j)
339-
char = Char(data[i+1])
340-
if char === '/'
341-
type = RawElementClose
342-
elseif char === '!'
343-
type = DTD
344-
elseif isletter(char) || char === '_'
345-
type = Char(o.data[j - 2]) === '/' ? RawElementSelfClosed : RawElementOpen
346-
else
347-
error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.")
348-
end
404+
error("Unreachable reached in XML.prev")
349405
end
350-
else
351-
error("Unreachable reached in XML.prev")
352406
end
353407
if type !== RawElementOpen && next_type === RawElementClose
354408
depth += 1
355409
elseif type == RawElementOpen && next_type !== RawElementClose
356410
depth -= 1
357411
end
358-
return Raw(type, depth, i, j - i, data)
412+
return Raw(type, depth, i, j - i, data, ctx)
359413
end

0 commit comments

Comments
 (0)