Skip to content

Commit fd85e51

Browse files
committed
new: Html Parsing example
1 parent 5313421 commit fd85e51

File tree

2 files changed

+103
-0
lines changed

2 files changed

+103
-0
lines changed
Binary file not shown.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Using Html Parsing
2+
let a = null, Summary = [
3+
RawHtml = "<html><body>
4+
<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>
5+
<a href=""/12314543"" target=""\_blank"">Jane Doe</a>
6+
",
7+
8+
Parsed = Html.Table( RawHtml, {
9+
{ "Url", "a", each _ }
10+
})
11+
12+
// delimsList = { "<a href=""", """ target=""_blank"""">", "</a>"},
13+
// rendList = Text.Combine( delimsList, "#(cr,lf)" ),
14+
// rendSplit = Text.Combine( splitUrl, "#(cr,lf)" ),
15+
// SplitSaleTag = Splitter.SplitTextByEachDelimiter( delimsList, QuoteStyle.None ),
16+
// splitUrl = SplitSaleTag( RawSrc ),
17+
// s0 = Text.AfterDelimiter(RawSrc, "<a href="""),
18+
// s1 = Text.BeforeDelimiter(s0, """"),
19+
20+
21+
// parseUrl = (rawTag as text) as record => [
22+
// // parse html without a parser, but try to be strict
23+
// strip_href = Text.AfterDelimiter( rawTag, "<a href=""" ),
24+
// only_href = Text.BeforeDelimiter(strip_href, """" ),
25+
26+
// strip_close_a = Text.BeforeDelimiter( rawTag, "</a>" ),
27+
// a_text_only = Text.AfterDelimiter(strip_close_a, ">" ),
28+
// ret = [ FullRawText = rawTag, Href = only_href, Name = a_text_only ]
29+
// ][ret],
30+
// // ], // toggle lines to see each step in the calculation
31+
32+
// final = parseUrl( RawSrc )
33+
34+
],
35+
Parsed = Summary[Parsed],
36+
#"Changed Type" = Table.TransformColumnTypes(Parsed,{{"Url", type any}}),
37+
#"Expanded Url" = Table.ExpandRecordColumn(#"Changed Type", "Url", {"TagName", "TextContent", "Attributes"}, {"Url.TagName", "Url.TextContent", "Url.Attributes"}),
38+
#"Expanded Url.Attributes" = Table.ExpandRecordColumn(#"Expanded Url", "Url.Attributes", {"href"}, {"Url.Attributes.href"}),
39+
#"Changed Type1" = Table.TransformColumnTypes(#"Expanded Url.Attributes",{{"Url.Attributes.href", type text}, {"Url.TextContent", type text}, {"Url.TagName", type text}})
40+
in
41+
#"Changed Type1"
42+
43+
// SingleString UsingSplits by long Delimiters
44+
let Summary = [
45+
Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText("i45Wiik1MDBOTlTIKEpNs41R0gfyXPIMQCDJzNLVJUZJoSSxKD21BCgXExOflJOYlx0D0ZQalJmckViUohCUmJKXWAkxSD8RIqcUGwsA", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RawHtml = _t]),
46+
#"Changed Type" = Table.TransformColumnTypes(Source,{{"RawHtml", type text}}),
47+
48+
RawSrc_FromTable = #"Changed Type"{0}[RawHtml],
49+
50+
RawSrc = "<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>",
51+
52+
53+
delimsList = { "<a href=""", """ target=""_blank"""">", "</a>"},
54+
rendList = Text.Combine( delimsList, "#(cr,lf)" ),
55+
rendSplit = Text.Combine( splitUrl, "#(cr,lf)" ),
56+
SplitSaleTag = Splitter.SplitTextByEachDelimiter( delimsList, QuoteStyle.None ),
57+
splitUrl = SplitSaleTag( RawSrc ),
58+
s0 = Text.AfterDelimiter(RawSrc, "<a href="""),
59+
s1 = Text.BeforeDelimiter(s0, """"),
60+
61+
62+
parseUrl = (rawTag as text) as record => [
63+
// parse html without a parser, but try to be strict
64+
strip_href = Text.AfterDelimiter( rawTag, "<a href=""" ),
65+
only_href = Text.BeforeDelimiter(strip_href, """" ),
66+
67+
strip_close_a = Text.BeforeDelimiter( rawTag, "</a>" ),
68+
a_text_only = Text.AfterDelimiter(strip_close_a, ">" ),
69+
ret = [ FullRawText = rawTag, Href = only_href, Name = a_text_only ]
70+
][ret],
71+
// ], // toggle lines to see each step in the calculation
72+
73+
final = parseUrl( RawSrc )
74+
75+
],
76+
final = Summary[final] in final
77+
78+
// PBIBlog Html Expand Extra Attributes
79+
let
80+
a = null,
81+
Summary = [
82+
Response = Web.Contents("https://powerbi.microsoft.com/en-us/blog/", [ManualStatusHandling={400, 401, 402, 404, 405} & {405..490} & {500..599}]),
83+
RawText = Text.FromBinary( Response ),
84+
ParseHtml = Html.Table( RawText, {{"Link", "a", each _ }})
85+
86+
87+
88+
],
89+
ParseHtml = Summary[ParseHtml],
90+
#"Expanded Link" = Table.ExpandRecordColumn(ParseHtml, "Link", {"TagName", "TextContent", "Attributes"}, {"Link.TagName", "Link.TextContent", "Link.Attributes"}),
91+
#"Expanded Link.Attributes" = Table.ExpandRecordColumn(#"Expanded Link", "Link.Attributes", {"class", "href", "id", "aria-label", "data-bi-name", "role", "title", "target", "rel"}, {"Link.Attributes.class", "Link.Attributes.href", "Link.Attributes.id", "Link.Attributes.aria-label", "Link.Attributes.data-bi-name", "Link.Attributes.role", "Link.Attributes.title", "Link.Attributes.target", "Link.Attributes.rel"})
92+
in
93+
#"Expanded Link.Attributes"
94+
95+
// HtmlParse a Single String
96+
let
97+
SingleHtml = "<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>",
98+
ParsedAll = Html.Table( SingleHtml, { {"Link", "a", each _ }}),
99+
#"Expanded Link" = Table.ExpandRecordColumn(ParsedAll, "Link", {"TagName", "TextContent", "Attributes"}, {"Tag", "Content", "Attributes"}),
100+
#"Expanded Attributes" = Table.ExpandRecordColumn(#"Expanded Link", "Attributes", {"href"}, {"href"}),
101+
#"Changed Type" = Table.TransformColumnTypes(#"Expanded Attributes",{{"href", type text}, {"Content", type text}, {"Tag", type text}})
102+
in
103+
#"Changed Type"

0 commit comments

Comments
 (0)