1
- // Using Html Parsing
2
- let a = null, Summary = [
1
+ // HtmlParse for True Parsing
2
+ let
3
3
RawHtml = "<html><body>
4
4
<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>
5
5
<a href=""/12314543"" target=""\_blank"">Jane Doe</a>
6
- ",
7
-
8
- Parsed = Html.Table( RawHtml, {
9
- { "Url", "a", each _ }
10
- })
11
-
12
- // delimsList = { "<a href=""", """ target=""_blank"""">", "</a>"},
13
- // rendList = Text.Combine( delimsList, "#(cr,lf)" ),
14
- // rendSplit = Text.Combine( splitUrl, "#(cr,lf)" ),
15
- // SplitSaleTag = Splitter.SplitTextByEachDelimiter( delimsList, QuoteStyle.None ),
16
- // splitUrl = SplitSaleTag( RawSrc ),
17
- // s0 = Text.AfterDelimiter(RawSrc, "<a href="""),
18
- // s1 = Text.BeforeDelimiter(s0, """"),
19
-
6
+ </body></html>",
7
+ // SingleHtml = "<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>",
20
8
21
- // parseUrl = (rawTag as text) as record => [
22
- // // parse html without a parser, but try to be strict
23
- // strip_href = Text.AfterDelimiter( rawTag, "<a href=""" ),
24
- // only_href = Text.BeforeDelimiter(strip_href, """" ),
9
+ ParsedAll = Html.Table( RawHtml, { {"Link", "a", each _ }}),
25
10
26
- // strip_close_a = Text.BeforeDelimiter( rawTag, "</a>" ),
27
- // a_text_only = Text.AfterDelimiter(strip_close_a, ">" ),
28
- // ret = [ FullRawText = rawTag, Href = only_href, Name = a_text_only ]
29
- // ][ret],
30
- // // ], // toggle lines to see each step in the calculation
11
+ #"Expanded Link" = Table.ExpandRecordColumn(ParsedAll, "Link",
12
+ { "TagName", "TextContent", "Attributes"},
13
+ { "Tag", "Content", "Attributes" } ),
31
14
32
- // final = parseUrl( RawSrc )
33
-
34
- ],
35
- Parsed = Summary[Parsed],
36
- #"Changed Type" = Table.TransformColumnTypes(Parsed,{{"Url", type any}}),
37
- #"Expanded Url" = Table.ExpandRecordColumn(#"Changed Type", "Url", {"TagName", "TextContent", "Attributes"}, {"Url.TagName", "Url.TextContent", "Url.Attributes"}),
38
- #"Expanded Url.Attributes" = Table.ExpandRecordColumn(#"Expanded Url", "Url.Attributes", {"href"}, {"Url.Attributes.href"}),
39
- #"Changed Type1" = Table.TransformColumnTypes(#"Expanded Url.Attributes",{{"Url.Attributes.href", type text}, {"Url.TextContent", type text}, {"Url.TagName", type text}})
15
+ #"Expanded Attributes" = Table.ExpandRecordColumn( #"Expanded Link", "Attributes", {"href"}, {"href"} ),
16
+ #"Changed Type" = Table.TransformColumnTypes(#"Expanded Attributes", {
17
+ {"href", type text}, {"Content", type text}, {"Tag", type text} })
40
18
in
41
- #"Changed Type1"
42
-
43
- // SingleString UsingSplits by long Delimiters
44
- let Summary = [
45
- Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText("i45Wiik1MDBOTlTIKEpNs41R0gfyXPIMQCDJzNLVJUZJoSSxKD21BCgXExOflJOYlx0D0ZQalJmckViUohCUmJKXWAkxSD8RIqcUGwsA", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RawHtml = _t]),
46
- #"Changed Type" = Table.TransformColumnTypes(Source,{{"RawHtml", type text}}),
47
-
48
- RawSrc_FromTable = #"Changed Type"{0}[RawHtml],
19
+ #"Changed Type"
49
20
21
+ // Raw Split by Long Delimiters
22
+ let
50
23
RawSrc = "<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>",
51
24
52
-
53
- delimsList = { "<a href=""", """ target=""_blank"""">", "</a>"},
54
- rendList = Text.Combine( delimsList, "#(cr,lf)" ),
55
- rendSplit = Text.Combine( splitUrl, "#(cr,lf)" ),
56
- SplitSaleTag = Splitter.SplitTextByEachDelimiter( delimsList, QuoteStyle.None ),
57
- splitUrl = SplitSaleTag( RawSrc ),
58
- s0 = Text.AfterDelimiter(RawSrc, "<a href="""),
59
- s1 = Text.BeforeDelimiter(s0, """"),
60
-
61
-
62
25
parseUrl = (rawTag as text) as record => [
63
- // parse html without a parser, but try to be strict
64
- strip_href = Text.AfterDelimiter( rawTag, "<a href=""" ),
65
- only_href = Text.BeforeDelimiter(strip_href, """" ),
66
-
67
- strip_close_a = Text.BeforeDelimiter( rawTag, "</a>" ),
68
- a_text_only = Text.AfterDelimiter(strip_close_a, ">" ),
69
- ret = [ FullRawText = rawTag, Href = only_href, Name = a_text_only ]
70
- ][ret],
26
+ // parse html without a parser, but try to be strict
27
+ strip_href = Text.AfterDelimiter( rawTag, "<a href=""" ),
28
+ only_href = Text.BeforeDelimiter(strip_href, """" ),
29
+ strip_close_a = Text.BeforeDelimiter( rawTag, "</a>" ),
30
+ a_text_only = Text.AfterDelimiter(strip_close_a, ">" ),
31
+ ret = [ FullRawText = rawTag, Href = only_href, Name = a_text_only ]
32
+ ][ret],
71
33
// ], // toggle lines to see each step in the calculation
72
34
73
35
final = parseUrl( RawSrc )
74
-
75
- ],
76
- final = Summary[final] in final
36
+ in
37
+ final
77
38
78
39
// PBIBlog Html Expand Extra Attributes
79
40
let
90
51
#"Expanded Link" = Table.ExpandRecordColumn(ParseHtml, "Link", {"TagName", "TextContent", "Attributes"}, {"Link.TagName", "Link.TextContent", "Link.Attributes"}),
91
52
#"Expanded Link.Attributes" = Table.ExpandRecordColumn(#"Expanded Link", "Link.Attributes", {"class", "href", "id", "aria-label", "data-bi-name", "role", "title", "target", "rel"}, {"Link.Attributes.class", "Link.Attributes.href", "Link.Attributes.id", "Link.Attributes.aria-label", "Link.Attributes.data-bi-name", "Link.Attributes.role", "Link.Attributes.title", "Link.Attributes.target", "Link.Attributes.rel"})
92
53
in
93
- #"Expanded Link.Attributes"
94
-
95
- // HtmlParse a Single String
96
- let
97
- SingleHtml = "<a href=""/003Dn00000b69ED"" target=""\_blank"">Richard Radnay</a>",
98
- ParsedAll = Html.Table( SingleHtml, { {"Link", "a", each _ }}),
99
- #"Expanded Link" = Table.ExpandRecordColumn(ParsedAll, "Link", {"TagName", "TextContent", "Attributes"}, {"Tag", "Content", "Attributes"}),
100
- #"Expanded Attributes" = Table.ExpandRecordColumn(#"Expanded Link", "Attributes", {"href"}, {"href"}),
101
- #"Changed Type" = Table.TransformColumnTypes(#"Expanded Attributes",{{"href", type text}, {"Content", type text}, {"Tag", type text}})
102
- in
103
- #"Changed Type"
54
+ #"Expanded Link.Attributes"
0 commit comments