Skip to content

Commit b87c8d1

Browse files
lemireanonrig
authored andcommitted
Various scheme-related optimizations. (#300)
* Various scheme-related optimizations. * Various fixes
1 parent 964fdad commit b87c8d1

File tree

7 files changed

+140
-43
lines changed

7 files changed

+140
-43
lines changed

include/ada/common_defs.h

+8
Original file line numberDiff line numberDiff line change
@@ -275,4 +275,12 @@ namespace ada {
275275
#define ADA_ASSERT_EQUAL(LHS, RHS, MESSAGE)
276276
#define ADA_ASSERT_TRUE(COND)
277277
#endif
278+
279+
280+
281+
#ifdef ADA_VISUAL_STUDIO
282+
#define ADA_ASSUME(COND) __assume(COND)
283+
#else
284+
#define ADA_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
285+
#endif
278286
#endif // ADA_COMMON_DEFS_H

include/ada/url-inl.h

+2
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ inline bool url::base_fragment_has_value() const { return fragment.has_value();
154154

155155
inline bool url::base_search_has_value() const { return query.has_value(); }
156156

157+
inline void url::set_protocol_as_file() { type = ada::scheme::type::FILE; }
158+
157159
inline void url::set_scheme(std::string &&new_scheme) noexcept {
158160
type = ada::scheme::get_scheme_type(new_scheme);
159161
// We only move the 'scheme' if it is non-special.

include/ada/url.h

+2
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ struct url : url_base {
120120
inline bool base_fragment_has_value() const override;
121121
/** @private */
122122
inline bool base_search_has_value() const override;
123+
/** @private set this URL's type to file */
124+
inline void set_protocol_as_file();
123125
/** @return true if the URL has host */
124126
[[nodiscard]] inline bool has_hostname() const noexcept;
125127
/** @return true if it has an host but it is the empty string */

include/ada/url_aggregator-inl.h

+27
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ inline void url_aggregator::update_base_hostname(const std::string_view input) {
9797

9898
bool has_credential = components.protocol_end + 2 < components.host_start;
9999
uint32_t current_length = components.host_end - components.host_start;
100+
// next line could overflow but unsigned arithmetic has well-defined overflows.
100101
uint32_t new_difference = uint32_t(input.size()) - current_length;
101102
// The common case is current_length == 0.
102103
buffer.erase(components.host_start, current_length);
@@ -676,6 +677,32 @@ ada_really_inline size_t url_aggregator::parse_port(std::string_view view, bool
676677
return consumed;
677678
}
678679

680+
inline void url_aggregator::set_protocol_as_file() {
681+
ada_log("url_aggregator::set_protocol_as_file ");
682+
ADA_ASSERT_TRUE(validate());
683+
type = ada::scheme::type::FILE;
684+
// next line could overflow but unsigned arithmetic has well-defined overflows.
685+
uint32_t new_difference = 5 - components.protocol_end;
686+
687+
if(buffer.empty()) {
688+
buffer.append("file:");
689+
} else {
690+
buffer.erase(0, components.protocol_end);
691+
buffer.insert(0, "file:");
692+
}
693+
components.protocol_end = 5;
694+
695+
// Update the rest of the components.
696+
components.username_end += new_difference;
697+
components.host_start += new_difference;
698+
components.host_end += new_difference;
699+
components.pathname_start += new_difference;
700+
if (components.search_start != url_components::omitted) { components.search_start += new_difference; }
701+
if (components.hash_start != url_components::omitted) { components.hash_start += new_difference; }
702+
ADA_ASSERT_TRUE(validate());
703+
}
704+
705+
679706
}
680707

681708
#endif // ADA_URL_AGGREGATOR_INL_H

include/ada/url_aggregator.h

+8-3
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,17 @@ namespace ada {
4242
void set_search(const std::string_view input);
4343
void set_hash(const std::string_view input);
4444
inline void set_scheme(std::string_view new_scheme) noexcept;
45+
/** @private fast function to set the scheme from a view with a colon in the buffer, does not change type */
46+
inline void set_scheme_from_view_with_colon(std::string_view new_scheme_with_colon) noexcept;
47+
4548
inline void copy_scheme(const url_aggregator& u) noexcept;
4649

4750
[[nodiscard]] bool has_valid_domain() const noexcept override;
4851

52+
/** @private */
53+
inline bool has_authority() const noexcept;
54+
/** @private set this URL's type to file */
55+
inline void set_protocol_as_file();
4956
/**
5057
* The origin getter steps are to return the serialization of this’s URL’s
5158
* origin. [HTML]
@@ -233,9 +240,7 @@ namespace ada {
233240
inline void consume_prepared_path(std::string_view input);
234241
/** @private */
235242
template <bool has_state_override = false>
236-
[[nodiscard]] ada_really_inline bool parse_scheme(const std::string_view input);
237-
/** @private */
238-
inline bool has_authority() const noexcept;
243+
[[nodiscard]] ada_really_inline bool parse_scheme_with_colon(const std::string_view input);
239244

240245
/**
241246
* Useful for implementing efficient serialization for the URL.

src/parser.cpp

+23-3
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,22 @@ namespace ada::parser {
3737
// we must return.
3838
if(base_url != nullptr) { url.is_valid &= base_url->is_valid; }
3939
if(!url.is_valid) { return url; }
40+
if constexpr (result_type_is_ada_url_aggregator) {
41+
// Most of the time, we just need user_input.size().
42+
// In some instances, we may need a bit more.
43+
///////////////////////////
44+
// This is *very* important. This line should be removed
45+
// hastily. There are principled reasons why reserve is important
46+
// for performance. If you have a benchmark with small inputs,
47+
// it may not matter, but in other instances, it could.
48+
////
49+
// This rounds up to the next power of two.
50+
uint32_t reserve_capacity = (0xFFFFFFFF >> helpers::leading_zeroes(uint32_t(user_input.size()))) + 1;
51+
url.reserve(reserve_capacity);
52+
//
53+
//
54+
//
55+
}
4056
std::string tmp_buffer;
4157
std::string_view internal_input;
4258
if(unicode::has_tabs_or_newline(user_input)) {
@@ -93,7 +109,12 @@ namespace ada::parser {
93109
// Otherwise, if c is U+003A (:), then:
94110
if ((input_position != input_size) && (url_data[input_position] == ':')) {
95111
ada_log("SCHEME the scheme should be ", url_data.substr(0,input_position));
96-
if(!url.parse_scheme(url_data.substr(0,input_position))) { return url; }
112+
if constexpr (result_type_is_ada_url) {
113+
if(!url.parse_scheme(url_data.substr(0,input_position))) { return url; }
114+
} else {
115+
// we pass the colon along instead of painfully adding it back.
116+
if(!url.parse_scheme_with_colon(url_data.substr(0,input_position+1))) { return url; }
117+
}
97118
ada_log("SCHEME the scheme is ", url.get_protocol());
98119

99120
// If url’s scheme is "file", then:
@@ -653,8 +674,7 @@ namespace ada::parser {
653674
ada_log("FILE ", helpers::substring(url_data, input_position));
654675
std::string_view file_view = helpers::substring(url_data, input_position);
655676

656-
// Set url’s scheme to "file".
657-
url.set_scheme("file");
677+
url.set_protocol_as_file();
658678
if constexpr (result_type_is_ada_url) {
659679
// Set url’s host to the empty string.
660680
url.host = "";

src/url_aggregator.cpp

+70-37
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414

1515
namespace ada {
1616
template <bool has_state_override>
17-
[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme(const std::string_view input) {
18-
ada_log("url_aggregator::parse_scheme ", input);
17+
[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(const std::string_view input_with_colon) {
18+
ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
1919
ADA_ASSERT_TRUE(validate());
20-
ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
20+
ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
21+
std::string_view input{input_with_colon};
22+
input.remove_suffix(1);
2123
auto parsed_type = ada::scheme::get_scheme_type(input);
2224
bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
2325
/**
@@ -36,16 +38,15 @@ template <bool has_state_override>
3638
if (type == ada::scheme::type::FILE && components.host_start == components.host_end) { return true; }
3739
}
3840

39-
set_scheme(input);
41+
type = parsed_type;
42+
set_scheme_from_view_with_colon(input_with_colon);
4043

4144
if (has_state_override) {
4245
// This is uncommon.
4346
uint16_t urls_scheme_port = get_special_port();
4447

45-
if (urls_scheme_port) {
46-
// If url’s port is url’s scheme’s default port, then set url’s port to null.
47-
if (components.port == urls_scheme_port) { components.port = url_components::omitted; }
48-
}
48+
// If url’s port is url’s scheme’s default port, then set url’s port to null.
49+
if (components.port == urls_scheme_port) { components.port = url_components::omitted; }
4950
}
5051
} else { // slow path
5152
std::string _buffer = std::string(input);
@@ -69,16 +70,14 @@ template <bool has_state_override>
6970
if (type == ada::scheme::type::FILE && components.host_start == components.host_end) { return true; }
7071
}
7172

72-
set_scheme(std::move(_buffer));
73+
set_scheme(_buffer);
7374

7475
if (has_state_override) {
7576
// This is uncommon.
7677
uint16_t urls_scheme_port = get_special_port();
7778

78-
if (urls_scheme_port) {
79-
// If url’s port is url’s scheme’s default port, then set url’s port to null.
80-
if (components.port == urls_scheme_port) { components.port = url_components::omitted; }
81-
}
79+
// If url’s port is url’s scheme’s default port, then set url’s port to null.
80+
if (components.port == urls_scheme_port) { components.port = url_components::omitted; }
8281
}
8382
}
8483
ADA_ASSERT_TRUE(validate());
@@ -88,6 +87,7 @@ template <bool has_state_override>
8887
inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
8988
ada_log("url_aggregator::copy_scheme ", u.buffer);
9089
ADA_ASSERT_TRUE(validate());
90+
// next line could overflow but unsigned arithmetic has well-defined overflows.
9191
uint32_t new_difference = u.components.protocol_end - components.protocol_end;
9292
type = u.type;
9393
buffer.erase(0, components.protocol_end);
@@ -107,22 +107,48 @@ inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
107107
ADA_ASSERT_TRUE(validate());
108108
}
109109

110+
inline void url_aggregator::set_scheme_from_view_with_colon(std::string_view new_scheme_with_colon) noexcept {
111+
ada_log("url_aggregator::set_scheme_from_view_with_colon ", new_scheme_with_colon);
112+
ADA_ASSERT_TRUE(validate());
113+
ADA_ASSERT_TRUE(!new_scheme_with_colon.empty() && new_scheme_with_colon.back() == ':');
114+
// next line could overflow but unsigned arithmetic has well-defined overflows.
115+
uint32_t new_difference = uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
116+
117+
if(buffer.empty()) {
118+
buffer.append(new_scheme_with_colon);
119+
} else {
120+
buffer.erase(0, components.protocol_end);
121+
buffer.insert(0, new_scheme_with_colon);
122+
}
123+
components.protocol_end += new_difference;
124+
125+
// Update the rest of the components.
126+
components.username_end += new_difference;
127+
components.host_start += new_difference;
128+
components.host_end += new_difference;
129+
components.pathname_start += new_difference;
130+
if (components.search_start != url_components::omitted) { components.search_start += new_difference; }
131+
if (components.hash_start != url_components::omitted) { components.hash_start += new_difference; }
132+
ADA_ASSERT_TRUE(validate());
133+
}
134+
135+
110136
inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
111137
ada_log("url_aggregator::set_scheme ", new_scheme);
112138
ADA_ASSERT_TRUE(validate());
113-
uint32_t new_difference = uint32_t(new_scheme.size()) - components.protocol_end;
114-
115-
// Optimization opportunity: Get rid of this branch
116-
if (new_scheme.back() != ':') { new_difference += 1; }
139+
ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
140+
// next line could overflow but unsigned arithmetic has well-defined overflows.
141+
uint32_t new_difference = uint32_t(new_scheme.size()) - components.protocol_end + 1;
117142

118143
type = ada::scheme::get_scheme_type(new_scheme);
119-
buffer.erase(0, components.protocol_end);
120-
buffer.insert(0, helpers::concat(new_scheme, ":"));
144+
if(buffer.empty()) {
145+
buffer.append(helpers::concat(new_scheme, ":"));
146+
} else {
147+
buffer.erase(0, components.protocol_end);
148+
buffer.insert(0, helpers::concat(new_scheme, ":"));
149+
}
121150
components.protocol_end = uint32_t(new_scheme.size() + 1);
122151

123-
// No need to update the components
124-
if (new_difference == 0) { return; }
125-
126152
// Update the rest of the components.
127153
components.username_end += new_difference;
128154
components.host_start += new_difference;
@@ -149,7 +175,7 @@ bool url_aggregator::set_protocol(const std::string_view input) {
149175
std::string::iterator pointer = std::find_if_not(view.begin(), view.end(), unicode::is_alnum_plus);
150176

151177
if (pointer != view.end() && *pointer == ':') {
152-
return parse_scheme<true>(std::string_view(view.data(), pointer - view.begin()));
178+
return parse_scheme_with_colon<true>(std::string_view(view.data(), pointer - view.begin() + 1));
153179
}
154180
return false;
155181
}
@@ -1001,9 +1027,10 @@ std::string url_aggregator::to_diagram() const {
10011027
line2[i] = '-';
10021028
}
10031029
line2.append(" hash_start");
1030+
answer.append(line2);
1031+
answer.append("\n");
10041032
}
1005-
answer.append(line2);
1006-
answer.append("\n");
1033+
10071034

10081035
std::string line3 = line1;
10091036
if(components.search_start != url_components::omitted) {
@@ -1015,9 +1042,10 @@ std::string url_aggregator::to_diagram() const {
10151042
}
10161043
line3.append(" search_start ");
10171044
line3.append(std::to_string(components.search_start));
1045+
answer.append(line3);
1046+
answer.append("\n");
10181047
}
1019-
answer.append(line3);
1020-
answer.append("\n");
1048+
10211049

10221050
std::string line4 = line1;
10231051
if(components.pathname_start != buffer.size()) {
@@ -1028,9 +1056,10 @@ std::string url_aggregator::to_diagram() const {
10281056
}
10291057
line4.append(" pathname_start ");
10301058
line4.append(std::to_string(components.pathname_start));
1059+
answer.append(line4);
1060+
answer.append("\n");
10311061
}
1032-
answer.append(line4);
1033-
answer.append("\n");
1062+
10341063

10351064
std::string line5 = line1;
10361065
if(components.host_end != buffer.size()) {
@@ -1042,9 +1071,10 @@ std::string url_aggregator::to_diagram() const {
10421071
}
10431072
line5.append(" host_end ");
10441073
line5.append(std::to_string(components.host_end));
1074+
answer.append(line5);
1075+
answer.append("\n");
10451076
}
1046-
answer.append(line5);
1047-
answer.append("\n");
1077+
10481078

10491079
std::string line6 = line1;
10501080
if(components.host_start != buffer.size()) {
@@ -1056,9 +1086,10 @@ std::string url_aggregator::to_diagram() const {
10561086
}
10571087
line6.append(" host_start ");
10581088
line6.append(std::to_string(components.host_start));
1089+
answer.append(line6);
1090+
answer.append("\n");
10591091
}
1060-
answer.append(line6);
1061-
answer.append("\n");
1092+
10621093

10631094
std::string line7 = line1;
10641095
if(components.username_end != buffer.size()) {
@@ -1070,9 +1101,10 @@ std::string url_aggregator::to_diagram() const {
10701101
}
10711102
line7.append(" username_end ");
10721103
line7.append(std::to_string(components.username_end));
1104+
answer.append(line7);
1105+
answer.append("\n");
10731106
}
1074-
answer.append(line7);
1075-
answer.append("\n");
1107+
10761108

10771109
std::string line8 = line1;
10781110
if(components.protocol_end != buffer.size()) {
@@ -1084,9 +1116,10 @@ std::string url_aggregator::to_diagram() const {
10841116
}
10851117
line8.append(" protocol_end ");
10861118
line8.append(std::to_string(components.protocol_end));
1119+
answer.append(line8);
1120+
answer.append("\n");
10871121
}
1088-
answer.append(line8);
1089-
answer.append("\n");
1122+
10901123

10911124
if(components.hash_start == url_components::omitted) {
10921125
answer.append("note: hash omitted\n");

0 commit comments

Comments
 (0)