Skip to content

Commit 11d4ece

Browse files
committed
add rest of checks
1 parent 8161b20 commit 11d4ece

24 files changed

+343
-8
lines changed

Diff for: R/tokenfilter.R

+11-7
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,6 @@ step_tokenfilter <-
100100
res = NULL,
101101
skip = FALSE,
102102
id = rand_id("tokenfilter")) {
103-
if (percentage && (max_times > 1 | max_times < 0 |
104-
min_times > 1 | min_times < 0)) {
105-
cli::cli_abort(
106-
"{.arg max_times} and {.arg min_times} should be in the interval [0, 1]."
107-
)
108-
}
109-
110103
add_step(
111104
recipe,
112105
step_tokenfilter_new(
@@ -150,6 +143,17 @@ step_tokenfilter_new <-
150143
prep.step_tokenfilter <- function(x, training, info = NULL, ...) {
151144
col_names <- recipes_eval_select(x$terms, training, info)
152145

146+
check_bool(x$percentage, arg = "percentage")
147+
if (x$percentage) {
148+
check_number_decimal(x$max_times, min = 0, max = 1, arg = "max_times")
149+
check_number_decimal(x$min_times, min = 0, max = 1, arg = "min_times")
150+
} else {
151+
check_number_whole(x$max_times, min = 0, allow_infinite = TRUE, arg = "max_times")
152+
check_number_whole(x$min_times, min = 0, arg = "min_times")
153+
}
154+
check_number_whole(x$max_tokens, min = 0, arg = "max_tokens")
155+
check_function(x$filter_fun, allow_null = TRUE, arg = "filter_fun")
156+
153157
check_type(training[, col_names], types = "tokenlist")
154158

155159
retain_words <- list()

Diff for: R/tokenize.R

+4
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ step_tokenize_new <-
285285
prep.step_tokenize <- function(x, training, info = NULL, ...) {
286286
col_names <- recipes_eval_select(x$terms, training, info)
287287

288+
check_string(x$token, arg = "token")
289+
check_string(x$engine, arg = "engine")
290+
check_function(x$custom_token, allow_null = TRUE, arg = "custom_token")
291+
288292
training <- factor_to_text(training, col_names)
289293

290294
check_type(training[, col_names], types = c("string", "factor", "ordered"))

Diff for: R/tokenize_bpe.R

+2
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ step_tokenize_bpe_new <-
113113
prep.step_tokenize_bpe <- function(x, training, info = NULL, ...) {
114114
col_names <- recipes_eval_select(x$terms, training, info)
115115

116+
check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")
117+
116118
training <- factor_to_text(training, col_names)
117119

118120
check_type(training[, col_names], types = c("string", "factor", "ordered"))

Diff for: R/tokenize_sentencepiece.R

+2
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ step_tokenize_sentencepiece_new <-
112112
prep.step_tokenize_sentencepiece <- function(x, training, info = NULL, ...) {
113113
col_names <- recipes_eval_select(x$terms, training, info)
114114

115+
check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")
116+
115117
training <- factor_to_text(training, col_names)
116118

117119
check_type(training[, col_names], types = c("string", "factor", "ordered"))

Diff for: R/tokenize_wordpiece.R

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ step_tokenize_wordpiece_new <-
106106
prep.step_tokenize_wordpiece <- function(x, training, info = NULL, ...) {
107107
col_names <- recipes_eval_select(x$terms, training, info)
108108

109+
check_string(x$unk_token, arg = "unk_token")
110+
check_number_whole(x$max_chars, min = 0, arg = "max_chars")
111+
109112
training <- factor_to_text(training, col_names)
110113

111114
check_type(training[, col_names], types = c("string", "factor", "ordered"))

Diff for: R/tokenmerge.R

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ step_tokenmerge_new <-
9595
prep.step_tokenmerge <- function(x, training, info = NULL, ...) {
9696
col_names <- recipes_eval_select(x$terms, training, info)
9797

98+
check_string(x$prefix, arg = "prefix")
99+
98100
check_type(training[, col_names], types = "tokenlist")
99101

100102
step_tokenmerge_new(

Diff for: R/untokenize.R

+2
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ step_untokenize_new <-
100100
prep.step_untokenize <- function(x, training, info = NULL, ...) {
101101
col_names <- recipes_eval_select(x$terms, training, info)
102102

103+
check_string(x$sep, arg = "sep")
104+
103105
check_type(training[, col_names], types = "tokenlist")
104106

105107
step_untokenize_new(

Diff for: R/word_embeddings.R

+4-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ step_word_embeddings <- function(recipe,
117117
)
118118
}
119119

120-
aggregation <- match.arg(aggregation)
120+
aggregation <- rlang::arg_match(aggregation)
121121

122122
add_step(
123123
recipe,
@@ -160,6 +160,9 @@ step_word_embeddings_new <- function(terms, role, trained, columns, embeddings,
160160
prep.step_word_embeddings <- function(x, training, info = NULL, ...) {
161161
col_names <- recipes_eval_select(x$terms, training, info)
162162

163+
check_number_decimal(x$aggregation_default, arg = "aggregation_default")
164+
check_string(x$prefix, arg = "prefix")
165+
163166
check_type(training[, col_names], types = "tokenlist")
164167

165168
step_word_embeddings_new(

Diff for: tests/testthat/_snaps/tokenfilter.md

+67
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,73 @@
3636
* Tokenization for: text | Trained
3737
* Text filtering for: text | Trained
3838

39+
# bad args
40+
41+
Code
42+
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = "yes") %>% prep()
43+
Condition
44+
Error in `step_tokenfilter()`:
45+
Caused by error in `prep()`:
46+
! `percentage` must be `TRUE` or `FALSE`, not the string "yes".
47+
48+
---
49+
50+
Code
51+
recipe(~., data = mtcars) %>% step_tokenfilter(max_tokens = -4) %>% prep()
52+
Condition
53+
Error in `step_tokenfilter()`:
54+
Caused by error in `prep()`:
55+
! `max_tokens` must be a whole number larger than or equal to 0, not the number -4.
56+
57+
---
58+
59+
Code
60+
recipe(~., data = mtcars) %>% step_tokenfilter(filter_fun = -4) %>% prep()
61+
Condition
62+
Error in `step_tokenfilter()`:
63+
Caused by error in `prep()`:
64+
! `filter_fun` must be a function or `NULL`, not the number -4.
65+
66+
---
67+
68+
Code
69+
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, max_times = 2) %>%
70+
prep()
71+
Condition
72+
Error in `step_tokenfilter()`:
73+
Caused by error in `prep()`:
74+
! `max_times` must be a number between 0 and 1, not the number 2.
75+
76+
---
77+
78+
Code
79+
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, min_times = 2) %>%
80+
prep()
81+
Condition
82+
Error in `step_tokenfilter()`:
83+
Caused by error in `prep()`:
84+
! `min_times` must be a number between 0 and 1, not the number 2.
85+
86+
---
87+
88+
Code
89+
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, max_times = -
90+
1) %>% prep()
91+
Condition
92+
Error in `step_tokenfilter()`:
93+
Caused by error in `prep()`:
94+
! `max_times` must be a whole number larger than or equal to 0, not the number -1.
95+
96+
---
97+
98+
Code
99+
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, min_times = -
100+
1) %>% prep()
101+
Condition
102+
Error in `step_tokenfilter()`:
103+
Caused by error in `prep()`:
104+
! `min_times` must be a whole number larger than or equal to 0, not the number -1.
105+
39106
# bake method errors when needed non-standard role columns are missing
40107

41108
Code

Diff for: tests/testthat/_snaps/tokenize.md

+27
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,33 @@
1616
Caused by error in `prep()`:
1717
! The `engine` argument is not valid.
1818

19+
# bad args
20+
21+
Code
22+
recipe(~., data = mtcars) %>% step_tokenize(token = letters) %>% prep()
23+
Condition
24+
Error in `step_tokenize()`:
25+
Caused by error in `prep()`:
26+
! `token` must be a single string, not a character vector.
27+
28+
---
29+
30+
Code
31+
recipe(~., data = mtcars) %>% step_tokenize(engine = letters) %>% prep()
32+
Condition
33+
Error in `step_tokenize()`:
34+
Caused by error in `prep()`:
35+
! `engine` must be a single string, not a character vector.
36+
37+
---
38+
39+
Code
40+
recipe(~., data = mtcars) %>% step_tokenize(custom_token = "yes") %>% prep()
41+
Condition
42+
Error in `step_tokenize()`:
43+
Caused by error in `prep()`:
44+
! `custom_token` must be a function or `NULL`, not the string "yes".
45+
1946
# bake method errors when needed non-standard role columns are missing
2047

2148
Code

Diff for: tests/testthat/_snaps/tokenize_bpe.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
# bad args
2+
3+
Code
4+
recipe(~., data = mtcars) %>% step_tokenize_bpe(vocabulary_size = -4) %>% prep()
5+
Condition
6+
Error in `step_tokenize_bpe()`:
7+
Caused by error in `prep()`:
8+
! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.
9+
110
# bake method errors when needed non-standard role columns are missing
211

312
Code

Diff for: tests/testthat/_snaps/tokenize_sentencepiece.md

+10
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88
Caused by error in `prep()`:
99
! The `vocabulary_size` of 10 is too small for column `text1` which has a unique character count of 23.
1010

11+
# bad args
12+
13+
Code
14+
recipe(~., data = mtcars) %>% step_tokenize_sentencepiece(vocabulary_size = -4) %>%
15+
prep()
16+
Condition
17+
Error in `step_tokenize_sentencepiece()`:
18+
Caused by error in `prep()`:
19+
! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.
20+
1121
# bake method errors when needed non-standard role columns are missing
1222

1323
Code

Diff for: tests/testthat/_snaps/tokenize_wordpiece.md

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
# bad args
2+
3+
Code
4+
recipe(~., data = mtcars) %>% step_tokenize_wordpiece(unk_token = 0) %>% prep()
5+
Condition
6+
Error in `step_tokenize_wordpiece()`:
7+
Caused by error in `prep()`:
8+
! `unk_token` must be a single string, not the number 0.
9+
10+
---
11+
12+
Code
13+
recipe(~., data = mtcars) %>% step_tokenize_wordpiece(max_chars = -4) %>% prep()
14+
Condition
15+
Error in `step_tokenize_wordpiece()`:
16+
Caused by error in `prep()`:
17+
! `max_chars` must be a whole number larger than or equal to 0, not the number -4.
18+
119
# bake method errors when needed non-standard role columns are missing
220

321
Code

Diff for: tests/testthat/_snaps/tokenmerge.md

+9
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818
! Name collision occurred. The following variable names already exist:
1919
* `tokenmerge`
2020

21+
# bad args
22+
23+
Code
24+
recipe(~., data = mtcars) %>% step_tokenmerge(prefix = NULL) %>% prep()
25+
Condition
26+
Error in `step_tokenmerge()`:
27+
Caused by error in `prep()`:
28+
! `prefix` must be a single string, not `NULL`.
29+
2130
# bake method errors when needed non-standard role columns are missing
2231

2332
Code

Diff for: tests/testthat/_snaps/untokenize.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
# bad args
2+
3+
Code
4+
recipe(~., data = mtcars) %>% step_untokenize(sep = 0) %>% prep()
5+
Condition
6+
Error in `step_untokenize()`:
7+
Caused by error in `prep()`:
8+
! `sep` must be a single string, not the number 0.
9+
110
# bake method errors when needed non-standard role columns are missing
211

312
Code

Diff for: tests/testthat/_snaps/word_embeddings.md

+26
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@
88
! Name collision occurred. The following variable names already exist:
99
* `wordembed_text_d1`
1010

11+
# bad args
12+
13+
Code
14+
recipe(~., data = mtcars) %>% step_word_embeddings(aggregation = "wrong") %>%
15+
prep()
16+
Condition
17+
Error in `step_word_embeddings()`:
18+
! argument "embeddings" is missing, with no default
19+
20+
---
21+
22+
Code
23+
recipe(~., data = mtcars) %>% step_word_embeddings(aggregation_default = "yes") %>%
24+
prep()
25+
Condition
26+
Error in `step_word_embeddings()`:
27+
! argument "embeddings" is missing, with no default
28+
29+
---
30+
31+
Code
32+
recipe(~., data = mtcars) %>% step_word_embeddings(prefix = NULL) %>% prep()
33+
Condition
34+
Error in `step_word_embeddings()`:
35+
! argument "embeddings" is missing, with no default
36+
1137
# bake method errors when needed non-standard role columns are missing
1238

1339
Code

Diff for: tests/testthat/test-tokenfilter.R

+46
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,52 @@ test_that("tunable", {
129129
)
130130
})
131131

132+
test_that("bad args", {
133+
expect_snapshot(
134+
error = TRUE,
135+
recipe(~., data = mtcars) %>%
136+
step_tokenfilter(percentage = "yes") %>%
137+
prep()
138+
)
139+
expect_snapshot(
140+
error = TRUE,
141+
recipe(~., data = mtcars) %>%
142+
step_tokenfilter(max_tokens = -4) %>%
143+
prep()
144+
)
145+
expect_snapshot(
146+
error = TRUE,
147+
recipe(~., data = mtcars) %>%
148+
step_tokenfilter(filter_fun = -4) %>%
149+
prep()
150+
)
151+
expect_snapshot(
152+
error = TRUE,
153+
recipe(~., data = mtcars) %>%
154+
step_tokenfilter(percentage = TRUE, max_times = 2) %>%
155+
prep()
156+
)
157+
expect_snapshot(
158+
error = TRUE,
159+
recipe(~., data = mtcars) %>%
160+
step_tokenfilter(percentage = TRUE, min_times = 2) %>%
161+
prep()
162+
)
163+
expect_snapshot(
164+
error = TRUE,
165+
recipe(~., data = mtcars) %>%
166+
step_tokenfilter(percentage = FALSE, max_times = -1) %>%
167+
prep()
168+
)
169+
expect_snapshot(
170+
error = TRUE,
171+
recipe(~., data = mtcars) %>%
172+
step_tokenfilter(percentage = FALSE, min_times = -1) %>%
173+
prep()
174+
)
175+
})
176+
177+
132178
# Infrastructure ---------------------------------------------------------------
133179

134180
test_that("bake method errors when needed non-standard role columns are missing", {

0 commit comments

Comments
 (0)