add rest of checks

EmilHvitfeldt · EmilHvitfeldt · commit 11d4ece8e116 · 2024-11-10T15:53:56.000-08:00
diff --git a/R/tokenfilter.R b/R/tokenfilter.R
@@ -100,13 +100,6 @@ step_tokenfilter <-
            res = NULL,
            skip = FALSE,
            id = rand_id("tokenfilter")) {
-    if (percentage && (max_times > 1 | max_times < 0 |
-      min_times > 1 | min_times < 0)) {
-      cli::cli_abort(
-        "{.arg max_times} and {.arg min_times} should be in the interval [0, 1]."
-      )
-    }
-    
     add_step(
       recipe,
       step_tokenfilter_new(
@@ -150,6 +143,17 @@ step_tokenfilter_new <-
 prep.step_tokenfilter <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_bool(x$percentage, arg = "percentage")
+  if (x$percentage) {
+    check_number_decimal(x$max_times, min = 0, max = 1, arg = "max_times")
+    check_number_decimal(x$min_times, min = 0, max = 1, arg = "min_times")
+  } else {
+    check_number_whole(x$max_times, min = 0, allow_infinite = TRUE, arg = "max_times")
+    check_number_whole(x$min_times, min = 0, arg = "min_times")
+  }
+  check_number_whole(x$max_tokens, min = 0, arg = "max_tokens")
+  check_function(x$filter_fun, allow_null = TRUE, arg = "filter_fun")
+  
   check_type(training[, col_names], types = "tokenlist")
 
   retain_words <- list()
diff --git a/R/tokenize.R b/R/tokenize.R
@@ -285,6 +285,10 @@ step_tokenize_new <-
 prep.step_tokenize <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_string(x$token, arg = "token")
+  check_string(x$engine, arg = "engine")
+  check_function(x$custom_token, allow_null = TRUE, arg = "custom_token")
+
   training <- factor_to_text(training, col_names)
 
   check_type(training[, col_names], types = c("string", "factor", "ordered"))
diff --git a/R/tokenize_bpe.R b/R/tokenize_bpe.R
@@ -113,6 +113,8 @@ step_tokenize_bpe_new <-
 prep.step_tokenize_bpe <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")
+
   training <- factor_to_text(training, col_names)
 
   check_type(training[, col_names], types = c("string", "factor", "ordered"))
diff --git a/R/tokenize_sentencepiece.R b/R/tokenize_sentencepiece.R
@@ -112,6 +112,8 @@ step_tokenize_sentencepiece_new <-
 prep.step_tokenize_sentencepiece <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")
+
   training <- factor_to_text(training, col_names)
 
   check_type(training[, col_names], types = c("string", "factor", "ordered"))
diff --git a/R/tokenize_wordpiece.R b/R/tokenize_wordpiece.R
@@ -106,6 +106,9 @@ step_tokenize_wordpiece_new <-
 prep.step_tokenize_wordpiece <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_string(x$unk_token, arg = "unk_token")
+  check_number_whole(x$max_chars, min = 0, arg = "max_chars")
+
   training <- factor_to_text(training, col_names)
 
   check_type(training[, col_names], types = c("string", "factor", "ordered"))
diff --git a/R/tokenmerge.R b/R/tokenmerge.R
@@ -95,6 +95,8 @@ step_tokenmerge_new <-
 prep.step_tokenmerge <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+check_string(x$prefix, arg = "prefix")
+  
   check_type(training[, col_names], types = "tokenlist")
 
   step_tokenmerge_new(
diff --git a/R/untokenize.R b/R/untokenize.R
@@ -100,6 +100,8 @@ step_untokenize_new <-
 prep.step_untokenize <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_string(x$sep, arg = "sep")
+
   check_type(training[, col_names], types = "tokenlist")
 
   step_untokenize_new(
diff --git a/R/word_embeddings.R b/R/word_embeddings.R
@@ -117,7 +117,7 @@ step_word_embeddings <- function(recipe,
     )
   }
   
-  aggregation <- match.arg(aggregation)
+  aggregation <- rlang::arg_match(aggregation)
 
   add_step(
     recipe,
@@ -160,6 +160,9 @@ step_word_embeddings_new <- function(terms, role, trained, columns, embeddings,
 prep.step_word_embeddings <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
 
+  check_number_decimal(x$aggregation_default, arg = "aggregation_default")
+  check_string(x$prefix, arg = "prefix")
+
   check_type(training[, col_names], types = "tokenlist")
 
   step_word_embeddings_new(
diff --git a/tests/testthat/_snaps/tokenfilter.md b/tests/testthat/_snaps/tokenfilter.md
@@ -36,6 +36,73 @@
       * Tokenization for: text | Trained
       * Text filtering for: text | Trained
 
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(percentage = "yes") %>% prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `percentage` must be `TRUE` or `FALSE`, not the string "yes".
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(max_tokens = -4) %>% prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `max_tokens` must be a whole number larger than or equal to 0, not the number -4.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(filter_fun = -4) %>% prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `filter_fun` must be a function or `NULL`, not the number -4.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, max_times = 2) %>%
+        prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `max_times` must be a number between 0 and 1, not the number 2.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, min_times = 2) %>%
+        prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `min_times` must be a number between 0 and 1, not the number 2.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, max_times = -
+        1) %>% prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `max_times` must be a whole number larger than or equal to 0, not the number -1.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, min_times = -
+        1) %>% prep()
+    Condition
+      Error in `step_tokenfilter()`:
+      Caused by error in `prep()`:
+      ! `min_times` must be a whole number larger than or equal to 0, not the number -1.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/tokenize.md b/tests/testthat/_snaps/tokenize.md
@@ -16,6 +16,33 @@
       Caused by error in `prep()`:
       ! The `engine` argument is not valid.
 
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize(token = letters) %>% prep()
+    Condition
+      Error in `step_tokenize()`:
+      Caused by error in `prep()`:
+      ! `token` must be a single string, not a character vector.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize(engine = letters) %>% prep()
+    Condition
+      Error in `step_tokenize()`:
+      Caused by error in `prep()`:
+      ! `engine` must be a single string, not a character vector.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize(custom_token = "yes") %>% prep()
+    Condition
+      Error in `step_tokenize()`:
+      Caused by error in `prep()`:
+      ! `custom_token` must be a function or `NULL`, not the string "yes".
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/tokenize_bpe.md b/tests/testthat/_snaps/tokenize_bpe.md
@@ -1,3 +1,12 @@
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize_bpe(vocabulary_size = -4) %>% prep()
+    Condition
+      Error in `step_tokenize_bpe()`:
+      Caused by error in `prep()`:
+      ! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/tokenize_sentencepiece.md b/tests/testthat/_snaps/tokenize_sentencepiece.md
@@ -8,6 +8,16 @@
       Caused by error in `prep()`:
       ! The `vocabulary_size` of 10 is too small for column `text1` which has a unique character count of 23.
 
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize_sentencepiece(vocabulary_size = -4) %>%
+        prep()
+    Condition
+      Error in `step_tokenize_sentencepiece()`:
+      Caused by error in `prep()`:
+      ! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/tokenize_wordpiece.md b/tests/testthat/_snaps/tokenize_wordpiece.md
@@ -1,3 +1,21 @@
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize_wordpiece(unk_token = 0) %>% prep()
+    Condition
+      Error in `step_tokenize_wordpiece()`:
+      Caused by error in `prep()`:
+      ! `unk_token` must be a single string, not the number 0.
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenize_wordpiece(max_chars = -4) %>% prep()
+    Condition
+      Error in `step_tokenize_wordpiece()`:
+      Caused by error in `prep()`:
+      ! `max_chars` must be a whole number larger than or equal to 0, not the number -4.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/tokenmerge.md b/tests/testthat/_snaps/tokenmerge.md
@@ -18,6 +18,15 @@
       ! Name collision occurred. The following variable names already exist:
       * `tokenmerge`
 
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_tokenmerge(prefix = NULL) %>% prep()
+    Condition
+      Error in `step_tokenmerge()`:
+      Caused by error in `prep()`:
+      ! `prefix` must be a single string, not `NULL`.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/untokenize.md b/tests/testthat/_snaps/untokenize.md
@@ -1,3 +1,12 @@
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_untokenize(sep = 0) %>% prep()
+    Condition
+      Error in `step_untokenize()`:
+      Caused by error in `prep()`:
+      ! `sep` must be a single string, not the number 0.
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/_snaps/word_embeddings.md b/tests/testthat/_snaps/word_embeddings.md
@@ -8,6 +8,32 @@
       ! Name collision occurred. The following variable names already exist:
       * `wordembed_text_d1`
 
+# bad args
+
+    Code
+      recipe(~., data = mtcars) %>% step_word_embeddings(aggregation = "wrong") %>%
+        prep()
+    Condition
+      Error in `step_word_embeddings()`:
+      ! argument "embeddings" is missing, with no default
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_word_embeddings(aggregation_default = "yes") %>%
+        prep()
+    Condition
+      Error in `step_word_embeddings()`:
+      ! argument "embeddings" is missing, with no default
+
+---
+
+    Code
+      recipe(~., data = mtcars) %>% step_word_embeddings(prefix = NULL) %>% prep()
+    Condition
+      Error in `step_word_embeddings()`:
+      ! argument "embeddings" is missing, with no default
+
 # bake method errors when needed non-standard role columns are missing
 
     Code
diff --git a/tests/testthat/test-tokenfilter.R b/tests/testthat/test-tokenfilter.R
@@ -129,6 +129,52 @@ test_that("tunable", {
   )
 })
 
+test_that("bad args", {
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(percentage = "yes") %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(max_tokens = -4) %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(filter_fun = -4) %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(percentage = TRUE, max_times = 2) %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(percentage = TRUE, min_times = 2) %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(percentage = FALSE, max_times = -1) %>%
+      prep()
+  )
+  expect_snapshot(
+    error = TRUE,
+    recipe(~., data = mtcars) %>%
+      step_tokenfilter(percentage = FALSE, min_times = -1) %>%
+      prep()
+  )
+})
+
+
 # Infrastructure ---------------------------------------------------------------
 
 test_that("bake method errors when needed non-standard role columns are missing", {
diff --git a/tests/testthat/test-tokenize.R b/tests/testthat/test-tokenize.R
diff --git a/tests/testthat/test-tokenize_bpe.R b/tests/testthat/test-tokenize_bpe.R
diff --git a/tests/testthat/test-tokenize_sentencepiece.R b/tests/testthat/test-tokenize_sentencepiece.R
diff --git a/tests/testthat/test-tokenize_wordpiece.R b/tests/testthat/test-tokenize_wordpiece.R
diff --git a/tests/testthat/test-tokenmerge.R b/tests/testthat/test-tokenmerge.R
diff --git a/tests/testthat/test-untokenize.R b/tests/testthat/test-untokenize.R
diff --git a/tests/testthat/test-word_embeddings.R b/tests/testthat/test-word_embeddings.R

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ step_word_embeddings <- function(recipe,`
`117`	`117`	`)`
`118`	`118`	`}`
`119`	`119`
`120`		`- aggregation <- match.arg(aggregation)`
	`120`	`+ aggregation <- rlang::arg_match(aggregation)`
`121`	`121`
`122`	`122`	`add_step(`
`123`	`123`	`recipe,`
`@@ -160,6 +160,9 @@ step_word_embeddings_new <- function(terms, role, trained, columns, embeddings,`
`160`	`160`	`prep.step_word_embeddings <- function(x, training, info = NULL, ...) {`
`161`	`161`	`col_names <- recipes_eval_select(x$terms, training, info)`
`162`	`162`
	`163`	`+ check_number_decimal(x$aggregation_default, arg = "aggregation_default")`
	`164`	`+ check_string(x$prefix, arg = "prefix")`
	`165`	`+`
`163`	`166`	`check_type(training[, col_names], types = "tokenlist")`
`164`	`167`
`165`	`168`	`step_word_embeddings_new(`