tidymodels
diff --git a/‎tests/testthat/_snaps/R4.4/tokenize_bpe.new.md
+16 b/‎tests/testthat/_snaps/R4.4/tokenize_bpe.new.md
+16
diff --git a/‎tests/testthat/_snaps/dummy_hash.md
+5-5 b/‎tests/testthat/_snaps/dummy_hash.md
+5-5
diff --git a/‎tests/testthat/_snaps/lda.md
+10-7 b/‎tests/testthat/_snaps/lda.md
+10-7
diff --git a/‎tests/testthat/test-clean_levels.R
+16-10 b/‎tests/testthat/test-clean_levels.R
+16-10
diff --git a/‎tests/testthat/test-clean_names.R
+10-9 b/‎tests/testthat/test-clean_names.R
+10-9
diff --git a/‎tests/testthat/test-dummy_hash.R
+65-11 b/‎tests/testthat/test-dummy_hash.R
+65-11
@@ -0,0 +1,16 @@
+# Errors if vocabulary size is set to low.
+
+    Code
+      recipe(~text1, data = test_data) %>% step_tokenize_bpe(text1, vocabulary_size = 10) %>%
+        prep()
+    Condition
+      Warning in `read.dcf()`:
+      cannot open compressed file '/Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library/tokenizers.bpe/DESCRIPTION', probable reason 'No such file or directory'
+    Message
+      1 package (tokenizers.bpe) is needed for this step but is not installed.
+      To install run: `install.packages("tokenizers.bpe")`
+    Condition
+      Error in `step_tokenize_bpe()`:
+      Caused by error in `prep()`:
+      ! `vocabulary_size` of 10 is too small for column `text1` which has a unique character count of 23
+
@@ -70,10 +70,10 @@
       
       -- Inputs 
       Number of variables by role
-      predictor: 2
+      predictor: 5
       
       -- Operations 
-      * Feature hashing with: sponsor_code
+      * Feature hashing with: Species
 
 ---
 
@@ -85,11 +85,11 @@
       
       -- Inputs 
       Number of variables by role
-      predictor: 2
+      predictor: 5
       
       -- Training information 
-      Training data contained 20 data points and no incomplete rows.
+      Training data contained 150 data points and no incomplete rows.
       
       -- Operations 
-      * Feature hashing with: sponsor_code | Trained
+      * Feature hashing with: Species | Trained
 
@@ -70,28 +70,31 @@
       
       -- Inputs 
       Number of variables by role
-      predictor: 2
+      predictor: 5
       
       -- Operations 
-      * Tokenization for: medium
-      * Text feature extraction for: medium
+      * Tokenization for: Species
+      * Text feature extraction for: Species
 
 ---
 
     Code
       prep(rec)
+    Condition
+      Warning in `get_dtm()`:
+      dtm has 0 rows. Empty iterator?
     Message
       
       -- Recipe ----------------------------------------------------------------------
       
       -- Inputs 
       Number of variables by role
-      predictor: 2
+      predictor: 5
       
       -- Training information 
-      Training data contained 100 data points and no incomplete rows.
+      Training data contained 150 data points and no incomplete rows.
       
       -- Operations 
-      * Tokenization for: medium | Trained
-      * Text feature extraction for: medium | Trained
+      * Tokenization for: Species | Trained
+      * Text feature extraction for: Species | Trained
 
@@ -1,15 +1,13 @@
-library(testthat)
-library(textrecipes)
-library(modeldata)
-data("Smithsonian")
-smith_tr <- Smithsonian[1:15, ]
-smith_te <- Smithsonian[16:20, ]
-
-rec <- recipe(~., data = smith_tr)
-
 test_that("character input", {
   skip_if_not_installed("janitor")
-  cleaned <- rec %>% step_clean_levels(name, id = "")
+  skip_if_not_installed("modeldata")
+
+  data("Smithsonian", package = "modeldata")
+  smith_tr <- Smithsonian[1:15, ]
+  smith_te <- Smithsonian[16:20, ]
+
+  cleaned <- recipe(~., data = smith_tr) %>% 
+    step_clean_levels(name, id = "")
 
   tidy_exp_un <- tibble(
     terms = c("name"),
@@ -50,6 +48,9 @@ test_that("character input", {
 
 test_that("factor input", {
   skip_if_not_installed("janitor")
+  skip_if_not_installed("modeldata")
+
+  data("Smithsonian", package = "modeldata")
   smith_tr <- Smithsonian[1:15, ]
   smith_tr$name <- as.factor(smith_tr$name)
   smith_te <- Smithsonian[16:20, ]
@@ -71,6 +72,11 @@ test_that("factor input", {
 
 test_that("bake method errors when needed non-standard role columns are missing", {
   skip_if_not_installed("janitor")
+  skip_if_not_installed("modeldata")
+
+  data("Smithsonian", package = "modeldata")
+  smith_tr <- Smithsonian[1:15, ]
+  
   rec <- recipe(~name, data = smith_tr) %>%
     step_clean_levels(name) %>%
     update_role(name, new_role = "potato") %>%
 
@@ -1,15 +1,14 @@
-library(testthat)
-library(textrecipes)
-data(airquality)
+test_that("can clean names", {
+  skip_if_not_installed("janitor")
+  skip_if_not_installed("modeldata")
 
-air_tr <- airquality[1:20, ]
-air_te <- airquality[101:110, ]
+  data("airquality", package = "modeldata")
 
-rec <- recipe(~., data = air_tr)
+  air_tr <- airquality[1:20, ]
+  air_te <- airquality[101:110, ]
 
-test_that("can clean names", {
-  skip_if_not_installed("janitor")
-  cleaned <- rec %>% step_clean_names(all_predictors(), id = "")
+  cleaned <- recipe(~., data = air_tr) %>% 
+  step_clean_names(all_predictors(), id = "")
 
   tidy_exp_un <- tibble(
     terms = c("all_predictors()"),
@@ -35,6 +34,8 @@ test_that("can clean names", {
 # Infrastructure ---------------------------------------------------------------
 
 test_that("bake method errors when needed non-standard role columns are missing", {
+  skip_if_not_installed("janitor")
+  
   rec <- recipe(mtcars) %>%
     step_clean_names(disp) %>%
     update_role(disp, new_role = "potato") %>%
 
@@ -1,18 +1,19 @@
 library(textrecipes)
 library(recipes)
-data(grants, package = "modeldata")
 
-test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
-test_data <- tibble::as_tibble(test_data)
-
-rec <- recipe(~., data = test_data)
 
 test_that("hashing gives double outputs", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
-  rec <- rec %>%
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
+  rec <- recipe(~., data = test_data) %>%
     step_dummy_hash(sponsor_code)
 
   obj <- rec %>%
@@ -32,9 +33,16 @@ test_that("hashing gives double outputs", {
 
 test_that("hashing multiple factors", {
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
+  skip_if_not_installed("text2vec")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
-  res <- rec %>%
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
+  res <- recipe(~., data = test_data) %>%
     step_dummy_hash(all_nominal_predictors(), num_terms = 12) %>%
     prep() %>%
     bake(new_data = NULL)
@@ -46,9 +54,16 @@ test_that("hashing multiple factors", {
 
 test_that("hashing collapsed multiple factors", {
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
+  skip_if_not_installed("text2vec")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
-  res <- rec %>%
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
+  res <- recipe(~., data = test_data) %>%
     step_dummy_hash(all_nominal_predictors(), num_terms = 4, collapse = TRUE) %>%
     prep() %>%
     bake(new_data = NULL)
@@ -60,9 +75,15 @@ test_that("hashing collapsed multiple factors", {
 test_that("hashing output width changes accordingly with num_terms", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
-  rec <- rec %>%
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
+  rec <- recipe(~., data = test_data) %>%
     step_dummy_hash(sponsor_code, num_terms = 256) %>%
     prep()
 
@@ -77,7 +98,13 @@ test_that("hashing output width changes accordingly with num_terms", {
 test_that("hashing output width changes accordingly with num_terms", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
+  
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
 
   signed <- recipe(~., data = test_data) %>%
     step_dummy_hash(all_predictors(), num_terms = 2) %>%
@@ -98,8 +125,14 @@ test_that("hashing output width changes accordingly with num_terms", {
 test_that("check_name() is used", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
   dat <- test_data
   dat$text <- dat$sponsor_code
   dat$dummyhash_text_01 <- dat$sponsor_code
@@ -131,6 +164,15 @@ test_that("tunable", {
 # Infrastructure ---------------------------------------------------------------
 
 test_that("bake method errors when needed non-standard role columns are missing", {
+  skip_if_not_installed("modeldata")
+  skip_if_not_installed("text2vec")
+  data.table::setDTthreads(2) # because data.table uses all cores by default 
+  
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
   rec <- recipe(~sponsor_code, data = test_data) %>%
     step_dummy_hash(sponsor_code) %>%
     update_role(sponsor_code, new_role = "potato") %>%
@@ -190,8 +232,14 @@ test_that("empty selection tidy method works", {
 test_that("keep_original_cols works", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
   new_names <- paste0("dummyhash_sponsor_code_", 1:5)
 
   rec <- recipe(~ sponsor_code, data = test_data) %>%
@@ -220,8 +268,14 @@ test_that("keep_original_cols works", {
 test_that("keep_original_cols - can prep recipes with it missing", {
   skip_if_not_installed("text2vec")
   skip_if_not_installed("data.table")
+  skip_if_not_installed("modeldata")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
+  data("grants", package = "modeldata")
+
+  test_data <- grants_test[1:20, c("contract_value_band", "sponsor_code")]
+  test_data <- tibble::as_tibble(test_data)
+
   rec <- recipe(~ sponsor_code, data = test_data) %>%
     step_dummy_hash(sponsor_code)
 
@@ -242,8 +296,8 @@ test_that("printing", {
   skip_if_not_installed("data.table")
   data.table::setDTthreads(2) # because data.table uses all cores by default 
 
-  rec <- rec %>%
-    step_dummy_hash(sponsor_code)
+  rec <- recipe(~., data = iris) %>%
+    step_dummy_hash(Species)
 
   expect_snapshot(print(rec))
   expect_snapshot(prep(rec))