alteryx · thehomebrewnerd · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -6,13 +6,14 @@ Future Release
 ==============
     * Enhancements
     * Fixes
+        * Fix bug in ``CountString`` with null values (:pr:`154`)
     * Changes
     * Documentation Changes
         * Update release branch naming convention in documentation (:pr:`155`)
     * Testing Changes
 
     Thanks to the following people for contributing to this release:
-    :user:`rwedge`
+    :user:`rwedge`, :user:`thehomebrewnerd`
 
 v2.6.0 Jun 16, 2022
 ===================

diff --git a/nlp_primitives/count_string.py b/nlp_primitives/count_string.py
@@ -3,7 +3,7 @@
 import numpy as np
 from featuretools.primitives import TransformPrimitive
 from woodwork.column_schema import ColumnSchema
-from woodwork.logical_types import Integer, NaturalLanguage
+from woodwork.logical_types import IntegerNullable, NaturalLanguage
 
 
 class CountString(TransformPrimitive):
@@ -27,38 +27,38 @@ class CountString(TransformPrimitive):
         >>> count_string(["The problem was difficult.",
         ...               "He was there.",
         ...               "The girl went to the store."]).tolist()
-        [1, 1, 2]
+        [1.0, 1.0, 2.0]
         >>> # Match case of string
         >>> count_string_ignore_case = CountString(string="the", ignore_case=False)
         >>> count_string_ignore_case(["The problem was difficult.",
         ...                           "He was there.",
         ...                           "The girl went to the store."]).tolist()
-        [0, 1, 1]
+        [0.0, 1.0, 1.0]
         >>> # Ignore non-alphanumeric characters in the search
         >>> count_string_ignore_non_alphanumeric = CountString(string="the",
         ...                                                    ignore_non_alphanumeric=True)
         >>> count_string_ignore_non_alphanumeric(["Th*/e problem was difficult.",
         ...                                       "He was there.",
         ...                                       "The girl went to the store."]).tolist()
-        [1, 1, 2]
+        [1.0, 1.0, 2.0]
         >>> # Specify the string as a regex
         >>> count_string_is_regex = CountString(string="t.e", is_regex=True)
         >>> count_string_is_regex(["The problem was difficult.",
         ...                        "He was there.",
         ...                        "The girl went to the store."]).tolist()
-        [1, 1, 2]
+        [1.0, 1.0, 2.0]
         >>> # Match whole words only
         >>> count_string_match_whole_words_only = CountString(string="the",
         ...                                                   match_whole_words_only=True)
         >>> count_string_match_whole_words_only(["The problem was difficult.",
         ...                                      "He was there.",
         ...                                      "The girl went to the store."]).tolist()
-        [1, 0, 2]
+        [1.0, 0.0, 2.0]
     """
 
     name = "count_string"
     input_types = [ColumnSchema(logical_type=NaturalLanguage)]
-    return_type = ColumnSchema(logical_type=Integer, semantic_tags={"numeric"})
+    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
 
     def __init__(
         self,
@@ -103,4 +103,4 @@ def count_string(words):
             words = self.process_text(words)
             return len(re.findall(self.pattern, words))
 
-        return np.vectorize(count_string)
+        return np.vectorize(count_string, otypes=[float])
diff --git a/nlp_primitives/tests/test_count_string.py b/nlp_primitives/tests/test_count_string.py
@@ -199,12 +199,29 @@ def test_nan(self):
 
     def test_with_featuretools(self, es):
         transform, aggregation = find_applicable_primitives(self.primitive)
-        primitive_instantiate = self.primitive(
+        primitive_instance = self.primitive(
             "the",
             ignore_case=True,
             ignore_non_alphanumeric=False,
             is_regex=False,
             match_whole_words_only=False,
         )
-        transform.append(primitive_instantiate)
+        transform.append(primitive_instance)
+        valid_dfs(es, aggregation, transform, self.primitive.name.upper())
+
+    def test_with_featuretools_nan(self, es):
+        comments = es["log"]["comments"]
+        comments[1] = pd.NA
+        comments[2] = np.nan
+        comments[3] = None
+        es["log"].ww["comments"] = comments
+        transform, aggregation = find_applicable_primitives(self.primitive)
+        primitive_instance = self.primitive(
+            "the",
+            ignore_case=True,
+            ignore_non_alphanumeric=False,
+            is_regex=False,
+            match_whole_words_only=False,
+        )
+        transform.append(primitive_instance)
         valid_dfs(es, aggregation, transform, self.primitive.name.upper())
diff --git a/nlp_primitives/whitespace_count.py b/nlp_primitives/whitespace_count.py
@@ -14,7 +14,7 @@ class WhitespaceCount(CountString):
         >>> x = ['', 'hi im ethan', 'multiple    spaces']
         >>> upper_case_count = WhitespaceCount()
         >>> upper_case_count(x).tolist()
-        [0, 2, 4]
+        [0.0, 2.0, 4.0]
     """
 
     name = "whitespace_count"