We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 8eb6e16 commit 06f5db2Copy full SHA for 06f5db2
src/datatrove/utils/word_tokenizers.py
@@ -32,7 +32,7 @@ def chunk_text_on_bytes(text: str, max_chunk_size: int = 1_000_000):
32
def __utf8len(s: str):
33
return len(s.encode("utf-8"))
34
35
- factor = len(text) / __utf8len(text)
+ factor = len(text) / __utf8len(text) if __utf8len(text) > 0 else 1
36
increase_by = int(max(min(max_chunk_size * 0.1, 10), 1))
37
initial_size_guess = int(max(max_chunk_size * factor - 10, 1))
38
final_list = []
0 commit comments