Skip to content

Commit

Permalink
Merge pull request #325 from ROBERT-MCDOWELL/v25
Browse files Browse the repository at this point in the history
fix splitting sentences blanks. various fixes
  • Loading branch information
ROBERT-MCDOWELL authored Feb 19, 2025
2 parents 3852596 + 860138a commit cd48073
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 7 deletions.
9 changes: 6 additions & 3 deletions lib/classes/tts_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(self, session):
self._build()

def _build(self):
self.params['current_voice_path'] = None
if self.session['tts_engine'] == XTTSv2:
if self.session['custom_model'] is not None:
self.model_name = os.path.basename(self.session['custom_model'])
Expand Down Expand Up @@ -168,15 +169,17 @@ def convert_sentence_to_audio(self):
'''
if self.session['tts_engine'] == XTTSv2:
if self.session['custom_model'] is not None or self.session['fine_tuned'] != 'internal':
msg = 'Computing speaker latents...'
print(msg)
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],'ref.wav') if self.session['custom_model']
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice'] if self.session['fine_tuned']
else models[self.session['tts_engine']]['internal']['voice']
)
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.params['tts'].get_conditioning_latents(audio_path=[self.params['voice_path']])
if self.params['current_voice_path'] != self.params['voice_path']:
msg = 'Computing speaker latents...'
print(msg)
self.params['current_voice_path'] = self.params['voice_path']
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.params['tts'].get_conditioning_latents(audio_path=[self.params['voice_path']])
with torch.no_grad():
result = self.params['tts'].inference(
text=self.params['sentence'],
Expand Down
45 changes: 41 additions & 4 deletions lib/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,8 @@ def filter_chapter(doc, lang, lang_iso1, tts_engine):
text = normalize_text(text, lang, lang_iso1, tts_engine)
# Create regex pattern from punctuation list to split the phoneme_list
escaped_punctuation = re.escape(''.join(punctuation_list))
punctuation_pattern_split = rf'([^{"".join(escaped_punctuation)}]+|[{escaped_punctuation}])'
#punctuation_pattern_split = rf'([^{"".join(escaped_punctuation)}]+|[{escaped_punctuation}])'
punctuation_pattern_split = rf'(\S.*?[{"".join(escaped_punctuation)}])|\S+'
# Split by punctuation marks while keeping the punctuation at the end of each word
tmp_list = re.findall(punctuation_pattern_split, text)
phoneme_list = [phoneme.strip() for phoneme in tmp_list if phoneme.strip()]
Expand All @@ -575,7 +576,7 @@ def filter_pattern(doc_identifier):
elif re.match(r'^\d+$', segment):
return 'numbers'
return None

'''
def get_sentences(phoneme_list, max_tokens):
sentences = []
current_sentence = ""
Expand All @@ -602,7 +603,42 @@ def get_sentences(phoneme_list, max_tokens):
if current_sentence:
sentences.append(current_sentence.strip())
return sentences

'''

def get_sentences(phoneme_list, max_tokens):
sentences = []
current_sentence = ""
current_phoneme_count = 0
for phoneme in phoneme_list:
part_phoneme_count = len(phoneme.split())
# Always append to current sentence unless punctuation is hit
if current_phoneme_count + part_phoneme_count > max_tokens:
# Ensure we finalize the sentence at punctuation, not a space
if any(current_sentence.endswith(punc) for punc in punctuation_list):
sentences.append(current_sentence.strip())
current_sentence = phoneme
current_phoneme_count = part_phoneme_count
else:
# Look back and split at last punctuation instead of splitting randomly
last_punc_index = max(
(current_sentence.rfind(punc) for punc in punctuation_list if punc in current_sentence),
default=-1
)
if last_punc_index > -1:
sentences.append(current_sentence[:last_punc_index+1].strip()) # Keep punctuation
current_sentence = current_sentence[last_punc_index+1:].strip() + " " + phoneme
current_phoneme_count = len(current_sentence.split())
else:
sentences.append(current_sentence.strip())
current_sentence = phoneme
current_phoneme_count = part_phoneme_count
else:
current_sentence += (" " if current_sentence else "") + phoneme
current_phoneme_count += part_phoneme_count
if current_sentence:
sentences.append(current_sentence.strip())
return sentences

def get_batch_size(list, session):
total_size = 0
print(list)
Expand Down Expand Up @@ -1596,7 +1632,6 @@ def process_cleanup(state):
label='Enable Text Splitting',
value=default_xtts_settings['enable_text_splitting'],
info='Coqui-tts builtin text splitting. Can help against hallucinations bu can also be worse.',
visible=False
)

gr_state = gr.State(value={"hash": None})
Expand Down Expand Up @@ -2076,6 +2111,8 @@ def change_gr_fine_tuned_list(selected, id):
visible = False
if selected == 'internal' and session['tts_engine'] == XTTSv2:
visible = visible_gr_group_custom_model
else:
visible = False
session['fine_tuned'] = selected
return gr.update(visible=visible)

Expand Down

0 comments on commit cd48073

Please sign in to comment.