diff --git a/README.md b/README.md index cee661d..b899780 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ One can use `pip` to install `pdf2video` directly from GitHub: ``` python3 -m pip install git+https://github.com/tjunttila/pdf2video.git ``` -See the [PyPA Installing Packages tutorial](https://packaging.python.org/tutorials/installing-packages/) if you are not familiar with installing Python packages. +See the [PyPA Installing Packages tutorial](https://packaging.python.org/tutorials/installing-packages/) for information on installing Python packages and on Python virtual environments. # Usage diff --git a/pdf2video/parser.py b/pdf2video/parser.py index 9cd8a09..76f4fee 100644 --- a/pdf2video/parser.py +++ b/pdf2video/parser.py @@ -4,19 +4,29 @@ License: The MIT License """ +from abc import ABC, abstractmethod import re +import sys -class AST: - def __init__(self): - pass +class AST(ABC): + """Base class for abstract syntax tree nodes.""" + + @abstractmethod def to_ssml(self, neural): - assert False + """Get the SSML representation of the sub-tree.""" + + @abstractmethod def to_words(self): - assert False + """Get the plain words representation of the sub-tree.""" + + @abstractmethod def to_sub(self): - assert False + """Get the sub-titles representation of the sub-tree.""" + class ASTWord(AST): + """An AST node for a word.""" def __init__(self, text): + super().__init__() self.text = text def to_ssml(self, neural): return self.text @@ -24,7 +34,9 @@ def to_words(self): return [self.text] def to_sub(self): return self.text + class ASTBreak(AST): + """An AST node for a break.""" def __init__(self, time): self.time = time def to_ssml(self, neural): @@ -33,7 +45,9 @@ def to_words(self): return [] def to_sub(self): return '' + class ASTDelim(AST): + """An AST node for a delimiter.""" def __init__(self, text): self.text = text def to_ssml(self, neural): @@ -42,7 +56,9 @@ def to_words(self): return [] def to_sub(self): return self.text + class ASTSpace(AST): + """An AST node for a white space.""" def __init__(self): pass def to_ssml(self, neural): @@ -51,32 +67,38 @@ def to_words(self): return [] def to_sub(self): return ' ' + class ASTEmph(AST): + """An AST node for emphasized text.""" def __init__(self, children): self.children = children def to_ssml(self, neural): children_ssml = "".join([child.to_ssml(neural) for child in self.children]) if neural: return ''+children_ssml+'' - else: - return ''+children_ssml+'' + return ''+children_ssml+'' def to_words(self): result = [] - for child in self.children: result += child.to_words() + for child in self.children: + result += child.to_words() return result def to_sub(self): return "".join([child.to_sub() for child in self.children]) + class ASTPhoneme(AST): + """An AST node for text read with phonemes.""" def __init__(self, text, xsampa): self.text = text self.xsampa = xsampa def to_ssml(self, neural): return f'{self.text}' def to_words(self): - return re.split('\s+', self.text.strip()) + return re.split(r'\s+', self.text.strip()) def to_sub(self): return self.text + class ASTSub(AST): + """An AST node for text with different sub-title representation.""" def __init__(self, children, subtitles): self.children = children self.subtitles = subtitles @@ -85,11 +107,14 @@ def to_ssml(self, neural): return "".join(children_ssml) def to_words(self): result = [] - for child in self.children: result += child.to_words() + for child in self.children: + result += child.to_words() return result def to_sub(self): return self.subtitles + class ASTSlow(AST): + """An AST node for text read slowly.""" def __init__(self, children): self.children = children def to_ssml(self, neural): @@ -97,11 +122,14 @@ def to_ssml(self, neural): return ''+children_ssml+'' def to_words(self): result = [] - for child in self.children: result += child.to_words() + for child in self.children: + result += child.to_words() return result def to_sub(self): return "".join([child.to_sub() for child in self.children]) + class ASTLow(AST): + """An AST node for text read in low pitch.""" def __init__(self, children): self.children = children def to_ssml(self, neural): @@ -109,15 +137,17 @@ def to_ssml(self, neural): if neural: # prosody pitch not yet in neural TTS, make it slightly slower return ''+children_ssml+'' - else: - return ''+children_ssml+'' + return ''+children_ssml+'' def to_words(self): result = [] - for child in self.children: result += child.to_words() + for child in self.children: + result += child.to_words() return result def to_sub(self): return "".join([child.to_sub() for child in self.children]) + class ASTHigh(AST): + """An AST node for text read in high pitch.""" def __init__(self, children): self.children = children def to_ssml(self, neural): @@ -125,21 +155,23 @@ def to_ssml(self, neural): if neural: # prosody pitch not yet in neural TTS, make it slightly faster return ''+children_ssml+'' - else: - return ''+children_ssml+'' + return ''+children_ssml+'' def to_words(self): result = [] - for child in self.children: result += child.to_words() + for child in self.children: + result += child.to_words() return result def to_sub(self): return "".join([child.to_sub() for child in self.children]) + class ASTSayAs(AST): + """An AST node for text read as letters.""" def __init__(self, letters): self.letters = letters def to_ssml(self, neural): return ''+self.letters+'' def to_words(self): - return re.split('\s+', self.letters.strip()) + return re.split(r'\s+', self.letters.strip()) def to_sub(self): return self.letters @@ -147,97 +179,96 @@ def to_sub(self): def parse_to_ast(string, err_linenum = None): """Parse the script text string into a sequence of AST nodes.""" i = 0 - n = len(string) + string_length = len(string) def read_until(chars): nonlocal i tmp = i - while i < n and string[i] not in chars: + while i < string_length and string[i] not in chars: i += 1 return string[tmp:i] def err(msg): - ln = f'On line {err_linenum}: ' if err_linenum != None else '' - print(ln+msg) - exit(1) + linenum_text = '' if err_linenum is None else f'On line {err_linenum}: ' + print(linenum_text+msg) + sys.exit(1) #assert False, msg result = [] - while i < n: + while i < string_length: if string[i] == '#': if string[i:i+4] == '#sub': - m = re.match('^#sub(.)(?P((?!\1).)*?)\\1(?P((?!\1).)+?)\\1', string[i:]) - if m == None: + match = re.match( + '^#sub(.)(?P((?!\1).)*?)\\1(?P((?!\1).)+?)\\1', + string[i:]) + if match is None: err(f'Malformed #sub "{string[i:]}"') - t = parse_to_ast(m['text']) - result.append(ASTSub(t, m['sub'])) - i += len(m.group(0)) + result.append(ASTSub(parse_to_ast(match['text']), match['sub'])) + i += len(match.group(0)) continue if string[i:i+5] == '#slow': - m = re.match('^#slow(.)(?P((?!\1).)+?)\\1', string[i:]) - if m == None: + match = re.match('^#slow(.)(?P((?!\1).)+?)\\1', string[i:]) + if match is None: err(f'Malformed #slow "{string[i:]}"') - t = parse_to_ast(m['text']) - result.append(ASTSlow(t)) - i += len(m.group(0)) + result.append(ASTSlow(parse_to_ast(match['text']))) + i += len(match.group(0)) continue if string[i:i+4] == '#low': - m = re.match('^#low(.)(?P((?!\1).)+?)\\1', string[i:]) - if m == None: + match = re.match('^#low(.)(?P((?!\1).)+?)\\1', string[i:]) + if match is None: err(f'Malformed #low "{string[i:]}"') - t = parse_to_ast(m['text']) - result.append(ASTLow(t)) - i += len(m.group(0)) + result.append(ASTLow(parse_to_ast(match['text']))) + i += len(match.group(0)) continue if string[i:i+5] == '#high': - m = re.match('^#high(.)(?P((?!\1).)+?)\\1', string[i:]) - if m == None: + match = re.match('^#high(.)(?P((?!\1).)+?)\\1', string[i:]) + if match is None: err(f'Malformed #high "{string[i:]}"') - t = parse_to_ast(m['text']) - result.append(ASTHigh(t)) - i += len(m.group(0)) + result.append(ASTHigh(parse_to_ast(match['text']))) + i += len(match.group(0)) continue if string[i:i+3] == '#ph': - m = re.match('^#ph(.)(?P((?!\1).)+?)\\1(?P((?!\1).)+?)\\1', string[i:]) - if m == None: + match = re.match( + '^#ph(.)(?P((?!\1).)+?)\\1(?P((?!\1).)+?)\\1', + string[i:]) + if match is None: err(f'Malformed #ph "{string[i:]}"') - result.append(ASTPhoneme(m['text'], m['ph'])) - i += len(m.group(0)) + result.append(ASTPhoneme(match['text'], match['ph'])) + i += len(match.group(0)) continue # Break #10 - m = re.match('^#(?P