-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathraw_text_preprocessor.py
52 lines (40 loc) · 2.08 KB
/
raw_text_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from typing import List
import pandas as pd
from transformers import BertTokenizer
from data.corpus_handler import CorpusName
from data.corpus_preprocessor import CorpusPreprocessor, STD_SENSE
from data.toy_preprocessor import flatten_list
from grimm_bert import DEFAULT_CORPUS_CACHE_DIR
def read_and_strip_lines(file_name: str) -> List[str]:
""" Reads non-empty lines and removes trailing and leading white spaces. """
with open(file_name) as file:
return [line for line in (raw_line.strip() for raw_line in file)
if line]
def tokenize_lines(lines: List[str], tokenizer: BertTokenizer) \
-> List[List[str]]:
""" Tokenizes each line. """
return [tokenizer.basic_tokenizer.tokenize(line) for line in lines]
class RawTextPreprocessor(CorpusPreprocessor):
def __init__(self, lines: List[str], tokenizer: BertTokenizer,
corpus_name: CorpusName = CorpusName.SHAKESPEARE,
corpus_cache_path: str = DEFAULT_CORPUS_CACHE_DIR):
""" Preprocessor for corpora in a raw text format. Handles each line as
a sentence. Strips and lowers each sentence and applies 'tokenizer'.
Generates generic semantic tags. """
super().__init__(corpus_name, corpus_cache_path)
self.lines = tokenize_lines(lines, tokenizer)
def get_sentences(self) -> pd.DataFrame:
sentences = [[token.lower() for token in sentence]
for sentence in self.lines]
return pd.DataFrame({'sentence': sentences})
def get_tagged_tokens(self) -> pd.DataFrame:
tokens = [token.lower() for token in flatten_list(self.lines)]
senses = [token + STD_SENSE for token in tokens]
return pd.DataFrame({'token': tokens, 'sense': senses,
'tagged_sense': False})
if __name__ == '__main__':
raw_text_preprocessor = RawTextPreprocessor(
read_and_strip_lines(f'data/raw_text_corpora/shakespeare.txt'),
BertTokenizer.from_pretrained('./model_cache/bert-base-uncased/'),
CorpusName.SHAKESPEARE)
raw_text_preprocessor.cache_dataset()