Skip to content

Commit c03bf85

Browse files
committed
add tokenization, stemming & lemmatization tutorial
1 parent a867caf commit c03bf85

19 files changed

+185
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
5757
- [How to Paraphrase Text using Transformers in Python](https://www.thepythoncode.com/article/paraphrase-text-using-transformers-in-python). ([code](machine-learning/nlp/text-paraphrasing))
5858
- [Fake News Detection using Transformers in Python](https://www.thepythoncode.com/article/fake-news-classification-in-python). ([code](machine-learning/nlp/fake-news-classification))
5959
- [Named Entity Recognition using Transformers and Spacy in Python](https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy). ([code](machine-learning/nlp/named-entity-recognition))
60+
- [Tokenization, Stemming, and Lemmatization in Python](https://www.thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python). ([code](machine-learning/nlp/tokenization-stemming-lemmatization))
6061
- ### [Computer Vision](https://www.thepythoncode.com/topic/computer-vision)
6162
- [How to Detect Human Faces in Python using OpenCV](https://www.thepythoncode.com/article/detect-faces-opencv-python). ([code](machine-learning/face_detection))
6263
- [How to Make an Image Classifier in Python using TensorFlow and Keras](https://www.thepythoncode.com/article/image-classification-keras-python). ([code](machine-learning/image-classifier))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Tokenization, Stemming, and Lemmatization in Python](https://www.thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
s = "Hello I am programmer"
2+
lst = s.split()
3+
print(lst)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
s = "Hello, I am programmer"
2+
lst = s.split(',')
3+
print(lst)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
def tokenize(file):
2+
tok = []
3+
f = open(file, 'r')
4+
for l in f:
5+
lst = l.split()
6+
tok.append(lst)
7+
return tok
8+
9+
tokens = tokenize('reviews.txt')
10+
for e in tokens:
11+
print(e)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import nltk
2+
from nltk.corpus import wordnet
3+
from nltk.stem import WordNetLemmatizer
4+
5+
word_lst = []
6+
def lemmatizer(file):
7+
lem_lst = []
8+
lem = WordNetLemmatizer()
9+
f = open(file, 'r')
10+
for l in f:
11+
word_lst.append(l.strip())
12+
w = lem.lemmatize(str(l.strip()))
13+
lem_lst.append(w)
14+
return lem_lst
15+
16+
def generate_tag(w):
17+
t = nltk.pos_tag([w])[0][1][0].upper()
18+
dic = {
19+
'N': wordnet.NOUN,
20+
'V': wordnet.VERB,
21+
'A': wordnet.ADJ,
22+
'R': wordnet.ADV
23+
}
24+
return dic.get(t, wordnet.VERB)
25+
26+
lem_lst = lemmatizer('reviews.txt')
27+
for i in range(len(word_lst)):
28+
print(word_lst[i]+"-->"+lem_lst[i])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from nltk.stem import PorterStemmer
2+
3+
word_lst = []
4+
def stemmer(file):
5+
stm_lst = []
6+
stm = PorterStemmer()
7+
f = open(file, 'r')
8+
for l in f:
9+
word_lst.append(l)
10+
w = stm.stem(str(l.strip()))
11+
stm_lst.append(w)
12+
return stm_lst
13+
14+
stm_lst = stemmer('reviews.txt')
15+
for i in range(len(word_lst)):
16+
print(word_lst[i]+"-->"+stm_lst[i])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
textblob
2+
nltk
3+
huggingface
4+
tokenizers
5+
transformers
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The restaurant has a good staff, good food, and a good environment.
2+
It is a good place for family outings. Hospitable staff.
3+
The staff is better than other places, but the food is okay.
4+
People are great here. I loved this place.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from nltk import sent_tokenize
2+
3+
def tokenize(file):
4+
tok = []
5+
f = open(file, 'r')
6+
for l in f:
7+
lst = sent_tokenize(l)
8+
tok.append(lst)
9+
return tok
10+
11+
tokens = tokenize('reviews.txt')
12+
for e in tokens:
13+
print(e)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from nltk.stem.snowball import SnowballStemmer
2+
3+
word_lst = []
4+
def stemmer(file):
5+
stm_lst = []
6+
stm = SnowballStemmer(language='english')
7+
f = open(file, 'r')
8+
for l in f:
9+
word_lst.append(l)
10+
w = stm.stem(str(l.strip()))
11+
stm_lst.append(w)
12+
return stm_lst
13+
14+
stm_lst = stemmer('reviews.txt')
15+
for i in range(len(word_lst)):
16+
print(word_lst[i]+"-->"+stm_lst[i])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from transformers import BertTokenizer
2+
3+
tk = BertTokenizer.from_pretrained('bert-base-uncased')
4+
f = open('reviews.txt', 'r')
5+
for l in f:
6+
res = tk.tokenize(l.strip())
7+
print(res)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from textblob import TextBlob
2+
3+
def tokenize(file):
4+
tok = []
5+
f = open(file, 'r')
6+
for l in f:
7+
lst = TextBlob(l).words
8+
tok.append(lst)
9+
return tok
10+
11+
tokens = tokenize('reviews.txt')
12+
for e in tokens:
13+
print(e)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from tokenizers import Tokenizer
2+
3+
tk = Tokenizer.from_file("tokenizer-wiki.json")
4+
5+
f = open('reviews.txt', 'r')
6+
for l in f:
7+
res = tk.encode(l.strip())
8+
print(res.tokens)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from tokenizers import Tokenizer
2+
from tokenizers.models import BPE
3+
from tokenizers.pre_tokenizers import Whitespace
4+
from tokenizers.trainers import BpeTrainer
5+
6+
tk = Tokenizer(BPE(unk_token="[UNK]"))
7+
tr = BpeTrainer()
8+
tk.pre_tokenizer = Whitespace()
9+
10+
f = [f"wikitext-103-raw\wiki.{s}.raw" for s in ["test", "train", "valid"]]
11+
tk.train(f, tr)
12+
13+
tk.save("tokenizer-wiki.json")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from nltk import word_tokenize
2+
3+
def tokenize(file):
4+
tok = []
5+
f = open(file, 'r')
6+
for l in f:
7+
lst = word_tokenize(l)
8+
tok.append(lst)
9+
return tok
10+
11+
tokens = tokenize('reviews.txt')
12+
for e in tokens:
13+
print(e)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from nltk.stem import WordNetLemmatizer
2+
3+
word_lst = []
4+
def lemmatizer(file):
5+
lem_lst = []
6+
lem = WordNetLemmatizer()
7+
f = open(file, 'r')
8+
for l in f:
9+
word_lst.append(l.strip())
10+
w = lem.lemmatize(str(l.strip()))
11+
lem_lst.append(w)
12+
return lem_lst
13+
14+
lem_lst = lemmatizer('reviews.txt')
15+
for i in range(len(word_lst)):
16+
print(word_lst[i]+"-->"+lem_lst[i])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from tokenizers import BertWordPieceTokenizer
2+
3+
tk = BertWordPieceTokenizer("bert-word-piece-vocab.txt", lowercase=True)
4+
f = open('reviews.txt', 'r')
5+
for l in f:
6+
res = tk.encode(l.strip())
7+
print(res.tokens)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from transformers import XLNetTokenizer
2+
3+
tk = XLNetTokenize.from_pretrained('xlnet-base-cased')
4+
f = open('reviews.txt', 'r')
5+
for l in f:
6+
res = tk.tokenize(l.strip())
7+
print(res)

0 commit comments

Comments
 (0)