add tokenization, stemming & lemmatization tutorial

x4nth055 · x4nth055 · commit c03bf85859b7 · 2023-01-05T15:08:17.000+01:00
diff --git a/README.md b/README.md
@@ -57,6 +57,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
         - [How to Paraphrase Text using Transformers in Python](https://www.thepythoncode.com/article/paraphrase-text-using-transformers-in-python). ([code](machine-learning/nlp/text-paraphrasing))
         - [Fake News Detection using Transformers in Python](https://www.thepythoncode.com/article/fake-news-classification-in-python). ([code](machine-learning/nlp/fake-news-classification))
         - [Named Entity Recognition using Transformers and Spacy in Python](https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy). ([code](machine-learning/nlp/named-entity-recognition))
+        - [Tokenization, Stemming, and Lemmatization in Python](https://www.thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python). ([code](machine-learning/nlp/tokenization-stemming-lemmatization))
     - ### [Computer Vision](https://www.thepythoncode.com/topic/computer-vision)
         - [How to Detect Human Faces in Python using OpenCV](https://www.thepythoncode.com/article/detect-faces-opencv-python). ([code](machine-learning/face_detection))
         - [How to Make an Image Classifier in Python using TensorFlow and Keras](https://www.thepythoncode.com/article/image-classification-keras-python). ([code](machine-learning/image-classifier))
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/README.md b/machine-learning/nlp/tokenization-stemming-lemmatization/README.md
@@ -0,0 +1 @@
+# [Tokenization, Stemming, and Lemmatization in Python](https://www.thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/example1_splitting_by_whitespace.py b/machine-learning/nlp/tokenization-stemming-lemmatization/example1_splitting_by_whitespace.py
@@ -0,0 +1,3 @@
+s = "Hello I am programmer"
+lst = s.split()
+print(lst)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/example2_splitting_by_comma.py b/machine-learning/nlp/tokenization-stemming-lemmatization/example2_splitting_by_comma.py
@@ -0,0 +1,3 @@
+s = "Hello, I am programmer"
+lst = s.split(',')
+print(lst)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/example3_splitting_by_whitespace.py b/machine-learning/nlp/tokenization-stemming-lemmatization/example3_splitting_by_whitespace.py
@@ -0,0 +1,11 @@
+def tokenize(file):
+    tok = []
+    f = open(file, 'r')
+    for l in f:
+        lst = l.split()
+        tok.append(lst)
+    return tok
+
+tokens = tokenize('reviews.txt')
+for e in tokens:
+    print(e)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/part_of_speech_tagging.py b/machine-learning/nlp/tokenization-stemming-lemmatization/part_of_speech_tagging.py
@@ -0,0 +1,28 @@
+import nltk
+from nltk.corpus import wordnet
+from nltk.stem import WordNetLemmatizer
+
+word_lst = []
+def lemmatizer(file):
+    lem_lst = []
+    lem = WordNetLemmatizer()
+    f = open(file, 'r')
+    for l in f:
+        word_lst.append(l.strip())
+        w = lem.lemmatize(str(l.strip()))
+        lem_lst.append(w)
+    return lem_lst
+
+def generate_tag(w):
+    t = nltk.pos_tag([w])[0][1][0].upper()
+    dic = {
+        'N': wordnet.NOUN,
+        'V': wordnet.VERB,
+        'A': wordnet.ADJ,
+        'R': wordnet.ADV
+    }
+    return dic.get(t, wordnet.VERB)
+
+lem_lst = lemmatizer('reviews.txt')
+for i in range(len(word_lst)):
+    print(word_lst[i]+"-->"+lem_lst[i])
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/port_stemmer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/port_stemmer.py
@@ -0,0 +1,16 @@
+from nltk.stem import PorterStemmer
+
+word_lst = []
+def stemmer(file):
+    stm_lst = []
+    stm = PorterStemmer()
+    f = open(file, 'r')
+    for l in f:
+        word_lst.append(l)
+        w = stm.stem(str(l.strip()))
+        stm_lst.append(w)
+    return stm_lst
+
+stm_lst = stemmer('reviews.txt')
+for i in range(len(word_lst)):
+    print(word_lst[i]+"-->"+stm_lst[i])
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/requirements.txt b/machine-learning/nlp/tokenization-stemming-lemmatization/requirements.txt
@@ -0,0 +1,5 @@
+textblob
+nltk
+huggingface
+tokenizers
+transformers
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/reviews.txt b/machine-learning/nlp/tokenization-stemming-lemmatization/reviews.txt
@@ -0,0 +1,4 @@
+The restaurant has a good staff, good food, and a good environment.
+It is a good place for family outings. Hospitable staff.
+The staff is better than other places, but the food is okay.
+People are great here. I loved this place.
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/sentence_tokenization_nltk.py b/machine-learning/nlp/tokenization-stemming-lemmatization/sentence_tokenization_nltk.py
@@ -0,0 +1,13 @@
+from nltk import sent_tokenize
+
+def tokenize(file):
+    tok = []
+    f = open(file, 'r')
+    for l in f:
+        lst = sent_tokenize(l)
+        tok.append(lst)
+    return tok
+
+tokens = tokenize('reviews.txt')
+for e in tokens:
+    print(e)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/snowball_stemmer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/snowball_stemmer.py
@@ -0,0 +1,16 @@
+from nltk.stem.snowball import SnowballStemmer
+
+word_lst = []
+def stemmer(file):
+    stm_lst = []
+    stm = SnowballStemmer(language='english')
+    f = open(file, 'r')
+    for l in f:
+        word_lst.append(l)
+        w = stm.stem(str(l.strip()))
+        stm_lst.append(w)
+    return stm_lst
+
+stm_lst = stemmer('reviews.txt')
+for i in range(len(word_lst)):
+    print(word_lst[i]+"-->"+stm_lst[i])
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/subword_tokenization_bert.py b/machine-learning/nlp/tokenization-stemming-lemmatization/subword_tokenization_bert.py
@@ -0,0 +1,7 @@
+from transformers import BertTokenizer
+
+tk = BertTokenizer.from_pretrained('bert-base-uncased')
+f = open('reviews.txt', 'r')
+for l in f:
+    res = tk.tokenize(l.strip())
+    print(res)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/textblob_tokenization.py b/machine-learning/nlp/tokenization-stemming-lemmatization/textblob_tokenization.py
@@ -0,0 +1,13 @@
+from textblob import TextBlob
+
+def tokenize(file):
+    tok = []
+    f = open(file, 'r')
+    for l in f:
+        lst = TextBlob(l).words
+        tok.append(lst)
+    return tok
+
+tokens = tokenize('reviews.txt')
+for e in tokens:
+    print(e)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/tokenize_bpe_tokenizer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/tokenize_bpe_tokenizer.py
@@ -0,0 +1,8 @@
+from tokenizers import Tokenizer
+
+tk = Tokenizer.from_file("tokenizer-wiki.json")
+
+f = open('reviews.txt', 'r')
+for l in f:
+    res = tk.encode(l.strip())
+    print(res.tokens)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/training_bpe_tokenizer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/training_bpe_tokenizer.py
@@ -0,0 +1,13 @@
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import BpeTrainer
+
+tk = Tokenizer(BPE(unk_token="[UNK]"))
+tr = BpeTrainer()
+tk.pre_tokenizer = Whitespace()
+
+f = [f"wikitext-103-raw\wiki.{s}.raw" for s in ["test", "train", "valid"]]
+tk.train(f, tr)
+
+tk.save("tokenizer-wiki.json")
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/word_tokenization_nltk.py b/machine-learning/nlp/tokenization-stemming-lemmatization/word_tokenization_nltk.py
@@ -0,0 +1,13 @@
+from nltk import word_tokenize
+
+def tokenize(file):
+    tok = []
+    f = open(file, 'r')
+    for l in f:
+        lst = word_tokenize(l)
+        tok.append(lst)
+    return tok
+
+tokens = tokenize('reviews.txt')
+for e in tokens:
+    print(e)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/wordnet_lemmatizer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/wordnet_lemmatizer.py
@@ -0,0 +1,16 @@
+from nltk.stem import WordNetLemmatizer
+
+word_lst = []
+def lemmatizer(file):
+    lem_lst = []
+    lem = WordNetLemmatizer()
+    f = open(file, 'r')
+    for l in f:
+        word_lst.append(l.strip())
+        w = lem.lemmatize(str(l.strip()))
+        lem_lst.append(w)
+    return lem_lst
+
+lem_lst = lemmatizer('reviews.txt')
+for i in range(len(word_lst)):
+    print(word_lst[i]+"-->"+lem_lst[i])
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/wordpiece_tokenizer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/wordpiece_tokenizer.py
@@ -0,0 +1,7 @@
+from tokenizers import BertWordPieceTokenizer
+
+tk = BertWordPieceTokenizer("bert-word-piece-vocab.txt", lowercase=True)
+f = open('reviews.txt', 'r')
+for l in f:
+    res = tk.encode(l.strip())
+    print(res.tokens)
diff --git a/machine-learning/nlp/tokenization-stemming-lemmatization/xlnet_sentencepiece_tokenizer.py b/machine-learning/nlp/tokenization-stemming-lemmatization/xlnet_sentencepiece_tokenizer.py
@@ -0,0 +1,7 @@
+from transformers import XLNetTokenizer
+
+tk = XLNetTokenize.from_pretrained('xlnet-base-cased')
+f = open('reviews.txt', 'r')
+for l in f:
+    res = tk.tokenize(l.strip())
+    print(res)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# [Tokenization, Stemming, and Lemmatization in Python](https://www.thepythoncode.com/article/tokenization-stemming-and-lemmatization-in-python)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+s = "Hello I am programmer"`
	`2`	`+lst = s.split()`
	`3`	`+print(lst)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+s = "Hello, I am programmer"`
	`2`	`+lst = s.split(',')`
	`3`	`+print(lst)`