rasbt · fisa712 · Feb 24, 2023 · Feb 24, 2023 · Feb 24, 2023 · Feb 24, 2023
diff --git a/ch08/additional_features/.github/workflows/i191855_update.yml b/ch08/additional_features/.github/workflows/i191855_update.yml
@@ -0,0 +1,17 @@
+name : push events workflow
+
+on : push 
+
+jobs :
+  unit-testing :
+    runs-on: ubuntu-latest
+
+    steps :
+      - name : Checkout Code
+        uses : actions/checkout@v2
+
+      - name : Install Package
+        run  : pip install pytest numpy pandas
+
+      - name : Run Test
+        run  : pytest test_preprocessing.py 
diff --git a/ch08/additional_features/README.md b/ch08/additional_features/README.md
@@ -0,0 +1,22 @@
+Task: Add more data preprocessing steps
+In this task, we will explore the impact of adding more data preprocessing steps on the accuracy and generalization of our sentiment analysis model. Specifically, we will add stemming, lemmatization, and/or stop-word removal to the existing data preprocessing steps.
+
+Files and Folders
+sentiment_analysis.py: This is the main script that performs sentiment analysis on a given input text.
+
+preprocessing.py: This script contains the existing data preprocessing steps. You will modify this script to add more preprocessing steps.
+
+data: This folder contains the training and test data.
+
+Instructions
+Clone the repository and create a new branch for this task.
+
+Open the preprocessing.py script and add more data preprocessing steps such as stemming, lemmatization, or stop-word removal. You can use any NLP library such as NLTK or spaCy to implement these preprocessing steps.
+
+Train the model using the modified data preprocessing steps and evaluate its accuracy and generalization using the test data.
+
+Update the README file with the results of the evaluation and a description of the added preprocessing steps.
+
+Push the changes to the branch and create a pull request.
+
+Wait for the reviewer to approve the pull request and merge it with the main branch.
diff --git a/ch08/additional_features/preprocessing.ipynb b/ch08/additional_features/preprocessing.ipynb
@@ -0,0 +1,28 @@
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer, PorterStemmer
+
+def remove_stopwords(text):
+    stop_words = set(stopwords.words('english'))
+    word_tokens = word_tokenize(text)
+    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
+    return ' '.join(filtered_text)
+
+def perform_lemmatization(text):
+    lemmatizer = WordNetLemmatizer()
+    word_tokens = word_tokenize(text)
+    lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens]
+    return ' '.join(lemmatized_text)
+
+def perform_stemming(text):
+    stemmer = PorterStemmer()
+    word_tokens = word_tokenize(text)
+    stemmed_text = [stemmer.stem(word) for word in word_tokens]
+    return ' '.join(stemmed_text)
+
+def preprocess_text(text):
+    text = remove_stopwords(text)
+    text = perform_lemmatization(text)
+    text = perform_stemming(text)
+    return text
diff --git a/ch08/additional_features/test_preprocessing.py b/ch08/additional_features/test_preprocessing.py
@@ -0,0 +1,27 @@
+import unittest
+from preprocessing import *
+
+class TestPreprocessing(unittest.TestCase):
+
+    def test_remove_stopwords(self):
+        text = "this is a sample text that includes some stop words such as the, and, etc."
+        expected_output = "sample text includes stop words like , , etc ."
+        self.assertEqual(remove_stopwords(text), expected_output)
+
+    def test_perform_lemmatization(self):
+        text = "running played plays"
+        expected_output = "running played play"
+        self.assertEqual(perform_lemmatization(text), expected_output)
+
+    def test_perform_stemming(self):
+        text = "running played plays"
+        expected_output = "run play play"
+        self.assertEqual(perform_stemming(text), expected_output)
+
+    def test_preprocess_text(self):
+        text = "This is a sample text. It includes some stop words, and it has words in different tenses (e.g. playing, played)."
+        expected_output = "thi sampl text . includ stop word , word differ tens ( e.g. play , play ) ."
+        self.assertEqual(preprocess_text(text), expected_output)
+
+if __name__ == '__main__':
+    unittest.main()