Machine_Learning/Sentiment_Analysis_of_Mental_Health.py at main · Saba633/Machine_Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import praw
import time
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

# Reddit API credentials
reddit = praw.Reddit(
    client_id='LkAQXMjg-m3DCwARZAzS7A',
    user_agent='MentalHealthSentiment',
    client_secret='NCG0yzF8J45kDfbxlzUscW5rb3Ci1w'
)

subreddits = ['depression', 'anxiety', 'mentalhealth', 'stress']
num_posts = 500
post_data = []

# Scrape data from Reddit
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    for post in subreddit.hot(limit=num_posts):
        post_data.append([
            subreddit_name,
            post.title,
            post.score,
            post.selftext,
            post.num_comments,
            post.created_utc
        ])
    print(f'Scraped {num_posts} posts from {subreddit_name}')
    time.sleep(2)

df = pd.DataFrame(post_data, columns=['subreddit', 'title', 'text', 'score', 'comments', 'timestamp'])
df.to_csv('reddit_mental_health.csv', index=False)
print("Data Saved")

# Text Cleaning Function
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        text = text.lower()
        words = word_tokenize(text)
        cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return " ".join(cleaned_words)
    else:
        return ""

# Sentiment Analysis Function
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)
    compound_score = sentiment_score['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Load and preprocess data
data = pd.read_csv('reddit_mental_health.csv')
data.dropna(subset=['title', 'text'], inplace=True)
data.drop(columns=['timestamp'], inplace=True)
data["full_text"] = data["title"].astype(str) + " " + data["text"].astype(str)
data['clean_text'] = data['full_text'].apply(clean_text)
data['sentiment'] = data['clean_text'].apply(get_sentiment)
data.to_csv("sentimental_reddit_mental_health.csv", index=False)
print("Cleaned data saved successfully!")

# Sentiment Analysis Visualization
sentiment_counts = data["sentiment"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="coolwarm")
plt.xlabel("Sentiment Category")
plt.ylabel("Number of Posts")
plt.title("Sentiment Distribution")
plt.show()