-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSentiment_Analysis_of_Mental_Health.py
More file actions
94 lines (84 loc) · 2.92 KB
/
Sentiment_Analysis_of_Mental_Health.py
File metadata and controls
94 lines (84 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import praw
import time
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()
# Reddit API credentials
reddit = praw.Reddit(
client_id='LkAQXMjg-m3DCwARZAzS7A',
user_agent='MentalHealthSentiment',
client_secret='NCG0yzF8J45kDfbxlzUscW5rb3Ci1w'
)
subreddits = ['depression', 'anxiety', 'mentalhealth', 'stress']
num_posts = 500
post_data = []
# Scrape data from Reddit
for subreddit_name in subreddits:
subreddit = reddit.subreddit(subreddit_name)
for post in subreddit.hot(limit=num_posts):
post_data.append([
subreddit_name,
post.title,
post.score,
post.selftext,
post.num_comments,
post.created_utc
])
print(f'Scraped {num_posts} posts from {subreddit_name}')
time.sleep(2)
df = pd.DataFrame(post_data, columns=['subreddit', 'title', 'text', 'score', 'comments', 'timestamp'])
df.to_csv('reddit_mental_health.csv', index=False)
print("Data Saved")
# Text Cleaning Function
def clean_text(text):
if isinstance(text, str):
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
text = re.sub(r"[^a-zA-Z\s]", "", text)
text = text.lower()
words = word_tokenize(text)
cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
return " ".join(cleaned_words)
else:
return ""
# Sentiment Analysis Function
def get_sentiment(text):
sentiment_score = sia.polarity_scores(text)
compound_score = sentiment_score['compound']
if compound_score >= 0.05:
return 'Positive'
elif compound_score <= -0.05:
return 'Negative'
else:
return 'Neutral'
# Load and preprocess data
data = pd.read_csv('reddit_mental_health.csv')
data.dropna(subset=['title', 'text'], inplace=True)
data.drop(columns=['timestamp'], inplace=True)
data["full_text"] = data["title"].astype(str) + " " + data["text"].astype(str)
data['clean_text'] = data['full_text'].apply(clean_text)
data['sentiment'] = data['clean_text'].apply(get_sentiment)
data.to_csv("sentimental_reddit_mental_health.csv", index=False)
print("Cleaned data saved successfully!")
# Sentiment Analysis Visualization
sentiment_counts = data["sentiment"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="coolwarm")
plt.xlabel("Sentiment Category")
plt.ylabel("Number of Posts")
plt.title("Sentiment Distribution")
plt.show()