-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
110 lines (87 loc) · 3.17 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import re
from flask import Flask, request, jsonify
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# Load lexicon-based sentiment analysis
positive_df = pd.read_csv('positive.tsv', sep='\t')
negative_df = pd.read_csv('negative.tsv', sep='\t')
# Verify the column names
print(positive_df.columns)
print(negative_df.columns)
# Create the lexicon from datasets
indonesian_lexicon = {}
# Add positive words to lexicon with their weight
for _, row in positive_df.iterrows():
indonesian_lexicon[row['word']] = row['weight']
# Add negative words to lexicon with their weight
for _, row in negative_df.iterrows():
indonesian_lexicon[row['word']] = row['weight']
# Initialize the app
app = Flask(__name__)
# Text cleaning
def clean_text(text):
# Remove emoticons, numbers, and special characters
text = re.sub(r'[^\w\s]', '', text)
# Transform text to lowercase
text = text.lower()
# Tokenize the text
words = text.split()
# Normalize the words with dictionary 'slangwords.txt' the file format is 'singkatan:panjang'
slang_words = {}
with open('slangwords.txt') as f:
for line in f:
parts = line.strip().split(':')
if len(parts) == 2:
slang, formal = parts
slang_words[slang] = formal
words = [slang_words.get(word, word) for word in words]
# Normalize it too with another dictionary 'kata.txt' the file format is 'singkatan:panjang'
formal_words = {}
with open('kata.txt') as f:
for line in f:
parts = line.strip().split(':')
if len(parts) == 2:
slang, formal = parts
formal_words[slang] = formal
words = [formal_words.get(word, word) for word in words]
# Remove stopwords with stopwords dictionary 'combined_stop_words.txt' the file format is 'stopword'
stop_words = set()
with open('combined_stop_words.txt') as f:
for line in f:
stop_words.add(line.strip())
words = [word for word in words if word not in stop_words]
# Stemming the words with Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()
words = [stemmer.stem(word) for word in words]
return words
def analyze_sentiment(text):
# Split the text into words
words = clean_text(text)
# Initialize the sentiment score
sentiment_score = 0
# Loop through all the words
for word in words:
# If the word is in the lexicon, add the weight to the sentiment score
if word in indonesian_lexicon:
sentiment_score += indonesian_lexicon[word]
# Return the sentiment score
return sentiment_score
@app.route('/analyze', methods=['POST'])
def analyze():
data = request.get_json()
text = data.get('text', '')
sentiment_score = analyze_sentiment(text)
if sentiment_score > 0:
sentiment = 'positive'
elif sentiment_score < 0:
sentiment = 'negative'
else:
sentiment = 'neutral'
return jsonify({
'cleaned_text': ' '.join(clean_text(text)),
'sentiment_score': sentiment_score,
'sentiment': sentiment
})
if __name__ == '__main__':
app.run(debug=True)