-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathituNlpPipeline.py
128 lines (95 loc) · 3.91 KB
/
ituNlpPipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# To run this code, first edit config.py with your configuration
#
import time
import re
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try: #python3
from urllib.request import urlopen
except: #python2
from urllib2 import urlopen
import config
import sqliteOperations
rowList = {}
def findInRow(row):
for selected_strings in config.STRING_VECTOR:
if selected_strings.lower() in row[4].lower():
if selected_strings.lower() in rowList:
rowList[selected_strings.lower()] = rowList[selected_strings.lower()][0] + 1,row
else:
rowList[selected_strings.lower()] = 1,row
class PipelineCaller(object):
DEFAULT_SENTENCE_SPLIT_DELIMITER_CLASS = '[\.\?:;!]'
def __init__(self, tool='normalize', text='example', token=config.ITU_NLP_API_TOKEN, processing_type='sentence'):
self.tool = tool
self.text = text
self.token = token
self.processing_type = processing_type
self.sentences = []
self.words = []
def call(self):
if self.processing_type == 'whole':
params = self.encode_parameters(self.text)
return self.request(params)
if self.processing_type == 'sentence':
results = []
self.parse_sentences()
for sentence in self.sentences:
params = self.encode_parameters(sentence)
results.append(self.request(params))
return "\n".join(results)
if self.processing_type == 'word':
results = []
self.parse_words()
for word in self.words:
params = self.encode_parameters(word)
results.append(self.request(params))
return "\n".join(results)
def parse_sentences(self):
r = re.compile(r'(?<=(?:{}))\s+'.format(PipelineCaller.DEFAULT_SENTENCE_SPLIT_DELIMITER_CLASS))
self.sentences = r.split(self.text)
if re.match('^\s*$', self.sentences[-1]):
self.sentences.pop(-1)
def parse_words(self):
self.parse_sentences()
for sentence in self.sentences:
for word in sentence.split():
self.words.append(word)
def encode_parameters(self, text):
return urllib.parse.urlencode({'tool': self.tool, 'input': text, 'token': self.token}).encode(config.PIPELINE_ENCODING)
def request(self, params):
response = urllib.request.urlopen(config.API_URL, params)
return response.read().decode(config.PIPELINE_ENCODING)
"""
def startItuNlpApi():
text = "beniim adiim ozgüür"
config.logger.info(text)
config.logger.info(config.ITU_NLP_API_TOKEN)
config.logger.info(config.API_URL)
config.logger.info(config.PIPELINE_ENCODING)
REQUEST_URL = config.API_URL + "?" + "tool=" + config.DEFAULT_TOOL + "&input=" + "MERHABAAAAAA" + "&token=" + config.ITU_NLP_API_TOKEN
config.logger.info(REQUEST_URL)
config.logger.info(config.DEFAULT_TOOL)
start_time = time.time()
caller = PipelineCaller(config.DEFAULT_TOOL, text, config.ITU_NLP_API_TOKEN, 'sentence')
config.logger.info(caller.call())
process_time = time.time() - start_time
config.logger.info("[DONE] It took {0:.0f} seconds to process whole text.".format(process_time))
"""
def startItuNlpApi():
# create a database connection
conn = sqliteOperations.createConnection(sqliteOperations.database)
with conn:
while True:
rows = sqliteOperations.selectTaskByStatus(conn, "0")
for row in rows:
#config.logger.info(row[4])
#caller = PipelineCaller(config.DEFAULT_TOOL, row[4], config.ITU_NLP_API_TOKEN, 'sentence')
sqliteOperations.UpdateTextByStatusWithItuNlpApi(conn, "1", row[4], row[4])
time.sleep(10)
time.sleep(60)
#startItuNlpApi()