forked from funsim/cpp_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmlab_maxmind_processed.py
executable file
·191 lines (167 loc) · 7.03 KB
/
mlab_maxmind_processed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python
#
# mlab_maxmind_processed.py
#
# Initials:
# RB Ruben Bloemgarten
# AX Axel Roest
#
# Version history
# 20120629 AX first version
# 20120701 AX cleanup of unused methods, Ruben fixed the locID bug, iterate over all maxmind tables if no argument given, fixed sql update query
#
# test:
# cd /DATA
# python scripts/mlab/mlab_mysql_import2.py mlab/clean/glasnost/20090128T000000Z-batch-batch-glasnost-0002.tgz.csv
#
# ToDO: v loop over all arguments in sys.argv[0]
# v deduplication toevoegen (put in hash, test on hash, clear hash for each file, but keep last entry
# v move files naar archive directory
# v move error files naar error directory
# v log process and errors
# todo: loop through all maxmind tables and update full mlab set
import sys
import re
import os
from optparse import OptionParser
from datetime import datetime
import dateutil.parser as dparser
from dateutil.relativedelta import relativedelta
import MySQLdb
#################################################################
# #
# settings #
# #
#################################################################
# PLEASE UPDATE THESE SETTINGS
db_host = "localhost" # your host, usually localhost
db_user = "root" # your username
db_passwd = "rootpassword" # your password
db_name = "mlab" # name of the database
db_tables = {"glasnost": "glasnost", "ndt": "ndt"} # a mapping from testname to tablename
db_filetable = 'files'
# directories
baseDir = '/DATA/mlab/'
logDir = baseDir + 'logs/'
#files
errorLog = "error.log"
processLog = "mlab_maxmind_processed.log"
#################################################################
# #
# functions #
# #
#################################################################
def usage():
print "Usage: mlab_maxmind_processed.py maxmind_table"
sys.exit(1)
# Blocks_GeoLiteCity_20090601
def extract_datestring(string):
''' Returns the datetime contained in string '''
# Extract the date
date_match = re.match('Blocks_GeoLiteCity_(\d{4}\d{2}\d{2})$', string)
if not date_match:
raise Exception('Error in argument "', string, '" does not contain a valid date.')
return date_match.group(1)
def extract_date(string):
''' Returns the datetime contained in string '''
# Extract the date
date_match = re.match('.*(\d{4})(\d{2})(\d{2})$', string)
if not date_match:
raise Exception('Error in argument "', string, '" does not contain a valid date.')
date = datetime(int(date_match.group(1)),int(date_match.group(2)),int(date_match.group(3)))
return date
# return True if the table exists in the database
def check_maxmind_exist(cur, table):
sql = "select * FROM maxmind.`" + table + "` LIMIT 1"
cur.execute(sql)
if cur.fetchone()[0] < 1:
return False
else:
return True
# return a list of Blocks_GeoLiteCity_ tables, for looking up the locIds
def get_maxmind_tableset(cur):
sql = "SHOW TABLES FROM `maxmind`"
cur.execute(sql)
allrows = cur.fetchall()
rows = []
# filter rows
for item in allrows:
m = re.search('Blocks_GeoLiteCity_(\d+)$', item[0])
if (m):
rows.append(m.group(0))
return rows
def get_maxmind_dates(rows):
datehash = {}
# we skip storing the first entry, as we need the date of the second entry first to store the range
skipfirst = True
for table in rows:
date = extract_datestring(table)
if (skipfirst):
skipfirst = False
else:
datehash[olddate] = date
olddate = date
# the skipping is set straight by storing the last entry outside of the loop
# last one is 6 months in the future
lastdate = extract_date(date)
print 'date=' + date + " = " + str(lastdate)
# futuredate = lastdate + datetime.timedelta(365 * 6/12)
futuredate = lastdate + relativedelta(months = +6)
datehash[olddate] = futuredate.strftime('%Y%m%d')
return datehash
def update_mlab_glasnost(cur,table):
start_datum = extract_datestring(table)
end_datum = maxmind_dates[start_datum]
print 'updating between' + start_datum + ' AND ' + end_datum
try:
sql = 'UPDATE mlab.glasnost L, maxmind.' + table + ' M SET L.locId = M.locId WHERE L.longip BETWEEN M.startIpNum AND M.endIpNum AND L.date BETWEEN "' + start_datum + '" AND "' + end_datum + '" AND L.locId = 0'
print sql
cur.execute(sql)
except MySQLdb.Error, e:
print "An error has been passed. %s" %e
#################################################################
# #
# start of initialisation #
# Read command line options #
# #
#################################################################
parser = OptionParser()
parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=False, help="don't print status messages to stdout")
(options, args) = parser.parse_args()
# check for -h argument here
# create file if necessary, as open by itself doesn't cut it
f = open(logDir + processLog, 'a')
f.write("\nNew mlab_maxmind_processed job on " + str(datetime.now()))
f.close
#################################################################
# #
# start of main program #
# #
#################################################################
global_start_time = datetime.now()
try:
# Connect to the mysql database
db = MySQLdb.connect(host = db_host,
user = db_user,
passwd = db_passwd,
db = db_name)
cur = db.cursor()
except:
sys.stderr.write('Error, cannot connect to database' + db_name + '\n')
# array with tables to loop over, in case we don't get a table argument
maxmind_all_tables = get_maxmind_tableset(cur)
# contains hash with key = start_date, value = enddate (= startdate of next table, except for the last one)
maxmind_dates = get_maxmind_dates(maxmind_all_tables)
if len(args) == 0:
print "Iterating over ALL maxmind tables"
for table in maxmind_all_tables:
if (check_maxmind_exist(cur,table)):
update_mlab_glasnost(cur,table)
else:
print "Iterating over all arguments"
for table in args:
if (check_maxmind_exist(cur,table)):
update_mlab_glasnost(cur,table)
cur.close()
global_end_time = datetime.now()
print '=====================================\nAll Done. ' + str(len(args)) + ' file(s) in ' + str(global_end_time - global_start_time)