forked from lalinsky/musicbrainz-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscogs_db.py
510 lines (450 loc) · 22.4 KB
/
discogs_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
import config as cfg
import sqlalchemy
from sqlalchemy.sql import text
from editing import MusicBrainzClient
from datetime import date
import httplib
def get_status_code(host, path="/"):
""" This function retreives the status code of a website by requesting
HEAD data from the host. This means that it only requests the headers.
If the host cannot be reached or something else goes wrong, it returns
None instead.
"""
try:
conn = httplib.HTTPConnection(host)
conn.request("HEAD", path)
return conn.getresponse().status
except StandardError:
return None
def read_query(name):
with open('discogs_sql/'+name+'.sql','r') as file:
query = file.read()
file.close()
return query
class DiscogsDbClient(object):
def __init__(self):
self.mbengine = sqlalchemy.create_engine(cfg.MB_DB)
self.doengine = sqlalchemy.create_engine(cfg.DO_DB)
def commit_artist_all(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, url, note FROM discogs_db_artist_all LIMIT %s"
queryDelete = "DELETE FROM discogs_db_artist_all WHERE gid = %s"
for gid, url, note in self.dodb.execute(queryLinks, limit):
mbClient.add_url("artist", gid, 180, "http://www.discogs.com/artist/" + url, note)
self.dodb.execute(queryDelete, gid)
print url + " Done!"
self.close()
def commit_artist_all2(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, url, note FROM discogs_db_artist_all2 LIMIT %s"
queryDelete = "DELETE FROM discogs_db_artist_all2 WHERE gid = %s"
for gid, url, note in self.dodb.execute(queryLinks, limit):
mbClient.add_url("artist", gid, 180, "http://www.discogs.com/artist/" + url, note)
self.dodb.execute(queryDelete, gid)
print url + " Done!"
self.close()
def commit_member_of_band(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid0, gid1, note FROM discogs_db_member_of_band LIMIT %s"
queryDelete = "DELETE FROM discogs_db_member_of_band WHERE gid0 = :gid0 AND gid1 = :gid1"
for gid0, gid1, note in self.dodb.execute(queryLinks, limit):
mbClient.add_relationship("artist", "artist", gid0, gid1, 103, {}, note)
self.dodb.execute(text(queryDelete), gid0=gid0, gid1=gid1)
print note + " Done!"
self.close()
def commit_perform_as(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid0, gid1, note FROM discogs_db_perform_as LIMIT %s"
queryDelete = "DELETE FROM discogs_db_perform_as WHERE gid0 = :gid0 AND gid1 = :gid1"
for gid0, gid1, note in self.dodb.execute(queryLinks, limit):
mbClient.add_relationship("artist", "artist", gid0, gid1, 108, {}, note)
self.dodb.execute(text(queryDelete), gid0=gid0, gid1=gid1)
print note + " Done!"
self.close()
def commit_label_links(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, url, note FROM discogs_db_label_link LIMIT %s"
queryDelete = "DELETE FROM discogs_db_label_link WHERE gid = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for gid, url, note in results:
mbClient.add_url("label", gid, 217, 'http://www.discogs.com/label/'+url, note)
self.dodb.execute(queryDelete, gid)
print url + " Done!"
self.close()
def commit_release_links(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, url, note FROM discogs_db_release_link ORDER BY sum DESC, note LIMIT %s"
queryDelete = "DELETE FROM discogs_db_release_link WHERE gid = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for gid, url, note in results:
mbClient.add_url("release", gid, 76, url, note)
self.dodb.execute(queryDelete, gid)
print url + " Done!"
self.close()
def commit_release_group_links(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, url, note FROM discogs_db_release_group_link LIMIT %s"
queryDelete = "DELETE FROM discogs_db_release_group_link WHERE gid = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for gid, url, note in results:
mbClient.add_url("release_group", gid, 90, 'http://www.discogs.com/master/'+str(url), note)
self.dodb.execute(queryDelete, gid)
print str(url) + " Done!"
self.close()
def commit_release_format(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, format_id, url FROM discogs_db_release_format LIMIT %s"
queryDelete = "DELETE FROM discogs_db_release_format WHERE gid = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for gid, format_id, url in results:
mbClient.set_release_medium_format(gid, '', format_id, "Format taken from "+url)
self.dodb.execute(queryDelete, gid)
print gid + " Done!"
self.close()
def commit_release_barcode(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT gid, value, url FROM discogs_db_release_barcode LIMIT %s"
queryDelete = "DELETE FROM discogs_db_release_barcode WHERE gid = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for gid, value, url in results:
mbClient.set_release_barcode(gid, '', value, "Barcode from attached Discogs link "+url)
self.dodb.execute(queryDelete, gid)
print gid + " Done!"
self.close()
def commit_release_cleanup(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT DISTINCT id, note FROM discogs_db_release_cleanup LIMIT %s"
queryDelete = "DELETE FROM discogs_db_release_cleanup WHERE id = %s"
results = self.dodb.execute(queryLinks, limit).fetchall()
for id, note in results:
mbClient.remove_relationship(id, 'release', 'url', note)
self.dodb.execute(queryDelete, id)
print str(id) + " Done!"
self.close()
def commit_release_artist_relationship(self, limit):
mbClient = self.open(do=True, client=True)
queryLinks = "SELECT release_gid, release_url, artist_gid, artist_url FROM do_release_artist_credits LIMIT %s"
queryDelete = "DELETE FROM do_release_artist_credits WHERE release_gid = %s AND artist_gid = %s"
for release_gid, release_url, artist_gid, artist_url in self.mbdb.execute(queryLinks, limit):
note = "Linked release: " + release_url + "\nLinked artist: " + artist_url
mbClient.add_relationship("artist", "release", artist_gid, release_gid, 30, {}, note)
self.mbdb.execute(queryDelete, release_gid, artist_gid)
print release_gid + " " + artist_gid + " Done!"
self.close()
def commit_recording_credits(self, limit):
mbClient = self.open(mb=True, client=True)
queryLinks = "SELECT artist_gid, gid, note FROM remix_temp LIMIT %s"
queryDelete = "DELETE FROM remix_temp WHERE artist_gid = %s AND gid = %s"
for artist_gid, gid, note in self.mbdb.execute(queryLinks, limit):
mbClient.add_relationship("artist", "recording", artist_gid, gid, 153, {}, note)
self.mbdb.execute(queryDelete, artist_gid, gid)
print artist_gid + " " + gid + " Done!"
self.close()
def run_artist_404(self, limit):
mbClient = self.open(do=True, client=True)
doquery = read_query('run_artist_404')
for id, gid, url in self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK, lim=limit):
if get_status_code("www.discogs.com", url[22:]) == 404:
mbClient.remove_relationship(id, 'artist', 'url', 'Remove dead link that not found on Discogs (404 - Page not found).')
print gid+' removed '+url
else:
print url+' found, do nothing!'
self.close()
def run_label_404(self, limit):
mbClient = self.open(do=True, client=True)
doquery = read_query('run_label_404')
for id, gid, url in self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK, lim=limit):
if get_status_code("www.discogs.com", url[22:]) == 404:
mbClient.remove_relationship(id, 'label', 'url', 'Remove dead link that not found on Discogs (404 - Page not found).')
print gid+' removed '+url
else:
print url+' found, do nothing!'
self.close()
def run_convert_db_relations(self, limit):
mbClient = self.open(mb=True, client=True)
query = read_query('run_convert_db_relations')
for id, link_type in self.mbdb.execute(query, limit):
mbClient.edit_relationship(id, 'artist', 'url', link_type, 188, {}, 'Convert whitelisted relation to "has a page in a database at"-form', True)
self.close()
def run_artist_types(self, limit):
mbClient = self.open(mb=True, client=True)
query1 = read_query('run_artist_types_1')
self.run_artist_type(query1, limit, 1, mbClient)
query2 = read_query('run_artist_types_2')
self.run_artist_type(query2, limit, 2, mbClient)
self.close()
def run_artist_type(self, query, limit, type, mbClient):
for gid, note in self.mbdb.execute(query, limit):
mbClient.set_artist_type(gid, type, note)
print gid + " Done!"
def run_artist_country(self, limit):
mbClient = self.open(mb=True, client=True)
query = read_query('run_artist_country')
results = self.mbdb.execute(query, limit).fetchall()
self.close()
for country_id, gid, name, comment in results:
note = "Based on disambiguation comment"
mbClient.set_artist_country_id(gid, country_id, note)
print gid + " Done!"
def run_artist_country2(self, limit):
mbClient = self.open(do=True, client=True)
query = read_query('run_artist_country2')
results = self.dodb.execute(text(query), dblink=cfg.MB_DB_LINK, limit=limit).fetchall()
self.close()
for country_id, gid, name, comment in results:
note = "Based on Discogs profile: " + comment
mbClient.set_artist_country_id(gid, country_id, note)
print gid + " Done!"
def report(self):
self.open(mb=True, do=True)
print '{|\n! ' + date.today().isoformat()
print """! MusicBrainz Total
! Discogs Total
! Links (all these are not unique)
! Percent done (compared to smaller total)
|-"""
mb = self.mbdb.execute('SELECT COUNT(id) FROM release').fetchone()[0]
do = self.dodb.execute('SELECT COUNT(id) FROM release').fetchone()[0]
link = self.mbdb.execute('SELECT COUNT(id) FROM do_release_link').fetchone()[0]
perc = float(link)/float(mb)*100.0
print '! Releases:\n| '+ str(mb) +'\n| '+ str(do) +'\n| '+ str(link) +('\n| %0.f' % perc) +'%\n|-'
mb = self.mbdb.execute('SELECT COUNT(id) FROM release_group').fetchone()[0]
do = self.dodb.execute('SELECT COUNT(id) FROM master').fetchone()[0]
link = self.mbdb.execute('SELECT COUNT(id) FROM do_release_group_link').fetchone()[0]
perc = float(link)/float(do)*100.0
print '! Release Groups:\n| '+ str(mb) +'\n| '+ str(do) +'\n| '+ str(link) +('\n| %0.f' % perc) +'%\n|-'
mb = self.mbdb.execute('SELECT COUNT(id) FROM artist').fetchone()[0]
do = self.dodb.execute('SELECT COUNT(id) FROM artist').fetchone()[0]
link = self.mbdb.execute('SELECT COUNT(id) FROM do_artist_link').fetchone()[0]
perc = float(link)/float(mb)*100.0
print '! Artists:\n| '+ str(mb) +'\n| '+ str(do) +'\n| '+ str(link) +('\n| %0.f' % perc) +'%\n|-'
mb = self.mbdb.execute('SELECT COUNT(id) FROM label').fetchone()[0]
do = self.dodb.execute('SELECT COUNT(id) FROM label').fetchone()[0]
link = self.mbdb.execute('SELECT COUNT(id) FROM do_label_link').fetchone()[0]
perc = float(link)/float(mb)*100.0
print '! Labels:\n| '+ str(mb) +'\n| '+ str(do) +'\n| '+ str(link) +('\n| %0.f' % perc) +'%\n|}'
self.close()
def report_release_artists(self, id):
self.open(do=True)
query = read_query('report_release_artists')
print self.dodb.execute(text(query), release_id=id).fetchone()[0]
self.close()
def report_release_structure(self, id):
self.open(do=True)
query = read_query('report_release_structure')
transaction = self.dodb.begin()
result = self.dodb.execute(text(query), release_id=id, dblink=cfg.MB_DB_LINK).fetchone()[0]
transaction.commit()
query = "INSERT INTO update_log(updated, event, timest) VALUES ('report_release_structure', 'end', now())"
transaction = self.dodb.begin()
self.dodb.execute(query)
transaction.commit()
print result
self.close()
def report_images_csv(self, filename):
self.open(do=True)
query = read_query('report_images_csv')
transaction = self.dodb.begin()
print self.dodb.execute(text(query), filename=filename, dblink=cfg.MB_DB_LINK).fetchone()[0]
transaction.commit()
self.close()
def do_links(self):
self.open(mb=True, do=True)
print 'Updating MB database materialised views'
query = read_query('do_links_1_mb')
transaction = self.mbdb.begin()
self.mbdb.execute(text(query))
transaction.commit()
print 'Updating Discogs database'
query = read_query('do_links')
transaction = self.dodb.begin()
for updated, event in self.dodb.execute(text(query), dblink=cfg.MB_DB_LINK):
print updated+' '+event
transaction.commit()
self.close()
def clean_release_identifiers(self):
self.open(do=True)
doquery = read_query('clean_release_identifiers')
self.dodb.execute(doquery)
def clean_artist_identifiers(self):
self.open(do=True)
doquery = read_query('clean_artist_identifiers')
self.dodb.execute(doquery)
def create_track_count(self):
self.open(do=True)
doquery = read_query('create_track_count')
self.dodb.execute(doquery)
def create_functions(self):
self.open(do=True)
doquery = read_query('create_functions')
self.dodb.execute(doquery)
self.close()
def create_link_views(self):
self.open(mb=True)
mbquery = read_query('create_link_views')
self.mbdb.execute(mbquery)
self.close()
def create_links(self):
self.open(do=True)
doquery = read_query('create_links')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def create_media_type_mapping_table(self):
"""
Method creates release_format table where format_id is same as MB
medium_format.id and format_name is same as MB medium_format.name
and release_id refers to Discogs release.id
Method handles all Discogs formats except 'Book', which is intentionally
excluded. Method uses format field and when necessary description field
(http://www.discogs.com/help/formatslist) to find correct MB format.
"""
self.open(do=True)
query = read_query('create_media_type_mapping_table')
self.dodb.execute(query)
def create_release_status_table(self):
"""
Method creates release_format table where format_id is same as MB
medium_format.id and format_name is same as MB medium_format.name
and release_id refers to Discogs release.id
Method handles all Discogs formats except 'Book', which is intentionally
excluded. Method uses format field and when necessary description field
(http://www.discogs.com/help/formatslist) to find correct MB format.
"""
self.open(do=True)
query = read_query('create_release_status_table')
self.dodb.execute(query)
def create_release_mb_mapping_table(self):
self.open(do=True)
query = read_query('create_release_mb_mapping_table')
self.dodb.execute(query)
def create_extra_tables_and_indexes(self):
self.open(do=True)
query = read_query('create_extra_tables_and_indexes')
self.dodb.execute(query)
def create_country_mapping_table(self):
"""
Created country mapping works on all (exception list below)
countries that Discogs currently have in they database.
Exceptions cannot be directly mapped on MB country codes.
Discogs have 35.5k releases that use listed countries.
2/3 of these releases have "UK & Europe", "Scandinavia",
or "USA & Canada" as they country.
TODO: There must be some way to handle areas?
Benelux area
Africa area
Asia area
Australasia area
Australia & New Zealand area
Central America area
France & Benelux area
Germany & Switzerland area
Germany, Austria, & Switzerland area
North America (inc Mexico) area
Scandinavia area
South America area
UK & Europe area
UK & Ireland area
UK & US area
UK, Europe & US area
USA & Canada area
USA, Canada & UK area
Gulf Cooperation Council area
Ivory Coast area
Protectorate of Bohemia and Moravia historic
Austria-Hungary historic
Virgin Islands us or uk
Korea north or south
"""
self.open(do=True)
query = read_query('create_country_mapping_table')
self.dodb.execute(text(query), dblink=cfg.MB_DB_LINK)
def create_country_search_table(self):
self.open(mb=True)
mbquery = read_query('create_country_search_table_mb')
self.mbdb.execute(text(mbquery))
def do_release_link_table(self):
self.open(do=True, mb=True)
mbquery = read_query('do_release_links_1_mb')
self.mbdb.execute(mbquery)
print 'MB side done!'
doquery = read_query('do_release_links_2')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
mbquery_cleanup = "DROP TABLE IF EXISTS do_release_link_catno"
self.mbdb.execute(mbquery_cleanup)
self.close()
def do_artist_evidence_track(self):
self.open(do=True)
doquery = read_query('do_artist_evidence_track')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_artist_evidence_release_credits(self):
self.open(do=True)
doquery = read_query('do_artist_evidence_release_credits')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_artist_all(self):
self.open(do=True)
doquery = read_query('do_artist_all')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_member_of_band_table(self):
self.open(do=True)
doquery = read_query('do_member_of_band')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_perform_as_table(self):
self.open(do=True)
doquery = read_query('do_perform_as')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_label_link_table(self):
self.open(do=True)
doquery = read_query('do_label_link')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_release_group_link_table(self):
self.open(do=True)
doquery = read_query('do_release_group_links')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_release_format_table(self):
self.open(do=True)
doquery = read_query('do_release_format')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_release_barcode_table(self):
self.open(do=True)
doquery = read_query('do_release_barcode')
self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK)
self.close()
def do_release_cleanup_table(self):
self.open(do=True)
doquery = read_query('do_release_cleanup')
print self.dodb.execute(text(doquery), dblink=cfg.MB_DB_LINK).fetchone()[0]
self.close()
def do_release_credits_table(self):
self.open(do=True, mb=True)
doquery = read_query('do_release_credits_1')
self.dodb.execute(doquery)
print 'Discogs side done!'
mbquery = read_query('do_release_credits_2_mb')
self.mbdb.execute(text(mbquery), dblink=cfg.DO_DB_LINK)
self.close()
def do_recording_credits_table(self):
self.open(do=True, mb=True)
doquery = read_query('do_recording_credits_1')
self.dodb.execute(doquery)
print 'Discogs side done!'
mbquery = read_query('do_recording_credits_2_mb')
self.mbdb.execute(text(mbquery), dblink=cfg.DO_DB_LINK)
self.close()
def open(self, mb=False, do=False, client=False):
if mb: self.mbdb = self.mbengine.connect()
if do: self.dodb = self.doengine.connect()
if client: return MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE)
return None
def close(self):
if hasattr(self, 'mbdb'):
self.mbdb.close()
if hasattr(self, 'dodb'):
self.dodb.close()