|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import sqlite3 |
| 4 | +import subprocess |
| 5 | +from util import get_column_names |
| 6 | + |
| 7 | +MAX_VISITS_TO_COPY_TO_SAMPLE_DB = 1000 |
| 8 | + |
| 9 | + |
| 10 | +def get_table_names_from_db(cursor): |
| 11 | + table_names = [] |
| 12 | + cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") |
| 13 | + for row in cursor.fetchall(): |
| 14 | + if row[0] == "sqlite_sequence": |
| 15 | + continue |
| 16 | + table_names.append(row[0]) |
| 17 | + return table_names |
| 18 | + |
| 19 | + |
| 20 | +def create_empty_db_from_existing_db(in_db, out_db): |
| 21 | + schema = subprocess.check_output(["sqlite3", in_db, '.schema']) |
| 22 | + # filter `sqlite_sequence` table |
| 23 | + schema = ''.join([line for line in schema.split("\n") |
| 24 | + if "sqlite_sequence" not in line]) |
| 25 | + conn = sqlite3.connect(out_db) |
| 26 | + conn.executescript(schema) |
| 27 | + conn.commit() |
| 28 | + conn.close() |
| 29 | + |
| 30 | + |
| 31 | +def copy_rows_to_sample_db(in_db, out_db, |
| 32 | + max_visits_to_copy=MAX_VISITS_TO_COPY_TO_SAMPLE_DB): |
| 33 | + conn = sqlite3.connect(in_db) |
| 34 | + cursor = conn.cursor() |
| 35 | + cursor.execute('ATTACH DATABASE "%s" AS db_sample' % out_db) |
| 36 | + table_names = get_table_names_from_db(cursor) |
| 37 | + |
| 38 | + visit_id_condition = "WHERE visit_id <= %s" % max_visits_to_copy |
| 39 | + for table_name in table_names: |
| 40 | + column_names = get_column_names(table_name, cursor) |
| 41 | + if "visit_id" in column_names: |
| 42 | + condition = visit_id_condition |
| 43 | + else: |
| 44 | + condition = "" |
| 45 | + cursor.execute('INSERT INTO db_sample.%s SELECT * FROM %s %s' % |
| 46 | + (table_name, table_name, condition)) |
| 47 | + conn.commit() |
| 48 | + |
| 49 | + |
| 50 | +# USAGE: |
| 51 | +# python crawl_db_path sample_crawl_db_path |
| 52 | +if __name__ == '__main__': |
| 53 | + in_db = sys.argv[1] |
| 54 | + out_db = sys.argv[2] |
| 55 | + create_empty_db_from_existing_db(in_db, out_db) |
| 56 | + copy_rows_to_sample_db(in_db, out_db) |
| 57 | + print "In, out DB sizes", os.path.getsize(in_db), os.path.getsize(out_db) |
0 commit comments