forked from romainr/hadoop-tutorials-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_schemas.py
executable file
·82 lines (60 loc) · 1.76 KB
/
create_schemas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
#
# Generates columns and cell data for an analytics tables of 1000+ columns
# cf. url
#
# create 'analytics', 'hour', 'day', 'total'
import itertools
import random
random.seed(1)
ROWS = 1000
HOURS = range(0, 25)
DAYS = range(0, 366)
COUNTRIES = ['US', 'France', 'Italy']
FAMILLIES = ['hour', 'day', 'total']
# Utilities
def columns_hours():
FAMILLY = 'hour'
cols = []
for hour in HOURS:
cols.append('%s:%02d-%s' % (FAMILLY, hour, 'total'))
for country in COUNTRIES:
cols.append('%s:%02d-%s' % (FAMILLY, hour, country))
return cols
def columns_days():
FAMILLY = 'day'
cols = []
for day in DAYS:
cols.append('%s:%03d-%s' % (FAMILLY, day, 'total'))
for country in COUNTRIES:
cols.append('%s:%03d-%s' % (FAMILLY, day, country))
return cols
def columns_total():
FAMILLY = 'total'
return ['%s:%s' % (FAMILLY, col) for col in ['total'] + COUNTRIES]
def get_domain(n):
return ['domain.%s' % n]
def total():
return [count_by_country(10000)]
def days():
return [count_by_country(1000) for day in DAYS]
def hours():
return [count_by_country(100) for hour in HOURS]
def count_by_country(n):
counts = [random.randrange(1, n) for country in COUNTRIES]
return [sum(counts)] + counts
def print_columns():
all_cols = columns_hours() + columns_days() + columns_total()
print "-Dimporttsv.columns=HBASE_ROW_KEY," + ','.join(['%s' % col for col in all_cols])
def generate_data(data_file):
f = open(data_file, 'w')
for i in xrange(ROWS):
a = hours() + days() + total()
f.write('\t'.join(get_domain(i) + map(str, itertools.chain.from_iterable(a))) + '\n')
print data_file + ' genererated'
# Main
#
# Print columns and generate data into a file
#
print_columns()
generate_data('/tmp/hbase-analytics.tsv')