Skip to content

Commit 4884e4b

Browse files
authored
Add data/rand-many-types (#42)
* Add data/rand-many-types * Make random data reproducible
1 parent 8cd6cce commit 4884e4b

File tree

3 files changed

+207
-0
lines changed

3 files changed

+207
-0
lines changed

data/rand-many-types/README.md

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# rand-many-types
21+
22+
This directory contains a file `random.arrows` in Arrow IPC stream format with randomly generated values in 20+ columns exercising many different Arrow data types. The Python script `generate.py` that generated the data file is included.

data/rand-many-types/generate.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import pyarrow as pa
19+
import numpy as np
20+
import string
21+
from decimal import Decimal
22+
from datetime import datetime, timedelta
23+
24+
25+
def generate_random_data(data_type, num_rows, random_generator):
26+
rng = random_generator
27+
if pa.types.is_int8(data_type):
28+
return pa.array(rng.integers(-128, 127, num_rows, dtype=np.int8))
29+
elif pa.types.is_int16(data_type):
30+
return pa.array(rng.integers(-32768, 32767, num_rows, dtype=np.int16))
31+
elif pa.types.is_int32(data_type):
32+
return pa.array(
33+
rng.integers(-2147483648, 2147483647, num_rows, dtype=np.int32)
34+
)
35+
elif pa.types.is_int64(data_type):
36+
return pa.array(
37+
rng.integers(
38+
-9223372036854775808,
39+
9223372036854775807,
40+
num_rows,
41+
dtype=np.int64,
42+
)
43+
)
44+
elif pa.types.is_uint8(data_type):
45+
return pa.array(rng.integers(0, 255, num_rows, dtype=np.uint8))
46+
elif pa.types.is_uint16(data_type):
47+
return pa.array(rng.integers(0, 65535, num_rows, dtype=np.uint16))
48+
elif pa.types.is_uint32(data_type):
49+
return pa.array(rng.integers(0, 4294967295, num_rows, dtype=np.uint32))
50+
elif pa.types.is_uint64(data_type):
51+
return pa.array(
52+
rng.integers(0, 18446744073709551615, num_rows, dtype=np.uint64)
53+
)
54+
elif pa.types.is_float32(data_type):
55+
return pa.array(rng.random(num_rows, np.float32))
56+
elif pa.types.is_float64(data_type):
57+
return pa.array(rng.random(num_rows, np.float64))
58+
elif pa.types.is_string(data_type):
59+
charset = list(
60+
string.ascii_lowercase + string.ascii_uppercase + string.digits
61+
)
62+
return pa.array(
63+
["".join(rng.choice(charset, 8)) for _ in range(num_rows)]
64+
)
65+
elif pa.types.is_binary(data_type):
66+
return pa.array([rng.bytes(8) for _ in range(num_rows)])
67+
elif pa.types.is_boolean(data_type):
68+
return pa.array(rng.choice([True, False], num_rows))
69+
elif pa.types.is_date32(data_type):
70+
base_date = datetime(1970, 1, 1)
71+
return pa.array(
72+
[
73+
(base_date + timedelta(days=int(rng.integers(0, 10000)))).date()
74+
for _ in range(num_rows)
75+
],
76+
type=pa.date32(),
77+
)
78+
elif pa.types.is_date64(data_type):
79+
base_date = datetime(1970, 1, 1)
80+
return pa.array(
81+
[
82+
(
83+
base_date
84+
+ timedelta(
85+
milliseconds=int(
86+
rng.integers(0, 10000 * 24 * 60 * 60 * 1000)
87+
)
88+
)
89+
).date()
90+
for _ in range(num_rows)
91+
],
92+
type=pa.date64(),
93+
)
94+
elif pa.types.is_timestamp(data_type):
95+
base_time = datetime(2016, 1, 1, 0, 0, 0, 0)
96+
return pa.array(
97+
[
98+
base_time + timedelta(seconds=int(rng.integers(0, 10000)))
99+
for _ in range(num_rows)
100+
],
101+
type=pa.timestamp("ns"),
102+
)
103+
elif pa.types.is_decimal(data_type):
104+
return pa.array(
105+
[
106+
Decimal(
107+
f"{rng.integers(10**7, 10**8-1)}.{rng.integers(0, 10**2-1)}"
108+
)
109+
for _ in range(num_rows)
110+
],
111+
type=pa.decimal128(10, 2),
112+
)
113+
elif pa.types.is_list(data_type):
114+
return pa.array(
115+
[[rng.integers(0, 100) for _ in range(3)] for _ in range(num_rows)],
116+
type=pa.list_(pa.int32()),
117+
)
118+
elif pa.types.is_struct(data_type):
119+
struct_type = pa.struct(
120+
[("field1", pa.int32()), ("field2", pa.float64())]
121+
)
122+
return pa.array(
123+
[
124+
{"field1": rng.integers(0, 100), "field2": rng.random()}
125+
for _ in range(num_rows)
126+
],
127+
type=struct_type,
128+
)
129+
elif pa.types.is_dictionary(data_type):
130+
return pa.array(
131+
[f"key_{i}" for i in range(num_rows)],
132+
type=pa.dictionary(pa.int32(), pa.string()),
133+
)
134+
else:
135+
return pa.nulls(num_rows, type=data_type)
136+
137+
138+
data_types = [
139+
pa.int8(),
140+
pa.int16(),
141+
pa.int32(),
142+
pa.int64(),
143+
pa.uint8(),
144+
pa.uint16(),
145+
pa.uint32(),
146+
pa.uint64(),
147+
pa.float32(),
148+
pa.float64(),
149+
pa.string(),
150+
pa.binary(),
151+
pa.bool_(),
152+
pa.date32(),
153+
pa.date64(),
154+
pa.timestamp("ns"),
155+
pa.decimal128(10, 2),
156+
pa.list_(pa.int32()),
157+
pa.struct([("field1", pa.int32()), ("field2", pa.float64())]),
158+
pa.dictionary(pa.int32(), pa.string()),
159+
pa.null(),
160+
]
161+
162+
schema = pa.schema(
163+
[(f"col_{j}", data_type) for j, data_type in enumerate(data_types)]
164+
)
165+
166+
num_rows_per_batch = 1000
167+
num_batches = 100
168+
169+
random_seed = 12345
170+
random_generator = np.random.default_rng(random_seed)
171+
172+
path = "random.arrows"
173+
174+
with pa.ipc.new_stream(path, schema) as writer:
175+
for i in range(0, num_batches):
176+
columns = {
177+
f"col_{j}": generate_random_data(
178+
data_type, num_rows_per_batch, random_generator
179+
)
180+
for j, data_type in enumerate(data_types)
181+
}
182+
writer.write_batch(pa.RecordBatch.from_pydict(columns))

data/rand-many-types/random.arrows

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:d5f42338317901eb00343a85394b263ebbcf488ec08d45434088d9323fb26d79
3+
size 13550776

0 commit comments

Comments
 (0)