Skip to content

Commit 787afa1

Browse files
apacheGH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (apache#39652)
### Rationale for this change First step for apache#39633: exposing the Array, DataType and Scalar classes for BinaryView and StringView, such that those can already be represented in pyarrow. (I exposed a variant of StringBuilder as well, just for now to be able to create test data) * Closes: apache#39651 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
1 parent c6ab286 commit 787afa1

File tree

16 files changed

+223
-6
lines changed

16 files changed

+223
-6
lines changed

docs/source/python/api/arrays.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
6363
FixedSizeBinaryArray
6464
LargeBinaryArray
6565
LargeStringArray
66+
BinaryViewArray,
67+
StringViewArray,
6668
Time32Array
6769
Time64Array
6870
Date32Array
@@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
119121
FixedSizeBinaryScalar
120122
LargeBinaryScalar
121123
LargeStringScalar
124+
BinaryViewScalar
125+
StringViewScalar
122126
Time32Scalar
123127
Time64Scalar
124128
Date32Scalar

docs/source/python/api/datatypes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
5555
large_binary
5656
large_string
5757
large_utf8
58+
binary_view
59+
string_view
5860
decimal128
5961
list_
6062
large_list
@@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category
168170
is_large_binary
169171
is_large_unicode
170172
is_large_string
173+
is_binary_view
174+
is_string_view
171175
is_fixed_size_binary
172176
is_map
173177
is_dictionary

python/pyarrow/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def print_entry(label, value):
163163
time32, time64, timestamp, date32, date64, duration,
164164
month_day_nano_interval,
165165
float16, float32, float64,
166-
binary, string, utf8,
166+
binary, string, utf8, binary_view, string_view,
167167
large_binary, large_string, large_utf8,
168168
decimal128, decimal256,
169169
list_, large_list, map_, struct,
@@ -205,6 +205,7 @@ def print_entry(label, value):
205205
FixedSizeListArray, UnionArray,
206206
BinaryArray, StringArray,
207207
LargeBinaryArray, LargeStringArray,
208+
BinaryViewArray, StringViewArray,
208209
FixedSizeBinaryArray,
209210
DictionaryArray,
210211
Date32Array, Date64Array, TimestampArray,
@@ -223,8 +224,8 @@ def print_entry(label, value):
223224
Time32Scalar, Time64Scalar,
224225
TimestampScalar, DurationScalar,
225226
MonthDayNanoIntervalScalar,
226-
BinaryScalar, LargeBinaryScalar,
227-
StringScalar, LargeStringScalar,
227+
BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
228+
StringScalar, LargeStringScalar, StringViewScalar,
228229
FixedSizeBinaryScalar, DictionaryScalar,
229230
MapScalar, StructScalar, UnionScalar,
230231
RunEndEncodedScalar, ExtensionScalar)

python/pyarrow/array.pxi

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array):
29422942
null_count, offset)
29432943

29442944

2945+
cdef class StringViewArray(Array):
2946+
"""
2947+
Concrete class for Arrow arrays of string (or utf8) view data type.
2948+
"""
2949+
2950+
29452951
cdef class BinaryArray(Array):
29462952
"""
29472953
Concrete class for Arrow arrays of variable-sized binary data type.
@@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array):
29682974
return (<CLargeBinaryArray*> self.ap).total_values_length()
29692975

29702976

2977+
cdef class BinaryViewArray(Array):
2978+
"""
2979+
Concrete class for Arrow arrays of variable-sized binary view data type.
2980+
"""
2981+
2982+
29712983
cdef class DictionaryArray(Array):
29722984
"""
29732985
Concrete class for dictionary-encoded Arrow arrays.
@@ -3669,6 +3681,8 @@ cdef dict _array_classes = {
36693681
_Type_STRING: StringArray,
36703682
_Type_LARGE_BINARY: LargeBinaryArray,
36713683
_Type_LARGE_STRING: LargeStringArray,
3684+
_Type_BINARY_VIEW: BinaryViewArray,
3685+
_Type_STRING_VIEW: StringViewArray,
36723686
_Type_DICTIONARY: DictionaryArray,
36733687
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
36743688
_Type_DECIMAL128: Decimal128Array,

python/pyarrow/builder.pxi

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):
8080

8181
def __len__(self):
8282
return self.builder.get().length()
83+
84+
85+
cdef class StringViewBuilder(_Weakrefable):
86+
"""
87+
Builder class for UTF8 string views.
88+
89+
This class exposes facilities for incrementally adding string values and
90+
building the null bitmap for a pyarrow.Array (type='string_view').
91+
"""
92+
cdef:
93+
unique_ptr[CStringViewBuilder] builder
94+
95+
def __cinit__(self, MemoryPool memory_pool=None):
96+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
97+
self.builder.reset(new CStringViewBuilder(pool))
98+
99+
def append(self, value):
100+
"""
101+
Append a single value to the builder.
102+
103+
The value can either be a string/bytes object or a null value
104+
(np.nan or None).
105+
106+
Parameters
107+
----------
108+
value : string/bytes or np.nan/None
109+
The value to append to the string array builder.
110+
"""
111+
if value is None or value is np.nan:
112+
self.builder.get().AppendNull()
113+
elif isinstance(value, (bytes, str)):
114+
self.builder.get().Append(tobytes(value))
115+
else:
116+
raise TypeError('StringViewBuilder only accepts string objects')
117+
118+
def append_values(self, values):
119+
"""
120+
Append all the values from an iterable.
121+
122+
Parameters
123+
----------
124+
values : iterable of string/bytes or np.nan/None values
125+
The values to append to the string array builder.
126+
"""
127+
for value in values:
128+
self.append(value)
129+
130+
def finish(self):
131+
"""
132+
Return result of builder as an Array object; also resets the builder.
133+
134+
Returns
135+
-------
136+
array : pyarrow.Array
137+
"""
138+
cdef shared_ptr[CArray] out
139+
with nogil:
140+
self.builder.get().Finish(&out)
141+
return pyarrow_wrap_array(out)
142+
143+
@property
144+
def null_count(self):
145+
return self.builder.get().null_count()
146+
147+
def __len__(self):
148+
return self.builder.get().length()

python/pyarrow/includes/libarrow.pxd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
126126
_Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
127127
_Type_LARGE_STRING" arrow::Type::LARGE_STRING"
128128
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
129+
_Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
130+
_Type_STRING_VIEW" arrow::Type::STRING_VIEW"
129131

130132
_Type_LIST" arrow::Type::LIST"
131133
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
@@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil:
12951297

12961298
cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
12971299
CStringBuilder(CMemoryPool* pool)
1300+
CStatus Append(const c_string& value)
1301+
1302+
cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
1303+
CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
1304+
CStatus Append(const char* value, int32_t length)
12981305

1306+
cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder):
1307+
CStringViewBuilder(CMemoryPool* pool)
12991308
CStatus Append(const c_string& value)
13001309

13011310
cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):

python/pyarrow/lib.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
445445
pass
446446

447447

448+
cdef class StringViewArray(Array):
449+
pass
450+
451+
452+
cdef class BinaryViewArray(Array):
453+
pass
454+
455+
448456
cdef class DictionaryArray(Array):
449457
cdef:
450458
object _indices, _dictionary

python/pyarrow/lib.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
106106
Type_LARGE_BINARY = _Type_LARGE_BINARY
107107
Type_LARGE_STRING = _Type_LARGE_STRING
108108
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
109+
Type_BINARY_VIEW = _Type_BINARY_VIEW
110+
Type_STRING_VIEW = _Type_STRING_VIEW
109111
Type_LIST = _Type_LIST
110112
Type_LARGE_LIST = _Type_LARGE_LIST
111113
Type_MAP = _Type_MAP

python/pyarrow/scalar.pxi

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
665665
pass
666666

667667

668+
cdef class BinaryViewScalar(BinaryScalar):
669+
pass
670+
671+
672+
cdef class StringViewScalar(StringScalar):
673+
pass
674+
675+
668676
cdef class ListScalar(Scalar):
669677
"""
670678
Concrete class for list-like scalars.
@@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
10511059
_Type_BINARY: BinaryScalar,
10521060
_Type_LARGE_BINARY: LargeBinaryScalar,
10531061
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
1062+
_Type_BINARY_VIEW: BinaryViewScalar,
10541063
_Type_STRING: StringScalar,
10551064
_Type_LARGE_STRING: LargeStringScalar,
1065+
_Type_STRING_VIEW: StringViewScalar,
10561066
_Type_LIST: ListScalar,
10571067
_Type_LARGE_LIST: LargeListScalar,
10581068
_Type_FIXED_SIZE_LIST: FixedSizeListScalar,

python/pyarrow/src/arrow/python/helpers.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
6363
GET_PRIMITIVE_TYPE(STRING, utf8);
6464
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
6565
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
66+
GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
67+
GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
6668
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
6769
default:
6870
return nullptr;

python/pyarrow/tests/test_builder.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import numpy as np
2121

2222
import pyarrow as pa
23-
from pyarrow.lib import StringBuilder
23+
from pyarrow.lib import StringBuilder, StringViewBuilder
2424

2525

2626
def test_weakref():
@@ -65,3 +65,22 @@ def test_string_builder_append_after_finish():
6565
sbuilder.append("No effect")
6666
expected = [None, None, "text", None, "other text"]
6767
assert arr.to_pylist() == expected
68+
69+
70+
def test_string_view_builder():
71+
builder = StringViewBuilder()
72+
builder.append(b"a byte string")
73+
builder.append("a string")
74+
builder.append("a longer not-inlined string")
75+
builder.append(np.nan)
76+
builder.append_values([None, "text"])
77+
assert len(builder) == 6
78+
assert builder.null_count == 2
79+
arr = builder.finish()
80+
assert isinstance(arr, pa.Array)
81+
assert arr.null_count == 2
82+
assert arr.type == 'string_view'
83+
expected = [
84+
"a byte string", "a string", "a longer not-inlined string", None, None, "text"
85+
]
86+
assert arr.to_pylist() == expected

python/pyarrow/tests/test_misc.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows():
185185
pa.UnionArray,
186186
pa.BinaryArray,
187187
pa.StringArray,
188+
pa.BinaryViewArray,
189+
pa.StringViewArray,
188190
pa.FixedSizeBinaryArray,
189191
pa.DictionaryArray,
190192
pa.Date32Array,
@@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows():
221223
pa.StringScalar,
222224
pa.BinaryScalar,
223225
pa.FixedSizeBinaryScalar,
226+
pa.BinaryViewScalar,
227+
pa.StringViewScalar,
224228
pa.ListScalar,
225229
pa.LargeListScalar,
226230
pa.MapScalar,

python/pyarrow/tests/test_scalars.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@
5151
(b"bytes", None, pa.BinaryScalar),
5252
("largestring", pa.large_string(), pa.LargeStringScalar),
5353
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
54+
# TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented
55+
# ("string_view", pa.string_view(), pa.StringViewScalar),
56+
# (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
5457
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
5558
([1, 2, 3], None, pa.ListScalar),
5659
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
@@ -488,7 +491,8 @@ def test_month_day_nano_interval():
488491
@pytest.mark.parametrize('value', ['foo', 'mañana'])
489492
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
490493
(pa.string(), pa.StringScalar),
491-
(pa.large_string(), pa.LargeStringScalar)
494+
(pa.large_string(), pa.LargeStringScalar),
495+
# (pa.string_view(), pa.StringViewScalar),
492496
])
493497
def test_string(value, ty, scalar_typ):
494498
s = pa.scalar(value, type=ty)
@@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ):
503507
assert buf.to_pybytes() == value.encode()
504508

505509

510+
@pytest.mark.parametrize('value', ['foo', 'mañana'])
511+
def test_string_view(value):
512+
# TODO: replace with normal scalar construction
513+
builder = pa.lib.StringViewBuilder()
514+
builder.append(value)
515+
arr = builder.finish()
516+
517+
s = arr[0]
518+
assert isinstance(s, pa.StringViewScalar)
519+
assert s.as_py() == value
520+
assert s.as_py() != 'something'
521+
assert repr(value) in repr(s)
522+
assert str(s) == str(value)
523+
524+
buf = s.as_buffer()
525+
assert isinstance(buf, pa.Buffer)
526+
assert buf.to_pybytes() == value.encode()
527+
528+
506529
@pytest.mark.parametrize('value', [b'foo', b'bar'])
507530
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
508531
(pa.binary(), pa.BinaryScalar),
509-
(pa.large_binary(), pa.LargeBinaryScalar)
532+
(pa.large_binary(), pa.LargeBinaryScalar),
533+
# (pa.binary_view(), pa.BinaryViewScalar),
510534
])
511535
def test_binary(value, ty, scalar_typ):
512536
s = pa.scalar(value, type=ty)

python/pyarrow/tests/test_types.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ def get_many_types():
6161
pa.binary(10),
6262
pa.large_string(),
6363
pa.large_binary(),
64+
pa.string_view(),
65+
pa.binary_view(),
6466
pa.list_(pa.int32()),
6567
pa.list_(pa.int32(), 2),
6668
pa.large_list(pa.uint16()),
@@ -244,6 +246,12 @@ def test_is_binary_string():
244246
assert types.is_fixed_size_binary(pa.binary(5))
245247
assert not types.is_fixed_size_binary(pa.binary())
246248

249+
assert types.is_string_view(pa.string_view())
250+
assert not types.is_string_view(pa.string())
251+
assert types.is_binary_view(pa.binary_view())
252+
assert not types.is_binary_view(pa.binary())
253+
assert not types.is_binary_view(pa.string_view())
254+
247255

248256
def test_is_temporal_date_time_timestamp():
249257
date_types = [pa.date32(), pa.date64()]

0 commit comments

Comments
 (0)