Skip to content

Commit f64097f

Browse files
authored
Benchmark showcasing with_column and with_column_renamed function performance (#14564)
* Add a benchmark showcasing the dataframe.with_column and dataframe_with_column_renamed function calls are slow. * clippy updates.
1 parent 5cfc653 commit f64097f

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

datafusion/core/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,7 @@ name = "topk_aggregate"
216216
harness = false
217217
name = "map_query_sql"
218218
required-features = ["nested_expressions"]
219+
220+
[[bench]]
221+
harness = false
222+
name = "dataframe"

datafusion/core/benches/dataframe.rs

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate arrow;
19+
#[macro_use]
20+
extern crate criterion;
21+
extern crate datafusion;
22+
23+
use arrow_schema::{DataType, Field, Schema};
24+
use criterion::Criterion;
25+
use datafusion::datasource::MemTable;
26+
use datafusion::prelude::SessionContext;
27+
use datafusion_expr::col;
28+
use datafusion_functions::expr_fn::btrim;
29+
use std::sync::Arc;
30+
use tokio::runtime::Runtime;
31+
32+
fn create_context(field_count: u32) -> datafusion_common::Result<Arc<SessionContext>> {
33+
let mut fields = vec![];
34+
for i in 0..field_count {
35+
fields.push(Field::new(format!("str{}", i), DataType::Utf8, true))
36+
}
37+
38+
let schema = Arc::new(Schema::new(fields));
39+
let ctx = SessionContext::new();
40+
let table = MemTable::try_new(Arc::clone(&schema), vec![vec![]])?;
41+
42+
ctx.register_table("t", Arc::new(table))?;
43+
44+
Ok(Arc::new(ctx))
45+
}
46+
47+
fn run(column_count: u32, ctx: Arc<SessionContext>) {
48+
let rt = Runtime::new().unwrap();
49+
50+
criterion::black_box(rt.block_on(async {
51+
let mut data_frame = ctx.table("t").await.unwrap();
52+
53+
for i in 0..column_count {
54+
let field_name = &format!("str{}", i);
55+
let new_field_name = &format!("newstr{}", i);
56+
57+
data_frame = data_frame
58+
.with_column_renamed(field_name, new_field_name)
59+
.unwrap();
60+
data_frame = data_frame
61+
.with_column(new_field_name, btrim(vec![col(new_field_name)]))
62+
.unwrap();
63+
}
64+
65+
Some(true)
66+
}))
67+
.unwrap();
68+
}
69+
70+
fn criterion_benchmark(c: &mut Criterion) {
71+
// 500 takes far too long right now
72+
for column_count in [10, 100, 200 /* 500 */] {
73+
let ctx = create_context(column_count).unwrap();
74+
75+
c.bench_function(&format!("with_column_{column_count}"), |b| {
76+
b.iter(|| run(column_count, ctx.clone()))
77+
});
78+
}
79+
}
80+
81+
criterion_group! {
82+
name = benches;
83+
config = Criterion::default().sample_size(10);
84+
targets = criterion_benchmark
85+
}
86+
criterion_main!(benches);

0 commit comments

Comments
 (0)