Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs(python): Add Polars & LLMs page to the user guide #21160

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1828fa6
Add Polars LLMS page
Feb 10, 2025
201c69f
Merge branch 'main' into list-expr-docstring-examples
Feb 10, 2025
a892956
Add to index
Feb 10, 2025
b431a67
Updates
Feb 10, 2025
4354002
Edits
Feb 10, 2025
c2215ec
Formatted file
Feb 10, 2025
8cb3d2b
Add custom API docs
braaannigan Feb 11, 2025
8513e0a
Format file
Feb 11, 2025
be4e181
Fix typo
braaannigan Feb 11, 2025
bf3151e
empty commit
Feb 12, 2025
a25f3af
feat: Improve DataFrame fmt in explain (#21158)
ritchie46 Feb 10, 2025
9c7be88
fix: Fix projection count query optimization (#21162)
ritchie46 Feb 10, 2025
e982c8e
fix: Projection of only row index in new streaming IPC (#21167)
coastalwhite Feb 10, 2025
0ac3e6e
chore: Install seaborn when running remote benchmark (#21168)
coastalwhite Feb 10, 2025
0359406
docs: Improve Arrow key feature description (#21171)
edwinvehmaanpera Feb 10, 2025
e586fc1
chore: Add feature gate to old streaming deprecation warning (#21179)
lukemanley Feb 11, 2025
3b9deb2
feat: Add row index to new streaming multiscan (#21169)
coastalwhite Feb 11, 2025
f987f01
fix: Raise error instead of panicking for unsupported SQL operations …
jqnatividad Feb 11, 2025
6c33e7d
fix: Do not panic in `strptime()` if `format` ends with '%' (#21176)
etiennebacher Feb 11, 2025
c3c4edb
feat: Add SQL support for the `DELETE` statement (#21190)
alexander-beedie Feb 12, 2025
44fa71b
feat: Don't take in rewriting visitor (#21212)
ritchie46 Feb 12, 2025
7b14ada
perf: Add sampling to new-streaming equi join to decide between build…
orlp Feb 12, 2025
2be2036
refactor(rust): Use distributor channel in new-streaming CSV reader a…
orlp Feb 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/benchmark-remote.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ jobs:
working-directory: py-polars
run: |
"$HOME/py-polars-cache/add-data.py" "$PWD/polars" < ./benchmark-results
pip install seaborn
"$HOME/py-polars-cache/create-plots.py"
touch "$HOME/py-polars-cache/upload-probe"
"$HOME/py-polars-cache/cache-build.sh" "$PWD/polars"
7 changes: 6 additions & 1 deletion crates/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,13 @@ impl DataFrame {
/// static EMPTY: DataFrame = DataFrame::empty();
/// ```
pub const fn empty() -> Self {
Self::empty_with_height(0)
}

/// Creates an empty `DataFrame` with a specific `height`.
pub const fn empty_with_height(height: usize) -> Self {
DataFrame {
height: 0,
height,
columns: vec![],
cached_schema: OnceLock::new(),
}
Expand Down
8 changes: 8 additions & 0 deletions crates/polars-error/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,14 @@ macro_rules! polars_err {
(opq = $op:ident, $lhs:expr, $rhs:expr) => {
$crate::polars_err!(op = stringify!($op), $lhs, $rhs)
};
(bigidx, ctx = $ctx:expr, size = $size:expr) => {
polars_err!(ComputeError: "\
{} produces {} rows which is more than maximum allowed pow(2, 32) rows; \
consider compiling with bigidx feature (polars-u64-idx package on python)",
$ctx,
$size,
)
};
(append) => {
polars_err!(SchemaMismatch: "cannot append series, data types don't match")
};
Expand Down
25 changes: 25 additions & 0 deletions crates/polars-expr/src/hash_keys.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@ impl HashKeys {
Self::Single(s) => Self::Single(s.gather(idxs)),
}
}

pub fn sketch_cardinality(&self, sketch: &mut CardinalitySketch) {
match self {
HashKeys::RowEncoded(s) => s.sketch_cardinality(sketch),
HashKeys::Single(s) => s.sketch_cardinality(sketch),
}
}
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -231,6 +238,20 @@ impl RowEncodedKeys {
keys,
}
}

pub fn sketch_cardinality(&self, sketch: &mut CardinalitySketch) {
if let Some(validity) = self.keys.validity() {
for (h, is_v) in self.hashes.values_iter().zip(validity) {
if is_v {
sketch.insert(*h);
}
}
} else {
for h in self.hashes.values_iter() {
sketch.insert(*h);
}
}
}
}

/// Single keys. Does not pre-hash for boolean & integer types, only for strings
Expand Down Expand Up @@ -284,4 +305,8 @@ impl SingleKeys {
keys: self.keys.take_slice_unchecked(idxs),
}
}

pub fn sketch_cardinality(&self, _sketch: &mut CardinalitySketch) {
todo!()
}
}
100 changes: 100 additions & 0 deletions crates/polars-io/src/csv/read/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use polars_core::prelude::*;
use polars_core::{config, POOL};
use polars_error::feature_gated;
use polars_utils::index::Bounded;
use polars_utils::select::select_unpredictable;
use rayon::prelude::*;

use super::buffer::Buffer;
Expand Down Expand Up @@ -607,6 +608,13 @@ pub struct CountLines {
quoting: bool,
}

#[derive(Copy, Clone, Debug)]
pub struct LineStats {
newline_count: usize,
last_newline_offset: usize,
end_inside_string: bool,
}

impl CountLines {
pub fn new(quote_char: Option<u8>, eol_char: u8) -> Self {
let quoting = quote_char.is_some();
Expand All @@ -626,6 +634,98 @@ impl CountLines {
}
}

/// Analyzes a chunk of CSV data.
///
/// Returns (newline_count, last_newline_offset, end_inside_string) twice,
/// the first is assuming the start of the chunk is *not* inside a string,
/// the second assuming the start is inside a string.
pub fn analyze_chunk(&self, bytes: &[u8]) -> [LineStats; 2] {
let mut scan_offset = 0;
let mut states = [
LineStats {
newline_count: 0,
last_newline_offset: 0,
end_inside_string: false,
},
LineStats {
newline_count: 0,
last_newline_offset: 0,
end_inside_string: false,
},
];

// false if even number of quotes seen so far, true otherwise.
#[allow(unused_assignments)]
let mut global_quote_parity = false;

#[cfg(feature = "simd")]
{
// 0 if even number of quotes seen so far, u64::MAX otherwise.
let mut global_quote_parity_mask = 0;
while scan_offset + 64 <= bytes.len() {
let block: [u8; 64] = unsafe {
bytes
.get_unchecked(scan_offset..scan_offset + 64)
.try_into()
.unwrap_unchecked()
};
let simd_bytes = SimdVec::from(block);
let eol_mask = simd_bytes.simd_eq(self.simd_eol_char).to_bitmask();
if self.quoting {
let quote_mask = simd_bytes.simd_eq(self.simd_quote_char).to_bitmask();
let quote_parity =
prefix_xorsum_inclusive(quote_mask) ^ global_quote_parity_mask;
global_quote_parity_mask = ((quote_parity as i64) >> 63) as u64;

let start_outside_string_eol_mask = eol_mask & !quote_parity;
states[0].newline_count += start_outside_string_eol_mask.count_ones() as usize;
states[0].last_newline_offset = select_unpredictable(
start_outside_string_eol_mask != 0,
(scan_offset + 63)
.wrapping_sub(start_outside_string_eol_mask.leading_zeros() as usize),
states[0].last_newline_offset,
);

let start_inside_string_eol_mask = eol_mask & quote_parity;
states[1].newline_count += start_inside_string_eol_mask.count_ones() as usize;
states[1].last_newline_offset = select_unpredictable(
start_inside_string_eol_mask != 0,
(scan_offset + 63)
.wrapping_sub(start_inside_string_eol_mask.leading_zeros() as usize),
states[1].last_newline_offset,
);
} else {
states[0].newline_count += eol_mask.count_ones() as usize;
states[0].last_newline_offset = select_unpredictable(
eol_mask != 0,
(scan_offset + 63).wrapping_sub(eol_mask.leading_zeros() as usize),
states[0].last_newline_offset,
);
}

scan_offset += 64;
}

global_quote_parity = global_quote_parity_mask > 0;
}

while scan_offset < bytes.len() {
let c = unsafe { *bytes.get_unchecked(scan_offset) };
global_quote_parity ^= (c == self.quote_char) & self.quoting;

let state = &mut states[global_quote_parity as usize];
state.newline_count += (c == self.eol_char) as usize;
state.last_newline_offset =
select_unpredictable(c == self.eol_char, scan_offset, state.last_newline_offset);

scan_offset += 1;
}

states[0].end_inside_string = global_quote_parity;
states[1].end_inside_string = !global_quote_parity;
states
}

pub fn find_next(&self, bytes: &[u8], chunk_size: &mut usize) -> (usize, usize) {
loop {
let b = unsafe { bytes.get_unchecked(..(*chunk_size).min(bytes.len())) };
Expand Down
15 changes: 10 additions & 5 deletions crates/polars-plan/src/plans/ir/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::fmt::{self, Display, Formatter};
use polars_core::datatypes::AnyValue;
use polars_core::schema::Schema;
use polars_io::RowIndex;
use polars_utils::format_list_truncated;
use recursive::recursive;

use self::ir::dot::ScanSourcesDisplay;
Expand Down Expand Up @@ -272,16 +273,20 @@ impl<'a> IRDisplay<'a> {
..
} => {
let total_columns = schema.len();
let n_columns = if let Some(columns) = output_schema {
columns.len().to_string()
let (n_columns, projected) = if let Some(schema) = output_schema {
(
format!("{}", schema.len()),
format_list_truncated!(schema.iter_names(), 4, '"'),
)
} else {
"*".to_string()
("*".to_string(), "".to_string())
};
write!(
f,
"{:indent$}DF {:?}; PROJECT {}/{} COLUMNS",
"{:indent$}DF {}; PROJECT{} {}/{} COLUMNS",
"",
schema.iter_names().take(4).collect::<Vec<_>>(),
format_list_truncated!(schema.iter_names(), 4, '"'),
projected,
n_columns,
total_columns,
)
Expand Down
42 changes: 27 additions & 15 deletions crates/polars-plan/src/plans/ir/tree_format.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::fmt;

use polars_core::error::*;
use polars_utils::{format_list_container_truncated, format_list_truncated};
#[cfg(feature = "regex")]
use regex::Regex;

Expand Down Expand Up @@ -190,22 +191,33 @@ impl<'a> TreeFmtNode<'a> {
schema,
output_schema,
..
} => ND(
wh(
h,
&format!(
"DF {:?}\nPROJECT {}/{} COLUMNS",
schema.iter_names().take(4).collect::<Vec<_>>(),
if let Some(columns) = output_schema {
format!("{}", columns.len())
} else {
"*".to_string()
},
schema.len()
} => {
let (n_columns, projected) = if let Some(schema) = output_schema {
(
format!("{}", schema.len()),
format!(
": {};",
format_list_truncated!(schema.iter_names(), 4, '"')
),
)
} else {
("*".to_string(), "".to_string())
};
ND(
wh(
h,
&format!(
"DF {}\nPROJECT{} {}/{} COLUMNS",
format_list_truncated!(schema.iter_names(), 4, '"'),
projected,
n_columns,
schema.len()
),
),
),
vec![],
),
vec![],
)
},

Union { inputs, .. } => ND(
wh(
h,
Expand Down
1 change: 1 addition & 0 deletions crates/polars-plan/src/plans/optimizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub fn optimize(
#[allow(dead_code)]
let verbose = verbose();

#[cfg(feature = "python")]
if opt_flags.streaming() {
polars_warn!(
Deprecation,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,9 @@ pub(super) fn process_functions(
Ok(lp)
},
Explode { columns, .. } => {
columns.iter().for_each(|name| {
add_str_to_accumulated(
name.clone(),
&mut ctx.acc_projections,
&mut ctx.projected_names,
expr_arena,
)
});
columns
.iter()
.for_each(|name| add_str_to_accumulated(name.clone(), &mut ctx, expr_arena));
proj_pd.pushdown_and_assign(input, ctx, lp_arena, expr_arena)?;
Ok(IRBuilder::new(input, expr_arena, lp_arena)
.explode(columns.clone())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,25 @@ pub(super) fn process_unpivot(
// restart projection pushdown
proj_pd.no_pushdown_restart_opt(lp, ctx, lp_arena, expr_arena)
} else {
let (mut acc_projections, mut local_projections, mut projected_names) =
split_acc_projections(
ctx.acc_projections,
lp_arena.get(input).schema(lp_arena).as_ref(),
expr_arena,
false,
);
let (acc_projections, mut local_projections, projected_names) = split_acc_projections(
ctx.acc_projections,
lp_arena.get(input).schema(lp_arena).as_ref(),
expr_arena,
false,
);

if !local_projections.is_empty() {
local_projections.extend_from_slice(&acc_projections);
}
let mut ctx = ProjectionContext::new(acc_projections, projected_names, ctx.inner);

// make sure that the requested columns are projected
args.index.iter().for_each(|name| {
add_str_to_accumulated(
name.clone(),
&mut acc_projections,
&mut projected_names,
expr_arena,
)
});
args.on.iter().for_each(|name| {
add_str_to_accumulated(
name.clone(),
&mut acc_projections,
&mut projected_names,
expr_arena,
)
});
let ctx = ProjectionContext::new(acc_projections, projected_names, ctx.inner);
args.index
.iter()
.for_each(|name| add_str_to_accumulated(name.clone(), &mut ctx, expr_arena));
args.on
.iter()
.for_each(|name| add_str_to_accumulated(name.clone(), &mut ctx, expr_arena));

proj_pd.pushdown_and_assign(input, ctx, lp_arena, expr_arena)?;

Expand Down
Loading
Loading