Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 4f84645

Browse files
authored
refactor(df-repr): rm unionfind dependency (#228)
The current union find is our fork with `unsafe impl Send + Sync` on a `Rc` stored inside the union find, which obviously doesn't seem safe, so I rewrote it. The union find we implemented in theory is serializable by deriving `Serialize`. Signed-off-by: Alex Chi <[email protected]>
1 parent 3d81e65 commit 4f84645

File tree

5 files changed

+130
-12
lines changed

5 files changed

+130
-12
lines changed

Cargo.lock

-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-datafusion-repr/Cargo.toml

-1
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,4 @@ camelpaste = "0.1"
2222
datafusion-expr = "32.0.0"
2323
serde = { version = "1.0", features = ["derive"] }
2424
bincode = "1.3.3"
25-
union-find = { git = "https://github.com/Gun9niR/union-find-rs.git", rev = "794821514f7daefcbb8d5f38ef04e62fc18b5665" }
2625
value-bag = { version = "1", features = ["owned"] }

optd-datafusion-repr/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ mod optimizer_ext;
3131
pub mod plan_nodes;
3232
pub mod properties;
3333
pub mod rules;
34+
mod utils;
3435

3536
#[cfg(test)]
3637
mod testing;

optd-datafusion-repr/src/properties/column_ref.rs

+6-5
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ use std::sync::Arc;
1010
use anyhow::anyhow;
1111
use itertools::Itertools;
1212
use optd_core::property::PropertyBuilder;
13-
use union_find::disjoint_sets::DisjointSets;
14-
use union_find::union_find::UnionFind;
1513

1614
use super::schema::Catalog;
1715
use super::DEFAULT_NAME;
18-
use crate::plan_nodes::{
19-
decode_empty_relation_schema, ArcDfPredNode, BinOpType, ConstantPred, DfNodeType, DfPredType,
20-
DfReprPredNode, JoinType, LogOpType,
16+
use crate::{
17+
plan_nodes::{
18+
decode_empty_relation_schema, ArcDfPredNode, BinOpType, ConstantPred, DfNodeType,
19+
DfPredType, DfReprPredNode, JoinType, LogOpType,
20+
},
21+
utils::DisjointSets,
2122
};
2223

2324
pub type BaseTableColumnRefs = Vec<ColumnRef>;

optd-datafusion-repr/src/utils.rs

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Copyright (c) 2023-2024 CMU Database Group
2+
//
3+
// Use of this source code is governed by an MIT-style license that can be found in the LICENSE file or at
4+
// https://opensource.org/licenses/MIT.
5+
6+
//! optd's implementation of disjoint sets (union finds). It's send + sync + serializable.
7+
8+
use std::{collections::HashMap, hash::Hash};
9+
#[derive(Clone, Default)]
10+
pub struct DisjointSets<T: Clone> {
11+
data_idx: HashMap<T, usize>,
12+
parents: Vec<usize>,
13+
}
14+
15+
impl<T: Clone> std::fmt::Debug for DisjointSets<T> {
16+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
17+
write!(f, "DisjointSets")
18+
}
19+
}
20+
21+
impl<T: Clone + Eq + PartialEq + Hash> DisjointSets<T> {
22+
pub fn new() -> Self {
23+
Self {
24+
data_idx: HashMap::new(),
25+
parents: Vec::new(),
26+
}
27+
}
28+
29+
pub fn contains(&self, data: &T) -> bool {
30+
self.data_idx.contains_key(data)
31+
}
32+
33+
#[must_use]
34+
pub fn make_set(&mut self, data: T) -> Option<()> {
35+
if self.data_idx.contains_key(&data) {
36+
return None;
37+
}
38+
let idx = self.parents.len();
39+
self.data_idx.insert(data.clone(), idx);
40+
self.parents.push(idx);
41+
Some(())
42+
}
43+
44+
fn find(&mut self, mut idx: usize) -> usize {
45+
while self.parents[idx] != idx {
46+
self.parents[idx] = self.parents[self.parents[idx]];
47+
idx = self.parents[idx];
48+
}
49+
idx
50+
}
51+
52+
fn find_const(&self, mut idx: usize) -> usize {
53+
while self.parents[idx] != idx {
54+
idx = self.parents[idx];
55+
}
56+
idx
57+
}
58+
59+
#[must_use]
60+
pub fn union(&mut self, data1: &T, data2: &T) -> Option<()> {
61+
let idx1 = *self.data_idx.get(data1)?;
62+
let idx2 = *self.data_idx.get(data2)?;
63+
let parent1 = self.find(idx1);
64+
let parent2 = self.find(idx2);
65+
if parent1 != parent2 {
66+
self.parents[parent1] = parent2;
67+
}
68+
Some(())
69+
}
70+
71+
pub fn same_set(&self, data1: &T, data2: &T) -> Option<bool> {
72+
let idx1 = *self.data_idx.get(data1)?;
73+
let idx2 = *self.data_idx.get(data2)?;
74+
Some(self.find_const(idx1) == self.find_const(idx2))
75+
}
76+
77+
pub fn set_size(&self, data: &T) -> Option<usize> {
78+
let idx = *self.data_idx.get(data)?;
79+
let parent = self.find_const(idx);
80+
Some(
81+
self.parents
82+
.iter()
83+
.filter(|&&x| self.find_const(x) == parent)
84+
.count(),
85+
)
86+
}
87+
}
88+
89+
#[cfg(test)]
90+
mod tests {
91+
use super::*;
92+
#[test]
93+
fn test_union_find() {
94+
let mut set = DisjointSets::new();
95+
set.make_set("a").unwrap();
96+
set.make_set("b").unwrap();
97+
set.make_set("c").unwrap();
98+
set.make_set("d").unwrap();
99+
set.make_set("e").unwrap();
100+
assert!(set.same_set(&"a", &"a").unwrap());
101+
assert!(!set.same_set(&"a", &"b").unwrap());
102+
assert_eq!(set.set_size(&"a").unwrap(), 1);
103+
assert_eq!(set.set_size(&"c").unwrap(), 1);
104+
set.union(&"a", &"b").unwrap();
105+
assert_eq!(set.set_size(&"a").unwrap(), 2);
106+
assert_eq!(set.set_size(&"c").unwrap(), 1);
107+
assert!(set.same_set(&"a", &"b").unwrap());
108+
assert!(!set.same_set(&"a", &"c").unwrap());
109+
set.union(&"b", &"c").unwrap();
110+
assert!(set.same_set(&"a", &"c").unwrap());
111+
assert!(!set.same_set(&"a", &"d").unwrap());
112+
assert_eq!(set.set_size(&"a").unwrap(), 3);
113+
assert_eq!(set.set_size(&"d").unwrap(), 1);
114+
set.union(&"d", &"e").unwrap();
115+
assert!(set.same_set(&"d", &"e").unwrap());
116+
assert!(!set.same_set(&"a", &"d").unwrap());
117+
assert_eq!(set.set_size(&"a").unwrap(), 3);
118+
assert_eq!(set.set_size(&"d").unwrap(), 2);
119+
set.union(&"c", &"e").unwrap();
120+
assert!(set.same_set(&"a", &"e").unwrap());
121+
assert_eq!(set.set_size(&"d").unwrap(), 5);
122+
}
123+
}

0 commit comments

Comments
 (0)