Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `execute_query_on_graph` now skips nodes that are not part of the output
(optional negated nodes) and makes sure the resulting iterator only produces
unique results.
- Queries with `@` could have extremly slow execution plans when the query
planner introduces an inverted `@` operator and miscalculated the cost
compared to the non-inverted version.
- Frequency queries now execute the (additional) timeout check after a certain
number of matches are processed, not if a specific tuple value has reached a
treshold.

## [4.0.0] - 2025-08-20

Expand Down
3 changes: 2 additions & 1 deletion core/src/annostorage/inmemory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
anno_keys: SymbolTable<AnnoKey>,
anno_values: SymbolTable<String>,

/// additional statistical information
/// Sampled histograms for each annotation key .
/// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
histogram_bounds: BTreeMap<usize, Vec<String>>,
largest_item: Option<T>,
total_number_of_annos: usize,
Expand Down
3 changes: 2 additions & 1 deletion core/src/annostorage/ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ where

anno_key_sizes: BTreeMap<AnnoKey, usize>,

/// additional statistical information
/// Sampled histograms for each annotation key .
/// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
histogram_bounds: BTreeMap<AnnoKey, Vec<String>>,
largest_item: Option<T>,

Expand Down
9 changes: 0 additions & 9 deletions core/src/util/disk_collections.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,6 @@ const BLOCK_MAX_SIZE: usize = 4 * KB;
/// Uses a cache for each disk table with 8 MB capacity.
pub const DEFAULT_BLOCK_CACHE_CAPACITY: usize = 8 * MB;

#[derive(Serialize, Deserialize)]
struct Entry<K, V>
where
K: Ord,
{
key: K,
value: V,
}

pub enum EvictionStrategy {
MaximumItems(usize),
}
Expand Down
101 changes: 63 additions & 38 deletions graphannis/src/annis/db/aql/operators/edge_op.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct BaseEdgeOpSpec {
pub edge_anno: Option<EdgeAnnoSearchSpec>,
pub is_reflexive: bool,
pub op_str: Option<String>,
pub check_cost_for_inverse_operator: bool,
pub inverse_operator_needs_cost_check: bool,
}

struct BaseEdgeOp {
Expand All @@ -45,34 +45,7 @@ impl BaseEdgeOp {
gs.push(gs_for_component);
}

let all_part_of_components = spec
.components
.iter()
.all(|c| c.get_type() == AnnotationComponentType::PartOf);

let max_nodes_estimate = if all_part_of_components && gs.len() == 1 {
// PartOf components have a very skewed distribution of root nodes
// vs. the actual possible targets, thus do not use all nodes as
// population but only the non-roots.
if let Some(stats) = gs[0].get_statistics() {
stats.nodes - stats.root_nodes
} else {
// Fallback to guessing by using the node type
db.get_node_annos().guess_max_count(
Some(&NODE_TYPE_KEY.ns),
&NODE_TYPE_KEY.name,
"corpus",
"datasource",
)?
}
} else {
db.get_node_annos().guess_max_count(
Some(&NODE_TYPE_KEY.ns),
&NODE_TYPE_KEY.name,
"node",
"node",
)?
};
let max_nodes_estimate = calculate_max_node_estimate(db, &spec, &gs, false)?;
Ok(BaseEdgeOp {
gs,
spec,
Expand All @@ -82,6 +55,48 @@ impl BaseEdgeOp {
}
}

fn calculate_max_node_estimate(
db: &AnnotationGraph,
spec: &BaseEdgeOpSpec,
gs: &[Arc<dyn GraphStorage>],
inverse: bool,
) -> Result<usize> {
let all_components_are_partof = spec
.components
.iter()
.all(|c| c.get_type() == AnnotationComponentType::PartOf);
let max_nodes_estimate = if all_components_are_partof && gs.len() == 1 {
// PartOf components have a very skewed distribution of root nodes vs.
// the actual possible targets, thus do not use all nodes as population
// but only the non-roots. We can only use this formula for the actual
// @* operator, but not the inverted one.
if !inverse && let Some(stats) = gs[0].get_statistics() {
stats.nodes - stats.root_nodes
} else {
// Fallback to guessing how many nodes have the node type "corpus"
// or "datasource" and thus could be reachable as RHS in a worst case
// scenario. Since a node can't be part of itself, subtract 1 for
// the node on the LHS.
db.get_node_annos()
.guess_max_count(
Some(&NODE_TYPE_KEY.ns),
&NODE_TYPE_KEY.name,
"corpus",
"datasource",
)?
.saturating_sub(1)
}
} else {
db.get_node_annos().guess_max_count(
Some(&NODE_TYPE_KEY.ns),
&NODE_TYPE_KEY.name,
"node",
"node",
)?
};
Ok(max_nodes_estimate)
}

impl BinaryOperatorSpec for BaseEdgeOpSpec {
fn necessary_components(
&self,
Expand Down Expand Up @@ -286,13 +301,15 @@ impl BinaryOperatorBase for BaseEdgeOp {

fn get_inverse_operator<'a>(
&self,
_graph: &'a AnnotationGraph,
graph: &'a AnnotationGraph,
) -> Result<Option<BinaryOperator<'a>>> {
let inverse = !self.inverse;

// Check if all graph storages have the same inverse cost. If not, we
// don't provide an inverse operator, because the plans would not
// account for the different costs
for g in &self.gs {
if self.spec.check_cost_for_inverse_operator && !g.inverse_has_same_cost() {
if self.spec.inverse_operator_needs_cost_check && !g.inverse_has_same_cost() {
return Ok(None);
}
if let Some(stat) = g.get_statistics() {
Expand All @@ -302,11 +319,12 @@ impl BinaryOperatorBase for BaseEdgeOp {
}
}
}
let max_nodes_estimate = calculate_max_node_estimate(graph, &self.spec, &self.gs, inverse)?;
let edge_op = BaseEdgeOp {
gs: self.gs.clone(),
spec: self.spec.clone(),
max_nodes_estimate: self.max_nodes_estimate,
inverse: !self.inverse,
max_nodes_estimate,
inverse,
};
Ok(Some(BinaryOperator::Index(Box::new(edge_op))))
}
Expand All @@ -317,7 +335,11 @@ impl BinaryOperatorBase for BaseEdgeOp {
return Ok(EstimationType::Selectivity(0.0));
}

let max_nodes: f64 = self.max_nodes_estimate as f64;
let mut max_nodes: f64 = self.max_nodes_estimate as f64;
// Avoid division by 0
if max_nodes == 0.0 {
max_nodes = 1.0;
}

let mut worst_sel: f64 = 0.0;

Expand Down Expand Up @@ -624,7 +646,7 @@ impl BinaryOperatorSpec for DominanceSpec {
dist: self.dist.clone(),
edge_anno: self.edge_anno.clone(),
is_reflexive: true,
check_cost_for_inverse_operator: true,
inverse_operator_needs_cost_check: true,
};
base.create_operator(db, cost_estimate)
}
Expand Down Expand Up @@ -676,7 +698,7 @@ impl BinaryOperatorSpec for PointingSpec {
edge_anno: self.edge_anno.clone(),
is_reflexive: true,
op_str: Some(op_str),
check_cost_for_inverse_operator: true,
inverse_operator_needs_cost_check: true,
};
base.create_operator(db, cost_estimate)
}
Expand Down Expand Up @@ -721,7 +743,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
ANNIS_NS.into(),
"".into(),
)];
let check_cost_for_inverse_operator = if let Some((_, rhs)) = cost_estimate {
let inverse_operator_needs_cost_check = if let Some((_, rhs)) = cost_estimate {
// Only ignore different cost and risk a nested loop join if the RHS
// has an estimated output size of 1 and thus a nested loop is not
// as costly.
Expand All @@ -735,7 +757,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
dist: self.dist.clone(),
edge_anno: None,
is_reflexive: false,
check_cost_for_inverse_operator,
inverse_operator_needs_cost_check,
};

base.create_operator(db, cost_estimate)
Expand All @@ -751,3 +773,6 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
self
}
}

#[cfg(test)]
mod tests;
88 changes: 88 additions & 0 deletions graphannis/src/annis/db/aql/operators/edge_op/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
use graphannis_core::graph::{
ANNIS_NS,
update::{GraphUpdate, UpdateEvent},
};

use crate::{
AnnotationGraph,
annis::{
db::{
aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec},
exec::CostEstimate,
},
operator::{BinaryOperatorBase, BinaryOperatorSpec},
},
};

/// Tests that if you invert a @* operator, the cost estimate stays the same.
#[test]
fn inverted_partof_has_same_estimate() {
// Create a simple annotation graph a chain of PartOf edges, so that the
// fan-out and inverse fan-out of the PartOf component are both 1.
// It has the following nodes, connected by a PartOf edge each
// - root
// - root/c
// - root/c/c
// - root/c/c/c
// - ...
// - root/c/c/c/c/c/c/c/c/c/c
let mut update = GraphUpdate::new();
update
.add_event(UpdateEvent::AddNode {
node_name: "root".to_string(),
node_type: "corpus".to_string(),
})
.unwrap();
for i in 1..10 {
let source_path = format!("root{}", "/c".repeat(i));
let target_path = format!("root{}", "/c".repeat(i - 1));
update
.add_event(UpdateEvent::AddNode {
node_name: source_path.to_string(),
node_type: "corpus".to_string(),
})
.unwrap();

update
.add_event(UpdateEvent::AddEdge {
source_node: source_path.to_string(),
target_node: target_path.to_string(),
layer: ANNIS_NS.to_string(),
component_type: "PartOf".to_string(),
component_name: "".to_string(),
})
.unwrap();
}

let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
g.apply_update(&mut update, |_| {}).unwrap();

// Define an operator and a realistic cost estimate for LHS and RHS
let spec = PartOfSubCorpusSpec {
dist: RangeSpec::Unbound,
};
let cost_estimate_lhs = CostEstimate {
output: 1,
intermediate_sum: 0,
processed_in_step: 0,
};
let cost_estimate_rhs = CostEstimate {
output: 1,
intermediate_sum: 0,
processed_in_step: 0,
};

let operator = spec
.create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
.unwrap();

let orig_estimate = operator.estimation_type().unwrap();

let inverted_operator = operator.get_inverse_operator(&g).unwrap();
assert_eq!(true, inverted_operator.is_some());
let inverted_operator = inverted_operator.unwrap();

let inverted_estimate = inverted_operator.estimation_type().unwrap();

assert_eq!(orig_estimate, inverted_estimate);
}
12 changes: 9 additions & 3 deletions graphannis/src/annis/db/corpusstorage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ mod subgraph;
#[cfg(test)]
mod tests;

/// After how many produces tuples the timeout check should be manually triggered (in case the underlying joint did not already check the timeout)
const TIMEOUT_CHECK_TUPLE_COUNT: u64 = 1_000;

enum CacheEntry {
Loaded(AnnotationGraph),
NotLoaded,
Expand Down Expand Up @@ -1602,7 +1605,7 @@ impl CorpusStorage {

for _ in plan {
total_count += 1;
if total_count % 1_000 == 0 {
if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
timeout.check()?;
}
}
Expand Down Expand Up @@ -1666,7 +1669,7 @@ impl CorpusStorage {
}
match_count += 1;

if match_count % 1_000 == 0 {
if match_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
timeout.check()?;
}
}
Expand Down Expand Up @@ -2307,9 +2310,11 @@ impl CorpusStorage {

let plan =
ExecutionPlan::from_disjunction(&prep.query, db, &self.query_config, timeout)?;
let mut total_count: u64 = 0;

for mgroup in plan {
let mgroup = mgroup?;

// for each match, extract the defined annotation (by its key) from the result node
let mut tuple: Vec<String> = Vec::with_capacity(annokeys.len());
for (node_ref, anno_keys) in &annokeys {
Expand All @@ -2328,7 +2333,8 @@ impl CorpusStorage {
let tuple_count: &mut usize = tuple_frequency.entry(tuple).or_insert(0);
*tuple_count += 1;

if *tuple_count % 1_000 == 0 {
total_count += 1;
if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
timeout.check()?;
}
}
Expand Down
2 changes: 1 addition & 1 deletion graphannis/src/annis/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ impl EdgeAnnoSearchSpec {
}

/// Represents the different strategies to estimate the output of size of applying an operator.
#[derive(Clone)]
#[derive(Clone, Debug, PartialEq)]
pub enum EstimationType {
/// Estimate using the given selectivity.
/// This means the cross product of the input sizes is multiplied with this factor to get the output size.
Expand Down
Loading