korpling · thomaskrause · Oct 8, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 8, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `execute_query_on_graph` now skips nodes that are not part of the output
   (optional negated nodes) and makes sure the resulting iterator only produces
   unique results.
+- Queries with `@` could have extremly slow execution plans when the query
+  planner introduces an inverted `@` operator and miscalculated the cost
+  compared to the non-inverted version.
+- Frequency queries now execute the (additional) timeout check after a certain
+  number of matches are processed, not if a specific tuple value has reached a
+  treshold.
 
 ## [4.0.0] - 2025-08-20
 

diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs
@@ -33,7 +33,8 @@ pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
     anno_keys: SymbolTable<AnnoKey>,
     anno_values: SymbolTable<String>,
 
-    /// additional statistical information
+    /// Sampled histograms for each annotation key .
+    /// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
     histogram_bounds: BTreeMap<usize, Vec<String>>,
     largest_item: Option<T>,
     total_number_of_annos: usize,

diff --git a/core/src/annostorage/ondisk.rs b/core/src/annostorage/ondisk.rs
@@ -48,7 +48,8 @@ where
 
     anno_key_sizes: BTreeMap<AnnoKey, usize>,
 
-    /// additional statistical information
+    /// Sampled histograms for each annotation key .
+    /// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
     histogram_bounds: BTreeMap<AnnoKey, Vec<String>>,
     largest_item: Option<T>,
 

diff --git a/core/src/util/disk_collections.rs b/core/src/util/disk_collections.rs
@@ -21,15 +21,6 @@ const BLOCK_MAX_SIZE: usize = 4 * KB;
 /// Uses a cache for each disk table with 8 MB capacity.
 pub const DEFAULT_BLOCK_CACHE_CAPACITY: usize = 8 * MB;
 
-#[derive(Serialize, Deserialize)]
-struct Entry<K, V>
-where
-    K: Ord,
-{
-    key: K,
-    value: V,
-}
-
 pub enum EvictionStrategy {
     MaximumItems(usize),
 }

diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs
@@ -24,7 +24,7 @@ struct BaseEdgeOpSpec {
     pub edge_anno: Option<EdgeAnnoSearchSpec>,
     pub is_reflexive: bool,
     pub op_str: Option<String>,
-    pub check_cost_for_inverse_operator: bool,
+    pub inverse_operator_needs_cost_check: bool,
 }
 
 struct BaseEdgeOp {
@@ -45,34 +45,7 @@ impl BaseEdgeOp {
             gs.push(gs_for_component);
         }
 
-        let all_part_of_components = spec
-            .components
-            .iter()
-            .all(|c| c.get_type() == AnnotationComponentType::PartOf);
-
-        let max_nodes_estimate = if all_part_of_components && gs.len() == 1 {
-            // PartOf components have a very skewed distribution of root nodes
-            // vs. the actual possible targets, thus do not use all nodes as
-            // population but only the non-roots.
-            if let Some(stats) = gs[0].get_statistics() {
-                stats.nodes - stats.root_nodes
-            } else {
-                // Fallback to guessing by using the node type
-                db.get_node_annos().guess_max_count(
-                    Some(&NODE_TYPE_KEY.ns),
-                    &NODE_TYPE_KEY.name,
-                    "corpus",
-                    "datasource",
-                )?
-            }
-        } else {
-            db.get_node_annos().guess_max_count(
-                Some(&NODE_TYPE_KEY.ns),
-                &NODE_TYPE_KEY.name,
-                "node",
-                "node",
-            )?
-        };
+        let max_nodes_estimate = calculate_max_node_estimate(db, &spec, &gs, false)?;
         Ok(BaseEdgeOp {
             gs,
             spec,
@@ -82,6 +55,48 @@ impl BaseEdgeOp {
     }
 }
 
+fn calculate_max_node_estimate(
+    db: &AnnotationGraph,
+    spec: &BaseEdgeOpSpec,
+    gs: &[Arc<dyn GraphStorage>],
+    inverse: bool,
+) -> Result<usize> {
+    let all_components_are_partof = spec
+        .components
+        .iter()
+        .all(|c| c.get_type() == AnnotationComponentType::PartOf);
+    let max_nodes_estimate = if all_components_are_partof && gs.len() == 1 {
+        // PartOf components have a very skewed distribution of root nodes vs.
+        // the actual possible targets, thus do not use all nodes as population
+        // but only the non-roots. We can only use this formula for the actual
+        // @* operator, but not the inverted one.
+        if !inverse && let Some(stats) = gs[0].get_statistics() {
+            stats.nodes - stats.root_nodes
+        } else {
+            // Fallback to guessing how many nodes have the node type "corpus"
+            // or "datasource" and thus could be reachable as RHS in a worst case
+            // scenario. Since a node can't be part of itself, subtract 1 for
+            // the node on the LHS.
+            db.get_node_annos()
+                .guess_max_count(
+                    Some(&NODE_TYPE_KEY.ns),
+                    &NODE_TYPE_KEY.name,
+                    "corpus",
+                    "datasource",
+                )?
+                .saturating_sub(1)
+        }
+    } else {
+        db.get_node_annos().guess_max_count(
+            Some(&NODE_TYPE_KEY.ns),
+            &NODE_TYPE_KEY.name,
+            "node",
+            "node",
+        )?
+    };
+    Ok(max_nodes_estimate)
+}
+
 impl BinaryOperatorSpec for BaseEdgeOpSpec {
     fn necessary_components(
         &self,
@@ -286,13 +301,15 @@ impl BinaryOperatorBase for BaseEdgeOp {
 
     fn get_inverse_operator<'a>(
         &self,
-        _graph: &'a AnnotationGraph,
+        graph: &'a AnnotationGraph,
     ) -> Result<Option<BinaryOperator<'a>>> {
+        let inverse = !self.inverse;
+
         // Check if all graph storages have the same inverse cost. If not, we
         // don't provide an inverse operator, because the plans would not
         // account for the different costs
         for g in &self.gs {
-            if self.spec.check_cost_for_inverse_operator && !g.inverse_has_same_cost() {
+            if self.spec.inverse_operator_needs_cost_check && !g.inverse_has_same_cost() {
                 return Ok(None);
             }
             if let Some(stat) = g.get_statistics() {
@@ -302,11 +319,12 @@ impl BinaryOperatorBase for BaseEdgeOp {
                 }
             }
         }
+        let max_nodes_estimate = calculate_max_node_estimate(graph, &self.spec, &self.gs, inverse)?;
         let edge_op = BaseEdgeOp {
             gs: self.gs.clone(),
             spec: self.spec.clone(),
-            max_nodes_estimate: self.max_nodes_estimate,
-            inverse: !self.inverse,
+            max_nodes_estimate,
+            inverse,
         };
         Ok(Some(BinaryOperator::Index(Box::new(edge_op))))
     }
@@ -317,7 +335,11 @@ impl BinaryOperatorBase for BaseEdgeOp {
             return Ok(EstimationType::Selectivity(0.0));
         }
 
-        let max_nodes: f64 = self.max_nodes_estimate as f64;
+        let mut max_nodes: f64 = self.max_nodes_estimate as f64;
+        // Avoid division by 0
+        if max_nodes == 0.0 {
+            max_nodes = 1.0;
+        }
 
         let mut worst_sel: f64 = 0.0;
 
@@ -624,7 +646,7 @@ impl BinaryOperatorSpec for DominanceSpec {
             dist: self.dist.clone(),
             edge_anno: self.edge_anno.clone(),
             is_reflexive: true,
-            check_cost_for_inverse_operator: true,
+            inverse_operator_needs_cost_check: true,
         };
         base.create_operator(db, cost_estimate)
     }
@@ -676,7 +698,7 @@ impl BinaryOperatorSpec for PointingSpec {
             edge_anno: self.edge_anno.clone(),
             is_reflexive: true,
             op_str: Some(op_str),
-            check_cost_for_inverse_operator: true,
+            inverse_operator_needs_cost_check: true,
         };
         base.create_operator(db, cost_estimate)
     }
@@ -721,7 +743,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
             ANNIS_NS.into(),
             "".into(),
         )];
-        let check_cost_for_inverse_operator = if let Some((_, rhs)) = cost_estimate {
+        let inverse_operator_needs_cost_check = if let Some((_, rhs)) = cost_estimate {
             // Only ignore different cost and risk a nested loop join if the RHS
             // has an estimated output size of 1 and thus a nested loop is not
             // as costly.
@@ -735,7 +757,7 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
             dist: self.dist.clone(),
             edge_anno: None,
             is_reflexive: false,
-            check_cost_for_inverse_operator,
+            inverse_operator_needs_cost_check,
         };
 
         base.create_operator(db, cost_estimate)
@@ -751,3 +773,6 @@ impl BinaryOperatorSpec for PartOfSubCorpusSpec {
         self
     }
 }
+
+#[cfg(test)]
+mod tests;
diff --git a/graphannis/src/annis/db/aql/operators/edge_op/tests.rs b/graphannis/src/annis/db/aql/operators/edge_op/tests.rs
@@ -0,0 +1,88 @@
+use graphannis_core::graph::{
+    ANNIS_NS,
+    update::{GraphUpdate, UpdateEvent},
+};
+
+use crate::{
+    AnnotationGraph,
+    annis::{
+        db::{
+            aql::{ast::RangeSpec, operators::PartOfSubCorpusSpec},
+            exec::CostEstimate,
+        },
+        operator::{BinaryOperatorBase, BinaryOperatorSpec},
+    },
+};
+
+/// Tests that if you invert a @* operator, the cost estimate stays the same.
+#[test]
+fn inverted_partof_has_same_estimate() {
+    // Create a simple annotation graph a chain of PartOf edges, so that the
+    // fan-out and inverse fan-out of the PartOf component are both 1.
+    // It has the following nodes, connected by a PartOf edge each
+    // - root
+    // - root/c
+    // - root/c/c
+    // - root/c/c/c
+    // - ...
+    // - root/c/c/c/c/c/c/c/c/c/c
+    let mut update = GraphUpdate::new();
+    update
+        .add_event(UpdateEvent::AddNode {
+            node_name: "root".to_string(),
+            node_type: "corpus".to_string(),
+        })
+        .unwrap();
+    for i in 1..10 {
+        let source_path = format!("root{}", "/c".repeat(i));
+        let target_path = format!("root{}", "/c".repeat(i - 1));
+        update
+            .add_event(UpdateEvent::AddNode {
+                node_name: source_path.to_string(),
+                node_type: "corpus".to_string(),
+            })
+            .unwrap();
+
+        update
+            .add_event(UpdateEvent::AddEdge {
+                source_node: source_path.to_string(),
+                target_node: target_path.to_string(),
+                layer: ANNIS_NS.to_string(),
+                component_type: "PartOf".to_string(),
+                component_name: "".to_string(),
+            })
+            .unwrap();
+    }
+
+    let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap();
+    g.apply_update(&mut update, |_| {}).unwrap();
+
+    // Define an operator and a realistic cost estimate for LHS and RHS
+    let spec = PartOfSubCorpusSpec {
+        dist: RangeSpec::Unbound,
+    };
+    let cost_estimate_lhs = CostEstimate {
+        output: 1,
+        intermediate_sum: 0,
+        processed_in_step: 0,
+    };
+    let cost_estimate_rhs = CostEstimate {
+        output: 1,
+        intermediate_sum: 0,
+        processed_in_step: 0,
+    };
+
+    let operator = spec
+        .create_operator(&g, Some((&cost_estimate_lhs, &cost_estimate_rhs)))
+        .unwrap();
+
+    let orig_estimate = operator.estimation_type().unwrap();
+
+    let inverted_operator = operator.get_inverse_operator(&g).unwrap();
+    assert_eq!(true, inverted_operator.is_some());
+    let inverted_operator = inverted_operator.unwrap();
+
+    let inverted_estimate = inverted_operator.estimation_type().unwrap();
+
+    assert_eq!(orig_estimate, inverted_estimate);
+}
diff --git a/graphannis/src/annis/db/corpusstorage.rs b/graphannis/src/annis/db/corpusstorage.rs
@@ -69,6 +69,9 @@ mod subgraph;
 #[cfg(test)]
 mod tests;
 
+/// After how many produces tuples the timeout check should be manually triggered (in case the underlying joint did not already check the timeout)
+const TIMEOUT_CHECK_TUPLE_COUNT: u64 = 1_000;
+
 enum CacheEntry {
     Loaded(AnnotationGraph),
     NotLoaded,
@@ -1602,7 +1605,7 @@ impl CorpusStorage {
 
             for _ in plan {
                 total_count += 1;
-                if total_count % 1_000 == 0 {
+                if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
                     timeout.check()?;
                 }
             }
@@ -1666,7 +1669,7 @@ impl CorpusStorage {
                 }
                 match_count += 1;
 
-                if match_count % 1_000 == 0 {
+                if match_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
                     timeout.check()?;
                 }
             }
@@ -2307,9 +2310,11 @@ impl CorpusStorage {
 
             let plan =
                 ExecutionPlan::from_disjunction(&prep.query, db, &self.query_config, timeout)?;
+            let mut total_count: u64 = 0;
 
             for mgroup in plan {
                 let mgroup = mgroup?;
+
                 // for each match, extract the defined annotation (by its key) from the result node
                 let mut tuple: Vec<String> = Vec::with_capacity(annokeys.len());
                 for (node_ref, anno_keys) in &annokeys {
@@ -2328,7 +2333,8 @@ impl CorpusStorage {
                 let tuple_count: &mut usize = tuple_frequency.entry(tuple).or_insert(0);
                 *tuple_count += 1;
 
-                if *tuple_count % 1_000 == 0 {
+                total_count += 1;
+                if total_count.is_multiple_of(TIMEOUT_CHECK_TUPLE_COUNT) {
                     timeout.check()?;
                 }
             }

diff --git a/graphannis/src/annis/operator.rs b/graphannis/src/annis/operator.rs
@@ -118,7 +118,7 @@ impl EdgeAnnoSearchSpec {
 }
 
 /// Represents the different strategies to estimate the output of size of applying an operator.
-#[derive(Clone)]
+#[derive(Clone, Debug, PartialEq)]
 pub enum EstimationType {
     /// Estimate using the given selectivity.
     /// This means the cross product of the input sizes is multiplied with this factor to get the output size.