Skip to content

Commit fb32b9d

Browse files
authored
Merge pull request #277 from korpling/feature/optimize-token-search
Optimize speed of loading corpora into memory
2 parents b1f1dca + cf7df84 commit fb32b9d

File tree

13 files changed

+369
-140
lines changed

13 files changed

+369
-140
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
55

66
## [Unreleased]
77

8+
### Added
9+
10+
- New `Graph::ensure_loaded_parallel` function to load needed graph storages in
11+
parallel.
12+
13+
### Fixed
14+
15+
- Do not attempt to unload corpora that are not loaded when trying to free
16+
memory.
17+
- Improve performance of loading a main memory corpus by using the standard
18+
`HashMap` for fields that are deserialized.
19+
820
## [3.0.0] - 2023-11-28
921

1022
### Added

core/src/annostorage/inmemory.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ use crate::annostorage::ValueSearch;
33
use crate::errors::Result;
44
use crate::graph::NODE_NAME_KEY;
55
use crate::types::{AnnoKey, Annotation, Edge, NodeID};
6-
use crate::util;
6+
use crate::util::{self};
77
use crate::{annostorage::symboltable::SymbolTable, errors::GraphAnnisCoreError};
88
use core::ops::Bound::*;
99
use itertools::Itertools;
10-
use rustc_hash::{FxHashMap, FxHashSet};
10+
use rustc_hash::FxHashSet;
1111
use smartstring::alias::String;
1212
use smartstring::{LazyCompact, SmartString};
1313
use std::borrow::Cow;
@@ -22,13 +22,13 @@ struct SparseAnnotation {
2222
val: usize,
2323
}
2424

25-
type ValueItemMap<T> = FxHashMap<usize, Vec<T>>;
25+
type ValueItemMap<T> = HashMap<usize, Vec<T>>;
2626

2727
#[derive(Serialize, Deserialize, Clone, Default)]
2828
pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
29-
by_container: FxHashMap<T, Vec<SparseAnnotation>>,
29+
by_container: HashMap<T, Vec<SparseAnnotation>>,
3030
/// A map from an annotation key symbol to a map of all its values to the items having this value for the annotation key
31-
by_anno: FxHashMap<usize, ValueItemMap<T>>,
31+
by_anno: HashMap<usize, ValueItemMap<T>>,
3232
/// Maps a distinct annotation key to the number of elements having this annotation key.
3333
anno_key_sizes: BTreeMap<AnnoKey, usize>,
3434
anno_keys: SymbolTable<AnnoKey>,
@@ -45,8 +45,8 @@ impl<T: Ord + Hash + Clone + serde::Serialize + serde::de::DeserializeOwned + De
4545
{
4646
pub fn new() -> AnnoStorageImpl<T> {
4747
AnnoStorageImpl {
48-
by_container: FxHashMap::default(),
49-
by_anno: FxHashMap::default(),
48+
by_container: HashMap::default(),
49+
by_anno: HashMap::default(),
5050
anno_keys: SymbolTable::new(),
5151
anno_values: SymbolTable::new(),
5252
anno_key_sizes: BTreeMap::new(),

core/src/annostorage/symboltable.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::errors::{GraphAnnisCoreError, Result};
22
use rustc_hash::FxHashMap;
33
use serde::{Deserialize, Serialize};
4+
use std::collections::HashMap;
45
use std::hash::Hash;
56
use std::sync::Arc;
67

@@ -23,14 +24,14 @@ where
2324
let by_id = Vec::default();
2425
SymbolTable {
2526
by_id,
26-
by_value: FxHashMap::default(),
27+
by_value: HashMap::default(),
2728
empty_slots: Vec::default(),
2829
}
2930
}
3031

3132
pub fn after_deserialization(&mut self) {
3233
// restore the by_value map and make sure the smart pointers point to the same instance
33-
//self.by_value.reserve(self.by_id.len());
34+
self.by_value.reserve(self.by_id.len());
3435
for i in 0..self.by_id.len() {
3536
if let Some(ref existing) = self.by_id[i] {
3637
self.by_value.insert(existing.clone(), i);

core/src/graph/mod.rs

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -793,23 +793,7 @@ impl<CT: ComponentType> Graph<CT> {
793793
}
794794
}
795795

796-
// load missing components in parallel
797-
let loaded_components: Vec<(_, Result<Arc<dyn GraphStorage>>)> = components_to_load
798-
.into_par_iter()
799-
.map(|c| match component_path(&self.location, &c) {
800-
Some(cpath) => {
801-
debug!("loading component {} from {}", c, &cpath.to_string_lossy());
802-
(c, load_component_from_disk(&cpath))
803-
}
804-
None => (c, Err(GraphAnnisCoreError::EmptyComponentPath)),
805-
})
806-
.collect();
807-
808-
// insert all the loaded components
809-
for (c, gs) in loaded_components {
810-
let gs = gs?;
811-
self.components.insert(c, Some(gs));
812-
}
796+
self.ensure_loaded_parallel(&components_to_load)?;
813797
Ok(())
814798
}
815799

@@ -833,6 +817,35 @@ impl<CT: ComponentType> Graph<CT> {
833817
Ok(())
834818
}
835819

820+
/// Ensure that the graph storage for a the given component is loaded and ready to use.
821+
/// Loading is done in paralell.
822+
pub fn ensure_loaded_parallel(&mut self, components_to_load: &[Component<CT>]) -> Result<()> {
823+
// We only load known components, so check the map if the entry exists
824+
let components_to_load: Vec<_> = components_to_load
825+
.iter()
826+
.filter(|c| self.components.contains_key(c))
827+
.collect();
828+
829+
// load missing components in parallel
830+
let loaded_components: Vec<(_, Result<Arc<dyn GraphStorage>>)> = components_to_load
831+
.into_par_iter()
832+
.map(|c| match component_path(&self.location, c) {
833+
Some(cpath) => {
834+
debug!("loading component {} from {}", c, &cpath.to_string_lossy());
835+
(c, load_component_from_disk(&cpath))
836+
}
837+
None => (c, Err(GraphAnnisCoreError::EmptyComponentPath)),
838+
})
839+
.collect();
840+
841+
// insert all the loaded components
842+
for (c, gs) in loaded_components {
843+
let gs = gs?;
844+
self.components.insert(c.clone(), Some(gs));
845+
}
846+
Ok(())
847+
}
848+
836849
pub fn optimize_impl(&mut self, disk_based: bool) -> Result<()> {
837850
self.ensure_loaded_all()?;
838851

core/src/graph/storage/adjacencylist.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,20 @@ use crate::{
99

1010
use super::{EdgeContainer, GraphStatistic, GraphStorage, WriteableGraphStorage};
1111
use itertools::Itertools;
12-
use rustc_hash::{FxHashMap, FxHashSet};
12+
use rustc_hash::FxHashSet;
1313
use serde::Deserialize;
14-
use std::collections::BTreeSet;
14+
use std::collections::{BTreeSet, HashMap};
1515
use std::{ops::Bound, path::Path};
1616

1717
#[derive(Serialize, Deserialize, Clone)]
1818
pub struct AdjacencyListStorage {
19-
edges: FxHashMap<NodeID, Vec<NodeID>>,
20-
inverse_edges: FxHashMap<NodeID, Vec<NodeID>>,
19+
edges: HashMap<NodeID, Vec<NodeID>>,
20+
inverse_edges: HashMap<NodeID, Vec<NodeID>>,
2121
annos: AnnoStorageImpl<Edge>,
2222
stats: Option<GraphStatistic>,
2323
}
2424

25-
fn get_fan_outs(edges: &FxHashMap<NodeID, Vec<NodeID>>) -> Vec<usize> {
25+
fn get_fan_outs(edges: &HashMap<NodeID, Vec<NodeID>>) -> Vec<usize> {
2626
let mut fan_outs: Vec<usize> = Vec::new();
2727
if !edges.is_empty() {
2828
for outgoing in edges.values() {
@@ -44,8 +44,8 @@ impl Default for AdjacencyListStorage {
4444
impl AdjacencyListStorage {
4545
pub fn new() -> AdjacencyListStorage {
4646
AdjacencyListStorage {
47-
edges: FxHashMap::default(),
48-
inverse_edges: FxHashMap::default(),
47+
edges: HashMap::default(),
48+
inverse_edges: HashMap::default(),
4949
annos: AnnoStorageImpl::new(),
5050
stats: None,
5151
}

core/src/graph/storage/dense_adjacency.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ use crate::{
99
};
1010
use itertools::Itertools;
1111
use num_traits::ToPrimitive;
12-
use rustc_hash::{FxHashMap, FxHashSet};
12+
use rustc_hash::FxHashSet;
1313
use serde::Deserialize;
14-
use std::{ops::Bound, path::Path};
14+
use std::{collections::HashMap, ops::Bound, path::Path};
1515

1616
#[derive(Serialize, Deserialize, Clone)]
1717
pub struct DenseAdjacencyListStorage {
1818
edges: Vec<Option<NodeID>>,
19-
inverse_edges: FxHashMap<NodeID, Vec<NodeID>>,
19+
inverse_edges: HashMap<NodeID, Vec<NodeID>>,
2020
annos: AnnoStorageImpl<Edge>,
2121
stats: Option<GraphStatistic>,
2222
}
@@ -31,7 +31,7 @@ impl DenseAdjacencyListStorage {
3131
pub fn new() -> DenseAdjacencyListStorage {
3232
DenseAdjacencyListStorage {
3333
edges: Vec::default(),
34-
inverse_edges: FxHashMap::default(),
34+
inverse_edges: HashMap::default(),
3535
annos: AnnoStorageImpl::new(),
3636
stats: None,
3737
}

core/src/graph/storage/linear.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@ use crate::{
88
graph::NODE_NAME_KEY,
99
types::{Edge, NodeID, NumValue},
1010
};
11-
use rustc_hash::FxHashMap;
1211
use rustc_hash::FxHashSet;
1312
use serde::{Deserialize, Serialize};
14-
use std::{clone::Clone, path::Path};
13+
use std::{clone::Clone, collections::HashMap, path::Path};
1514

1615
#[derive(Serialize, Deserialize, Clone)]
1716
struct RelativePosition<PosT> {
@@ -21,8 +20,8 @@ struct RelativePosition<PosT> {
2120

2221
#[derive(Serialize, Deserialize, Clone)]
2322
pub struct LinearGraphStorage<PosT: NumValue> {
24-
node_to_pos: FxHashMap<NodeID, RelativePosition<PosT>>,
25-
node_chains: FxHashMap<NodeID, Vec<NodeID>>,
23+
node_to_pos: HashMap<NodeID, RelativePosition<PosT>>,
24+
node_chains: HashMap<NodeID, Vec<NodeID>>,
2625
annos: AnnoStorageImpl<Edge>,
2726
stats: Option<GraphStatistic>,
2827
}
@@ -33,8 +32,8 @@ where
3332
{
3433
pub fn new() -> LinearGraphStorage<PosT> {
3534
LinearGraphStorage {
36-
node_to_pos: FxHashMap::default(),
37-
node_chains: FxHashMap::default(),
35+
node_to_pos: HashMap::default(),
36+
node_chains: HashMap::default(),
3837
annos: AnnoStorageImpl::new(),
3938
stats: None,
4039
}
@@ -97,6 +96,15 @@ where
9796
Box::from(std::iter::empty())
9897
}
9998

99+
fn has_ingoing_edges(&self, node: NodeID) -> Result<bool> {
100+
let result = self
101+
.node_to_pos
102+
.get(&node)
103+
.map(|pos| !pos.pos.is_zero())
104+
.unwrap_or(false);
105+
Ok(result)
106+
}
107+
100108
fn source_nodes<'a>(&'a self) -> Box<dyn Iterator<Item = Result<NodeID>> + 'a> {
101109
// use the node chains to find source nodes, but always skip the last element
102110
// because the last element is only a target node, not a source node

core/src/graph/storage/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,16 @@ pub trait EdgeContainer: Sync + Send {
8585
node: NodeID,
8686
) -> Box<dyn Iterator<Item = Result<NodeID>> + 'a>;
8787

88+
/// Return true of the given node has any incoming edges.
89+
fn has_ingoing_edges(&self, node: NodeID) -> Result<bool> {
90+
if let Some(ingoing) = self.get_ingoing_edges(node).next() {
91+
ingoing?;
92+
Ok(true)
93+
} else {
94+
Ok(false)
95+
}
96+
}
97+
8898
fn get_statistics(&self) -> Option<&GraphStatistic> {
8999
None
90100
}

core/src/util/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
2+
23
use std::borrow::Cow;
34

45
pub mod disk_collections;

graphannis/src/annis/db/corpusstorage.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -954,7 +954,7 @@ impl CorpusStorage {
954954
// make it known to the cache
955955
cache.insert(
956956
corpus_name.clone(),
957-
Arc::new(RwLock::new(CacheEntry::Loaded(graph))),
957+
Arc::new(RwLock::new(CacheEntry::NotLoaded)),
958958
);
959959
check_cache_size_and_remove_with_cache(
960960
cache,
@@ -1389,9 +1389,7 @@ impl CorpusStorage {
13891389
{
13901390
let mut lock = db_entry.write()?;
13911391
let db = get_write_or_error(&mut lock)?;
1392-
for c in missing_components {
1393-
db.ensure_loaded(&c)?;
1394-
}
1392+
db.ensure_loaded_parallel(&missing_components)?;
13951393
}
13961394
self.check_cache_size_and_remove(vec![corpus_name], true)?;
13971395
};
@@ -2502,8 +2500,13 @@ fn check_cache_size_and_remove_with_cache(
25022500
// but never remove the last loaded entry
25032501
let all_corpus_names: Vec<String> = cache.keys().cloned().collect();
25042502
for corpus_name in all_corpus_names {
2503+
let corpus_is_loaded = if let Some(cache_entry) = cache.get(&corpus_name) {
2504+
matches!(*cache_entry.read()?, CacheEntry::Loaded(_))
2505+
} else {
2506+
false
2507+
};
25052508
if size_sum > max_cache_size {
2506-
if !keep.contains(corpus_name.as_str()) {
2509+
if corpus_is_loaded && !keep.contains(corpus_name.as_str()) {
25072510
cache.remove(&corpus_name);
25082511
// Re-measure the currently used memory size for this process
25092512
size_sum = memory_stats().map(|s| s.physical_mem).unwrap_or(usize::MAX);

0 commit comments

Comments
 (0)