Skip to content

Commit

Permalink
Merge pull request #279 from korpling/feature/export-stable-order
Browse files Browse the repository at this point in the history
Added function that allows to export to GraphML with a guaranteed order.
  • Loading branch information
thomaskrause authored Jan 10, 2024
2 parents fb8bbb2 + 1ca3631 commit 40c5c70
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 13 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
alternative for using a `CorpusStorage` when only one corpus is handled.
- New `Graph::ensure_loaded_parallel` function to load needed graph storages in
parallel.
- Added `graphannis_core::graph::serialization::graphml::export_stable_order`
function that allows to export to GraphML, but with a guaranteed order of the
elements.

### Fixed

Expand Down
189 changes: 176 additions & 13 deletions core/src/graph/serialization/graphml.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{
annostorage::ValueSearch,
annostorage::{Match, ValueSearch},
errors::{GraphAnnisCoreError, Result},
graph::{
update::{GraphUpdate, UpdateEvent},
Expand All @@ -8,13 +8,15 @@ use crate::{
types::{AnnoKey, Annotation, Component, ComponentType, Edge},
util::{join_qname, split_qname},
};
use itertools::Itertools;
use quick_xml::{
events::{
attributes::Attributes, BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event,
},
Reader, Writer,
};
use std::{
cmp::Ordering,
collections::{BTreeMap, BTreeSet, HashMap},
io::{BufReader, BufWriter, Read, Write},
str::FromStr,
Expand All @@ -23,6 +25,7 @@ use std::{
fn write_annotation_keys<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
has_graph_configuration: bool,
sorted: bool,
writer: &mut Writer<W>,
) -> Result<BTreeMap<AnnoKey, String>> {
let mut key_id_mapping = BTreeMap::new();
Expand All @@ -42,7 +45,11 @@ fn write_annotation_keys<CT: ComponentType, W: std::io::Write>(
}

// Create node annotation keys
for key in graph.get_node_annos().annotation_keys()? {
let mut anno_keys = graph.get_node_annos().annotation_keys()?;
if sorted {
anno_keys.sort_unstable();
}
for key in anno_keys {
if (key.ns != ANNIS_NS || key.name != NODE_NAME) && !key_id_mapping.contains_key(&key) {
let new_id = format!("k{}", id_counter);
id_counter += 1;
Expand All @@ -66,7 +73,11 @@ fn write_annotation_keys<CT: ComponentType, W: std::io::Write>(
CT::update_graph_index_components(graph)
.into_iter()
.collect();
for c in graph.get_all_components(None, None) {
let mut all_components = graph.get_all_components(None, None);
if sorted {
all_components.sort_unstable();
}
for c in all_components {
if !autogenerated_components.contains(&c) {
if let Some(gs) = graph.get_graphstorage(&c) {
for key in gs.get_anno_storage().annotation_keys()? {
Expand Down Expand Up @@ -115,15 +126,37 @@ fn write_data<W: std::io::Write>(
Ok(())
}

fn compare_results<T: Ord>(a: &Result<T>, b: &Result<T>) -> Ordering {
if let (Ok(a), Ok(b)) = (a, b) {
a.cmp(b)
} else if a.is_err() {
Ordering::Less
} else if b.is_err() {
Ordering::Greater
} else {
// Treat two errors as equal
Ordering::Equal
}
}

fn write_nodes<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
writer: &mut Writer<W>,
sorted: bool,
key_id_mapping: &BTreeMap<AnnoKey, String>,
) -> Result<()> {
for m in graph
.get_node_annos()
.exact_anno_search(Some(ANNIS_NS), NODE_TYPE, ValueSearch::Any)
{
let base_node_iterator =
graph
.get_node_annos()
.exact_anno_search(Some(ANNIS_NS), NODE_TYPE, ValueSearch::Any);
let node_iterator: Box<dyn Iterator<Item = Result<Match>>> = if sorted {
let it = base_node_iterator.sorted_unstable_by(compare_results);
Box::new(it)
} else {
Box::new(base_node_iterator)
};

for m in node_iterator {
let m = m?;
let mut node_start = BytesStart::new("node");

Expand Down Expand Up @@ -162,6 +195,7 @@ fn write_nodes<CT: ComponentType, W: std::io::Write>(
fn write_edges<CT: ComponentType, W: std::io::Write>(
graph: &Graph<CT>,
writer: &mut Writer<W>,
sorted: bool,
key_id_mapping: &BTreeMap<AnnoKey, String>,
) -> Result<()> {
let mut edge_counter = 0;
Expand All @@ -170,17 +204,35 @@ fn write_edges<CT: ComponentType, W: std::io::Write>(
.into_iter()
.collect();

for c in graph.get_all_components(None, None) {
let mut all_components = graph.get_all_components(None, None);
if sorted {
all_components.sort_unstable();
}

for c in all_components {
// Create edge annotation keys for all components, but skip auto-generated ones
if !autogenerated_components.contains(&c) {
if let Some(gs) = graph.get_graphstorage(&c) {
for source in gs.source_nodes() {
let source_nodes_iterator = if sorted {
Box::new(gs.source_nodes().sorted_unstable_by(compare_results))
} else {
gs.source_nodes()
};
for source in source_nodes_iterator {
let source = source?;
if let Some(source_id) = graph
.get_node_annos()
.get_value_for_item(&source, &NODE_NAME_KEY)?
{
for target in gs.get_outgoing_edges(source) {
let target_nodes_iterator = if sorted {
Box::new(
gs.get_outgoing_edges(source)
.sorted_unstable_by(compare_results),
)
} else {
gs.get_outgoing_edges(source)
};
for target in target_nodes_iterator {
let target = target?;
if let Some(target_id) = graph
.get_node_annos()
Expand Down Expand Up @@ -247,7 +299,75 @@ where

// Define all valid annotation ns/name pairs
progress_callback("exporting all available annotation keys");
let key_id_mapping = write_annotation_keys(graph, graph_configuration.is_some(), &mut writer)?;
let key_id_mapping =
write_annotation_keys(graph, graph_configuration.is_some(), false, &mut writer)?;

// We are writing a single graph
let mut graph_start = BytesStart::new("graph");
graph_start.push_attribute(("edgedefault", "directed"));
// Add parse helper information to allow more efficient parsing
graph_start.push_attribute(("parse.order", "nodesfirst"));
graph_start.push_attribute(("parse.nodeids", "free"));
graph_start.push_attribute(("parse.edgeids", "canonical"));

writer.write_event(Event::Start(graph_start))?;

// If graph configuration is given, add it as data element to the graph
if let Some(config) = graph_configuration {
let mut data_start = BytesStart::new("data");
// This is always the first key ID
data_start.push_attribute(("key", "k0"));
writer.write_event(Event::Start(data_start))?;
// Add the annotation value as internal text node
writer.write_event(Event::CData(BytesCData::new(config)))?;
writer.write_event(Event::End(BytesEnd::new("data")))?;
}

// Write out all nodes
progress_callback("exporting nodes");
write_nodes(graph, &mut writer, false, &key_id_mapping)?;

// Write out all edges
progress_callback("exporting edges");
write_edges(graph, &mut writer, false, &key_id_mapping)?;

writer.write_event(Event::End(BytesEnd::new("graph")))?;
writer.write_event(Event::End(BytesEnd::new("graphml")))?;

// Make sure to flush the buffered writer
writer.into_inner().flush()?;

Ok(())
}

/// Export the GraphML file and ensure a stable order of the XML elements.
///
/// This is slower than [`export`] but can e.g. be used in tests where the
/// output should always be the same.
pub fn export_stable_order<CT: ComponentType, W: std::io::Write, F>(
graph: &Graph<CT>,
graph_configuration: Option<&str>,
output: W,
progress_callback: F,
) -> Result<()>
where
F: Fn(&str),
{
// Always buffer the output
let output = BufWriter::new(output);
let mut writer = Writer::new_with_indent(output, b' ', 4);

// Add XML declaration
let xml_decl = BytesDecl::new("1.0", Some("UTF-8"), None);
writer.write_event(Event::Decl(xml_decl))?;

// Always write the root element
writer.write_event(Event::Start(BytesStart::new("graphml")))?;

// Define all valid annotation ns/name pairs
progress_callback("exporting all available annotation keys");
let key_id_mapping =
write_annotation_keys(graph, graph_configuration.is_some(), true, &mut writer)?;

// We are writing a single graph
let mut graph_start = BytesStart::new("graph");
Expand All @@ -272,11 +392,11 @@ where

// Write out all nodes
progress_callback("exporting nodes");
write_nodes(graph, &mut writer, &key_id_mapping)?;
write_nodes(graph, &mut writer, true, &key_id_mapping)?;

// Write out all edges
progress_callback("exporting edges");
write_edges(graph, &mut writer, &key_id_mapping)?;
write_edges(graph, &mut writer, true, &key_id_mapping)?;

writer.write_event(Event::End(BytesEnd::new("graph")))?;
writer.write_event(Event::End(BytesEnd::new("graphml")))?;
Expand Down Expand Up @@ -655,6 +775,49 @@ value = "test""#;
assert_eq!(expected, actual);
}

#[test]
fn export_graphml_sorted() {
// Create a sample graph using the simple type
let mut u = GraphUpdate::new();

u.add_event(UpdateEvent::AddNode {
node_name: "1".to_string(),
node_type: "node".to_string(),
})
.unwrap();
u.add_event(UpdateEvent::AddNode {
node_name: "2".to_string(),
node_type: "node".to_string(),
})
.unwrap();
u.add_event(UpdateEvent::AddNodeLabel {
node_name: "1".to_string(),
anno_ns: DEFAULT_NS.to_string(),
anno_name: "an_annotation".to_string(),
anno_value: "something".to_string(),
})
.unwrap();

u.add_event(UpdateEvent::AddEdge {
source_node: "1".to_string(),
target_node: "2".to_string(),
component_type: "Edge".to_string(),
layer: "some_ns".to_string(),
component_name: "test_component".to_string(),
})
.unwrap();

let mut g: Graph<DefaultComponentType> = Graph::new(false).unwrap();
g.apply_update(&mut u, |_| {}).unwrap();

// export to GraphML, read generated XML and compare it
let mut xml_data: Vec<u8> = Vec::default();
export_stable_order(&g, Some(TEST_CONFIG), &mut xml_data, |_| {}).unwrap();
let expected = include_str!("graphml_example sorted.graphml");
let actual = String::from_utf8(xml_data).unwrap();
assert_eq!(expected, actual);
}

#[test]
fn import_graphml() {
let input_xml = std::io::Cursor::new(
Expand Down
22 changes: 22 additions & 0 deletions core/src/graph/serialization/graphml_example sorted.graphml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<graphml>
<key id="k0" for="graph" attr.name="configuration" attr.type="string"/>
<key id="k1" for="node" attr.name="default_ns::an_annotation" attr.type="string"/>
<key id="k2" for="node" attr.name="annis::node_type" attr.type="string"/>
<graph edgedefault="directed" parse.order="nodesfirst" parse.nodeids="free" parse.edgeids="canonical">
<data key="k0"><![CDATA[[some]
key = "<value>"
[some.another]
value = "test"]]></data>
<node id="1">
<data key="k1">something</data>
<data key="k2">node</data>
</node>
<node id="2">
<data key="k2">node</data>
</node>
<edge id="e0" source="1" target="2" label="Edge/some_ns/test_component">
</edge>
</graph>
</graphml>

0 comments on commit 40c5c70

Please sign in to comment.