Skip to content

Commit

Permalink
Merge #57
Browse files Browse the repository at this point in the history
57: Add a rustfmt config file into the project r=ManyTheFish a=Kerollmops



Co-authored-by: many <[email protected]>
Co-authored-by: Kerollmops <[email protected]>
  • Loading branch information
3 people authored Aug 18, 2021
2 parents 9a2ff20 + 5834fa0 commit c0b5cf7
Show file tree
Hide file tree
Showing 24 changed files with 689 additions and 565 deletions.
5 changes: 5 additions & 0 deletions .rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
unstable_features = true

use_small_heuristics = "max"
imports_granularity = "Module"
group_imports = "StdExternalCrate"
2 changes: 1 addition & 1 deletion benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
mod default_run;
mod initialization;
mod tokenizer;
mod normalizer;
mod tokenizer;

use criterion::{criterion_group, criterion_main, Criterion};

Expand Down
12 changes: 7 additions & 5 deletions benches/default_run.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use criterion::{black_box, BenchmarkId, Criterion};
use fst::Set;

use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};

pub fn criterion_benchmark(c: &mut Criterion, data_set: &[(&str, &str)]) {
Expand All @@ -15,15 +14,18 @@ pub fn criterion_benchmark(c: &mut Criterion, data_set: &[(&str, &str)]) {
let mut group = c.benchmark_group("default-run");

for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("default-run", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("default-run", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

group.finish();
}

fn run(analyzer: &Analyzer<Vec<u8>>, text: &str) {

let analyzed = analyzer.analyze(text);
black_box(analyzed.tokens().for_each(|_|{}));

black_box(analyzed.tokens().for_each(|_| {}));
}
47 changes: 32 additions & 15 deletions benches/initialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,36 @@ use std::collections::HashMap;

use criterion::{BenchmarkId, Criterion};
use fst::Set;

use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, detection::is_cjk};
use meilisearch_tokenizer::analyzer::{Language, Pipeline, Script};
use meilisearch_tokenizer::detection::is_cjk;
use meilisearch_tokenizer::normalizer::{DeunicodeNormalizer, LowercaseNormalizer, Normalizer};
use meilisearch_tokenizer::processors::ChineseTranslationPreProcessor;
use meilisearch_tokenizer::tokenizer::{Jieba, LegacyMeilisearch};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};

pub fn criterion_benchmark(c: &mut Criterion, data_set: &[(&str, &str)]) {
let mut group = c.benchmark_group("initialization");

for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("default", name), text, |b, s| b.iter(|| default_init(s)));
group.bench_with_input(BenchmarkId::new("default", name), text, |b, s| {
b.iter(|| default_init(s))
});
}

for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("pre:identity-tok:legacy-nor:deunicode+lowercase", name), text, |b, s| b.iter(|| legacy_tok_deunicode_lowercase_norm(s)));
group.bench_with_input(
BenchmarkId::new("pre:identity-tok:legacy-nor:deunicode+lowercase", name),
text,
|b, s| b.iter(|| legacy_tok_deunicode_lowercase_norm(s)),
);
}

for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("pre:translate-tok:jieba-nor:deunicode+lowercase", name), text, |b, s| b.iter(|| translation_pre_jieba_tok_deunicode_lowercase_norm(s)));
group.bench_with_input(
BenchmarkId::new("pre:translate-tok:jieba-nor:deunicode+lowercase", name),
text,
|b, s| b.iter(|| translation_pre_jieba_tok_deunicode_lowercase_norm(s)),
);
}

group.finish();
Expand All @@ -36,10 +46,12 @@ fn default_init(text: &str) {

fn legacy_tok_deunicode_lowercase_norm(text: &str) {
let mut pipeline_map: HashMap<(Script, Language), Pipeline> = HashMap::new();
let latin_normalizer: Vec<Box<dyn Normalizer>> = vec![Box::new(DeunicodeNormalizer::default()), Box::new(LowercaseNormalizer)];
pipeline_map.insert((Script::Other, Language::Other), Pipeline::default()
.set_tokenizer(LegacyMeilisearch)
.set_normalizer(latin_normalizer));
let latin_normalizer: Vec<Box<dyn Normalizer>> =
vec![Box::new(DeunicodeNormalizer::default()), Box::new(LowercaseNormalizer)];
pipeline_map.insert(
(Script::Other, Language::Other),
Pipeline::default().set_tokenizer(LegacyMeilisearch).set_normalizer(latin_normalizer),
);

let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::new(pipeline_map, &stop_words));
Expand All @@ -49,12 +61,17 @@ fn legacy_tok_deunicode_lowercase_norm(text: &str) {

fn translation_pre_jieba_tok_deunicode_lowercase_norm(text: &str) {
let mut pipeline_map: HashMap<(Script, Language), Pipeline> = HashMap::new();
let chinese_deunicoder = DeunicodeNormalizer::new(&|text: &str| text.chars().next().map_or(false, is_cjk));
let chinese_normalizer: Vec<Box<dyn Normalizer>> = vec![Box::new(chinese_deunicoder), Box::new(LowercaseNormalizer)];
pipeline_map.insert((Script::Other, Language::Other), Pipeline::default()
.set_pre_processor(ChineseTranslationPreProcessor)
.set_tokenizer(Jieba::default())
.set_normalizer(chinese_normalizer));
let chinese_deunicoder =
DeunicodeNormalizer::new(&|text: &str| text.chars().next().map_or(false, is_cjk));
let chinese_normalizer: Vec<Box<dyn Normalizer>> =
vec![Box::new(chinese_deunicoder), Box::new(LowercaseNormalizer)];
pipeline_map.insert(
(Script::Other, Language::Other),
Pipeline::default()
.set_pre_processor(ChineseTranslationPreProcessor)
.set_tokenizer(Jieba::default())
.set_normalizer(chinese_normalizer),
);

let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::new(pipeline_map, &stop_words));
Expand Down
45 changes: 31 additions & 14 deletions benches/normalizer.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
use std::collections::HashMap;

use criterion::{BenchmarkId, Criterion, black_box};
use criterion::{black_box, BenchmarkId, Criterion};
use fst::Set;

use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::analyzer::{Language, Pipeline, Script};
use meilisearch_tokenizer::normalizer::{DeunicodeNormalizer, IdentityNormalizer, LowercaseNormalizer, Normalizer};
use meilisearch_tokenizer::normalizer::{
DeunicodeNormalizer, IdentityNormalizer, LowercaseNormalizer, Normalizer,
};
use meilisearch_tokenizer::tokenizer::LegacyMeilisearch;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};

fn init_analyzer_with_normalizer<'a>(normalizer: impl Normalizer + 'static, stop_words: &'a Set<Vec<u8>>) -> Analyzer<'a, Vec<u8>> {
fn init_analyzer_with_normalizer<'a>(
normalizer: impl Normalizer + 'static,
stop_words: &'a Set<Vec<u8>>,
) -> Analyzer<'a, Vec<u8>> {
let mut pipeline_map: HashMap<(Script, Language), Pipeline> = HashMap::new();
pipeline_map.insert((Script::Other, Language::Other), Pipeline::default()
.set_tokenizer(LegacyMeilisearch)
.set_normalizer(normalizer));
pipeline_map.insert(
(Script::Other, Language::Other),
Pipeline::default().set_tokenizer(LegacyMeilisearch).set_normalizer(normalizer),
);

let analyzer = Analyzer::new(AnalyzerConfig::new(pipeline_map, stop_words));

// analyze a first time to trigger lazy initializations
analyzer.analyze("Hello");

Expand All @@ -29,24 +34,36 @@ pub fn criterion_benchmark(c: &mut Criterion, data_set: &[(&str, &str)]) {

let analyzer = init_analyzer_with_normalizer(IdentityNormalizer, &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("IdentityNormalizer", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("IdentityNormalizer", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

let analyzer = init_analyzer_with_normalizer(DeunicodeNormalizer::default(), &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("DeunicodeNormalizer", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("DeunicodeNormalizer", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

let analyzer = init_analyzer_with_normalizer(LowercaseNormalizer, &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("LowercaseNormalizer", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("LowercaseNormalizer", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

group.finish();
}

fn run(analyzer: &Analyzer<Vec<u8>>, text: &str) {
let analyzed = analyzer.analyze(text);
black_box(analyzed.tokens().for_each(|_|{}));

black_box(analyzed.tokens().for_each(|_| {}));
}
41 changes: 27 additions & 14 deletions benches/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
use std::collections::HashMap;

use criterion::{BenchmarkId, Criterion, black_box};
use criterion::{black_box, BenchmarkId, Criterion};
use fst::Set;

use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::analyzer::{Language, Pipeline, Script};
use meilisearch_tokenizer::tokenizer::{LegacyMeilisearch, Tokenizer, Jieba, UnicodeSegmenter};

use meilisearch_tokenizer::tokenizer::{Jieba, LegacyMeilisearch, Tokenizer, UnicodeSegmenter};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};

fn init_analyzer_with_tokenizer<'a>(tokenizer: impl Tokenizer + 'static, stop_words: &'a Set<Vec<u8>>) -> Analyzer<'a, Vec<u8>> {
fn init_analyzer_with_tokenizer<'a>(
tokenizer: impl Tokenizer + 'static,
stop_words: &'a Set<Vec<u8>>,
) -> Analyzer<'a, Vec<u8>> {
let mut pipeline_map: HashMap<(Script, Language), Pipeline> = HashMap::new();
pipeline_map.insert((Script::Other, Language::Other), Pipeline::default()
.set_tokenizer(tokenizer));
pipeline_map
.insert((Script::Other, Language::Other), Pipeline::default().set_tokenizer(tokenizer));

let analyzer = Analyzer::new(AnalyzerConfig::new(pipeline_map, stop_words));

// analyze a first time to trigger lazy initializations
analyzer.analyze("Hello");

Expand All @@ -28,24 +29,36 @@ pub fn criterion_benchmark(c: &mut Criterion, data_set: &[(&str, &str)]) {

let analyzer = init_analyzer_with_tokenizer(LegacyMeilisearch, &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("LegacyMeilisearch", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("LegacyMeilisearch", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

let analyzer = init_analyzer_with_tokenizer(UnicodeSegmenter, &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("UnicodeSegmenter", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("UnicodeSegmenter", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

let analyzer = init_analyzer_with_tokenizer(Jieba, &stop_words);
for &(name, text) in data_set {
group.bench_with_input(BenchmarkId::new("Jieba", name), &(&analyzer, text), |b, &(a, s)| b.iter(|| run(a, s)));
group.bench_with_input(
BenchmarkId::new("Jieba", name),
&(&analyzer, text),
|b, &(a, s)| b.iter(|| run(a, s)),
);
}

group.finish();
}

fn run(analyzer: &Analyzer<Vec<u8>>, text: &str) {
let analyzed = analyzer.analyze(text);
black_box(analyzed.tokens().for_each(|_|{}));

black_box(analyzed.tokens().for_each(|_| {}));
}
Loading

0 comments on commit c0b5cf7

Please sign in to comment.