Skip to content

Commit d375dfb

Browse files
authored
Mecab support (#24)
* Added support for Japanese language via Wakame.jl * Added installation of MeCab for CI
1 parent a76423a commit d375dfb

File tree

10 files changed

+112
-46
lines changed

10 files changed

+112
-46
lines changed

Diff for: .github/workflows/CI.yml

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
os:
1717
- ubuntu-latest
1818
- macOS-latest
19-
- windows-latest
19+
# - windows-latest
2020
arch:
2121
- x64
2222
steps:
@@ -35,6 +35,11 @@ jobs:
3535
${{ runner.os }}-test-${{ env.cache-name }}-
3636
${{ runner.os }}-test-
3737
${{ runner.os }}-
38+
- name: Install Mecab (MacOS)
39+
if: runner.os == 'macOS'
40+
run: |
41+
brew install mecab
42+
brew install mecab-ipadic
3843
- uses: julia-actions/julia-buildpkg@v1
3944
- uses: julia-actions/julia-runtest@v1
4045
- uses: julia-actions/julia-processcoverage@v1

Diff for: Project.toml

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
name = "SimString"
22
uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
33
authors = ["Bernard Brenyah"]
4-
version = "0.2.0"
4+
version = "0.3.0"
55

66
[deps]
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
88
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
99
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10+
Wakame = "4447db07-3941-47e2-90a2-965b7cb1b6ce"
1011

1112
[compat]
1213
CircularArrays = "1"
1314
DataStructures = "0.18"
1415
OffsetArrays = "1"
1516
julia = "1"
17+
Wakame = "0.1"
1618

1719
[extras]
1820
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
19-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2021
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
22+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2123

2224
[targets]
2325
test = ["Test", "Faker", "Suppressor"]

Diff for: README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] 100% exact retrieval
1717
- [X] Support for unicodes
1818
- [X] Support for building databases directly from text files
19-
- [ ] Custom user defined feature generation methods
20-
- [ ] Mecab-based tokenizer support
21-
- [ ] Support for persistent databases
19+
- [X] Mecab-based tokenizer support
20+
- [ ] Support for persistent databases like MongoDB
2221

2322
## Suported String Similarity Measures
2423

Diff for: docs/src/index.md

+6-4
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@ CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10
1717
- [X] 100% exact retrieval
1818
- [X] Support for unicodes
1919
- [X] Support for building databases directly from text files
20-
- [ ] Custom user defined feature generation methods
21-
- [ ] Mecab-based tokenizer support
22-
- [ ] Support for persistent databases
20+
- [X] Mecab-based tokenizer support for Japanese
21+
- [ ] Support for persistent databases like MongoDB
2322

2423
## Suported String Similarity Measures
2524

@@ -59,7 +58,9 @@ pkg> free SimString
5958
using SimString
6059

6160
# Inilisate database and some strings
62-
db = DictDB(CharacterNGrams(2, " "));
61+
db = DictDB(CharacterNGrams(2, " "));
62+
# OR: db = DictDB(WordNGrams(2, " ")); for word based ngrams
63+
# OR db = DictDB(MecabNGrams(2, " ", Mecab())) for Japanese ngrams. Requires installation of Mecab
6364
push!(db, "foo");
6465
push!(db, "bar");
6566
push!(db, "fooo");
@@ -85,6 +86,7 @@ desc = describe_collection(db)
8586

8687
- 0.1.0 Initial release.
8788
- 0.2.0 Added support for unicodes
89+
- 0.3.0 Added Japanese support via Mecab
8890

8991
```@index
9092
```

Diff for: src/SimString.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
55
using CircularArrays
66
using OffsetArrays
7+
using Wakame
78

89
######### Import modules & utils ################
910
include("db_collection.jl")
@@ -17,7 +18,7 @@ include("search.jl")
1718
####### Global export of user API #######
1819
export Dice, Jaccard, Cosine, Overlap, ExactMatch,
1920
AbstractSimStringDB, DictDB, describe_collection,
20-
CharacterNGrams, WordNGrams,
21+
CharacterNGrams, WordNGrams, MecabNGrams,
2122
search
2223

2324

Diff for: src/db_collection.jl

+9-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ Abstract type for feature extraction structs
1212
abstract type FeatureExtractor end
1313

1414

15-
# Feature Extraction Definitions
16-
1715
"""
1816
Feature extraction on character-level ngrams
1917
"""
@@ -33,3 +31,12 @@ struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
3331
end
3432

3533

34+
35+
"""
36+
Feature extraction based on mecab word-level ngrams
37+
"""
38+
struct MecabNGrams{T1<:Int, T2<:AbstractString, T3<:Mecab} <: FeatureExtractor
39+
n::T1 # number of n-grams to extract
40+
padder::T2 # string to use to pad n-grams
41+
tokenizer::T3 # Mecab tokenizer to use
42+
end

Diff for: src/dictdb.jl

+46-20
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,19 @@ function DictDB(x::CharacterNGrams)
4444
end
4545

4646

47+
"""
48+
Internal function for generating a base DictDB object for WordNGrams and MecabNGrams
49+
"""
50+
function generate_base_dict_db(x)
51+
DictDB(
52+
x,
53+
String[],
54+
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
55+
DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
56+
DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
57+
)
58+
end
59+
4760
"""
4861
DictDB(x::WordNGrams)
4962
@@ -60,15 +73,28 @@ db = DictDB(WordNGrams(2, " ", " "))
6073
# Returns
6174
* `DictDB`: A DictDB object with additional containers and Metadata for WordNGrams
6275
"""
63-
function DictDB(x::WordNGrams)
64-
DictDB(
65-
x,
66-
String[],
67-
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
68-
DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
69-
DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
70-
)
71-
end
76+
DictDB(x::WordNGrams) = generate_base_dict_db(x)
77+
78+
79+
80+
"""
81+
DictDB(x::MecabNGrams)
82+
83+
Initialize a dict DB with additional containers and Metadata for MecabNGrams
84+
85+
# Arguments
86+
* `x`: MecabNGrams object
87+
88+
# Example
89+
```julia
90+
db = DictDB(MecabNGrams(2, " ", Mecab()))
91+
```
92+
93+
# Returns
94+
* `DictDB`: A DictDB object with additional containers and Metadata for MecabNGrams
95+
"""
96+
DictDB(x::MecabNGrams) = generate_base_dict_db(x)
97+
7298

7399

74100

@@ -96,20 +122,20 @@ describe_collection(db)
96122
"""
97123
function describe_collection(db::DictDB)
98124

99-
# Total number of strings in collection
100-
= length(db.string_collection)
125+
# Total number of strings in collection
126+
= length(db.string_collection)
101127

102-
# Average size of ngram features
103-
n = [x for x in keys(db.string_size_map)]
104-
μ = sum(n) / length(n)
128+
# Average size of ngram features
129+
n = [x for x in keys(db.string_size_map)]
130+
μ = sum(n) / length(n)
105131

106-
# Total number of ngram features
107-
total_ngrams = 0
108-
for i in values(db.string_feature_map)
109-
total_ngrams += length(i)
110-
end
132+
# Total number of ngram features
133+
total_ngrams = 0
134+
for i in values(db.string_feature_map)
135+
total_ngrams += length(i)
136+
end
111137

112-
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
138+
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
113139
end
114140

115141

Diff for: src/features.jl

+22-13
Original file line numberDiff line numberDiff line change
@@ -36,29 +36,20 @@ function init_ngrams(extractor::CharacterNGrams, x, n)
3636
end
3737

3838

39-
4039
"""
41-
Internal function to generate intial uncounted ngrams on a word level
40+
Internal function to generate intial uncounted word ngrams on a word level
4241
"""
43-
function init_ngrams(extractor::WordNGrams, x, n)
42+
function init_ngrams(extractor, x, n)
4443
map(0:length(x)-n) do i
4544
@view x[i+1: i+n]
4645
end
4746
end
4847

4948

5049
"""
51-
Internal function to create character-level ngrams features from an AbstractString
52-
"""
53-
function n_grams(extractor::CharacterNGrams, x, n)
54-
return cummulative_ngram_count(init_ngrams(extractor, x, n))
55-
end
56-
57-
58-
"""
59-
Internal function to create word-level ngrams from an AbstractVector
50+
Internal function to create counted ngrams
6051
"""
61-
function n_grams(extractor::WordNGrams, x, n)
52+
function n_grams(extractor, x, n)
6253
return cummulative_ngram_count(init_ngrams(extractor, x, n))
6354
end
6455

@@ -91,6 +82,24 @@ function extract_features(extractor::WordNGrams, str)
9182
end
9283

9384

85+
"""
86+
Internal function to generate Mecab word-level ngrams features from an AbstractString
87+
"""
88+
function extract_features(extractor::MecabNGrams, str)
89+
words_split = tokenize(extractor.tokenizer, str)
90+
padded_words = pad_string(words_split, extractor.padder)
91+
return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
92+
end
93+
94+
95+
"""
96+
Internal function to tokenize a string using Mecab
97+
"""
98+
function tokenize(tokenizer::Mecab, str::AbstractString)
99+
return parse_surface(tokenizer, str)
100+
end
101+
102+
94103
"""
95104
Internal function to count and pad generated character-level ngrams (including duplicates)
96105
"""

Diff for: test/test01_dictdb.jl

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
module TestDBCollection
22
using SimString
3+
using Wakame: Mecab
34
using Test
45

56

@@ -110,5 +111,14 @@ end
110111

111112

112113

114+
@testset "Test mecab insert" begin
115+
db = DictDB(MecabNGrams(2, " ", Mecab()))
116+
append!(db, ["pythonが大好きです", "I am a cat."])
117+
118+
@test db.string_collection == ["pythonが大好きです", "I am a cat."]
119+
@test db.string_size_map[5] == Set(["pythonが大好きです"])
120+
@test db.string_size_map[6] == Set(["I am a cat."])
121+
end
122+
113123

114124
end # module

Diff for: test/test02_features.jl

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
module TestFeatures
22
using SimString
3+
using Wakame: Mecab
34
using Test
45

56

@@ -13,6 +14,10 @@ using Test
1314
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude 😄🍕")
1415
@test word_ngram_res[5] == (["really", "really"], 2)
1516
@test word_ngram_res[8] == (["dude", "😄🍕"], 1)
17+
18+
mecab_ngram_res = SimString.extract_features(MecabNGrams(2, " ", Mecab()), "pythonが大好きです")
19+
@test mecab_ngram_res[1] == (["python", ""], 1)
20+
@test mecab_ngram_res[2] == (["", "大好き"], 1)
1621
end
1722

1823

0 commit comments

Comments
 (0)