Mecab support (#24)

PyDataBlog · web-flow · commit d375dfbe023f · 2023-08-09T13:22:26.000+02:00
* Added support for Japanese language via Wakame.jl
* Added installation of MeCab for CI
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -16,7 +16,7 @@ jobs:
         os:
           - ubuntu-latest
           - macOS-latest
-          - windows-latest
+          # - windows-latest
         arch:
           - x64
     steps:
@@ -35,6 +35,11 @@ jobs:
             ${{ runner.os }}-test-${{ env.cache-name }}-
             ${{ runner.os }}-test-
             ${{ runner.os }}-
+      - name: Install Mecab (MacOS)
+        if: runner.os == 'macOS'
+        run: |
+          brew install mecab
+          brew install mecab-ipadic
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
diff --git a/Project.toml b/Project.toml
@@ -1,23 +1,25 @@
 name = "SimString"
 uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
 authors = ["Bernard Brenyah"]
-version = "0.2.0"
+version = "0.3.0"
 
 [deps]
 CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Wakame = "4447db07-3941-47e2-90a2-965b7cb1b6ce"
 
 [compat]
 CircularArrays = "1"
 DataStructures = "0.18"
 OffsetArrays = "1"
 julia = "1"
+Wakame = "0.1"
 
 [extras]
 Faker = "0efc519c-db33-5916-ab87-703215c3906f"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "Faker", "Suppressor"]
diff --git a/README.md b/README.md
@@ -16,9 +16,8 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] 100% exact retrieval
 - [X] Support for unicodes
 - [X] Support for building databases directly from text files
-- [ ] Custom user defined feature generation methods
-- [ ] Mecab-based tokenizer support
-- [ ] Support for persistent databases
+- [X] Mecab-based tokenizer support
+- [ ] Support for persistent databases like MongoDB
 
 ## Suported String Similarity Measures
 
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -17,9 +17,8 @@ CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10
 - [X] 100% exact retrieval
 - [X] Support for unicodes
 - [X] Support for building databases directly from text files
-- [ ] Custom user defined feature generation methods
-- [ ] Mecab-based tokenizer support
-- [ ] Support for persistent databases
+- [X] Mecab-based tokenizer support for Japanese
+- [ ] Support for persistent databases like MongoDB
 
 ## Suported String Similarity Measures
 
@@ -59,7 +58,9 @@ pkg> free SimString
 using SimString
 
 # Inilisate database and some strings
-db = DictDB(CharacterNGrams(2, " "));
+db = DictDB(CharacterNGrams(2, " ")); 
+# OR: db = DictDB(WordNGrams(2, " ")); for word based ngrams 
+# OR  db = DictDB(MecabNGrams(2, " ", Mecab())) for Japanese ngrams. Requires installation of Mecab
 push!(db, "foo");
 push!(db, "bar");
 push!(db, "fooo");
@@ -85,6 +86,7 @@ desc = describe_collection(db)
 
 - 0.1.0 Initial release.
 - 0.2.0 Added support for unicodes
+- 0.3.0 Added Japanese support via Mecab
 
 ```@index
 ```
diff --git a/src/SimString.jl b/src/SimString.jl
@@ -4,6 +4,7 @@ import Base: push!, append!
 using DataStructures: DefaultOrderedDict, DefaultDict
 using CircularArrays
 using OffsetArrays
+using Wakame
 
 ######### Import modules & utils ################
 include("db_collection.jl")
@@ -17,7 +18,7 @@ include("search.jl")
 ####### Global export of user API #######
 export Dice, Jaccard, Cosine, Overlap, ExactMatch,
     AbstractSimStringDB, DictDB, describe_collection,
-    CharacterNGrams, WordNGrams,
+    CharacterNGrams, WordNGrams, MecabNGrams,
     search
 
 
diff --git a/src/db_collection.jl b/src/db_collection.jl
@@ -12,8 +12,6 @@ Abstract type for feature extraction structs
 abstract type FeatureExtractor end
 
 
-# Feature Extraction Definitions
-
 """
 Feature extraction on character-level ngrams
 """
@@ -33,3 +31,12 @@ struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
 end
 
 
+
+"""
+Feature extraction based on mecab word-level ngrams
+"""
+struct MecabNGrams{T1<:Int, T2<:AbstractString, T3<:Mecab} <: FeatureExtractor
+    n::T1           # number of n-grams to extract
+    padder::T2      # string to use to pad n-grams
+    tokenizer::T3   # Mecab tokenizer to use
+end
diff --git a/src/dictdb.jl b/src/dictdb.jl
@@ -44,6 +44,19 @@ function DictDB(x::CharacterNGrams)
 end
 
 
+"""
+    Internal function for generating a base DictDB object for WordNGrams and MecabNGrams
+"""
+function generate_base_dict_db(x)
+    DictDB(
+        x,
+        String[],
+        DefaultDict{Int, Set{String}}( () -> Set{String}() ),
+        DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}  }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
+        DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
+    )
+end
+
 """
     DictDB(x::WordNGrams)
 
@@ -60,15 +73,28 @@ db = DictDB(WordNGrams(2, " ", " "))
 # Returns
 * `DictDB`: A DictDB object with additional containers and Metadata for WordNGrams
 """
-function DictDB(x::WordNGrams)
-    DictDB(
-        x,
-        String[],
-        DefaultDict{Int, Set{String}}( () -> Set{String}() ),
-        DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}  }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
-        DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
-    )
-end
+DictDB(x::WordNGrams) = generate_base_dict_db(x)
+
+
+
+"""
+    DictDB(x::MecabNGrams)
+
+Initialize a dict DB with additional containers and Metadata for MecabNGrams
+
+# Arguments
+* `x`: MecabNGrams object
+
+# Example
+```julia
+db = DictDB(MecabNGrams(2, " ", Mecab()))
+```
+
+# Returns
+* `DictDB`: A DictDB object with additional containers and Metadata for MecabNGrams
+"""
+DictDB(x::MecabNGrams) = generate_base_dict_db(x)
+
 
 
 
@@ -96,20 +122,20 @@ describe_collection(db)
 """
 function describe_collection(db::DictDB)
 
-# Total number of strings in collection
-∑ = length(db.string_collection)
+    # Total number of strings in collection
+    ∑ = length(db.string_collection)
 
-# Average size of ngram features
-n = [x for x in keys(db.string_size_map)]
-μ = sum(n) / length(n)
+    # Average size of ngram features
+    n = [x for x in keys(db.string_size_map)]
+    μ = sum(n) / length(n)
 
-# Total number of ngram features
-total_ngrams = 0
-for i in values(db.string_feature_map)
-    total_ngrams += length(i)
-end
+    # Total number of ngram features
+    total_ngrams = 0
+    for i in values(db.string_feature_map)
+        total_ngrams += length(i)
+    end
 
-return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
+    return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
 end
 
 
diff --git a/src/features.jl b/src/features.jl
@@ -36,29 +36,20 @@ function init_ngrams(extractor::CharacterNGrams, x, n)
 end
 
 
-
 """
-Internal function to generate intial uncounted ngrams on a word level
+Internal function to generate intial uncounted word ngrams on a word level
 """
-function init_ngrams(extractor::WordNGrams, x, n)
+function init_ngrams(extractor, x, n)
     map(0:length(x)-n) do i
         @view x[i+1: i+n]
     end
 end
 
 
 """
-Internal function to create character-level ngrams features from an AbstractString
-"""
-function n_grams(extractor::CharacterNGrams, x, n)
-    return cummulative_ngram_count(init_ngrams(extractor, x, n))
-end
-
-
-"""
-Internal function to create word-level ngrams from an AbstractVector
+Internal function to create counted ngrams
 """
-function n_grams(extractor::WordNGrams, x, n)
+function n_grams(extractor, x, n)
     return cummulative_ngram_count(init_ngrams(extractor, x, n))
 end
 
@@ -91,6 +82,24 @@ function extract_features(extractor::WordNGrams, str)
 end
 
 
+"""
+Internal function to generate Mecab word-level ngrams features from an AbstractString
+"""
+function extract_features(extractor::MecabNGrams, str)
+    words_split = tokenize(extractor.tokenizer, str)
+    padded_words = pad_string(words_split, extractor.padder)
+    return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
+end
+
+
+"""
+Internal function to tokenize a string using Mecab
+"""
+function tokenize(tokenizer::Mecab, str::AbstractString)
+    return parse_surface(tokenizer, str)
+end
+
+
 """
 Internal function to count and pad generated character-level ngrams (including duplicates)
 """
diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl
@@ -1,5 +1,6 @@
 module TestDBCollection
 using SimString
+using Wakame: Mecab
 using Test
 
 
@@ -110,5 +111,14 @@ end
 
 
 
+@testset "Test mecab insert" begin
+    db = DictDB(MecabNGrams(2, " ", Mecab()))
+    append!(db, ["pythonが大好きです", "I am a cat."])
+
+    @test db.string_collection == ["pythonが大好きです", "I am a cat."]
+    @test db.string_size_map[5] == Set(["pythonが大好きです"])
+    @test db.string_size_map[6] == Set(["I am a cat."])
+end
+
 
 end  # module
diff --git a/test/test02_features.jl b/test/test02_features.jl
@@ -1,5 +1,6 @@
 module TestFeatures
 using SimString
+using Wakame: Mecab
 using Test
 
 
@@ -13,6 +14,10 @@ using Test
     word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude 😄🍕")
     @test word_ngram_res[5] == (["really", "really"], 2)
     @test word_ngram_res[8] == (["dude", "😄🍕"], 1)
+
+    mecab_ngram_res = SimString.extract_features(MecabNGrams(2, " ", Mecab()), "pythonが大好きです")
+    @test mecab_ngram_res[1] == (["python", "が"], 1)
+    @test mecab_ngram_res[2] == (["が", "大好き"], 1)
 end