Skip to content

Commit f822944

Browse files
authored
Initial public release (#15)
* Initial public release
1 parent 504460e commit f822944

12 files changed

+203
-23
lines changed

Diff for: Project.toml

+5-2
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ version = "0.1.0"
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
88
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
99
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10-
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
1110

1211
[compat]
12+
CircularArrays = "1"
13+
DataStructures = "0.18"
14+
OffsetArrays = "1"
1315
julia = "1"
1416

1517
[extras]
1618
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
1719
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
20+
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
1821

1922
[targets]
20-
test = ["Test", "Faker"]
23+
test = ["Test", "Faker", "Suppressor"]

Diff for: README.md

+3
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
1717
- [X] Support for unicodes
1818
- [ ] Custom user defined feature generation methods
1919
- [ ] Mecab-based tokenizer support
20+
- [X] Support for building databases directly from text files
21+
- [ ] Support for persistent databases
2022

2123
## Suported String Similarity Measures
2224

2325
- [X] Dice coefficient
2426
- [X] Jaccard coefficient
2527
- [X] Cosine coefficient
2628
- [X] Overlap coefficient
29+
- [X] Exact match
2730

2831
## Installation
2932

Diff for: docs/src/index.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] Support for unicodes
1717
- [ ] Custom user defined feature generation methods
1818
- [ ] Mecab-based tokenizer support
19-
- [ ] Support for building databases directly from text files
19+
- [X] Support for building databases directly from text files
2020
- [ ] Support for persistent databases
2121

2222
## Suported String Similarity Measures
@@ -64,6 +64,8 @@ push!(db, "fooo");
6464

6565
# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
6666

67+
# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");
68+
6769
# Retrieve the closest match(es)
6870
res = search(Dice(), db, "foo"; α=0.8, ranked=true)
6971
# 2-element Vector{Tuple{String, Float64}}:
@@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
7274

7375
# Describe a working database collection
7476
desc = describe_collection(db)
75-
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
77+
# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
7678
```
7779

7880
## TODO: Benchmarks

Diff for: extras/benchmark_sim.jl

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using SimString
2+
using Faker
3+
using BenchmarkTools
4+
using DataStructures
5+
6+
################################# Benchmark Bulk addition #####################
7+
db = DictDB(CharacterNGrams(3, " "));
8+
Faker.seed(2020)
9+
@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
10+
11+
12+
f(d, x) = append!(d, x)
13+
@time f(db, fake_names)
14+
15+
16+
17+
################################ Simple Addition ###############################
18+
19+
db = DictDB(CharacterNGrams(2, " "));
20+
push!(db, "foo");
21+
push!(db, "bar");
22+
push!(db, "fooo");
23+
24+
f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
25+
test = "foo";
26+
col = db;
27+
sim = Cosine();
28+
a = 0.8;
29+
r = true;
30+
31+
f(Cosine(), db, "foo", 0.8, true)
32+
33+
@btime f($sim, $col, $test, $a, $r)
34+
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
35+
36+
37+
38+
db2 = DictDB(CharacterNGrams(3, " "));
39+
append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
40+
41+
results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented
42+
43+
bs = ["foo", "bar", "foo", "foo", "bar"]
44+
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
45+
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
46+
47+
db = DictDB(WordNGrams(2, " ", " "))
48+
push!(db, "You are a really really really cool dude.")

Diff for: src/SimString.jl

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ module SimString
22

33
import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
5-
using ProgressMeter
65
using CircularArrays
76
using OffsetArrays
87

Diff for: src/dictdb.jl

+15-2
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Basic summary stats for the DB
8787
db = DictDB(CharacterNGrams(2, " "));
8888
append!(db, ["foo", "bar", "fooo"]);
8989
describe_collection(db)
90+
(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
9091
9192
# Returns
9293
* NamedTuples: Summary stats for the DB
@@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
9899
# Total number of strings in collection
99100
= length(db.string_collection)
100101

101-
# Average number of ngram features
102+
# Average size of ngram features
102103
n = [x for x in keys(db.string_size_map)]
103104
μ = sum(n) / length(n)
104105

@@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
108109
total_ngrams += length(i)
109110
end
110111

111-
return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
112+
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
113+
end
114+
115+
116+
"""
117+
Pretty print summary stats for the DB
118+
"""
119+
function Base.show(io::IO, x::DictDB)
120+
metrics = describe_collection(x)
121+
println(io, "DictDB($(x.feature_extractor))")
122+
println(io, "Total collection: ", metrics.total_collection)
123+
println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
124+
println(io, "Total number of ngram features: ", metrics.total_ngrams)
112125
end
113126

114127

Diff for: src/features.jl

+60
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,25 @@ end
9999

100100

101101
"""
102+
push!(db::AbstractSimStringDB, str::AbstractString)
103+
102104
Add a new item to a new or existing collection of strings using
103105
the custom AbstractSimStringDB type.
106+
107+
# Arguments:
108+
* `db`: AbstractSimStringDB - The collection of strings to add to
109+
* `str`: AbstractString - The string to add to the collection
110+
111+
# Example:
112+
```julia
113+
db = DictDB(CharacterNGrams(2, " "));
114+
push!(db, "foo")
115+
push!(db, "bar")
116+
push!(db, "fooo")
117+
````
118+
119+
# Returns:
120+
* `db`: AbstractSimStringDB - The collection of strings with the new string added
104121
"""
105122
function push!(db::AbstractSimStringDB, str::AbstractString)
106123
# Extract features based on the specified feature extractor
@@ -125,11 +142,54 @@ end
125142

126143

127144
"""
145+
append!(db::AbstractSimStringDB, str::Vector)
146+
128147
Add bulk items to a new or existing collection of strings using
129148
the custom AbstractSimStringDB type.
149+
150+
# Arguments:
151+
* db: AbstractSimStringDB - The database to add the strings to
152+
* str: Vector of AbstractString - Vector/Array of strings to add to the database
153+
154+
# Example:
155+
```julia
156+
db = DictDB(CharacterNGrams(2, " "));
157+
append!(db, ["foo", "foo", "fooo"]);
158+
```
159+
160+
# Returns:
161+
* db: AbstractSimStringDB - The database with the new strings added
130162
"""
131163
function append!(db::AbstractSimStringDB, str::Vector)
132164
@inbounds @simd for i in str
133165
push!(db, i)
134166
end
167+
end
168+
169+
170+
"""
171+
append!(db::AbstractSimStringDB, file::AbstractString)
172+
173+
Add bulk items to a new or existing collection of strings using
174+
from a file using the custom AbstractSimStringDB type.
175+
176+
# Arguments:
177+
* `db``: AbstractSimStringDB - The database to add the items to
178+
* `file`: AbstractString - Path to the file to read from
179+
180+
# Example:
181+
```julia
182+
db = DictDB(CharacterNGrams(2, " "));
183+
append!(db, "./data/test.txt")
184+
```
185+
186+
# Returns:
187+
* `db`: AbstractSimStringDB - The database with the items added
188+
"""
189+
function append!(db::AbstractSimStringDB, file::AbstractString)
190+
open(file) do f
191+
for line in eachline(f)
192+
push!(db, line)
193+
end
194+
end
135195
end

Diff for: src/search.jl

+6-6
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
7474
results = String[]
7575

7676
for (candidate, match_count) in candidate_match_counts
77-
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
77+
for i in (query_feature_length - τ + 1) : query_feature_length
7878
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
7979
match_count += 1
8080
end
@@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
103103
features = extract_features(db_collection.feature_extractor, query)
104104

105105
# Metadata from the generated features (length, min & max sizes)
106-
length_of_features = length(features)
107-
min_feature_size = minimum_feature_size(measure, length_of_features, α)
108-
max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
106+
# length_of_features = length(features)
107+
# min_feature_size = minimum_feature_size(measure, length_of_features, α)
108+
# max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
109109

110110
results = String[]
111111

112112
# Generate and return results from the potential candidate size pool
113-
@inbounds for candidate_size in min_feature_size:max_feature_size
113+
@inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
114114
# Minimum overlap
115-
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
115+
τ = minimum_overlap(measure, length(features), candidate_size, α)
116116

117117
# Generate approximate candidates from the overlap join
118118
append!(results, overlap_join(db_collection, features, τ, candidate_size))

Diff for: test/dummy_sents.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
You are a really really really cool dude.
2+
Sometimes you are not really really cool tho

Diff for: test/dummy_words.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
foo
2+
bar
3+
fooo

Diff for: test/test01_dictdb.jl

+44-10
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ using Test
1515

1616
@test collect(keys(db.string_feature_map)) == [5, 6]
1717

18-
@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
19-
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
18+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
19+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
2020
end
2121

2222

@@ -41,10 +41,10 @@ end
4141

4242
@test collect(keys(db.string_feature_map)) == [5, 6]
4343

44-
@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
45-
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
44+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
45+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
4646

47-
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
47+
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
4848
end
4949

5050

@@ -59,19 +59,53 @@ end
5959
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6060
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6161

62-
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
62+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
6363
end
6464

6565

6666

6767
@testset "Test describe functionality" begin
68-
db = DictDB(CharacterNGrams(2, " "));
69-
append!(db, ["foo", "bar", "fooo"]);
68+
db = DictDB(CharacterNGrams(2, " "))
69+
append!(db, ["foo", "bar", "fooo"])
7070

7171
# Interact with db
72-
search(Dice(), db, "zep"; α=0.8, ranked=true)
72+
search(Dice(), db, "zep"; α = 0.8, ranked = true)
73+
74+
@test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
75+
end
76+
77+
78+
@testset "Test bulk insertion from a file using CharacterNGrams" begin
79+
db = DictDB(CharacterNGrams(3, " "))
80+
append!(db, "dummy_words.txt")
81+
82+
@test db.string_collection == ["foo", "bar", "fooo"]
83+
@test db.string_size_map[5] == Set(["bar", "foo"])
84+
@test db.string_size_map[6] == Set(["fooo"])
85+
86+
@test collect(keys(db.string_feature_map)) == [5, 6]
87+
88+
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
89+
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
90+
91+
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
92+
end
93+
94+
95+
96+
@testset "Test bulk insertion from a file using WordNGrams" begin
97+
db = DictDB(WordNGrams(2, " ", " "))
98+
append!(db, "dummy_sents.txt")
99+
100+
@test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
101+
@test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
102+
103+
@test collect(keys(db.string_feature_map)) == [9]
104+
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
105+
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
106+
107+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
73108

74-
@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
75109
end
76110

77111

Diff for: test/test04_search.jl

+13
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ module TestMeasures
22
using SimString
33
using Test
44
using Faker
5+
using Suppressor
56

67

78
@testset "Test Dice Search" begin
@@ -54,6 +55,7 @@ end
5455

5556
end
5657

58+
5759
@testset "Test Micro Deep Dive Search" begin
5860
db = DictDB(CharacterNGrams(2, " "));
5961
append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
@@ -76,6 +78,17 @@ end
7678
end
7779

7880

81+
@testset "Test output from show" begin
82+
db = DictDB(CharacterNGrams(2, " "));
83+
append!(db, ["foo", "bar", "fooo"]);
84+
85+
expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
86+
r = @capture_out show(db)
87+
@test r == expected_out
88+
end
89+
90+
91+
7992

8093

8194
end # module

0 commit comments

Comments
 (0)