Skip to content

Commit 8fc4b30

Browse files
committed
Support multiple n-gram lengths
1 parent 04de116 commit 8fc4b30

File tree

8 files changed

+176
-184
lines changed

8 files changed

+176
-184
lines changed

README.md

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,15 @@
22

33
The plan for this package is to have Go implementation of different string distance/similarity functions, like Levenshtein (normalized, weighted, Damerau), Jaro-Winkler, Jaccard index, Euclidean distance, Hamming distance...
44

5-
Currently it has implemented Levenshtein, Jaccard, Hamming, LCS, Q-gram and Cosine distance functions. Work in progress...
5+
Currently it has implemented:
6+
- Levenshtein
7+
- Jaccard
8+
- Hamming
9+
- LCS
10+
- Q-gram
11+
- n-gram based Cosine distanc
12+
13+
Work in progress...
614

715
## Import and installation
816

@@ -30,7 +38,7 @@ Currently only Levenshtein, Jaccard, Hamming, LCS string, Q-gram and Cosine dist
3038

3139
#### Levenshtein
3240

33-
Levenshtein distance can be calculated with default parameters (use DefaultSimilarityOptions) where cost of insert, delete and substitute operation are 1. You can also use it with other parameters by using SimilarityOptions type. Setting CaseInsensitive to true in SimilarityOptions the comparison will be done without considering character cases.
41+
Levenshtein distance can be calculated with default parameters (use DefaultSimilarityOptions) where cost of insert, delete and substitute operation are 1. You can also use it with other parameters by using SimilarityOptions type. Setting CaseInsensitive to true in SimilarityOptions the comparison will be done without considering character cases.
3442

3543
Example:
3644

@@ -62,9 +70,9 @@ Example:
6270
```go
6371
fmt.Println(stringosim.Jaccard([]rune("stringosim"), []rune("stingobim")))
6472

65-
fmt.Println(stringosim.Jaccard([]rune("stringosim"), []rune("stingobim"), 2))
73+
fmt.Println(stringosim.Jaccard([]rune("stringosim"), []rune("stingobim"), []int{2}))
6674

67-
fmt.Println(stringosim.Jaccard([]rune("stringosim"), []rune("stingobim"), 3))
75+
fmt.Println(stringosim.Jaccard([]rune("stringosim"), []rune("stingobim"), []int{3}))
6876
```
6977

7078
#### Hamming
@@ -97,7 +105,7 @@ Example:
97105
```go
98106
fmt.Println(stringosim.LCS([]rune("testing lcs algorithm"), []rune("another l c s example")))
99107

100-
fmt.Println(stringosim.LCS([]rune("testing lcs algorithm"), []rune("ANOTHER L C S EXAMPLE"),
108+
fmt.Println(stringosim.LCS([]rune("testing lcs algorithm"), []rune("ANOTHER L C S EXAMPLE"),
101109
stringosim.LCSSimilarityOptions{
102110
CaseInsensitive: true,
103111
}))
@@ -139,7 +147,7 @@ Example:
139147
fmt.Println(stringosim.QGram([]rune("abcde"), []rune("ABDCDE"),
140148
stringosim.QGramSimilarityOptions{
141149
CaseInsensitive: true,
142-
NGramLength: 3,
150+
NGramSizes: []int{3},
143151
}))
144152
```
145153

@@ -155,7 +163,6 @@ Example:
155163
fmt.Println(stringosim.Cosine(Cosine[]rune("abcde"), []rune("ABDCDE"),
156164
stringosim.CosineSimilarityOptions{
157165
CaseInsensitive: true,
158-
NGramLength: 3,
166+
NGramSizes: []int{3},
159167
}))
160168
```
161-

cosine.go

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,43 @@
11
package stringosim
22

33
import (
4-
"strings"
4+
"strings"
55
)
66

77
type CosineSimilarityOptions struct {
8-
CaseInsensitive bool
9-
NGramLength int
8+
CaseInsensitive bool
9+
NGramSizes []int
1010
}
1111

1212
var DefaultCosineSimilarityOptions = CosineSimilarityOptions{
13-
CaseInsensitive: false,
14-
NGramLength: 2,
13+
CaseInsensitive: false,
14+
NGramSizes: []int{2},
1515
}
1616

1717
func Cosine(s []rune, t []rune, options ...CosineSimilarityOptions) float64 {
18-
if len(s) == 0 {
19-
if len(t) == 0 {
20-
return float64(0.0)
21-
} else {
22-
return float64(1.0)
23-
}
24-
} else {
25-
if len(t) == 0 {
26-
return float64(1.0)
27-
}
28-
}
29-
opt := DefaultCosineSimilarityOptions
30-
for _, option := range options {
31-
opt = option
32-
break
33-
}
34-
var sGrams, tGrams map[string]int
35-
if opt.CaseInsensitive {
36-
sGrams = GetNGram(strings.ToLower(string(s)), opt.NGramLength)
37-
tGrams = GetNGram(strings.ToLower(string(t)), opt.NGramLength)
38-
} else {
39-
sGrams = GetNGram(string(s), opt.NGramLength)
40-
tGrams = GetNGram(string(t), opt.NGramLength)
41-
}
42-
return 1.0 - float64(DotProductNGrams(sGrams, tGrams))/NormNGram(sGrams)/NormNGram(tGrams)
18+
if len(s) == 0 {
19+
if len(t) == 0 {
20+
return float64(0.0)
21+
} else {
22+
return float64(1.0)
23+
}
24+
} else {
25+
if len(t) == 0 {
26+
return float64(1.0)
27+
}
28+
}
29+
opt := DefaultCosineSimilarityOptions
30+
for _, option := range options {
31+
opt = option
32+
break
33+
}
34+
var sGrams, tGrams map[string]int
35+
if opt.CaseInsensitive {
36+
sGrams = GetNGram(strings.ToLower(string(s)), opt.NGramSizes)
37+
tGrams = GetNGram(strings.ToLower(string(t)), opt.NGramSizes)
38+
} else {
39+
sGrams = GetNGram(string(s), opt.NGramSizes)
40+
tGrams = GetNGram(string(t), opt.NGramSizes)
41+
}
42+
return 1.0 - float64(DotProductNGrams(sGrams, tGrams))/NormNGram(sGrams)/NormNGram(tGrams)
4343
}

cosine_test.go

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,41 @@
11
package stringosim
22

33
import (
4-
"testing"
4+
"testing"
55
)
66

77
var testCosineOptions1 CosineSimilarityOptions = CosineSimilarityOptions{
8-
CaseInsensitive: true,
9-
NGramLength: 3,
8+
CaseInsensitive: true,
9+
NGramSizes: []int{3},
1010
}
1111

1212
type CosineTest struct {
13-
src string
14-
trg string
15-
dis float64
16-
opt CosineSimilarityOptions
13+
src string
14+
trg string
15+
dis float64
16+
opt CosineSimilarityOptions
1717
}
1818

1919
var CosineTests = []CosineTest{
20-
{"", "", 0.0, DefaultCosineSimilarityOptions},
21-
{"xxxyyy", "xxxyyy", 0.0, DefaultCosineSimilarityOptions},
22-
{"xxxyyy", "yyyxxx", 0.1111111111111, DefaultCosineSimilarityOptions},
23-
{"xxyzxyyzzy", "xyyxzyzxyzyx", 0.2364582844290667, DefaultCosineSimilarityOptions},
24-
{"xxyzxyyzzy", "XYYXZYZXYZYX", 0.5527864045000421, testCosineOptions1},
25-
{"xxyyzz", "xxxzzz", 0.40371520600005606, DefaultCosineSimilarityOptions},
26-
{"asdlkajsdlkasdkj", "fkdsjlkdf", 0.8825559560970593, DefaultCosineSimilarityOptions},
27-
{"STRING", "sting", 1.0, DefaultCosineSimilarityOptions},
28-
{"STRING", "sting", 0.7113248654051871, testCosineOptions1},
29-
{"comparing the string similarity", "this is compared using Cosine similarity", 0.3519132720916943, DefaultCosineSimilarityOptions},
30-
{"comparing the string similarity", "this is compared using Cosine similarity", 0.5291024827582208, testCosineOptions1},
20+
{"", "", 0.0, DefaultCosineSimilarityOptions},
21+
{"xxxyyy", "xxxyyy", 0.0, DefaultCosineSimilarityOptions},
22+
{"xxxyyy", "yyyxxx", 0.1111111111111, DefaultCosineSimilarityOptions},
23+
{"xxyzxyyzzy", "xyyxzyzxyzyx", 0.2364582844290667, DefaultCosineSimilarityOptions},
24+
{"xxyzxyyzzy", "XYYXZYZXYZYX", 0.5527864045000421, testCosineOptions1},
25+
{"xxyyzz", "xxxzzz", 0.40371520600005606, DefaultCosineSimilarityOptions},
26+
{"asdlkajsdlkasdkj", "fkdsjlkdf", 0.8825559560970593, DefaultCosineSimilarityOptions},
27+
{"STRING", "sting", 1.0, DefaultCosineSimilarityOptions},
28+
{"STRING", "sting", 0.7113248654051871, testCosineOptions1},
29+
{"comparing the string similarity", "this is compared using Cosine similarity", 0.3519132720916943, DefaultCosineSimilarityOptions},
30+
{"comparing the string similarity", "this is compared using Cosine similarity", 0.5291024827582208, testCosineOptions1},
3131
}
3232

3333
func TestCosine(t *testing.T) {
34-
for _, test := range CosineTests {
35-
dis := Cosine([]rune(test.src), []rune(test.trg), test.opt)
36-
if !EqualFloat64(dis, test.dis) {
37-
t.Log("Cosine distance between", test.src, "and", test.trg, "is", dis, "but should be", test.dis)
38-
t.Fail()
39-
}
40-
}
34+
for _, test := range CosineTests {
35+
dis := Cosine([]rune(test.src), []rune(test.trg), test.opt)
36+
if !EqualFloat64(dis, test.dis) {
37+
t.Log("Cosine distance between", test.src, "and", test.trg, "is", dis, "but should be", test.dis)
38+
t.Fail()
39+
}
40+
}
4141
}

helper.go

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,19 @@ func EqualFloat64(x, y float64) bool {
3030
return math.Abs(x-y) < EPS
3131
}
3232

33-
func GetNGram(s string, n ...int) map[string]int {
34-
nGram := 1
35-
if len(n) > 0 {
36-
for _, v := range n {
37-
nGram = v
38-
break
39-
}
40-
}
41-
33+
func GetNGram(s string, NGramSizes []int) map[string]int {
4234
regExp := regexp.MustCompile(`\s+`)
4335
t := regExp.ReplaceAllString(s, " ")
4436
m := make(map[string]int)
45-
for i := 0; i <= len(t)-nGram; i++ {
46-
v := string(t[i:(i + nGram)])
47-
cnt, ok := m[v]
48-
if ok {
49-
m[v] = cnt + 1
50-
} else {
51-
m[v] = 1
37+
for _, nGram := range NGramSizes {
38+
for i := 0; i <= len(t)-nGram; i++ {
39+
v := string(t[i:(i + nGram)])
40+
cnt, ok := m[v]
41+
if ok {
42+
m[v] = cnt + 1
43+
} else {
44+
m[v] = 1
45+
}
5246
}
5347
}
5448
return m

jaccard.go

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,16 @@
11
package stringosim
22

3-
import ()
3+
func Jaccard(s []rune, t []rune, NGramSizes []int) float64 {
4+
sGrams := GetNGram(string(s), NGramSizes)
5+
tGrams := GetNGram(string(t), NGramSizes)
46

5-
func Jaccard(s []rune, t []rune, n ...int) float64 {
6-
nGram := 1
7-
if len(n) > 0 {
8-
for _, v := range n {
9-
nGram = v
10-
break
11-
}
12-
}
13-
sGrams := GetNGram(string(s), nGram)
14-
tGrams := GetNGram(string(t), nGram)
15-
16-
total := len(sGrams) + len(tGrams)
17-
intersection := 0
18-
for k, _ := range sGrams {
19-
_, ok := tGrams[k]
20-
if ok {
21-
intersection++
22-
}
23-
}
24-
return 1.0 - float64(intersection)/float64(total-intersection)
7+
total := len(sGrams) + len(tGrams)
8+
intersection := 0
9+
for k, _ := range sGrams {
10+
_, ok := tGrams[k]
11+
if ok {
12+
intersection++
13+
}
14+
}
15+
return 1.0 - float64(intersection)/float64(total-intersection)
2516
}

jaccard_test.go

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,39 @@
11
package stringosim
22

33
import (
4-
"testing"
4+
"testing"
55
)
66

77
type JaccardTest struct {
8-
src string
9-
trg string
10-
nGram int
11-
dis float64
8+
src string
9+
trg string
10+
NGramSizes []int
11+
dis float64
1212
}
1313

1414
var jaccardTests = []JaccardTest{
15-
{"abracadabra", "baccarda", 1, 0.0},
16-
{"abracadabra", "baccarda", 2, 0.7272727272727273},
17-
{"abracadabra", "baccarda", 3, 1.0},
18-
{"this is space test", "this is space test", 3, 0.0},
19-
{"just another test of jaccard", "i will test jaccard", 1, 0.47058823529411764},
20-
{"just another test of jaccard", "i will test jaccard", 2, 0.6129032258064516},
21-
{"just another test of jaccard", "i will test jaccard", 3, 0.6875},
22-
{"book is on the shelv", "buk is on the shelf", 1, 0.2142857142857143},
23-
{"book is on the shelv", "buk is on the shelf", 2, 0.33333333333333337},
24-
{"book is on the shelv", "buk is on the shelf", 3, 0.33333333333333337},
25-
{"cardiogram", "krdiogram", 1, 0.2222222222222222},
26-
{"cardiogram", "krdiogram", 2, 0.30000000000000004},
27-
{"cardiogram", "krdiogram", 3, 0.33333333333333337},
15+
{"abracadabra", "baccarda", []int{1}, 0.0},
16+
{"abracadabra", "baccarda", []int{2}, 0.7272727272727273},
17+
{"abracadabra", "baccarda", []int{3}, 1.0},
18+
{"this is space test", "this is space test", []int{3}, 0.0},
19+
{"just another test of jaccard", "i will test jaccard", []int{1}, 0.47058823529411764},
20+
{"just another test of jaccard", "i will test jaccard", []int{2}, 0.6129032258064516},
21+
{"just another test of jaccard", "i will test jaccard", []int{3}, 0.6875},
22+
{"book is on the shelv", "buk is on the shelf", []int{1}, 0.2142857142857143},
23+
{"book is on the shelv", "buk is on the shelf", []int{2}, 0.33333333333333337},
24+
{"book is on the shelv", "buk is on the shelf", []int{3}, 0.33333333333333337},
25+
{"cardiogram", "krdiogram", []int{1}, 0.2222222222222222},
26+
{"cardiogram", "krdiogram", []int{2}, 0.30000000000000004},
27+
{"cardiogram", "krdiogram", []int{3}, 0.33333333333333337},
2828
}
2929

3030
func TestJaccard(t *testing.T) {
31-
for _, test := range jaccardTests {
32-
dis := Jaccard([]rune(test.src), []rune(test.trg), test.nGram)
33-
if !EqualFloat64(dis, test.dis) {
34-
t.Log("Jaccard distance between", test.src, "and", test.trg, "is", dis, "but should be", test.dis)
35-
t.Fail()
36-
}
31+
for _, test := range jaccardTests {
32+
dis := Jaccard([]rune(test.src), []rune(test.trg), test.NGramSizes)
33+
if !EqualFloat64(dis, test.dis) {
34+
t.Log("Jaccard distance between", test.src, "and", test.trg, "is", dis, "but should be", test.dis)
35+
t.Fail()
36+
}
3737

38-
}
38+
}
3939
}

0 commit comments

Comments
 (0)