120 lines
2.6 KiB
Go
120 lines
2.6 KiB
Go
package ngram
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// Слово
|
|
type Word struct {
|
|
Value string // Слово
|
|
Grams []string // n-граммы слова
|
|
Size int // Размерность n-грамм
|
|
}
|
|
|
|
var wordPrepare = regexp.MustCompile(`[\.!@#$%^&*_'"\[\]\{\}\(\)\-\+=\\\/\s\t]`)
|
|
|
|
// Разбиваем слово на n-граммы
|
|
func BuildWord(value string, size int) (word *Word, err error) {
|
|
if len(value) == 0 {
|
|
return nil, fmt.Errorf(`should be set word for prepare`)
|
|
}
|
|
value = wordPrepare.ReplaceAllString(value, "")
|
|
if len(value) == 0 {
|
|
return nil, fmt.Errorf(`word not contains literal characters`)
|
|
}
|
|
v := []rune(strings.ToUpper(value))
|
|
if len(value) < size {
|
|
return nil, fmt.Errorf(`length word should be equal or more characters when size ngram`)
|
|
}
|
|
word = &Word{
|
|
Value: value,
|
|
Size: size,
|
|
}
|
|
/*for i := 0; i <= len(v)-size; i++ {
|
|
word.Grams = append(word.Grams, string(v[i:i+size]))
|
|
}*/
|
|
word.Grams = word.prepareGrams(v)
|
|
if len(word.Grams) == 0 {
|
|
word = nil
|
|
err = fmt.Errorf(`word not contains grams`)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Подготовка n-грамм для слова
|
|
func (word *Word) prepareGrams(v []rune) (grams []string) {
|
|
for i := 0; i <= len(v)-word.Size; i++ {
|
|
grams = append(grams, string(v[i:i+word.Size]))
|
|
}
|
|
// Проверим есть ли совпадения н-граммов и уберем дубликаты
|
|
if len(grams) > 0 {
|
|
var newGrams []string
|
|
for _, g := range grams {
|
|
if len(newGrams) == 0 {
|
|
newGrams = append(newGrams, g)
|
|
} else {
|
|
search := false
|
|
for _, g1 := range newGrams {
|
|
if g == g1 {
|
|
search = true
|
|
break
|
|
}
|
|
}
|
|
if !search {
|
|
newGrams = append(newGrams, g)
|
|
}
|
|
}
|
|
}
|
|
grams = newGrams
|
|
}
|
|
return
|
|
}
|
|
|
|
// Коэффициент совпадения [0;1]
|
|
func (word *Word) Match(value string) float32 {
|
|
if len(value) == 0 {
|
|
return 0.0
|
|
}
|
|
value = wordPrepare.ReplaceAllString(value, "")
|
|
if len(value) == 0 {
|
|
return 0.0
|
|
}
|
|
if len([]rune(value)) < word.Size {
|
|
return 0.0
|
|
}
|
|
findGrams := word.prepareGrams([]rune(strings.ToUpper(value)))
|
|
if len(findGrams) == 0 {
|
|
return 0.0
|
|
}
|
|
maxGrams := len(findGrams)
|
|
maxGramsWord := findGrams
|
|
var minGramsWord []string
|
|
if maxGrams < len(word.Grams) {
|
|
minGramsWord = maxGramsWord
|
|
maxGramsWord = word.Grams
|
|
maxGrams = len(word.Grams)
|
|
|
|
} else {
|
|
minGramsWord = word.Grams
|
|
}
|
|
noEquals := 0
|
|
if noEquals < 0 {
|
|
noEquals *= -1
|
|
}
|
|
for _, g1 := range maxGramsWord {
|
|
search := false
|
|
for _, g2 := range minGramsWord {
|
|
if g1 == g2 {
|
|
search = true
|
|
break
|
|
}
|
|
}
|
|
if !search {
|
|
noEquals++
|
|
}
|
|
}
|
|
return (float32(maxGrams) - float32(noEquals)) / float32(maxGrams)
|
|
}
|