go-yt-ngram/word.go

120 lines
2.6 KiB
Go

package ngram
import (
"fmt"
"regexp"
"strings"
)
// Слово
type Word struct {
Value string // Слово
Grams []string // n-граммы слова
Size int // Размерность n-грамм
}
var wordPrepare = regexp.MustCompile(`[\.!@#$%^&*_'"\[\]\{\}\(\)\-\+=\\\/\s\t]`)
// Разбиваем слово на n-граммы
func BuildWord(value string, size int) (word *Word, err error) {
if len(value) == 0 {
return nil, fmt.Errorf(`should be set word for prepare`)
}
value = wordPrepare.ReplaceAllString(value, "")
if len(value) == 0 {
return nil, fmt.Errorf(`word not contains literal characters`)
}
v := []rune(strings.ToUpper(value))
if len(value) < size {
return nil, fmt.Errorf(`length word should be equal or more characters when size ngram`)
}
word = &Word{
Value: value,
Size: size,
}
/*for i := 0; i <= len(v)-size; i++ {
word.Grams = append(word.Grams, string(v[i:i+size]))
}*/
word.Grams = word.prepareGrams(v)
if len(word.Grams) == 0 {
word = nil
err = fmt.Errorf(`word not contains grams`)
}
return
}
// Подготовка n-грамм для слова
func (word *Word) prepareGrams(v []rune) (grams []string) {
for i := 0; i <= len(v)-word.Size; i++ {
grams = append(grams, string(v[i:i+word.Size]))
}
// Проверим есть ли совпадения н-граммов и уберем дубликаты
if len(grams) > 0 {
var newGrams []string
for _, g := range grams {
if len(newGrams) == 0 {
newGrams = append(newGrams, g)
} else {
search := false
for _, g1 := range newGrams {
if g == g1 {
search = true
break
}
}
if !search {
newGrams = append(newGrams, g)
}
}
}
grams = newGrams
}
return
}
// Коэффициент совпадения [0;1]
func (word *Word) Match(value string) float32 {
if len(value) == 0 {
return 0.0
}
value = wordPrepare.ReplaceAllString(value, "")
if len(value) == 0 {
return 0.0
}
if len([]rune(value)) < word.Size {
return 0.0
}
findGrams := word.prepareGrams([]rune(strings.ToUpper(value)))
if len(findGrams) == 0 {
return 0.0
}
maxGrams := len(findGrams)
maxGramsWord := findGrams
var minGramsWord []string
if maxGrams < len(word.Grams) {
minGramsWord = maxGramsWord
maxGramsWord = word.Grams
maxGrams = len(word.Grams)
} else {
minGramsWord = word.Grams
}
noEquals := 0
if noEquals < 0 {
noEquals *= -1
}
for _, g1 := range maxGramsWord {
search := false
for _, g2 := range minGramsWord {
if g1 == g2 {
search = true
break
}
}
if !search {
noEquals++
}
}
return (float32(maxGrams) - float32(noEquals)) / float32(maxGrams)
}