89 lines
1.8 KiB
Go
89 lines
1.8 KiB
Go
package ngram
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
var textPrepare = regexp.MustCompile(`[\s\t][\s\t]+`)
|
|
|
|
// Обработка текста
|
|
type Text struct {
|
|
Value string // Текст (значение)
|
|
Words []*Word // Слова
|
|
Size int // Размерность n-грамм
|
|
}
|
|
|
|
// Разбивка текста на слова
|
|
func BuildText(value string, size int) (text *Text, err error) {
|
|
if len(value) == 0 {
|
|
return nil, fmt.Errorf(`should be set text for prepare`)
|
|
}
|
|
v := strings.Trim(textPrepare.ReplaceAllString(value, " "), " ")
|
|
if len(v) == 0 {
|
|
return nil, fmt.Errorf(`word not contains literal characters`)
|
|
}
|
|
text = &Text{
|
|
Value: v,
|
|
Size: size,
|
|
}
|
|
text.Words = text.prepareWords(v)
|
|
|
|
if len(text.Words) == 0 {
|
|
err = fmt.Errorf(`not have words`)
|
|
text = nil
|
|
}
|
|
return
|
|
}
|
|
|
|
// Подготовка слов
|
|
func (text *Text) prepareWords(v string) (words []*Word) {
|
|
v1 := strings.Split(v, " ")
|
|
for _, v2 := range v1 {
|
|
var word *Word
|
|
var err error
|
|
word, err = BuildWord(v2, text.Size)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
words = append(words, word)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Определение соответствия такста заданному значению
|
|
func (text *Text) Match(value string) float32 {
|
|
if len(value) == 0 {
|
|
return .0
|
|
}
|
|
v := strings.Trim(textPrepare.ReplaceAllString(value, " "), " ")
|
|
if len(v) == 0 {
|
|
return .0
|
|
}
|
|
|
|
words := text.prepareWords(v)
|
|
var minWords []*Word
|
|
if len(words) < len(text.Words) {
|
|
minWords = words
|
|
words = text.Words
|
|
} else {
|
|
minWords = text.Words
|
|
}
|
|
|
|
sumCoeff := float32(0.0)
|
|
countCoeff := 0
|
|
for _, w1 := range words {
|
|
maxCoeff := float32(0.0)
|
|
for _, w2 := range minWords {
|
|
if res := w1.Match(w2.Value); res > maxCoeff {
|
|
maxCoeff = res
|
|
}
|
|
}
|
|
sumCoeff += maxCoeff
|
|
countCoeff++
|
|
}
|
|
|
|
return sumCoeff / float32(countCoeff)
|
|
}
|