211 lines
6.4 KiB
Go
211 lines
6.4 KiB
Go
package ngram
|
||
|
||
import (
|
||
"fmt"
|
||
)
|
||
|
||
// Индексатор
|
||
type NGram struct {
|
||
Texts []*Text // Тексты
|
||
size int // Размерность n-грамм
|
||
}
|
||
|
||
// Результат поиска
|
||
type NGramSearchResult struct {
|
||
Value string // Результирующая строка
|
||
Coeff float32 // Коэффициент соответствия
|
||
}
|
||
|
||
// Create new object NGram
|
||
func NewNGram(size int) (ngram *NGram) {
|
||
ngram = &NGram{
|
||
size: size,
|
||
}
|
||
return
|
||
}
|
||
|
||
// Подготовка текста
|
||
func (gram *NGram) AddText(value string) error {
|
||
text, err := BuildText(value, gram.size)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if text == nil {
|
||
err = fmt.Errorf(`text not parsed`)
|
||
} else {
|
||
gram.Texts = append(gram.Texts, text)
|
||
}
|
||
return err
|
||
}
|
||
|
||
// Поиск с установленным порогом совпадения
|
||
func (gram *NGram) SearchLimit(value string, limit float32) (result []NGramSearchResult) {
|
||
if len(gram.Texts) == 0 {
|
||
return
|
||
}
|
||
for _, text := range gram.Texts {
|
||
res := text.Match(value)
|
||
if res >= limit {
|
||
result = append(result, NGramSearchResult{
|
||
Value: text.Value,
|
||
Coeff: res,
|
||
})
|
||
}
|
||
}
|
||
return
|
||
}
|
||
|
||
// Поиск наиболее подходящего значения
|
||
func (gram *NGram) Search(value string) string {
|
||
if len(gram.Texts) == 0 {
|
||
return ""
|
||
}
|
||
results := gram.SearchLimit(value, 0.0)
|
||
if len(results) == 0 {
|
||
return ""
|
||
}
|
||
var maxRes NGramSearchResult
|
||
for _, r := range results {
|
||
if maxRes.Coeff < r.Coeff {
|
||
maxRes = r
|
||
}
|
||
}
|
||
return maxRes.Value
|
||
}
|
||
|
||
// Генерирование SQL-запроса для выполнения в БД
|
||
// Для выполнения данного запроса потребуется примерно следующая структура таблиц:
|
||
//
|
||
// CREATE TABLE texts (id BIGINT PRIMARY KEY, txt TEXT);
|
||
// CREATE TABLE words (id BIGINT PRIMARY KEY, word VARCHAR (150));
|
||
// CREATE TABLE ngram (id BIGINT PRIMARY KEY, ngram VARCHAR (20));
|
||
// CREATE TABLE word_and_text (id_text BIGINT REFERENCES texts (id), id_word BIGINT REFERENCES words (id));
|
||
// CREATE TABLE ngram_and_word (id_word BIGINT REFERENCES words (id), id_ngram BIGINT REFERENCES ngram (id));
|
||
//
|
||
// Соответственно таблицы и поля могут быть переименованы и дополнены в соответствии с нужными требованиями
|
||
func (gram *NGram) SearchLimitSQL(
|
||
tableNameTexts string,
|
||
pkColumnNameTexts string,
|
||
|
||
tableNameWords string,
|
||
pkColumnNameWords string,
|
||
|
||
tableNameTextsAndWords string,
|
||
fkColumnNameWordsForTableWords string,
|
||
fkColumnNameTextsForTableTexts string,
|
||
|
||
tableNameNGrams string,
|
||
pkColumnNameNGrams string,
|
||
|
||
tableNameWordsAndNGrams string,
|
||
columnNameNGram string,
|
||
fkColumnNameWords2ForTableWords string,
|
||
fkColumnNameNGramsForTableNGrams string,
|
||
) string {
|
||
/*
|
||
explain SELECT t.id, sum(ww.w_coeff)/count(ww.w_coeff) AS coeff
|
||
FROM texts AS t
|
||
INNER JOIN word_and_text AS wat ON t.id = wat.id_text
|
||
INNER JOIN (
|
||
|
||
SELECT w.id, naw1.c_naw, count(w.id) AS w_c, (naw1.c_naw/count(w.id)) AS w_coeff
|
||
FROM words AS w
|
||
INNER JOIN ngram_and_word AS naw ON naw.id_word = w.id
|
||
INNER JOIN (
|
||
SELECT naw.id_word, count(naw.id_ngram) as c_naw
|
||
FROM ngram_and_word AS naw
|
||
GROUP BY naw.id_word
|
||
) AS naw1 ON naw1.id_word = w.id
|
||
INNER JOIN (
|
||
SELECT id FROM ngram WHERE ngram.ngram IN ('a','b')
|
||
) AS n ON n.id = naw.id_ngram
|
||
|
||
) AS ww ON ww.id = wat.id_word
|
||
*/
|
||
/*
|
||
SELECT t.Name, sum(ww.w_coeff)/count(ww.w_coeff) AS coeff
|
||
FROM artists AS t
|
||
INNER JOIN ngram_word_and_artists AS wat ON t.Name = wat.id_artist
|
||
INNER JOIN (
|
||
|
||
SELECT w.id, naw1.c_naw, count(w.id) AS w_c, (naw1.c_naw/count(w.id)) AS w_coeff
|
||
FROM ngram_words AS w
|
||
INNER JOIN ngram_and_words AS naw ON naw.id_word = w.id
|
||
INNER JOIN (
|
||
SELECT naw.id_word, count(naw.id_word) as c_naw
|
||
FROM ngram_and_words AS naw
|
||
GROUP BY naw.id_word
|
||
) AS naw1 ON naw1.id_word = w.id
|
||
INNER JOIN (
|
||
SELECT id FROM ngram_ngrams AS ngram WHERE ngram.ngram IN ('POP','OPO','POV')
|
||
) AS n ON n.id = naw.id_ngram
|
||
GROUP BY w.id
|
||
|
||
) AS ww ON ww.id = wat.id_word
|
||
*/
|
||
sql := `
|
||
SELECT t.` + pkColumnNameTexts + `, (sum(ww.w_coeff) * 1.0)/(count(ww.w_coeff) * 1.0) AS coeff
|
||
FROM ` + tableNameTexts + ` AS t
|
||
INNER JOIN ` + tableNameTextsAndWords + ` AS wat ON t.` + pkColumnNameTexts + ` = wat.` + fkColumnNameTextsForTableTexts + `
|
||
INNER JOIN (
|
||
|
||
SELECT w.` + pkColumnNameWords + `, naw1.c_naw, count(w.` + pkColumnNameWords + ` * 1.0) AS w_c, ((count(w.` + pkColumnNameWords + `) * 1.0)/(naw1.c_naw * 1.0)) AS w_coeff
|
||
FROM ` + tableNameWords + ` AS w
|
||
INNER JOIN ` + tableNameWordsAndNGrams + ` AS naw ON naw.` + fkColumnNameWords2ForTableWords + ` = w.` + pkColumnNameWords + `
|
||
INNER JOIN (
|
||
SELECT naw.` + fkColumnNameWords2ForTableWords + `, count(naw.` + fkColumnNameWords2ForTableWords + `) * 1.0 as c_naw
|
||
FROM ` + tableNameWordsAndNGrams + ` AS naw
|
||
GROUP BY naw.` + fkColumnNameWords2ForTableWords + `
|
||
) AS naw1 ON naw1.` + fkColumnNameWords2ForTableWords + ` = w.` + pkColumnNameWords + `
|
||
INNER JOIN (
|
||
SELECT ` + pkColumnNameNGrams + ` FROM ` + tableNameNGrams + ` AS ngram WHERE ngram.ngram IN ?
|
||
) AS n ON n.` + pkColumnNameNGrams + ` = naw.` + fkColumnNameNGramsForTableNGrams + `
|
||
GROUP BY w.` + pkColumnNameWords + `
|
||
) AS ww ON ww.id = wat.` + fkColumnNameWordsForTableWords + `
|
||
INNER JOIN (
|
||
SELECT ` + fkColumnNameTextsForTableTexts + `, count(` + fkColumnNameTextsForTableTexts + `) AS c_t FROM ` + tableNameTextsAndWords + ` GROUP BY ` + fkColumnNameTextsForTableTexts + `
|
||
) AS ct ON ct.` + fkColumnNameTextsForTableTexts + ` = t.id
|
||
GROUP BY t.` + pkColumnNameTexts + `
|
||
ORDER BY 2 DESC
|
||
`
|
||
return sql
|
||
}
|
||
|
||
// Список всех n-грамм
|
||
func (gram *NGram) ListGrams() (grams []string) {
|
||
gs := make(map[string]bool)
|
||
for _, t := range gram.Texts {
|
||
for _, w := range t.Words {
|
||
for _, g := range w.Grams {
|
||
gs[g] = true
|
||
}
|
||
}
|
||
}
|
||
for k := range gs {
|
||
grams = append(grams, k)
|
||
}
|
||
return
|
||
}
|
||
|
||
// Список всех слов
|
||
func (gram *NGram) ListWords() (words []string) {
|
||
ws := make(map[string]bool)
|
||
for _, t := range gram.Texts {
|
||
for _, w := range t.Words {
|
||
ws[w.Value] = true
|
||
}
|
||
}
|
||
for k := range ws {
|
||
words = append(words, k)
|
||
}
|
||
return
|
||
}
|
||
|
||
// Список всех текстов
|
||
func (gram *NGram) ListTexts() (texts []string) {
|
||
for _, t := range gram.Texts {
|
||
texts = append(texts, t.Value)
|
||
}
|
||
return
|
||
}
|