go-yt-ngram/ngram.go

211 lines
6.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package ngram
import (
"fmt"
)
// Индексатор
type NGram struct {
Texts []*Text // Тексты
size int // Размерность n-грамм
}
// Результат поиска
type NGramSearchResult struct {
Value string // Результирующая строка
Coeff float32 // Коэффициент соответствия
}
// Create new object NGram
func NewNGram(size int) (ngram *NGram) {
ngram = &NGram{
size: size,
}
return
}
// Подготовка текста
func (gram *NGram) AddText(value string) error {
text, err := BuildText(value, gram.size)
if err != nil {
return err
}
if text == nil {
err = fmt.Errorf(`text not parsed`)
} else {
gram.Texts = append(gram.Texts, text)
}
return err
}
// Поиск с установленным порогом совпадения
func (gram *NGram) SearchLimit(value string, limit float32) (result []NGramSearchResult) {
if len(gram.Texts) == 0 {
return
}
for _, text := range gram.Texts {
res := text.Match(value)
if res >= limit {
result = append(result, NGramSearchResult{
Value: text.Value,
Coeff: res,
})
}
}
return
}
// Поиск наиболее подходящего значения
func (gram *NGram) Search(value string) string {
if len(gram.Texts) == 0 {
return ""
}
results := gram.SearchLimit(value, 0.0)
if len(results) == 0 {
return ""
}
var maxRes NGramSearchResult
for _, r := range results {
if maxRes.Coeff < r.Coeff {
maxRes = r
}
}
return maxRes.Value
}
// Генерирование SQL-запроса для выполнения в БД
// Для выполнения данного запроса потребуется примерно следующая структура таблиц:
//
// CREATE TABLE texts (id BIGINT PRIMARY KEY, txt TEXT);
// CREATE TABLE words (id BIGINT PRIMARY KEY, word VARCHAR (150));
// CREATE TABLE ngram (id BIGINT PRIMARY KEY, ngram VARCHAR (20));
// CREATE TABLE word_and_text (id_text BIGINT REFERENCES texts (id), id_word BIGINT REFERENCES words (id));
// CREATE TABLE ngram_and_word (id_word BIGINT REFERENCES words (id), id_ngram BIGINT REFERENCES ngram (id));
//
// Соответственно таблицы и поля могут быть переименованы и дополнены в соответствии с нужными требованиями
func (gram *NGram) SearchLimitSQL(
tableNameTexts string,
pkColumnNameTexts string,
tableNameWords string,
pkColumnNameWords string,
tableNameTextsAndWords string,
fkColumnNameWordsForTableWords string,
fkColumnNameTextsForTableTexts string,
tableNameNGrams string,
pkColumnNameNGrams string,
tableNameWordsAndNGrams string,
columnNameNGram string,
fkColumnNameWords2ForTableWords string,
fkColumnNameNGramsForTableNGrams string,
) string {
/*
explain SELECT t.id, sum(ww.w_coeff)/count(ww.w_coeff) AS coeff
FROM texts AS t
INNER JOIN word_and_text AS wat ON t.id = wat.id_text
INNER JOIN (
SELECT w.id, naw1.c_naw, count(w.id) AS w_c, (naw1.c_naw/count(w.id)) AS w_coeff
FROM words AS w
INNER JOIN ngram_and_word AS naw ON naw.id_word = w.id
INNER JOIN (
SELECT naw.id_word, count(naw.id_ngram) as c_naw
FROM ngram_and_word AS naw
GROUP BY naw.id_word
) AS naw1 ON naw1.id_word = w.id
INNER JOIN (
SELECT id FROM ngram WHERE ngram.ngram IN ('a','b')
) AS n ON n.id = naw.id_ngram
) AS ww ON ww.id = wat.id_word
*/
/*
SELECT t.Name, sum(ww.w_coeff)/count(ww.w_coeff) AS coeff
FROM artists AS t
INNER JOIN ngram_word_and_artists AS wat ON t.Name = wat.id_artist
INNER JOIN (
SELECT w.id, naw1.c_naw, count(w.id) AS w_c, (naw1.c_naw/count(w.id)) AS w_coeff
FROM ngram_words AS w
INNER JOIN ngram_and_words AS naw ON naw.id_word = w.id
INNER JOIN (
SELECT naw.id_word, count(naw.id_word) as c_naw
FROM ngram_and_words AS naw
GROUP BY naw.id_word
) AS naw1 ON naw1.id_word = w.id
INNER JOIN (
SELECT id FROM ngram_ngrams AS ngram WHERE ngram.ngram IN ('POP','OPO','POV')
) AS n ON n.id = naw.id_ngram
GROUP BY w.id
) AS ww ON ww.id = wat.id_word
*/
sql := `
SELECT t.` + pkColumnNameTexts + `, (sum(ww.w_coeff) * 1.0)/(count(ww.w_coeff) * 1.0) AS coeff
FROM ` + tableNameTexts + ` AS t
INNER JOIN ` + tableNameTextsAndWords + ` AS wat ON t.` + pkColumnNameTexts + ` = wat.` + fkColumnNameTextsForTableTexts + `
INNER JOIN (
SELECT w.` + pkColumnNameWords + `, naw1.c_naw, count(w.` + pkColumnNameWords + ` * 1.0) AS w_c, ((count(w.` + pkColumnNameWords + `) * 1.0)/(naw1.c_naw * 1.0)) AS w_coeff
FROM ` + tableNameWords + ` AS w
INNER JOIN ` + tableNameWordsAndNGrams + ` AS naw ON naw.` + fkColumnNameWords2ForTableWords + ` = w.` + pkColumnNameWords + `
INNER JOIN (
SELECT naw.` + fkColumnNameWords2ForTableWords + `, count(naw.` + fkColumnNameWords2ForTableWords + `) * 1.0 as c_naw
FROM ` + tableNameWordsAndNGrams + ` AS naw
GROUP BY naw.` + fkColumnNameWords2ForTableWords + `
) AS naw1 ON naw1.` + fkColumnNameWords2ForTableWords + ` = w.` + pkColumnNameWords + `
INNER JOIN (
SELECT ` + pkColumnNameNGrams + ` FROM ` + tableNameNGrams + ` AS ngram WHERE ngram.ngram IN ?
) AS n ON n.` + pkColumnNameNGrams + ` = naw.` + fkColumnNameNGramsForTableNGrams + `
GROUP BY w.` + pkColumnNameWords + `
) AS ww ON ww.id = wat.` + fkColumnNameWordsForTableWords + `
INNER JOIN (
SELECT ` + fkColumnNameTextsForTableTexts + `, count(` + fkColumnNameTextsForTableTexts + `) AS c_t FROM ` + tableNameTextsAndWords + ` GROUP BY ` + fkColumnNameTextsForTableTexts + `
) AS ct ON ct.` + fkColumnNameTextsForTableTexts + ` = t.id
GROUP BY t.` + pkColumnNameTexts + `
ORDER BY 2 DESC
`
return sql
}
// Список всех n-грамм
func (gram *NGram) ListGrams() (grams []string) {
gs := make(map[string]bool)
for _, t := range gram.Texts {
for _, w := range t.Words {
for _, g := range w.Grams {
gs[g] = true
}
}
}
for k := range gs {
grams = append(grams, k)
}
return
}
// Список всех слов
func (gram *NGram) ListWords() (words []string) {
ws := make(map[string]bool)
for _, t := range gram.Texts {
for _, w := range t.Words {
ws[w.Value] = true
}
}
for k := range ws {
words = append(words, k)
}
return
}
// Список всех текстов
func (gram *NGram) ListTexts() (texts []string) {
for _, t := range gram.Texts {
texts = append(texts, t.Value)
}
return
}