Albirew/nyaa-pantsu
Archivé
1
0
Bifurcation 0

Improve search and fix '*' in search box (#871)

* Improve ES search

The new performance is very good.
Some examples on my 1.5gb vm:
INFO[0153] Query 'shingeki' took 6 milliseconds.
INFO[0125] Query 'アニメ' took 17 milliseconds.
INFO[0102] Query 'shingeki -kyojin horrible ' took 12 milliseconds

Looking at the criteria we wanted here:
https://pad.riseup.net/p/i8DrilHDWRvf, it meets:

1. Fast: sub-100ms for a typical query, sub-50ms is good and sub-20ms is
optimal
2. Prefix match: "horrible" finds horriblesubs
3. Substring match? "アニメ" finds "TVアニメ"
4. Position-independent terms ("shingeki kyojin" finds the same as
"kyojin shingeki")
5. Works with short term lengths correctly and fast (no in "kyoukai no
kanata", 04 in "horrible shingeki 04" etc)
7. (nice to have) search negation: shingeki kyojin -horriblesubs

* Use match_all query instead of *, fix *
Cette révision appartient à :
tomleb 2017-05-31 18:38:29 -04:00 révisé par ewhal
Parent 00a885af28
révision ba683c3bcb
4 fichiers modifiés avec 43 ajouts et 21 suppressions

Voir le fichier

@ -5,6 +5,7 @@ import (
"encoding/json" "encoding/json"
"net/http" "net/http"
"strconv" "strconv"
"strings"
"github.com/gorilla/mux" "github.com/gorilla/mux"
elastic "gopkg.in/olivere/elastic.v5" elastic "gopkg.in/olivere/elastic.v5"
@ -38,10 +39,7 @@ type TorrentParam struct {
func (p *TorrentParam) FromRequest(r *http.Request) { func (p *TorrentParam) FromRequest(r *http.Request) {
var err error var err error
nameLike := r.URL.Query().Get("q") nameLike := strings.TrimSpace(r.URL.Query().Get("q"))
if nameLike == "" {
nameLike = "*"
}
page := mux.Vars(r)["page"] page := mux.Vars(r)["page"]
pagenum, err := strconv.ParseUint(page, 10, 32) pagenum, err := strconv.ParseUint(page, 10, 32)
@ -143,10 +141,15 @@ func (p *TorrentParam) Find(client *elastic.Client) (int64, []model.Torrent, err
// TODO Why is it needed, what does it do ? // TODO Why is it needed, what does it do ?
ctx := context.Background() ctx := context.Background()
query := elastic.NewSimpleQueryStringQuery(p.NameLike). var query elastic.Query
Field("name"). if p.NameLike == "" {
Analyzer(config.Conf.Search.ElasticsearchAnalyzer). query = elastic.NewMatchAllQuery()
DefaultOperator("AND") } else {
query = elastic.NewSimpleQueryStringQuery(p.NameLike).
Field("name").
Analyzer(config.Conf.Search.ElasticsearchAnalyzer).
DefaultOperator("AND")
}
fsc := elastic.NewFetchSourceContext(true). fsc := elastic.NewFetchSourceContext(true).
Include("id") Include("id")

Voir le fichier

@ -34,8 +34,8 @@ cache:
# Size by default for the cache # Size by default for the cache
size: 1024 size: 1024
search: search:
# default analyzer for ES # default search analyzer for ES
es_analyze: nyaapantsu_analyzer es_analyze: nyaapantsu_search_analyzer
# default search index for ES # default search index for ES
es_index: nyaapantsu es_index: nyaapantsu
# Name of the type in the es mapping # Name of the type in the es mapping
@ -163,4 +163,4 @@ models:
# ReportsTableName = "sukebei_torrent_reports" # ReportsTableName = "sukebei_torrent_reports"
# CommentsTableName = "sukebei_comments" # CommentsTableName = "sukebei_comments"
# UploadsOldTableName = "sukebei_user_uploads_old" # UploadsOldTableName = "sukebei_user_uploads_old"
# FilesTableName = "sukebei_files" # FilesTableName = "sukebei_files"

Voir le fichier

@ -4,7 +4,6 @@ nyaapantsu_password: nyaapantsu
nyaapantsu_pgpool_port: 9998 nyaapantsu_pgpool_port: 9998
nyaapantsu_directory: /nyaapantsu/ nyaapantsu_directory: /nyaapantsu/
nyaapantsu_gpg_passphrase_file: "{{ nyaapantsu_directory }}/passphrase" nyaapantsu_gpg_passphrase_file: "{{ nyaapantsu_directory }}/passphrase"
nyaapantsu_elasticsearch_index: nyaapantsu
nyaapantsu_build_directory: go_nyaa/ nyaapantsu_build_directory: go_nyaa/
nyaapantsu_elasticsearch_alias: nyaapantsu nyaapantsu_elasticsearch_alias: nyaapantsu
# nyaapantsu_elasticsearch_alias: sukebei # nyaapantsu_elasticsearch_alias: sukebei

Voir le fichier

@ -2,19 +2,37 @@
settings: settings:
analysis: analysis:
analyzer: analyzer:
nyaapantsu_analyzer: # Don't use ngram for search otherwise 'horribleexample' would match
tokenizer: nyaapantsu_tokenizer # 'horriblesubs'
nyaapantsu_search_analyzer:
tokenizer: standard
filter: filter:
- standard - standard
- lowercase - lowercase
tokenizer: char_filter:
nyaapantsu_tokenizer: - dash_to_underscore
nyaapantsu_index_analyzer:
tokenizer: standard
filter:
- standard
- lowercase
- e_ngram_filter
char_filter:
- dash_to_underscore
filter:
e_ngram_filter:
type: edge_ngram type: edge_ngram
min_gram: 2 min_gram: 2
max_gram: 15 max_gram: 15
token_chars:
- letter char_filter:
- digit dash_to_underscore:
type: pattern_replace
pattern: "([^\\s]+)-(?=[^\\s]+)"
replacement: "$1_"
index: index:
number_of_shards: 1 number_of_shards: 1
number_of_replicas: 0 number_of_replicas: 0
@ -28,8 +46,10 @@ mappings:
type: long type: long
name: name:
type: text type: text
analyzer: nyaapantsu_analyzer analyzer: nyaapantsu_index_analyzer
fielddata: true # Use to sort by id because it is currently a text field fields:
raw:
type: keyword
category: category:
type: text type: text
sub_category: sub_category: