Albirew/nyaa-pantsu
Albirew
/
nyaa-pantsu
Archivé
1
0
Bifurcation 0

Improve search and fix '*' in search box (#871)

* Improve ES search

The new performance is very good.
Some examples on my 1.5gb vm:
INFO[0153] Query 'shingeki' took 6 milliseconds.
INFO[0125] Query 'アニメ' took 17 milliseconds.
INFO[0102] Query 'shingeki -kyojin horrible ' took 12 milliseconds

Looking at the criteria we wanted here:
https://pad.riseup.net/p/i8DrilHDWRvf, it meets:

1. Fast: sub-100ms for a typical query, sub-50ms is good and sub-20ms is
optimal
2. Prefix match: "horrible" finds horriblesubs
3. Substring match? "アニメ" finds "TVアニメ"
4. Position-independent terms ("shingeki kyojin" finds the same as
"kyojin shingeki")
5. Works with short term lengths correctly and fast (no in "kyoukai no
kanata", 04 in "horrible shingeki 04" etc)
7. (nice to have) search negation: shingeki kyojin -horriblesubs

* Use match_all query instead of *, fix *
Cette révision appartient à :
tomleb 2017-05-31 18:38:29 -04:00 révisé par ewhal
Parent 00a885af28
révision ba683c3bcb
4 fichiers modifiés avec 43 ajouts et 21 suppressions

Voir le fichier

@ -5,6 +5,7 @@ import (
"encoding/json"
"net/http"
"strconv"
"strings"
"github.com/gorilla/mux"
elastic "gopkg.in/olivere/elastic.v5"
@ -38,10 +39,7 @@ type TorrentParam struct {
func (p *TorrentParam) FromRequest(r *http.Request) {
var err error
nameLike := r.URL.Query().Get("q")
if nameLike == "" {
nameLike = "*"
}
nameLike := strings.TrimSpace(r.URL.Query().Get("q"))
page := mux.Vars(r)["page"]
pagenum, err := strconv.ParseUint(page, 10, 32)
@ -143,10 +141,15 @@ func (p *TorrentParam) Find(client *elastic.Client) (int64, []model.Torrent, err
// TODO Why is it needed, what does it do ?
ctx := context.Background()
query := elastic.NewSimpleQueryStringQuery(p.NameLike).
Field("name").
Analyzer(config.Conf.Search.ElasticsearchAnalyzer).
DefaultOperator("AND")
var query elastic.Query
if p.NameLike == "" {
query = elastic.NewMatchAllQuery()
} else {
query = elastic.NewSimpleQueryStringQuery(p.NameLike).
Field("name").
Analyzer(config.Conf.Search.ElasticsearchAnalyzer).
DefaultOperator("AND")
}
fsc := elastic.NewFetchSourceContext(true).
Include("id")

Voir le fichier

@ -34,8 +34,8 @@ cache:
# Size by default for the cache
size: 1024
search:
# default analyzer for ES
es_analyze: nyaapantsu_analyzer
# default search analyzer for ES
es_analyze: nyaapantsu_search_analyzer
# default search index for ES
es_index: nyaapantsu
# Name of the type in the es mapping
@ -163,4 +163,4 @@ models:
# ReportsTableName = "sukebei_torrent_reports"
# CommentsTableName = "sukebei_comments"
# UploadsOldTableName = "sukebei_user_uploads_old"
# FilesTableName = "sukebei_files"
# FilesTableName = "sukebei_files"

Voir le fichier

@ -4,7 +4,6 @@ nyaapantsu_password: nyaapantsu
nyaapantsu_pgpool_port: 9998
nyaapantsu_directory: /nyaapantsu/
nyaapantsu_gpg_passphrase_file: "{{ nyaapantsu_directory }}/passphrase"
nyaapantsu_elasticsearch_index: nyaapantsu
nyaapantsu_build_directory: go_nyaa/
nyaapantsu_elasticsearch_alias: nyaapantsu
# nyaapantsu_elasticsearch_alias: sukebei

Voir le fichier

@ -2,19 +2,37 @@
settings:
analysis:
analyzer:
nyaapantsu_analyzer:
tokenizer: nyaapantsu_tokenizer
# Don't use ngram for search otherwise 'horribleexample' would match
# 'horriblesubs'
nyaapantsu_search_analyzer:
tokenizer: standard
filter:
- standard
- lowercase
tokenizer:
nyaapantsu_tokenizer:
char_filter:
- dash_to_underscore
nyaapantsu_index_analyzer:
tokenizer: standard
filter:
- standard
- lowercase
- e_ngram_filter
char_filter:
- dash_to_underscore
filter:
e_ngram_filter:
type: edge_ngram
min_gram: 2
max_gram: 15
token_chars:
- letter
- digit
char_filter:
dash_to_underscore:
type: pattern_replace
pattern: "([^\\s]+)-(?=[^\\s]+)"
replacement: "$1_"
index:
number_of_shards: 1
number_of_replicas: 0
@ -28,8 +46,10 @@ mappings:
type: long
name:
type: text
analyzer: nyaapantsu_analyzer
fielddata: true # Use to sort by id because it is currently a text field
analyzer: nyaapantsu_index_analyzer
fields:
raw:
type: keyword
category:
type: text
sub_category: