From ba683c3bcbc8c64680f3a7ab5043c646b64c8a7d Mon Sep 17 00:00:00 2001 From: tomleb Date: Wed, 31 May 2017 18:38:29 -0400 Subject: [PATCH] Improve search and fix '*' in search box (#871) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Improve ES search The new performance is very good. Some examples on my 1.5gb vm: INFO[0153] Query 'shingeki' took 6 milliseconds. INFO[0125] Query 'アニメ' took 17 milliseconds. INFO[0102] Query 'shingeki -kyojin horrible ' took 12 milliseconds Looking at the criteria we wanted here: https://pad.riseup.net/p/i8DrilHDWRvf, it meets: 1. Fast: sub-100ms for a typical query, sub-50ms is good and sub-20ms is optimal 2. Prefix match: "horrible" finds horriblesubs 3. Substring match? "アニメ" finds "TVアニメ" 4. Position-independent terms ("shingeki kyojin" finds the same as "kyojin shingeki") 5. Works with short term lengths correctly and fast (no in "kyoukai no kanata", 04 in "horrible shingeki 04" etc) 7. (nice to have) search negation: shingeki kyojin -horriblesubs * Use match_all query instead of *, fix * --- common/torrent.go | 19 ++++++---- config/default_config.yml | 6 +-- deploy/ansible/group_vars/all | 1 - .../files/elasticsearch_settings.yml | 38 ++++++++++++++----- 4 files changed, 43 insertions(+), 21 deletions(-) diff --git a/common/torrent.go b/common/torrent.go index 757c95a0..930e3147 100644 --- a/common/torrent.go +++ b/common/torrent.go @@ -5,6 +5,7 @@ import ( "encoding/json" "net/http" "strconv" + "strings" "github.com/gorilla/mux" elastic "gopkg.in/olivere/elastic.v5" @@ -38,10 +39,7 @@ type TorrentParam struct { func (p *TorrentParam) FromRequest(r *http.Request) { var err error - nameLike := r.URL.Query().Get("q") - if nameLike == "" { - nameLike = "*" - } + nameLike := strings.TrimSpace(r.URL.Query().Get("q")) page := mux.Vars(r)["page"] pagenum, err := strconv.ParseUint(page, 10, 32) @@ -143,10 +141,15 @@ func (p *TorrentParam) Find(client *elastic.Client) (int64, []model.Torrent, err // TODO Why is it needed, what does it do ? ctx := context.Background() - query := elastic.NewSimpleQueryStringQuery(p.NameLike). - Field("name"). - Analyzer(config.Conf.Search.ElasticsearchAnalyzer). - DefaultOperator("AND") + var query elastic.Query + if p.NameLike == "" { + query = elastic.NewMatchAllQuery() + } else { + query = elastic.NewSimpleQueryStringQuery(p.NameLike). + Field("name"). + Analyzer(config.Conf.Search.ElasticsearchAnalyzer). + DefaultOperator("AND") + } fsc := elastic.NewFetchSourceContext(true). Include("id") diff --git a/config/default_config.yml b/config/default_config.yml index 5d77bd26..5bf06eb3 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -34,8 +34,8 @@ cache: # Size by default for the cache size: 1024 search: -# default analyzer for ES - es_analyze: nyaapantsu_analyzer +# default search analyzer for ES + es_analyze: nyaapantsu_search_analyzer # default search index for ES es_index: nyaapantsu # Name of the type in the es mapping @@ -163,4 +163,4 @@ models: # ReportsTableName = "sukebei_torrent_reports" # CommentsTableName = "sukebei_comments" # UploadsOldTableName = "sukebei_user_uploads_old" -# FilesTableName = "sukebei_files" \ No newline at end of file +# FilesTableName = "sukebei_files" diff --git a/deploy/ansible/group_vars/all b/deploy/ansible/group_vars/all index a72ae917..94d0208f 100644 --- a/deploy/ansible/group_vars/all +++ b/deploy/ansible/group_vars/all @@ -4,7 +4,6 @@ nyaapantsu_password: nyaapantsu nyaapantsu_pgpool_port: 9998 nyaapantsu_directory: /nyaapantsu/ nyaapantsu_gpg_passphrase_file: "{{ nyaapantsu_directory }}/passphrase" -nyaapantsu_elasticsearch_index: nyaapantsu nyaapantsu_build_directory: go_nyaa/ nyaapantsu_elasticsearch_alias: nyaapantsu # nyaapantsu_elasticsearch_alias: sukebei diff --git a/deploy/ansible/roles/elasticsearch/files/elasticsearch_settings.yml b/deploy/ansible/roles/elasticsearch/files/elasticsearch_settings.yml index 8e5be802..c59d47e6 100644 --- a/deploy/ansible/roles/elasticsearch/files/elasticsearch_settings.yml +++ b/deploy/ansible/roles/elasticsearch/files/elasticsearch_settings.yml @@ -2,19 +2,37 @@ settings: analysis: analyzer: - nyaapantsu_analyzer: - tokenizer: nyaapantsu_tokenizer + # Don't use ngram for search otherwise 'horribleexample' would match + # 'horriblesubs' + nyaapantsu_search_analyzer: + tokenizer: standard filter: - standard - lowercase - tokenizer: - nyaapantsu_tokenizer: + char_filter: + - dash_to_underscore + + nyaapantsu_index_analyzer: + tokenizer: standard + filter: + - standard + - lowercase + - e_ngram_filter + char_filter: + - dash_to_underscore + + filter: + e_ngram_filter: type: edge_ngram min_gram: 2 max_gram: 15 - token_chars: - - letter - - digit + + char_filter: + dash_to_underscore: + type: pattern_replace + pattern: "([^\\s]+)-(?=[^\\s]+)" + replacement: "$1_" + index: number_of_shards: 1 number_of_replicas: 0 @@ -28,8 +46,10 @@ mappings: type: long name: type: text - analyzer: nyaapantsu_analyzer - fielddata: true # Use to sort by id because it is currently a text field + analyzer: nyaapantsu_index_analyzer + fields: + raw: + type: keyword category: type: text sub_category: