Use send data to process suggestion

Not all suggested document is created in ELS Need to refactor send data
Pylint suggester
2021-08-31 02:31:32 +02:00 · 2021-08-30 19:32:12 +02:00 · 2021-08-30 19:13:30 +02:00 · 2021-08-23 01:22:58 +02:00 · 2021-08-23 01:22:58 +02:00 · 2021-08-23 01:22:58 +02:00
5 changed files with 173 additions and 82 deletions
--- a/iTunesParser.py
+++ b/iTunesParser.py
@@ -147,7 +147,8 @@ class ITunesParser:
                'Play Count': 0,
                'Rating': 0,
                'Genre': set(),
-                'Album': set()
+                'Album': set(),
+                'Album Artist': set()
                }

        # Compute information
@@ -168,6 +169,9 @@ class ITunesParser:
        if 'Album' in track:
            self._artists[akey]['Album'].add(track['Album'])

+        if 'Album Artist' in track:
+            self._artists[akey]['Album Artist'].add(track['Artist'])
+
    def _process_album(self, track):
        """
            Process albums in the track part of library and return a JSON formated for a bulk ELS request
--- a/mapping.suggest.json
+++ b/mapping.suggest.json
@@ -0,0 +1,48 @@
+{
+    "settings": {
+        "index": {
+            "number_of_replicas": 0
+        },
+        "analysis": {
+            "filter": {
+                "french_stop": {
+                    "type": "stop",
+                    "stopwords": "_french_"
+                },
+                "english_stop": {
+                    "type": "stop",
+                    "stopwords": "_english_"
+                }
+            },
+            "analyzer": {
+                "names": {
+                    "tokenizer": "standard",
+                    "filter": [
+                        "lowercase",
+                        "asciifolding",
+                        "french_stop",
+                        "english_stop"
+                    ]
+                }
+            }
+        }
+    },
+    "mappings": {
+        "properties": {
+            "artist_suggest": {
+                "type": "completion",
+                "search_analyzer": "names"
+            },
+            "artist": {
+                "type": "keyword"
+            },
+            "album_suggest": {
+                "type": "completion",
+                "search_analyzer": "names"
+            },
+            "album": {
+                "type": "keyword"
+            }
+        }
+    }
+}
--- a/send_data.py
+++ b/send_data.py
@@ -10,6 +10,8 @@ import json
 import time
 import requests

+from suggester import process_file
+
 class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
@@ -28,10 +30,12 @@ DEFAULT_ARTIST_FILE = 'es-artists.json'
 DEFAULT_MAPPING_SONGS_FILE = 'mapping.songs.json'
 DEFAULT_MAPPING_ARTISTS_FILE = 'mapping.artists.json'
 DEFAULT_MAPPING_ALBUMS_FILE = 'mapping.albums.json'
+DEFAULT_MAPPING_SUGGEST_FILE = 'mapping.suggest.json'

 SONG_INDEX = 'itunes-songs'
 ALBUM_INDEX = 'itunes-albums'
 ARTIST_INDEX = 'itunes-artists'
+SUGGEST_INDEX = 'itunes-suggest'
 # TODO Put variables in a config files or in a python library

 # Global values / set as default values
@@ -145,6 +149,22 @@ def main():
        else:
            print('Album sent')

+    if args.ALL or args.no_suggest:
+        print("Process suggestion:")
+        if args.DELETE: # TODO Do a method?
+            mapping_suggest = load_file(args.mapping_suggest, DEFAULT_MAPPING_SUGGEST_FILE)
+            if not args.quiet:
+                print("Mapping of suggest index file: '{}'".format(mapping_suggest.name))
+
+            delete_index(SUGGEST_INDEX, args.quiet)
+            put_mapping(SUGGEST_INDEX, mapping_suggest, args.quiet)
+
+        suggs_docs = 0
+        suggs_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
+        print('Created documents: ' + str(suggs_docs))
+        suggs_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
+        print('Created documents: ' + str(suggs_docs))
+
    print("I'm done!")
    if check_is_ok.count(False) > 0:
        print('Some problems occurs')
@@ -199,6 +219,8 @@ def create_args_parser():
                               .format(DEFAULT_ARTIST_FILE))
    song_group.add_argument('-s', '--song', action='store_false',
                            help='Disable sending song data')
+    song_group.add_argument('--no-suggest', action='store_false',
+                            help='Disable sending suggest data')

    # Mode
    mode_group = parser.add_argument_group('Mode')
@@ -211,12 +233,14 @@ def create_args_parser():

    # Mapping
    mapping_group = parser.add_argument_group('Mapping files')
-    mode_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=DEFAULT_MAPPING_SONGS_FILE, nargs='?',
+    mapping_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=DEFAULT_MAPPING_SONGS_FILE, nargs='?',
                            help='Mapping file for songs (default: \'{}\')'.format(DEFAULT_MAPPING_SONGS_FILE))
-    mode_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE, nargs='?',
+    mapping_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE, nargs='?',
                            help='Mapping file for artists (default: \'{}\')'.format(DEFAULT_MAPPING_ARTISTS_FILE))
-    mode_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=DEFAULT_MAPPING_ALBUMS_FILE, nargs='?',
+    mapping_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=DEFAULT_MAPPING_ALBUMS_FILE, nargs='?',
                            help='Mapping file for albums (default: \'{}\')'.format(DEFAULT_MAPPING_ALBUMS_FILE))
+    mapping_group.add_argument('-mg', '--mapping-suggest', type=argparse.FileType('r'), const=DEFAULT_MAPPING_SUGGEST_FILE, nargs='?',
+                            help='Mapping file for suggest (default: \'{}\')'.format(DEFAULT_MAPPING_SUGGEST_FILE))

    # Global Settings
    g_settings_group = parser.add_argument_group('Global Settings')
--- a/suggester.es
+++ b/suggester.es
@@ -1,51 +1,7 @@
 DELETE itunes-suggest

 PUT /itunes-suggest
-{
-    "settings": {
-        "analysis": {
-            "filter": {
-                "french_stop": {
-                    "type": "stop",
-                    "stopwords": "_french_"
-                },
-                "english_stop": {
-                    "type": "stop",
-                    "stopwords": "_english_"
-                }
-            },
-            "analyzer": {
-                "names": {
-                    "tokenizer": "standard",
-                    "filter": [
-                        "lowercase",
-                        "asciifolding",
-                        "french_stop",
-                        "english_stop"
-                    ]
-                }
-            }
-        }
-    },
-    "mappings": {
-        "properties": {
-            "artist_suggest": {
-                "type": "completion",
-                "search_analyzer": "names"
-            },
-            "artist": {
-                "type": "keyword"
-            },
-            "album_suggest": {
-                "type": "completion",
-                "search_analyzer": "names"
-            },
-            "album": {
-                "type": "keyword"
-            }
-        }
-    }
-}
+!./mapping.suggest.json

 // Also possible to specify analyze for ingesting => https://stackoverflow.com/questions/48304499/elasticsearch-completion-suggester-not-working-with-whitespace-analyzer

@@ -59,7 +15,7 @@ GET itunes-suggest/_analyze

 GET itunes-suggest/_search

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source" : "artist",
    "suggest": {
@@ -72,7 +28,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source" : "album",
    "suggest": {
@@ -86,7 +42,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source": ["album", "artist"],
    "suggest": {
@@ -105,7 +61,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source": ["album", "artist"],
    "suggest": {
@@ -124,7 +80,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "suggest": {
        "ar-suggest": {
--- a/suggester.py
+++ b/suggester.py
@@ -1,37 +1,77 @@
-import requests
-import json
-import sys
+"""
+    Process files generated by iTunesParser to fill a suggester index.
+    Suggester index in ELS must be created before use.

-ELS_URL ='http://localhost:9200'
+    Found suggester.es query to create index.
+"""
+
+import sys
+import json
+import requests
+
+ELS_URL = 'http://localhost:9200'
 INDEX = 'itunes-suggest'

 class NoGoodDataException(Exception):
-    def __init__(self, message):
-        super().__init__(message)
+    """ Raise when data can't be correctly analyzed """

 def get_tokens(data: str) -> list:
+    """
+        Query Elasticsearch to get token for a string with a specific analyzer.
+        Throw an exception if no token found in ELS response.
+        Parameters
+        ----------
+        data: string
+            String to be analysed to obtain the tokens
+        Returns
+        -------
+        list
+            A list of token
+        Raises
+        ------
+        NoGoodDataException
+            If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
+    """
    if not data:
        return []
    query = {
-        "analyzer": "names",
+        "analyzer": "names", # TODO Parameterize analyzer ?
        "text" : data
    }

    url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
-    r = requests.get(url, json=query)
+    req = requests.get(url, json=query)

-    if not 'tokens' in r.json():
+    if not 'tokens' in req.json():
        print('ERROR: Not tokens in result')
        print('Input: ' + str(data))
-        print('Request: ' + str(r.json()))
+        print('Request: ' + str(req.json()))
        raise NoGoodDataException('Data is not correct to get tokens')
-    return [t['token'] for t in r.json()['tokens']]
+    return [t['token'] for t in req.json()['tokens']]

-def post_document(name: str, input: list, field_name: str) -> bool:
-    suggest_name = field_name + '_suggest'
+def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
+    """
+        Create suggestion document in Elasticsearch.
+
+        Parameters
+        ----------
+        main_field_value : str
+            Value to put in the main field named by `main_field_name`
+        input_terms : list
+            List of suggestion term to put in document
+        main_field_name : str
+            Name of the main field, to fill with `main_field_value`
+
+        Returns
+        -------
+        str
+            Success: ID of created document
+            Fail (ret. status <> 201): None
+    """
+    suggest_name = main_field_name + '_suggest'
    element = {
-        field_name: name,
-        suggest_name: input
+        main_field_name: main_field_value,
+        suggest_name: input_terms
    }

    # Filter empty keys
@@ -43,16 +83,29 @@ def post_document(name: str, input: list, field_name: str) -> bool:
        print('ELS Response KO')
        print(resp.status_code)
        print(resp.text)
-        return
+        return None

    el_id = resp.json()['_id']
    # print('Post_element - Element created: ' + el_id)
    return el_id

-def process_file(file_name: str, field_name: str) -> int:
+def process_file(file_name: str, field_name: str, array_file: str = None) -> int:
+    """
+        Process a JSON file with data
+
+        Parameters
+        ----------
+        file_name: string
+            Path and name of file to analyze
+        field_name: string
+            Name of the field where to find the data to create the suggestion entries
+        array_file: string, Default: None
+            Name of an array field to analyze to create more suggestion entries.
+            Nothing if None
+    """
    print('Process file: ' + file_name)
    with open(file_name, 'r') as o_file:
-            lines = o_file.readlines()
+        lines = o_file.readlines()

    count = 0
    i = 0
@@ -62,22 +115,28 @@ def process_file(file_name: str, field_name: str) -> int:
        sys.stdout.flush()
        sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
        data = json.loads(line)
-        if "Artist" in data:
-            try :
-                input = get_tokens(data[field_name])
-                post_document(name=data[field_name], input=input, field_name=field_name.lower())
+        if not "index" in data: # Exclude index line
+            try:
+                suggests_entries = get_tokens(data[field_name])
+
+                if array_file and array_file in data and data[array_file]:
+                    for key in data[array_file]:
+                        suggests_entries.extend(get_tokens(key))
+
+                # TODO Input have the same value several times ==> use to process a score
+                post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
                count += 1
            except NoGoodDataException:
                print('ERROR WITH DATA')
                print(str(data))
    print('File processed\n')
-
    return count


 if __name__ == '__main__':
-    # Using readlines()
-    count = 0
-    count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
-    count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist')
-    print('Created documents: ' + str(count))
+    created_docs = 0
+    created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
+    print('Created documents: ' + str(created_docs))
+    created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
+    print('Created documents: ' + str(created_docs))
+    # TODO Created doc <> nb doc in ELS
Author	SHA1	Message	Date
Maxence G. de Montauzan	928efb659e	Use send data to process suggestion Not all suggested document is created in ELS Need to refactor send data	2021-08-31 02:31:32 +02:00
Maxence G. de Montauzan	88025347ec	Pylint suggester	2021-08-30 19:32:12 +02:00
Maxence G. de Montauzan	67e1f8bd0c	Working suggester ingester	2021-08-30 19:13:30 +02:00
Maxence G. de Montauzan	042c2558ae	Update ES files - POST -> get	2021-08-23 01:22:58 +02:00
Maxence G. de Montauzan	ad0487943a	Process for album Adds too many uninteresting results Eg. all albums for one artist => prevents finding interesting information	2021-08-23 01:22:58 +02:00
Maxence G. de Montauzan	56050d0a49	Suggester: take Album Artist But it's not OK in dashboard -> For example, search 'ayache' (for Superbus) => Result display 'Superbus' and we don't understand why	2021-08-23 01:22:58 +02:00