From 88025347ec4bad2d82f9ac6eae081ebd06a9d23b Mon Sep 17 00:00:00 2001
From: "Maxence G. de Montauzan" <maxence@gdemontauzan.fr>
Date: Mon, 30 Aug 2021 19:32:12 +0200
Subject: [PATCH] Pylint suggester

---
 suggester.py | 101 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 31 deletions(-)

diff --git a/suggester.py b/suggester.py
index beda073..6ebbc90 100644
--- a/suggester.py
+++ b/suggester.py
@@ -1,37 +1,77 @@
-import requests
-import json
-import sys
+"""
+    Process files generated by iTunesParser to fill a suggester index.
+    Suggester index in ELS must be created before use.
 
-ELS_URL ='http://localhost:9200'
+    Found suggester.es query to create index.
+"""
+
+import sys
+import json
+import requests
+
+ELS_URL = 'http://localhost:9200'
 INDEX = 'itunes-suggest'
 
 class NoGoodDataException(Exception):
-    def __init__(self, message):
-        super().__init__(message)
+    """ Raise when data can't be correctly analyzed """
 
 def get_tokens(data: str) -> list:
+    """
+        Query Elasticsearch to get token for a string with a specific analyzer.
+        Throw an exception if no token found in ELS response.
+        Parameters
+        ----------
+        data: string
+            String to be analysed to obtain the tokens
+        Returns
+        -------
+        list
+            A list of token
+        Raises
+        ------
+        NoGoodDataException
+            If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
+    """
     if not data:
         return []
     query = {
-        "analyzer": "names",
+        "analyzer": "names", # TODO Parameterize analyzer ?
         "text" : data
     }
 
     url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
-    r = requests.get(url, json=query)
+    req = requests.get(url, json=query)
 
-    if not 'tokens' in r.json():
+    if not 'tokens' in req.json():
         print('ERROR: Not tokens in result')
         print('Input: ' + str(data))
-        print('Request: ' + str(r.json()))
+        print('Request: ' + str(req.json()))
         raise NoGoodDataException('Data is not correct to get tokens')
-    return [t['token'] for t in r.json()['tokens']]
+    return [t['token'] for t in req.json()['tokens']]
 
-def post_document(name: str, input: list, field_name: str) -> bool:
-    suggest_name = field_name + '_suggest'
+def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
+    """
+        Create suggestion document in Elasticsearch.
+
+        Parameters
+        ----------
+        main_field_value : str
+            Value to put in the main field named by `main_field_name`
+        input_terms : list
+            List of suggestion term to put in document
+        main_field_name : str
+            Name of the main field, to fill with `main_field_value`
+
+        Returns
+        -------
+        str
+            Success: ID of created document
+            Fail (ret. status <> 201): None
+    """
+    suggest_name = main_field_name + '_suggest'
     element = {
-        field_name: name,
-        suggest_name: input
+        main_field_name: main_field_value,
+        suggest_name: input_terms
     }
 
     # Filter empty keys
@@ -43,7 +83,7 @@ def post_document(name: str, input: list, field_name: str) -> bool:
         print('ELS Response KO')
         print(resp.status_code)
         print(resp.text)
-        return
+        return None
 
     el_id = resp.json()['_id']
     # print('Post_element - Element created: ' + el_id)
@@ -56,15 +96,16 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
         Parameters
         ----------
         file_name: string
-            Name and path of file to open for analyze
+            Path and name of file to analyze
         field_name: string
-            Name of field where found data to analyze and process suggest input
+            Name of the field where to find the data to create the suggestion entries
         array_file: string, Default: None
-            A name of a field with array data to analyze. Nothing if None
+            Name of an array field to analyze to create more suggestion entries.
+            Nothing if None
     """
     print('Process file: ' + file_name)
     with open(file_name, 'r') as o_file:
-            lines = o_file.readlines()
+        lines = o_file.readlines()
 
     count = 0
     i = 0
@@ -75,28 +116,26 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
         sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
         data = json.loads(line)
         if not "index" in data: # Exclude index line
-            try :
-                input = get_tokens(data[field_name])
+            try:
+                suggests_entries = get_tokens(data[field_name])
 
                 if array_file and array_file in data and data[array_file]:
                     for key in data[array_file]:
-                        input.extend(get_tokens(key))
+                        suggests_entries.extend(get_tokens(key))
 
                 # TODO Input have the same value several times ==> use to process a score
-                post_document(name=data[field_name], input=input, field_name=field_name.lower())
+                post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
                 count += 1
             except NoGoodDataException:
                 print('ERROR WITH DATA')
                 print(str(data))
     print('File processed\n')
-
     return count
 
 
 if __name__ == '__main__':
-    # Using readlines()
-    count = 0
-    count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
-    print('Created documents: ' + str(count))
-    count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
-    print('Created documents: ' + str(count))
+    created_docs = 0
+    created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
+    print('Created documents: ' + str(created_docs))
+    created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
+    print('Created documents: ' + str(created_docs))