From 88025347ec4bad2d82f9ac6eae081ebd06a9d23b Mon Sep 17 00:00:00 2001 From: "Maxence G. de Montauzan" Date: Mon, 30 Aug 2021 19:32:12 +0200 Subject: [PATCH] Pylint suggester --- suggester.py | 101 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 31 deletions(-) diff --git a/suggester.py b/suggester.py index beda073..6ebbc90 100644 --- a/suggester.py +++ b/suggester.py @@ -1,37 +1,77 @@ -import requests -import json -import sys +""" + Process files generated by iTunesParser to fill a suggester index. + Suggester index in ELS must be created before use. -ELS_URL ='http://localhost:9200' + Found suggester.es query to create index. +""" + +import sys +import json +import requests + +ELS_URL = 'http://localhost:9200' INDEX = 'itunes-suggest' class NoGoodDataException(Exception): - def __init__(self, message): - super().__init__(message) + """ Raise when data can't be correctly analyzed """ def get_tokens(data: str) -> list: + """ + Query Elasticsearch to get token for a string with a specific analyzer. + Throw an exception if no token found in ELS response. + Parameters + ---------- + data: string + String to be analysed to obtain the tokens + Returns + ------- + list + A list of token + Raises + ------ + NoGoodDataException + If no tokens are found in the ELS responses, consider that the data is not correct for analysis. + """ if not data: return [] query = { - "analyzer": "names", + "analyzer": "names", # TODO Parameterize analyzer ? "text" : data } url = '{}/{}/_analyze'.format(ELS_URL, INDEX) - r = requests.get(url, json=query) + req = requests.get(url, json=query) - if not 'tokens' in r.json(): + if not 'tokens' in req.json(): print('ERROR: Not tokens in result') print('Input: ' + str(data)) - print('Request: ' + str(r.json())) + print('Request: ' + str(req.json())) raise NoGoodDataException('Data is not correct to get tokens') - return [t['token'] for t in r.json()['tokens']] + return [t['token'] for t in req.json()['tokens']] -def post_document(name: str, input: list, field_name: str) -> bool: - suggest_name = field_name + '_suggest' +def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str: + """ + Create suggestion document in Elasticsearch. + + Parameters + ---------- + main_field_value : str + Value to put in the main field named by `main_field_name` + input_terms : list + List of suggestion term to put in document + main_field_name : str + Name of the main field, to fill with `main_field_value` + + Returns + ------- + str + Success: ID of created document + Fail (ret. status <> 201): None + """ + suggest_name = main_field_name + '_suggest' element = { - field_name: name, - suggest_name: input + main_field_name: main_field_value, + suggest_name: input_terms } # Filter empty keys @@ -43,7 +83,7 @@ def post_document(name: str, input: list, field_name: str) -> bool: print('ELS Response KO') print(resp.status_code) print(resp.text) - return + return None el_id = resp.json()['_id'] # print('Post_element - Element created: ' + el_id) @@ -56,15 +96,16 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int Parameters ---------- file_name: string - Name and path of file to open for analyze + Path and name of file to analyze field_name: string - Name of field where found data to analyze and process suggest input + Name of the field where to find the data to create the suggestion entries array_file: string, Default: None - A name of a field with array data to analyze. Nothing if None + Name of an array field to analyze to create more suggestion entries. + Nothing if None """ print('Process file: ' + file_name) with open(file_name, 'r') as o_file: - lines = o_file.readlines() + lines = o_file.readlines() count = 0 i = 0 @@ -75,28 +116,26 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int sys.stdout.write("\b" * (40+1)) # return to start of line, after '[' data = json.loads(line) if not "index" in data: # Exclude index line - try : - input = get_tokens(data[field_name]) + try: + suggests_entries = get_tokens(data[field_name]) if array_file and array_file in data and data[array_file]: for key in data[array_file]: - input.extend(get_tokens(key)) + suggests_entries.extend(get_tokens(key)) # TODO Input have the same value several times ==> use to process a score - post_document(name=data[field_name], input=input, field_name=field_name.lower()) + post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower()) count += 1 except NoGoodDataException: print('ERROR WITH DATA') print(str(data)) print('File processed\n') - return count if __name__ == '__main__': - # Using readlines() - count = 0 - count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album') - print('Created documents: ' + str(count)) - count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist') - print('Created documents: ' + str(count)) + created_docs = 0 + created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album') + print('Created documents: ' + str(created_docs)) + created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist') + print('Created documents: ' + str(created_docs))