""" Process files generated by iTunesParser to fill a suggester index. Suggester index in ELS must be created before use. Found suggester.es query to create index. """ import sys import json import requests ELS_URL = 'http://localhost:9200' INDEX = 'itunes-suggest' class NoGoodDataException(Exception): """ Raise when data can't be correctly analyzed """ def get_tokens(data: str) -> list: """ Query Elasticsearch to get token for a string with a specific analyzer. Throw an exception if no token found in ELS response. Parameters ---------- data: string String to be analysed to obtain the tokens Returns ------- list A list of token Raises ------ NoGoodDataException If no tokens are found in the ELS responses, consider that the data is not correct for analysis. """ if not data: return [] query = { "analyzer": "names", # TODO Parameterize analyzer ? "text" : data } url = '{}/{}/_analyze'.format(ELS_URL, INDEX) req = requests.get(url, json=query) if not 'tokens' in req.json(): print('ERROR: Not tokens in result') print('Input: ' + str(data)) print('Request: ' + str(req.json())) raise NoGoodDataException('Data is not correct to get tokens') return [t['token'] for t in req.json()['tokens']] def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str: """ Create suggestion document in Elasticsearch. Parameters ---------- main_field_value : str Value to put in the main field named by `main_field_name` input_terms : list List of suggestion term to put in document main_field_name : str Name of the main field, to fill with `main_field_value` Returns ------- str Success: ID of created document Fail (ret. status <> 201): None """ suggest_name = main_field_name + '_suggest' element = { main_field_name: main_field_value, suggest_name: input_terms } # Filter empty keys # element = {k: v for k, v in element.items() if v} url = '{}/{}/_doc'.format(ELS_URL, INDEX) resp = requests.post(url, json=element) if resp.status_code != 201: print('ELS Response KO') print(resp.status_code) print(resp.text) return None el_id = resp.json()['_id'] # print('Post_element - Element created: ' + el_id) return el_id def process_file(file_name: str, field_name: str, array_file: str = None) -> int: """ Process a JSON file with data Parameters ---------- file_name: string Path and name of file to analyze field_name: string Name of the field where to find the data to create the suggestion entries array_file: string, Default: None Name of an array field to analyze to create more suggestion entries. Nothing if None """ print('Process file: ' + file_name) with open(file_name, 'r') as o_file: lines = o_file.readlines() count = 0 i = 0 for line in lines: i += 1 sys.stdout.write(str(int((i/len(lines))*100)) + '%') sys.stdout.flush() sys.stdout.write("\b" * (40+1)) # return to start of line, after '[' data = json.loads(line) if not "index" in data: # Exclude index line try: suggests_entries = get_tokens(data[field_name]) if array_file and array_file in data and data[array_file]: for key in data[array_file]: suggests_entries.extend(get_tokens(key)) # TODO Input have the same value several times ==> use to process a score post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower()) count += 1 except NoGoodDataException: print('ERROR WITH DATA') print(str(data)) print('File processed\n') return count if __name__ == '__main__': created_docs = 0 created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album') print('Created documents: ' + str(created_docs)) created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist') print('Created documents: ' + str(created_docs)) # TODO Created doc <> nb doc in ELS