Pylint suggester

This commit is contained in:
2021-08-30 19:32:12 +02:00
parent 67e1f8bd0c
commit 88025347ec

View File

@@ -1,37 +1,77 @@
import requests """
import json Process files generated by iTunesParser to fill a suggester index.
import sys Suggester index in ELS must be created before use.
ELS_URL ='http://localhost:9200' Found suggester.es query to create index.
"""
import sys
import json
import requests
ELS_URL = 'http://localhost:9200'
INDEX = 'itunes-suggest' INDEX = 'itunes-suggest'
class NoGoodDataException(Exception): class NoGoodDataException(Exception):
def __init__(self, message): """ Raise when data can't be correctly analyzed """
super().__init__(message)
def get_tokens(data: str) -> list: def get_tokens(data: str) -> list:
"""
Query Elasticsearch to get token for a string with a specific analyzer.
Throw an exception if no token found in ELS response.
Parameters
----------
data: string
String to be analysed to obtain the tokens
Returns
-------
list
A list of token
Raises
------
NoGoodDataException
If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
"""
if not data: if not data:
return [] return []
query = { query = {
"analyzer": "names", "analyzer": "names", # TODO Parameterize analyzer ?
"text" : data "text" : data
} }
url = '{}/{}/_analyze'.format(ELS_URL, INDEX) url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
r = requests.get(url, json=query) req = requests.get(url, json=query)
if not 'tokens' in r.json(): if not 'tokens' in req.json():
print('ERROR: Not tokens in result') print('ERROR: Not tokens in result')
print('Input: ' + str(data)) print('Input: ' + str(data))
print('Request: ' + str(r.json())) print('Request: ' + str(req.json()))
raise NoGoodDataException('Data is not correct to get tokens') raise NoGoodDataException('Data is not correct to get tokens')
return [t['token'] for t in r.json()['tokens']] return [t['token'] for t in req.json()['tokens']]
def post_document(name: str, input: list, field_name: str) -> bool: def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
suggest_name = field_name + '_suggest' """
Create suggestion document in Elasticsearch.
Parameters
----------
main_field_value : str
Value to put in the main field named by `main_field_name`
input_terms : list
List of suggestion term to put in document
main_field_name : str
Name of the main field, to fill with `main_field_value`
Returns
-------
str
Success: ID of created document
Fail (ret. status <> 201): None
"""
suggest_name = main_field_name + '_suggest'
element = { element = {
field_name: name, main_field_name: main_field_value,
suggest_name: input suggest_name: input_terms
} }
# Filter empty keys # Filter empty keys
@@ -43,7 +83,7 @@ def post_document(name: str, input: list, field_name: str) -> bool:
print('ELS Response KO') print('ELS Response KO')
print(resp.status_code) print(resp.status_code)
print(resp.text) print(resp.text)
return return None
el_id = resp.json()['_id'] el_id = resp.json()['_id']
# print('Post_element - Element created: ' + el_id) # print('Post_element - Element created: ' + el_id)
@@ -56,15 +96,16 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
Parameters Parameters
---------- ----------
file_name: string file_name: string
Name and path of file to open for analyze Path and name of file to analyze
field_name: string field_name: string
Name of field where found data to analyze and process suggest input Name of the field where to find the data to create the suggestion entries
array_file: string, Default: None array_file: string, Default: None
A name of a field with array data to analyze. Nothing if None Name of an array field to analyze to create more suggestion entries.
Nothing if None
""" """
print('Process file: ' + file_name) print('Process file: ' + file_name)
with open(file_name, 'r') as o_file: with open(file_name, 'r') as o_file:
lines = o_file.readlines() lines = o_file.readlines()
count = 0 count = 0
i = 0 i = 0
@@ -75,28 +116,26 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
sys.stdout.write("\b" * (40+1)) # return to start of line, after '[' sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
data = json.loads(line) data = json.loads(line)
if not "index" in data: # Exclude index line if not "index" in data: # Exclude index line
try : try:
input = get_tokens(data[field_name]) suggests_entries = get_tokens(data[field_name])
if array_file and array_file in data and data[array_file]: if array_file and array_file in data and data[array_file]:
for key in data[array_file]: for key in data[array_file]:
input.extend(get_tokens(key)) suggests_entries.extend(get_tokens(key))
# TODO Input have the same value several times ==> use to process a score # TODO Input have the same value several times ==> use to process a score
post_document(name=data[field_name], input=input, field_name=field_name.lower()) post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
count += 1 count += 1
except NoGoodDataException: except NoGoodDataException:
print('ERROR WITH DATA') print('ERROR WITH DATA')
print(str(data)) print(str(data))
print('File processed\n') print('File processed\n')
return count return count
if __name__ == '__main__': if __name__ == '__main__':
# Using readlines() created_docs = 0
count = 0 created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album') print('Created documents: ' + str(created_docs))
print('Created documents: ' + str(count)) created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist') print('Created documents: ' + str(created_docs))
print('Created documents: ' + str(count))