Pylint suggester
This commit is contained in:
101
suggester.py
101
suggester.py
@@ -1,37 +1,77 @@
|
|||||||
import requests
|
"""
|
||||||
import json
|
Process files generated by iTunesParser to fill a suggester index.
|
||||||
import sys
|
Suggester index in ELS must be created before use.
|
||||||
|
|
||||||
ELS_URL ='http://localhost:9200'
|
Found suggester.es query to create index.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
ELS_URL = 'http://localhost:9200'
|
||||||
INDEX = 'itunes-suggest'
|
INDEX = 'itunes-suggest'
|
||||||
|
|
||||||
class NoGoodDataException(Exception):
|
class NoGoodDataException(Exception):
|
||||||
def __init__(self, message):
|
""" Raise when data can't be correctly analyzed """
|
||||||
super().__init__(message)
|
|
||||||
|
|
||||||
def get_tokens(data: str) -> list:
|
def get_tokens(data: str) -> list:
|
||||||
|
"""
|
||||||
|
Query Elasticsearch to get token for a string with a specific analyzer.
|
||||||
|
Throw an exception if no token found in ELS response.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data: string
|
||||||
|
String to be analysed to obtain the tokens
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
A list of token
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
NoGoodDataException
|
||||||
|
If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
|
||||||
|
"""
|
||||||
if not data:
|
if not data:
|
||||||
return []
|
return []
|
||||||
query = {
|
query = {
|
||||||
"analyzer": "names",
|
"analyzer": "names", # TODO Parameterize analyzer ?
|
||||||
"text" : data
|
"text" : data
|
||||||
}
|
}
|
||||||
|
|
||||||
url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
|
url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
|
||||||
r = requests.get(url, json=query)
|
req = requests.get(url, json=query)
|
||||||
|
|
||||||
if not 'tokens' in r.json():
|
if not 'tokens' in req.json():
|
||||||
print('ERROR: Not tokens in result')
|
print('ERROR: Not tokens in result')
|
||||||
print('Input: ' + str(data))
|
print('Input: ' + str(data))
|
||||||
print('Request: ' + str(r.json()))
|
print('Request: ' + str(req.json()))
|
||||||
raise NoGoodDataException('Data is not correct to get tokens')
|
raise NoGoodDataException('Data is not correct to get tokens')
|
||||||
return [t['token'] for t in r.json()['tokens']]
|
return [t['token'] for t in req.json()['tokens']]
|
||||||
|
|
||||||
def post_document(name: str, input: list, field_name: str) -> bool:
|
def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
|
||||||
suggest_name = field_name + '_suggest'
|
"""
|
||||||
|
Create suggestion document in Elasticsearch.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
main_field_value : str
|
||||||
|
Value to put in the main field named by `main_field_name`
|
||||||
|
input_terms : list
|
||||||
|
List of suggestion term to put in document
|
||||||
|
main_field_name : str
|
||||||
|
Name of the main field, to fill with `main_field_value`
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
Success: ID of created document
|
||||||
|
Fail (ret. status <> 201): None
|
||||||
|
"""
|
||||||
|
suggest_name = main_field_name + '_suggest'
|
||||||
element = {
|
element = {
|
||||||
field_name: name,
|
main_field_name: main_field_value,
|
||||||
suggest_name: input
|
suggest_name: input_terms
|
||||||
}
|
}
|
||||||
|
|
||||||
# Filter empty keys
|
# Filter empty keys
|
||||||
@@ -43,7 +83,7 @@ def post_document(name: str, input: list, field_name: str) -> bool:
|
|||||||
print('ELS Response KO')
|
print('ELS Response KO')
|
||||||
print(resp.status_code)
|
print(resp.status_code)
|
||||||
print(resp.text)
|
print(resp.text)
|
||||||
return
|
return None
|
||||||
|
|
||||||
el_id = resp.json()['_id']
|
el_id = resp.json()['_id']
|
||||||
# print('Post_element - Element created: ' + el_id)
|
# print('Post_element - Element created: ' + el_id)
|
||||||
@@ -56,15 +96,16 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
|
|||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
file_name: string
|
file_name: string
|
||||||
Name and path of file to open for analyze
|
Path and name of file to analyze
|
||||||
field_name: string
|
field_name: string
|
||||||
Name of field where found data to analyze and process suggest input
|
Name of the field where to find the data to create the suggestion entries
|
||||||
array_file: string, Default: None
|
array_file: string, Default: None
|
||||||
A name of a field with array data to analyze. Nothing if None
|
Name of an array field to analyze to create more suggestion entries.
|
||||||
|
Nothing if None
|
||||||
"""
|
"""
|
||||||
print('Process file: ' + file_name)
|
print('Process file: ' + file_name)
|
||||||
with open(file_name, 'r') as o_file:
|
with open(file_name, 'r') as o_file:
|
||||||
lines = o_file.readlines()
|
lines = o_file.readlines()
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
i = 0
|
i = 0
|
||||||
@@ -75,28 +116,26 @@ def process_file(file_name: str, field_name: str, array_file: str = None) -> int
|
|||||||
sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
|
sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
if not "index" in data: # Exclude index line
|
if not "index" in data: # Exclude index line
|
||||||
try :
|
try:
|
||||||
input = get_tokens(data[field_name])
|
suggests_entries = get_tokens(data[field_name])
|
||||||
|
|
||||||
if array_file and array_file in data and data[array_file]:
|
if array_file and array_file in data and data[array_file]:
|
||||||
for key in data[array_file]:
|
for key in data[array_file]:
|
||||||
input.extend(get_tokens(key))
|
suggests_entries.extend(get_tokens(key))
|
||||||
|
|
||||||
# TODO Input have the same value several times ==> use to process a score
|
# TODO Input have the same value several times ==> use to process a score
|
||||||
post_document(name=data[field_name], input=input, field_name=field_name.lower())
|
post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
|
||||||
count += 1
|
count += 1
|
||||||
except NoGoodDataException:
|
except NoGoodDataException:
|
||||||
print('ERROR WITH DATA')
|
print('ERROR WITH DATA')
|
||||||
print(str(data))
|
print(str(data))
|
||||||
print('File processed\n')
|
print('File processed\n')
|
||||||
|
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Using readlines()
|
created_docs = 0
|
||||||
count = 0
|
created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
|
||||||
count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
|
print('Created documents: ' + str(created_docs))
|
||||||
print('Created documents: ' + str(count))
|
created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
|
||||||
count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
|
print('Created documents: ' + str(created_docs))
|
||||||
print('Created documents: ' + str(count))
|
|
||||||
|
|||||||
Reference in New Issue
Block a user