Adds too many uninteresting results Eg. all albums for one artist => prevents finding interesting information
104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
import requests
|
|
import json
|
|
import sys
|
|
|
|
ELS_URL ='http://localhost:9200'
|
|
INDEX = 'itunes-suggest'
|
|
|
|
class NoGoodDataException(Exception):
|
|
def __init__(self, message):
|
|
super().__init__(message)
|
|
|
|
def get_tokens(data: str) -> list:
|
|
if not data:
|
|
return []
|
|
query = {
|
|
"analyzer": "names",
|
|
"text" : data
|
|
}
|
|
|
|
url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
|
|
r = requests.get(url, json=query)
|
|
|
|
if not 'tokens' in r.json():
|
|
print('ERROR: Not tokens in result')
|
|
print('Input: ' + str(data))
|
|
print('Request: ' + str(r.json()))
|
|
raise NoGoodDataException('Data is not correct to get tokens')
|
|
return [t['token'] for t in r.json()['tokens']]
|
|
|
|
def post_document(name: str, input: list, field_name: str) -> bool:
|
|
suggest_name = field_name + '_suggest'
|
|
element = {
|
|
field_name: name,
|
|
suggest_name: input
|
|
}
|
|
|
|
# Filter empty keys
|
|
# element = {k: v for k, v in element.items() if v}
|
|
|
|
url = '{}/{}/_doc'.format(ELS_URL, INDEX)
|
|
resp = requests.post(url, json=element)
|
|
if resp.status_code != 201:
|
|
print('ELS Response KO')
|
|
print(resp.status_code)
|
|
print(resp.text)
|
|
return
|
|
|
|
el_id = resp.json()['_id']
|
|
# print('Post_element - Element created: ' + el_id)
|
|
return el_id
|
|
|
|
def process_file(file_name: str, field_name: str, array_file: str = None) -> int:
|
|
"""
|
|
Process a JSON file with data
|
|
|
|
Parameters
|
|
----------
|
|
file_name: string
|
|
Name and path of file to open for analyze
|
|
field_name: string
|
|
Name of field where found data to analyze and process suggest input
|
|
array_file: string, Default: None
|
|
A name of a field with array data to analyze. Nothing if None
|
|
"""
|
|
print('Process file: ' + file_name)
|
|
with open(file_name, 'r') as o_file:
|
|
lines = o_file.readlines()
|
|
|
|
count = 0
|
|
i = 0
|
|
for line in lines:
|
|
i += 1
|
|
sys.stdout.write(str(int((i/len(lines))*100)) + '%')
|
|
sys.stdout.flush()
|
|
sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
|
|
data = json.loads(line)
|
|
if not "index" in data: # Exclude index line
|
|
try :
|
|
input = get_tokens(data[field_name])
|
|
|
|
if array_file and array_file in data and data[array_file]:
|
|
for key in data[array_file]:
|
|
if key != data[field_name]: # => Absolutely don't work for album, and block a scoring for artists
|
|
input.extend(get_tokens(key))
|
|
|
|
# TODO Input have the same value several times ==> use to process a score
|
|
post_document(name=data[field_name], input=input, field_name=field_name.lower())
|
|
count += 1
|
|
except NoGoodDataException:
|
|
print('ERROR WITH DATA')
|
|
print(str(data))
|
|
print('File processed\n')
|
|
|
|
return count
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Using readlines()
|
|
count = 0
|
|
count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album', 'Artist')
|
|
print('Created documents: ' + str(count))
|
|
count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
|
|
print('Created documents: ' + str(count))
|