iTunes/suggester.py

import requests
import json
import sys

ELS_URL ='http://localhost:9200'
INDEX = 'itunes-suggest'

class NoGoodDataException(Exception):
    def __init__(self, message):
        super().__init__(message)

def get_tokens(data: str) -> list:
    if not data:
        return []
    query = {
        "analyzer": "names",
        "text" : data
    }

    url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
    r = requests.get(url, json=query)

    if not 'tokens' in r.json():
        print('ERROR: Not tokens in result')
        print('Input: ' + str(data))
        print('Request: ' + str(r.json()))
        raise NoGoodDataException('Data is not correct to get tokens')
    return [t['token'] for t in r.json()['tokens']]

def post_document(name: str, input: list, field_name: str) -> bool:
    suggest_name = field_name + '_suggest'
    element = {
        field_name: name,
        suggest_name: input
    }

    # Filter empty keys
    # element = {k: v for k, v in element.items() if v}

    url = '{}/{}/_doc'.format(ELS_URL, INDEX)
    resp = requests.post(url, json=element)
    if resp.status_code != 201:
        print('ELS Response KO')
        print(resp.status_code)
        print(resp.text)
        return

    el_id = resp.json()['_id']
    # print('Post_element - Element created: ' + el_id)
    return el_id

def process_file(file_name: str, field_name: str, array_file: str = None) -> int:
    """
        Process a JSON file with data

        Parameters
        ----------
        file_name: string
            Name and path of file to open for analyze
        field_name: string
            Name of field where found data to analyze and process suggest input
        array_file: string, Default: None
            A name of a field with array data to analyze. Nothing if None
    """
    print('Process file: ' + file_name)
    with open(file_name, 'r') as o_file:
            lines = o_file.readlines()

    count = 0
    i = 0
    for line in lines:
        i += 1
        sys.stdout.write(str(int((i/len(lines))*100)) + '%')
        sys.stdout.flush()
        sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
        data = json.loads(line)
        if not "index" in data: # Exclude index line
            try :
                input = get_tokens(data[field_name])

                if array_file and array_file in data and data[array_file]:
                    for key in data[array_file]:
                        if key != data[field_name]: # => Absolutely don't work for album, and block a scoring for artists
                            input.extend(get_tokens(key))

                # TODO Input have the same value several times ==> use to process a score
                post_document(name=data[field_name], input=input, field_name=field_name.lower())
                count += 1
            except NoGoodDataException:
                print('ERROR WITH DATA')
                print(str(data))
    print('File processed\n')

    return count


if __name__ == '__main__':
    # Using readlines()
    count = 0
    count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album', 'Artist')
    print('Created documents: ' + str(count))
    count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
    print('Created documents: ' + str(count))