8 Commits

Author SHA1 Message Date
95534c92b2 Refactor send data argparse 2021-09-05 02:21:24 +02:00
0230bf260b Process suggestion according to option 2021-09-05 02:13:47 +02:00
928efb659e Use send data to process suggestion
Not all suggested document is created in ELS
Need to refactor send data
2021-08-31 02:31:32 +02:00
88025347ec Pylint suggester 2021-08-30 19:32:12 +02:00
67e1f8bd0c Working suggester ingester 2021-08-30 19:13:30 +02:00
042c2558ae Update ES files - POST -> get 2021-08-23 01:22:58 +02:00
ad0487943a Process for album
Adds too many uninteresting results
Eg. all albums for one artist
=> prevents finding interesting information
2021-08-23 01:22:58 +02:00
56050d0a49 Suggester: take Album Artist
But it's not OK in dashboard ->
For example, search 'ayache' (for Superbus)
=> Result display 'Superbus' and we don't understand why
2021-08-23 01:22:58 +02:00
5 changed files with 218 additions and 121 deletions

View File

@@ -147,7 +147,8 @@ class ITunesParser:
'Play Count': 0,
'Rating': 0,
'Genre': set(),
'Album': set()
'Album': set(),
'Album Artist': set()
}
# Compute information
@@ -168,6 +169,9 @@ class ITunesParser:
if 'Album' in track:
self._artists[akey]['Album'].add(track['Album'])
if 'Album Artist' in track:
self._artists[akey]['Album Artist'].add(track['Artist'])
def _process_album(self, track):
"""
Process albums in the track part of library and return a JSON formated for a bulk ELS request

48
mapping.suggest.json Normal file
View File

@@ -0,0 +1,48 @@
{
"settings": {
"index": {
"number_of_replicas": 0
},
"analysis": {
"filter": {
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"names": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"french_stop",
"english_stop"
]
}
}
}
},
"mappings": {
"properties": {
"artist_suggest": {
"type": "completion",
"search_analyzer": "names"
},
"artist": {
"type": "keyword"
},
"album_suggest": {
"type": "completion",
"search_analyzer": "names"
},
"album": {
"type": "keyword"
}
}
}
}

View File

@@ -10,6 +10,8 @@ import json
import time
import requests
from suggester import process_file
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
@@ -22,16 +24,18 @@ class bcolors:
UNDERLINE = '\033[4m'
# Default file names
DEFAULT_SONG_FILE = 'es-songs.json'
DEFAULT_ALBUM_FILE = 'es-albums.json'
DEFAULT_ARTIST_FILE = 'es-artists.json'
DEFAULT_MAPPING_SONGS_FILE = 'mapping.songs.json'
DEFAULT_MAPPING_ARTISTS_FILE = 'mapping.artists.json'
DEFAULT_MAPPING_ALBUMS_FILE = 'mapping.albums.json'
SONG_FILE = 'es-songs.json'
ALBUM_FILE = 'es-albums.json'
ARTIST_FILE = 'es-artists.json'
MAPPING_SONGS_FILE = 'mapping.songs.json'
MAPPING_ARTISTS_FILE = 'mapping.artists.json'
MAPPING_ALBUMS_FILE = 'mapping.albums.json'
MAPPING_SUGGEST_FILE = 'mapping.suggest.json'
SONG_INDEX = 'itunes-songs'
ALBUM_INDEX = 'itunes-albums'
ARTIST_INDEX = 'itunes-artists'
SUGGEST_INDEX = 'itunes-suggest'
# TODO Put variables in a config files or in a python library
# Global values / set as default values
@@ -47,8 +51,8 @@ def main():
args = create_args_parser().parse_args()
if not args.song and args.ALL:
print(__file__ + ': error: argument -A/--ALL: not allowed with argument -s/--song')
if args.ALL and args.no_song:
print(__file__ + ': error: argument -A/--ALL: not allowed with argument --no-song')
sys.exit(-1)
# Overloaded setting value
@@ -68,16 +72,16 @@ def main():
check_is_ok = []
# Send song data
if args.song or args.ALL:
if not args.no_song:
if args.DELETE:
mapping_song = load_file(args.mapping_song, DEFAULT_MAPPING_SONGS_FILE)
mapping_song = load_file(args.mapping_song, MAPPING_SONGS_FILE)
if not args.quiet:
print("Mapping of song index file: '{}'".format(mapping_song.name))
delete_index(SONG_INDEX, args.quiet)
put_mapping(SONG_INDEX, mapping_song, args.quiet)
song_file = load_file(args.song_file, DEFAULT_SONG_FILE)
song_file = load_file(args.song_file, SONG_FILE)
if not args.quiet:
print("Song file: '{}'".format(song_file.name))
@@ -96,7 +100,7 @@ def main():
# Send artist data
if args.artist_file or args.ALL:
if args.DELETE:
mapping_artist = load_file(args.mapping_artist, DEFAULT_MAPPING_ARTISTS_FILE)
mapping_artist = load_file(args.mapping_artist, MAPPING_ARTISTS_FILE)
if not args.quiet:
print("Mapping of artist index file: '{}'".format(mapping_artist.name))
@@ -107,7 +111,7 @@ def main():
if not artist_file:
if not args.quiet:
print('No artist file specified, take default file...')
artist_file = open(DEFAULT_ARTIST_FILE, 'r')
artist_file = open(ARTIST_FILE, 'r')
if not args.quiet:
print("Artist file: '{}'".format(artist_file.name))
@@ -122,7 +126,7 @@ def main():
if args.album_file or args.ALL:
if args.DELETE:
mapping_album = load_file(args.mapping_album, DEFAULT_MAPPING_ALBUMS_FILE)
mapping_album = load_file(args.mapping_album, MAPPING_ALBUMS_FILE)
if not args.quiet:
print("Mapping of artist index file: '{}'".format(mapping_album.name))
@@ -133,7 +137,7 @@ def main():
if not album_file:
if not args.quiet:
print('No album file specified, take default file...')
album_file = open(DEFAULT_ALBUM_FILE, 'r')
album_file = open(ALBUM_FILE, 'r')
if not args.quiet:
print("Take file '{}' to send song data".format(album_file.name))
@@ -145,6 +149,28 @@ def main():
else:
print('Album sent')
if not args.no_suggest:
print("Process suggestion:")
if args.DELETE:
delete_index(SUGGEST_INDEX, args.quiet)
if not args.ALL and not args.album_file and not args.artist_file:
print('Only song file processed. No suggestion to process.')
else:
if args.DELETE:
mapping_suggest = load_file(args.mapping_suggest, MAPPING_SUGGEST_FILE)
if not args.quiet:
print("Mapping of suggest index file: '{}'".format(mapping_suggest.name))
put_mapping(SUGGEST_INDEX, mapping_suggest, args.quiet)
suggs_docs = 0
if args.album_file or args.ALL:
suggs_docs += process_file(ALBUM_FILE, 'Album')
print('Created suggestion documents: ' + str(suggs_docs))
if args.artist_file or args.ALL:
suggs_docs += process_file(ARTIST_FILE, 'Artist', 'Album Artist')
print('Created suggestion documents: ' + str(suggs_docs))
print("I'm done!")
if check_is_ok.count(False) > 0:
print('Some problems occurs')
@@ -175,13 +201,12 @@ def create_args_parser():
description='''
Send JSON files formated for bulk Elasticsearch operation to an Elasticsearch.
By default: send song data enable, send album & artist data disabled.
Check that all the data has been sent.
By default: send only song data. See option to send album/artist/suggest data.
Detect if index doesn't exist and create it with a mapping file (see -map and -idx argument).
Remeber : it's cumulative! If you want to remove songs/artits/albums,
you have to delete and re-create the index (use -D option).
'''
Create index if -D option activated with a mapping file (see -map).
It's cumulative! If you want to remove songs/artits/albums, you have to delete and re-create the index (use -D option).''',
formatter_class=argparse.RawTextHelpFormatter
)
# Bulk
parser.add_argument('-q', '--quiet', action='store_true',
@@ -190,38 +215,43 @@ def create_args_parser():
sending_group = parser.add_argument_group("Sending options")
song_group = sending_group.add_mutually_exclusive_group()
song_group.add_argument('-sf', '--song-file', type=argparse.FileType('r'),
help='Song file data to send (default: \'{}\').'.format(DEFAULT_SONG_FILE))
sending_group.add_argument('-al', '--album-file', nargs='?', type=argparse.FileType('r'), const=DEFAULT_ALBUM_FILE,
help='Song file data to send (default: \'{}\').'.format(SONG_FILE))
sending_group.add_argument('-al', '--album-file', nargs='?', type=argparse.FileType('r'), const=ALBUM_FILE,
help='Enable sending album data. Optionally, precise the album data file (default: \'{}\')'
.format(DEFAULT_ALBUM_FILE))
sending_group.add_argument('-ar', '--artist-file', nargs='?', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE,
.format(ALBUM_FILE))
sending_group.add_argument('-ar', '--artist-file', nargs='?', type=argparse.FileType('r'), const=ARTIST_FILE,
help='Enable sending artist data. Optionally, precise the artist data file (default: \'{}\')'
.format(DEFAULT_ARTIST_FILE))
song_group.add_argument('-s', '--song', action='store_false',
help='Disable sending song data')
.format(ARTIST_FILE))
# Mode
mode_group = parser.add_argument_group('Mode')
mode_group.add_argument('-A', '--ALL', action='store_true',
help='Send all possible data: song, artist and album')
help='Send all possible data: song, artist, album and suggest. Use default file if not specified')
mode_group.add_argument('-D', '--DELETE', action='store_true',
help='''Delete old index and create a new.
See -idx argument to set index name.
See -map arguement to set mapping file.''')
help='Delete index and create new. See -map arguement to set mapping file')
mode_group.add_argument('--no-song', action='store_true',
help='''Disable sending song data.
Not allowed with -A option.''')
mode_group.add_argument('--no-suggest', action='store_true',
help='Disable sending suggest data. Allowed with -A option')
# Mapping
mapping_group = parser.add_argument_group('Mapping files')
mode_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=DEFAULT_MAPPING_SONGS_FILE, nargs='?',
help='Mapping file for songs (default: \'{}\')'.format(DEFAULT_MAPPING_SONGS_FILE))
mode_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE, nargs='?',
help='Mapping file for artists (default: \'{}\')'.format(DEFAULT_MAPPING_ARTISTS_FILE))
mode_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=DEFAULT_MAPPING_ALBUMS_FILE, nargs='?',
help='Mapping file for albums (default: \'{}\')'.format(DEFAULT_MAPPING_ALBUMS_FILE))
# CAUTION default values cannot be used because they necessarily activate the option
# QUESTION Use a for with a list of default mapping file?
mapping_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=MAPPING_SONGS_FILE, nargs='?',
help='Mapping file for songs (default: \'{}\')'.format(MAPPING_SONGS_FILE))
mapping_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=ARTIST_FILE, nargs='?',
help='Mapping file for artists (default: \'{}\')'.format(MAPPING_ARTISTS_FILE))
mapping_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=MAPPING_ALBUMS_FILE, nargs='?',
help='Mapping file for albums (default: \'{}\')'.format(MAPPING_ALBUMS_FILE))
mapping_group.add_argument('-mg', '--mapping-suggest', type=argparse.FileType('r'), const=MAPPING_SUGGEST_FILE, nargs='?',
help='Mapping file for suggest (default: \'{}\')'.format(MAPPING_SUGGEST_FILE))
# Global Settings
g_settings_group = parser.add_argument_group('Global Settings')
g_settings_group.add_argument('-els', '--elasticsearch-url', default=ELASTICSEARCH_URL, nargs='?',
help="Elasticsearch URL (default: \'{}\')".format(ELASTICSEARCH_URL))
help="Elasticsearch URL.")
return parser
@@ -253,7 +283,7 @@ def delete_index(index_name, quiet=False):
res = requests.delete(url=ELASTICSEARCH_URL + index_name)
if res.status_code == 200:
if not quiet:
print(bcolors.OKGREEN + "Index deleted!" + bcolors.ENDC)
print(bcolors.OKGREEN + "Index '{}' deleted!".format(index_name) + bcolors.ENDC)
else:
print(bcolors.FAIL + "An error occured" + bcolors.ENDC)
if res.json()['error']['type'] == 'index_not_found_exception':
@@ -276,7 +306,7 @@ def put_mapping(index_name, mapping_file, quiet=False):
print(res.text + bcolors.ENDC)
else:
if not quiet:
print(bcolors.OKGREEN + "Mapping sent" + bcolors.ENDC)
print(bcolors.OKGREEN + "Mapping for '{}' sent".format(index_name) + bcolors.ENDC)
put_setting(index_name, 0, quiet)

View File

@@ -1,51 +1,7 @@
DELETE itunes-suggest
PUT /itunes-suggest
{
"settings": {
"analysis": {
"filter": {
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"names": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"french_stop",
"english_stop"
]
}
}
}
},
"mappings": {
"properties": {
"artist_suggest": {
"type": "completion",
"search_analyzer": "names"
},
"artist": {
"type": "keyword"
},
"album_suggest": {
"type": "completion",
"search_analyzer": "names"
},
"album": {
"type": "keyword"
}
}
}
}
!./mapping.suggest.json
// Also possible to specify analyze for ingesting => https://stackoverflow.com/questions/48304499/elasticsearch-completion-suggester-not-working-with-whitespace-analyzer
@@ -59,7 +15,7 @@ GET itunes-suggest/_analyze
GET itunes-suggest/_search
POST itunes-suggest/_search
GET itunes-suggest/_search
{
"_source" : "artist",
"suggest": {
@@ -72,7 +28,7 @@ POST itunes-suggest/_search
}
}
POST itunes-suggest/_search
GET itunes-suggest/_search
{
"_source" : "album",
"suggest": {
@@ -86,7 +42,7 @@ POST itunes-suggest/_search
}
}
POST itunes-suggest/_search
GET itunes-suggest/_search
{
"_source": ["album", "artist"],
"suggest": {
@@ -105,7 +61,7 @@ POST itunes-suggest/_search
}
}
POST itunes-suggest/_search
GET itunes-suggest/_search
{
"_source": ["album", "artist"],
"suggest": {
@@ -124,7 +80,7 @@ POST itunes-suggest/_search
}
}
POST itunes-suggest/_search
GET itunes-suggest/_search
{
"suggest": {
"ar-suggest": {

View File

@@ -1,37 +1,77 @@
import requests
import json
"""
Process files generated by iTunesParser to fill a suggester index.
Suggester index in ELS must be created before use.
Found suggester.es query to create index.
"""
import sys
import json
import requests
ELS_URL = 'http://localhost:9200'
INDEX = 'itunes-suggest'
class NoGoodDataException(Exception):
def __init__(self, message):
super().__init__(message)
""" Raise when data can't be correctly analyzed """
def get_tokens(data: str) -> list:
"""
Query Elasticsearch to get token for a string with a specific analyzer.
Throw an exception if no token found in ELS response.
Parameters
----------
data: string
String to be analysed to obtain the tokens
Returns
-------
list
A list of token
Raises
------
NoGoodDataException
If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
"""
if not data:
return []
query = {
"analyzer": "names",
"analyzer": "names", # TODO Parameterize analyzer ?
"text" : data
}
url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
r = requests.get(url, json=query)
req = requests.get(url, json=query)
if not 'tokens' in r.json():
if not 'tokens' in req.json():
print('ERROR: Not tokens in result')
print('Input: ' + str(data))
print('Request: ' + str(r.json()))
print('Request: ' + str(req.json()))
raise NoGoodDataException('Data is not correct to get tokens')
return [t['token'] for t in r.json()['tokens']]
return [t['token'] for t in req.json()['tokens']]
def post_document(name: str, input: list, field_name: str) -> bool:
suggest_name = field_name + '_suggest'
def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
"""
Create suggestion document in Elasticsearch.
Parameters
----------
main_field_value : str
Value to put in the main field named by `main_field_name`
input_terms : list
List of suggestion term to put in document
main_field_name : str
Name of the main field, to fill with `main_field_value`
Returns
-------
str
Success: ID of created document
Fail (ret. status <> 201): None
"""
suggest_name = main_field_name + '_suggest'
element = {
field_name: name,
suggest_name: input
main_field_name: main_field_value,
suggest_name: input_terms
}
# Filter empty keys
@@ -43,13 +83,26 @@ def post_document(name: str, input: list, field_name: str) -> bool:
print('ELS Response KO')
print(resp.status_code)
print(resp.text)
return
return None
el_id = resp.json()['_id']
# print('Post_element - Element created: ' + el_id)
return el_id
def process_file(file_name: str, field_name: str) -> int:
def process_file(file_name: str, field_name: str, array_file: str = None) -> int:
"""
Process a JSON file with data
Parameters
----------
file_name: string
Path and name of file to analyze
field_name: string
Name of the field where to find the data to create the suggestion entries
array_file: string, Default: None
Name of an array field to analyze to create more suggestion entries.
Nothing if None
"""
print('Process file: ' + file_name)
with open(file_name, 'r') as o_file:
lines = o_file.readlines()
@@ -62,22 +115,28 @@ def process_file(file_name: str, field_name: str) -> int:
sys.stdout.flush()
sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
data = json.loads(line)
if "Artist" in data:
if not "index" in data: # Exclude index line
try:
input = get_tokens(data[field_name])
post_document(name=data[field_name], input=input, field_name=field_name.lower())
suggests_entries = get_tokens(data[field_name])
if array_file and array_file in data and data[array_file]:
for key in data[array_file]:
suggests_entries.extend(get_tokens(key))
# TODO Input have the same value several times ==> use to process a score
post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
count += 1
except NoGoodDataException:
print('ERROR WITH DATA')
print(str(data))
print('File processed\n')
return count
if __name__ == '__main__':
# Using readlines()
count = 0
count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist')
print('Created documents: ' + str(count))
created_docs = 0
created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
print('Created documents: ' + str(created_docs))
created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
print('Created documents: ' + str(created_docs))
# TODO Created doc <> nb doc in ELS