Refactor send data argparse

Process suggestion according to option
Use send data to process suggestion
2021-09-05 02:21:24 +02:00 · 2021-09-05 02:13:47 +02:00 · 2021-08-31 02:31:32 +02:00 · 2021-08-30 19:32:12 +02:00 · 2021-08-30 19:13:30 +02:00 · 2021-08-23 01:22:58 +02:00
5 changed files with 218 additions and 121 deletions
--- a/iTunesParser.py
+++ b/iTunesParser.py
@@ -147,7 +147,8 @@ class ITunesParser:
                'Play Count': 0,
                'Rating': 0,
                'Genre': set(),
-                'Album': set()
+                'Album': set(),
+                'Album Artist': set()
                }

        # Compute information
@@ -168,6 +169,9 @@ class ITunesParser:
        if 'Album' in track:
            self._artists[akey]['Album'].add(track['Album'])

+        if 'Album Artist' in track:
+            self._artists[akey]['Album Artist'].add(track['Artist'])
+
    def _process_album(self, track):
        """
            Process albums in the track part of library and return a JSON formated for a bulk ELS request
--- a/mapping.suggest.json
+++ b/mapping.suggest.json
@@ -0,0 +1,48 @@
+{
+    "settings": {
+        "index": {
+            "number_of_replicas": 0
+        },
+        "analysis": {
+            "filter": {
+                "french_stop": {
+                    "type": "stop",
+                    "stopwords": "_french_"
+                },
+                "english_stop": {
+                    "type": "stop",
+                    "stopwords": "_english_"
+                }
+            },
+            "analyzer": {
+                "names": {
+                    "tokenizer": "standard",
+                    "filter": [
+                        "lowercase",
+                        "asciifolding",
+                        "french_stop",
+                        "english_stop"
+                    ]
+                }
+            }
+        }
+    },
+    "mappings": {
+        "properties": {
+            "artist_suggest": {
+                "type": "completion",
+                "search_analyzer": "names"
+            },
+            "artist": {
+                "type": "keyword"
+            },
+            "album_suggest": {
+                "type": "completion",
+                "search_analyzer": "names"
+            },
+            "album": {
+                "type": "keyword"
+            }
+        }
+    }
+}
--- a/send_data.py
+++ b/send_data.py
@@ -10,6 +10,8 @@ import json
 import time
 import requests

+from suggester import process_file
+
 class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
@@ -22,16 +24,18 @@ class bcolors:
    UNDERLINE = '\033[4m'

 # Default file names
-DEFAULT_SONG_FILE = 'es-songs.json'
-DEFAULT_ALBUM_FILE = 'es-albums.json'
-DEFAULT_ARTIST_FILE = 'es-artists.json'
-DEFAULT_MAPPING_SONGS_FILE = 'mapping.songs.json'
-DEFAULT_MAPPING_ARTISTS_FILE = 'mapping.artists.json'
-DEFAULT_MAPPING_ALBUMS_FILE = 'mapping.albums.json'
+SONG_FILE = 'es-songs.json'
+ALBUM_FILE = 'es-albums.json'
+ARTIST_FILE = 'es-artists.json'
+MAPPING_SONGS_FILE = 'mapping.songs.json'
+MAPPING_ARTISTS_FILE = 'mapping.artists.json'
+MAPPING_ALBUMS_FILE = 'mapping.albums.json'
+MAPPING_SUGGEST_FILE = 'mapping.suggest.json'

 SONG_INDEX = 'itunes-songs'
 ALBUM_INDEX = 'itunes-albums'
 ARTIST_INDEX = 'itunes-artists'
+SUGGEST_INDEX = 'itunes-suggest'
 # TODO Put variables in a config files or in a python library

 # Global values / set as default values
@@ -47,8 +51,8 @@ def main():

    args = create_args_parser().parse_args()

-    if not args.song and args.ALL:
-        print(__file__ + ': error: argument -A/--ALL: not allowed with argument -s/--song')
+    if args.ALL and args.no_song:
+        print(__file__ + ': error: argument -A/--ALL: not allowed with argument --no-song')
        sys.exit(-1)

    # Overloaded setting value
@@ -68,16 +72,16 @@ def main():
    check_is_ok = []

    # Send song data
-    if args.song or args.ALL:
+    if not args.no_song:
        if args.DELETE:
-            mapping_song = load_file(args.mapping_song, DEFAULT_MAPPING_SONGS_FILE)
+            mapping_song = load_file(args.mapping_song, MAPPING_SONGS_FILE)
            if not args.quiet:
                print("Mapping of song index file: '{}'".format(mapping_song.name))

            delete_index(SONG_INDEX, args.quiet)
            put_mapping(SONG_INDEX, mapping_song, args.quiet)

-        song_file = load_file(args.song_file, DEFAULT_SONG_FILE)
+        song_file = load_file(args.song_file, SONG_FILE)
        if not args.quiet:
            print("Song file: '{}'".format(song_file.name))

@@ -96,7 +100,7 @@ def main():
    # Send artist data
    if args.artist_file or args.ALL:
        if args.DELETE:
-            mapping_artist = load_file(args.mapping_artist, DEFAULT_MAPPING_ARTISTS_FILE)
+            mapping_artist = load_file(args.mapping_artist, MAPPING_ARTISTS_FILE)
            if not args.quiet:
                print("Mapping of artist index file: '{}'".format(mapping_artist.name))

@@ -107,7 +111,7 @@ def main():
        if not artist_file:
            if not args.quiet:
                print('No artist file specified, take default file...')
-            artist_file = open(DEFAULT_ARTIST_FILE, 'r')
+            artist_file = open(ARTIST_FILE, 'r')

        if not args.quiet:
            print("Artist file: '{}'".format(artist_file.name))
@@ -122,7 +126,7 @@ def main():

    if args.album_file or args.ALL:
        if args.DELETE:
-            mapping_album = load_file(args.mapping_album, DEFAULT_MAPPING_ALBUMS_FILE)
+            mapping_album = load_file(args.mapping_album, MAPPING_ALBUMS_FILE)
            if not args.quiet:
                print("Mapping of artist index file: '{}'".format(mapping_album.name))

@@ -133,7 +137,7 @@ def main():
        if not album_file:
            if not args.quiet:
                print('No album file specified, take default file...')
-            album_file = open(DEFAULT_ALBUM_FILE, 'r')
+            album_file = open(ALBUM_FILE, 'r')

        if not args.quiet:
            print("Take file '{}' to send song data".format(album_file.name))
@@ -145,6 +149,28 @@ def main():
        else:
            print('Album sent')

+    if not args.no_suggest:
+        print("Process suggestion:")
+        if args.DELETE:
+            delete_index(SUGGEST_INDEX, args.quiet)
+
+        if not args.ALL and not args.album_file and not args.artist_file:
+            print('Only song file processed. No suggestion to process.')
+        else:
+            if args.DELETE:
+                mapping_suggest = load_file(args.mapping_suggest, MAPPING_SUGGEST_FILE)
+                if not args.quiet:
+                    print("Mapping of suggest index file: '{}'".format(mapping_suggest.name))
+                put_mapping(SUGGEST_INDEX, mapping_suggest, args.quiet)
+
+            suggs_docs = 0
+            if args.album_file or args.ALL:
+                suggs_docs += process_file(ALBUM_FILE, 'Album')
+            print('Created suggestion documents: ' + str(suggs_docs))
+            if args.artist_file or args.ALL:
+                suggs_docs += process_file(ARTIST_FILE, 'Artist', 'Album Artist')
+            print('Created suggestion documents: ' + str(suggs_docs))
+
    print("I'm done!")
    if check_is_ok.count(False) > 0:
        print('Some problems occurs')
@@ -175,13 +201,12 @@ def create_args_parser():
        description='''
 Send JSON files formated for bulk Elasticsearch operation to an Elasticsearch.

-            By default: send song data enable, send album & artist data disabled.
-            Check that all the data has been sent.
+By default: send only song data. See option to send album/artist/suggest data.

-            Detect if index doesn't exist and create it with a mapping file (see -map and -idx argument).
-            Remeber : it's cumulative! If you want to remove songs/artits/albums,
-            you have to delete and re-create the index (use -D option).
-        '''
+Create index if -D option activated with a mapping file (see -map).
+
+It's cumulative! If you want to remove songs/artits/albums, you have to delete and re-create the index (use -D option).''',
+        formatter_class=argparse.RawTextHelpFormatter
    )
    # Bulk
    parser.add_argument('-q', '--quiet', action='store_true',
@@ -190,38 +215,43 @@ def create_args_parser():
    sending_group = parser.add_argument_group("Sending options")
    song_group = sending_group.add_mutually_exclusive_group()
    song_group.add_argument('-sf', '--song-file', type=argparse.FileType('r'),
-                            help='Song file data to send (default: \'{}\').'.format(DEFAULT_SONG_FILE))
-    sending_group.add_argument('-al', '--album-file', nargs='?', type=argparse.FileType('r'), const=DEFAULT_ALBUM_FILE,
+                            help='Song file data to send (default: \'{}\').'.format(SONG_FILE))
+    sending_group.add_argument('-al', '--album-file', nargs='?', type=argparse.FileType('r'), const=ALBUM_FILE,
                               help='Enable sending album data. Optionally, precise the album data file (default: \'{}\')'
-                               .format(DEFAULT_ALBUM_FILE))
-    sending_group.add_argument('-ar', '--artist-file', nargs='?', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE,
+                               .format(ALBUM_FILE))
+    sending_group.add_argument('-ar', '--artist-file', nargs='?', type=argparse.FileType('r'), const=ARTIST_FILE,
                               help='Enable sending artist data. Optionally, precise the artist data file (default: \'{}\')'
-                               .format(DEFAULT_ARTIST_FILE))
-    song_group.add_argument('-s', '--song', action='store_false',
-                            help='Disable sending song data')
+                               .format(ARTIST_FILE))

    # Mode
    mode_group = parser.add_argument_group('Mode')
    mode_group.add_argument('-A', '--ALL', action='store_true',
-                            help='Send all possible data: song, artist and album')
+                            help='Send all possible data: song, artist, album and suggest. Use default file if not specified')
    mode_group.add_argument('-D', '--DELETE', action='store_true',
-                            help='''Delete old index and create a new.
-                            See -idx argument to set index name.
-                             See -map arguement to set mapping file.''')
+                            help='Delete index and create new. See -map arguement to set mapping file')
+    mode_group.add_argument('--no-song', action='store_true',
+                            help='''Disable sending song data.
+Not allowed with -A option.''')
+    mode_group.add_argument('--no-suggest', action='store_true',
+                            help='Disable sending suggest data. Allowed with -A option')

    # Mapping
    mapping_group = parser.add_argument_group('Mapping files')
-    mode_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=DEFAULT_MAPPING_SONGS_FILE, nargs='?',
-                            help='Mapping file for songs (default: \'{}\')'.format(DEFAULT_MAPPING_SONGS_FILE))
-    mode_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=DEFAULT_ARTIST_FILE, nargs='?',
-                            help='Mapping file for artists (default: \'{}\')'.format(DEFAULT_MAPPING_ARTISTS_FILE))
-    mode_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=DEFAULT_MAPPING_ALBUMS_FILE, nargs='?',
-                            help='Mapping file for albums (default: \'{}\')'.format(DEFAULT_MAPPING_ALBUMS_FILE))
+    # CAUTION default values cannot be used because they necessarily activate the option
+    # QUESTION Use a for with a list of default mapping file?
+    mapping_group.add_argument('-ms', '--mapping-song', type=argparse.FileType('r'), const=MAPPING_SONGS_FILE, nargs='?',
+                               help='Mapping file for songs (default: \'{}\')'.format(MAPPING_SONGS_FILE))
+    mapping_group.add_argument('-mr', '--mapping-artist', type=argparse.FileType('r'), const=ARTIST_FILE, nargs='?',
+                               help='Mapping file for artists (default: \'{}\')'.format(MAPPING_ARTISTS_FILE))
+    mapping_group.add_argument('-ml', '--mapping-album', type=argparse.FileType('r'), const=MAPPING_ALBUMS_FILE, nargs='?',
+                               help='Mapping file for albums (default: \'{}\')'.format(MAPPING_ALBUMS_FILE))
+    mapping_group.add_argument('-mg', '--mapping-suggest', type=argparse.FileType('r'), const=MAPPING_SUGGEST_FILE, nargs='?',
+                               help='Mapping file for suggest (default: \'{}\')'.format(MAPPING_SUGGEST_FILE))

    # Global Settings
    g_settings_group = parser.add_argument_group('Global Settings')
    g_settings_group.add_argument('-els', '--elasticsearch-url', default=ELASTICSEARCH_URL, nargs='?',
-                                  help="Elasticsearch URL (default: \'{}\')".format(ELASTICSEARCH_URL))
+                                  help="Elasticsearch URL.")

    return parser

@@ -253,7 +283,7 @@ def delete_index(index_name, quiet=False):
    res = requests.delete(url=ELASTICSEARCH_URL + index_name)
    if res.status_code == 200:
        if not quiet:
-            print(bcolors.OKGREEN + "Index deleted!" + bcolors.ENDC)
+            print(bcolors.OKGREEN + "Index '{}' deleted!".format(index_name) + bcolors.ENDC)
    else:
        print(bcolors.FAIL + "An error occured" + bcolors.ENDC)
        if res.json()['error']['type'] == 'index_not_found_exception':
@@ -276,7 +306,7 @@ def put_mapping(index_name, mapping_file, quiet=False):
        print(res.text + bcolors.ENDC)
    else:
        if not quiet:
-            print(bcolors.OKGREEN + "Mapping sent" + bcolors.ENDC)
+            print(bcolors.OKGREEN + "Mapping for '{}' sent".format(index_name) + bcolors.ENDC)

    put_setting(index_name, 0, quiet)

--- a/suggester.es
+++ b/suggester.es
@@ -1,51 +1,7 @@
 DELETE itunes-suggest

 PUT /itunes-suggest
-{
-    "settings": {
-        "analysis": {
-            "filter": {
-                "french_stop": {
-                    "type": "stop",
-                    "stopwords": "_french_"
-                },
-                "english_stop": {
-                    "type": "stop",
-                    "stopwords": "_english_"
-                }
-            },
-            "analyzer": {
-                "names": {
-                    "tokenizer": "standard",
-                    "filter": [
-                        "lowercase",
-                        "asciifolding",
-                        "french_stop",
-                        "english_stop"
-                    ]
-                }
-            }
-        }
-    },
-    "mappings": {
-        "properties": {
-            "artist_suggest": {
-                "type": "completion",
-                "search_analyzer": "names"
-            },
-            "artist": {
-                "type": "keyword"
-            },
-            "album_suggest": {
-                "type": "completion",
-                "search_analyzer": "names"
-            },
-            "album": {
-                "type": "keyword"
-            }
-        }
-    }
-}
+!./mapping.suggest.json

 // Also possible to specify analyze for ingesting => https://stackoverflow.com/questions/48304499/elasticsearch-completion-suggester-not-working-with-whitespace-analyzer

@@ -59,7 +15,7 @@ GET itunes-suggest/_analyze

 GET itunes-suggest/_search

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source" : "artist",
    "suggest": {
@@ -72,7 +28,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source" : "album",
    "suggest": {
@@ -86,7 +42,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source": ["album", "artist"],
    "suggest": {
@@ -105,7 +61,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "_source": ["album", "artist"],
    "suggest": {
@@ -124,7 +80,7 @@ POST itunes-suggest/_search
    }
 }

-POST itunes-suggest/_search
+GET itunes-suggest/_search
 {
    "suggest": {
        "ar-suggest": {
--- a/suggester.py
+++ b/suggester.py
@@ -1,37 +1,77 @@
-import requests
-import json
+"""
+    Process files generated by iTunesParser to fill a suggester index.
+    Suggester index in ELS must be created before use.
+
+    Found suggester.es query to create index.
+"""
+
 import sys
+import json
+import requests

 ELS_URL = 'http://localhost:9200'
 INDEX = 'itunes-suggest'

 class NoGoodDataException(Exception):
-    def __init__(self, message):
-        super().__init__(message)
+    """ Raise when data can't be correctly analyzed """

 def get_tokens(data: str) -> list:
+    """
+        Query Elasticsearch to get token for a string with a specific analyzer.
+        Throw an exception if no token found in ELS response.
+        Parameters
+        ----------
+        data: string
+            String to be analysed to obtain the tokens
+        Returns
+        -------
+        list
+            A list of token
+        Raises
+        ------
+        NoGoodDataException
+            If no tokens are found in the ELS responses, consider that the data is not correct for analysis.
+    """
    if not data:
        return []
    query = {
-        "analyzer": "names",
+        "analyzer": "names", # TODO Parameterize analyzer ?
        "text" : data
    }

    url = '{}/{}/_analyze'.format(ELS_URL, INDEX)
-    r = requests.get(url, json=query)
+    req = requests.get(url, json=query)

-    if not 'tokens' in r.json():
+    if not 'tokens' in req.json():
        print('ERROR: Not tokens in result')
        print('Input: ' + str(data))
-        print('Request: ' + str(r.json()))
+        print('Request: ' + str(req.json()))
        raise NoGoodDataException('Data is not correct to get tokens')
-    return [t['token'] for t in r.json()['tokens']]
+    return [t['token'] for t in req.json()['tokens']]

-def post_document(name: str, input: list, field_name: str) -> bool:
-    suggest_name = field_name + '_suggest'
+def post_document(main_field_value: str, input_terms: list, main_field_name: str) -> str:
+    """
+        Create suggestion document in Elasticsearch.
+
+        Parameters
+        ----------
+        main_field_value : str
+            Value to put in the main field named by `main_field_name`
+        input_terms : list
+            List of suggestion term to put in document
+        main_field_name : str
+            Name of the main field, to fill with `main_field_value`
+
+        Returns
+        -------
+        str
+            Success: ID of created document
+            Fail (ret. status <> 201): None
+    """
+    suggest_name = main_field_name + '_suggest'
    element = {
-        field_name: name,
-        suggest_name: input
+        main_field_name: main_field_value,
+        suggest_name: input_terms
    }

    # Filter empty keys
@@ -43,13 +83,26 @@ def post_document(name: str, input: list, field_name: str) -> bool:
        print('ELS Response KO')
        print(resp.status_code)
        print(resp.text)
-        return
+        return None

    el_id = resp.json()['_id']
    # print('Post_element - Element created: ' + el_id)
    return el_id

-def process_file(file_name: str, field_name: str) -> int:
+def process_file(file_name: str, field_name: str, array_file: str = None) -> int:
+    """
+        Process a JSON file with data
+
+        Parameters
+        ----------
+        file_name: string
+            Path and name of file to analyze
+        field_name: string
+            Name of the field where to find the data to create the suggestion entries
+        array_file: string, Default: None
+            Name of an array field to analyze to create more suggestion entries.
+            Nothing if None
+    """
    print('Process file: ' + file_name)
    with open(file_name, 'r') as o_file:
        lines = o_file.readlines()
@@ -62,22 +115,28 @@ def process_file(file_name: str, field_name: str) -> int:
        sys.stdout.flush()
        sys.stdout.write("\b" * (40+1)) # return to start of line, after '['
        data = json.loads(line)
-        if "Artist" in data:
+        if not "index" in data: # Exclude index line
            try:
-                input = get_tokens(data[field_name])
-                post_document(name=data[field_name], input=input, field_name=field_name.lower())
+                suggests_entries = get_tokens(data[field_name])
+
+                if array_file and array_file in data and data[array_file]:
+                    for key in data[array_file]:
+                        suggests_entries.extend(get_tokens(key))
+
+                # TODO Input have the same value several times ==> use to process a score
+                post_document(main_field_value=data[field_name], input_terms=suggests_entries, main_field_name=field_name.lower())
                count += 1
            except NoGoodDataException:
                print('ERROR WITH DATA')
                print(str(data))
    print('File processed\n')
-
    return count


 if __name__ == '__main__':
-    # Using readlines()
-    count = 0
-    count += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
-    count += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist')
-    print('Created documents: ' + str(count))
+    created_docs = 0
+    created_docs += process_file('/home/budd/workspace/iTunes/es-albums.json', 'Album')
+    print('Created documents: ' + str(created_docs))
+    created_docs += process_file('/home/budd/workspace/iTunes/es-artists.json', 'Artist', 'Album Artist')
+    print('Created documents: ' + str(created_docs))
+    # TODO Created doc <> nb doc in ELS
Author	SHA1	Message	Date
Maxence G. de Montauzan	95534c92b2	Refactor send data argparse	2021-09-05 02:21:24 +02:00
Maxence G. de Montauzan	0230bf260b	Process suggestion according to option	2021-09-05 02:13:47 +02:00
Maxence G. de Montauzan	928efb659e	Use send data to process suggestion Not all suggested document is created in ELS Need to refactor send data	2021-08-31 02:31:32 +02:00
Maxence G. de Montauzan	88025347ec	Pylint suggester	2021-08-30 19:32:12 +02:00
Maxence G. de Montauzan	67e1f8bd0c	Working suggester ingester	2021-08-30 19:13:30 +02:00
Maxence G. de Montauzan	042c2558ae	Update ES files - POST -> get	2021-08-23 01:22:58 +02:00
Maxence G. de Montauzan	ad0487943a	Process for album Adds too many uninteresting results Eg. all albums for one artist => prevents finding interesting information	2021-08-23 01:22:58 +02:00
Maxence G. de Montauzan	56050d0a49	Suggester: take Album Artist But it's not OK in dashboard -> For example, search 'ayache' (for Superbus) => Result display 'Superbus' and we don't understand why	2021-08-23 01:22:58 +02:00