iTunes/iTunesParser.py

#!/usr/bin/env python

"""
Parse iTunes library and produce JSON adapted files to send to Elasticsearch

Rating note:
    For albums and artists data, 'Rating' is the average rate for *all* songs in the album or of the artist.
    So, if in an album, 10 songs are evaluated and 2 not evaluated, 'Rating' will be the sum of rate divided by 12.

    TODO: Add informations to store number of evaluated songs, and 'Rating' for evaluated song.

Parses an iTunes library XML file and generates a JSON file
for use in the D3.js JavaScript library.

    Example Track info:
    {
        'Album': 'Nirvana',
        'Persistent ID': 'A50FE1436726815C',
        'Track Number': 4,
        'Location': 'file://localhost/Users/foo/Music/iTunes/iTunes%20Music/Nirvana/Nirvana/04%20Sliver.mp3',
        'File Folder Count': 4,
        'Album Rating Computed': True,
        'Total Time': 134295,
        'Sample Rate': 44100,
        'Genre': 'Rock/Alternative',
        'Bit Rate': 236,
        'Kind': 'MPEG audio file',
        'Name': 'Sliver',
        'Artist': 'Nirvana',
        'Date Added': datetime.datetime(2006, 10, 11, 4, 31, 38),
        'Album Rating': 60,
        'Rating': 40,
        'Date Modified': datetime.datetime(2009, 7, 18, 4, 57, 41),
        'Library Folder Count': 1,
        'Year': 2002,
        'Track ID': 7459,
        'Size': 3972838,
        'Track Type': 'File',
        'Play Count': 2,
        'Play Date UTC': datetime.datetime(2009, 7, 18, 5, 00, 00)
    }

"""

import datetime
import io
import json
import os
import plistlib
import sys
import argparse
import hashlib


class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        if isinstance(obj, datetime.datetime):
            return obj.isoformat()
            # encoded_object = int(mktime(obj.timetuple()))
        return json.JSONEncoder.default(self, obj)


class ITunesParser:
    """
        Parse an iTunes Library and produce JSON - for ELS
    """
    def __init__(self, library_file):
        self._albums = {}
        self._artists = {}
        self.library_file = library_file

    def to_json(self):
        """
            Just do processSong()
            or do process_songs, then _write_artists and _write_albums.
            Note: process_songs do a process_artists and process_albums...
            This method suck.
        """
        ret = self._process_songs()

        self._write_artists()
        self._write_albums()

        # return json.dumps(jsonObj, indent=indent, cls=SetEncoder)
        return ret

    def _read_tracks(self):
        """
            Read library and return Tracks key of dict
        """
        plist = plistlib.load(open(self.library_file, 'rb'))
        return plist['Tracks']

    def _process_songs(self):
        """
            Return an output JSON for an ELS Bulk request - Not a correct format
            This method call process_album & process_artist
            TODO Just return a _correct_ JSON and treat in another place/class
        """

        tracks = self._read_tracks()
        ret = ""

        for _, track in tracks.items():
            # Filter out any non-music
            if track['Track Type'] != 'File':
                continue
            if 'Podcast' in track or 'Has Video' in track:
                continue

            persistent_id = track['Persistent ID']
            json_track_index = {
                "index": {"_index": "itunessongs", "_type": "song", "_id": persistent_id}
            }

            # Retrieve for each track artist information
            self._process_artist(track)
            # Retrieve for each track album information
            self._process_album(track)

            ret += json.dumps(json_track_index, indent=None, cls=SetEncoder)
            ret += "\n"
            ret += json.dumps(track, indent=None, cls=SetEncoder)
            ret += "\n"
        return ret

    def _process_artist(self, track):
        """
            Process artists in the track part of library and return a JSON formated for a bulk ELS request
        """

        if 'Artist' not in track:
            return

        akey = track['Artist']
        # Add artist
        if akey not in self._artists:
            a_id = self.calc_id(akey)
            self._artists[akey] = {
                'Persistent ID': a_id,
                'Name': akey,
                'Track Count': 0,
                'Play Count': 0,
                'Rating': 0,
                'Genre': set()
                }

        # Compute information
        play_count = track['Play Count'] if 'Play Count' in track else 0

        rating = track['Rating'] if 'Rating' in track else 0
        rating = self.calc_rating(rating, self._artists[akey]['Rating'], self._artists[akey]['Track Count'])

        self._artists[akey]['Track Count'] += 1
        self._artists[akey]['Rating'] = rating
        self._artists[akey]['Play Count'] += play_count

        if 'Genre' in track:
            # Split up the Genres
            genre_parts = track['Genre'].split('/')
            self._artists[akey]['Genre'] |= set(genre_parts)

    def _process_album(self, track):
        """
            Process albums in the track part of library and return a JSON formated for a bulk ELS request
        """

        if 'Album' not in track:
            return

        akey = track['Album']
        if akey not in self._albums:
            a_id = self.calc_id(akey)
            self._albums[akey] = {
                'Persistent ID': a_id,
                'Name': akey,
                'Track Count': 0,
                'Play Count': 0,
                'Rating': 0,
                'Genre': set(),
                'Artist': set()
            }

        # Compute information
        play_count = track['Play Count'] if 'Play Count' in track else 0

        rating = track['Rating'] if 'Rating' in track else 0
        rating = self.calc_rating(rating, self._albums[akey]['Rating'], self._albums[akey]['Track Count'])

        self._albums[akey]['Track Count'] += 1
        self._albums[akey]['Rating'] = rating
        self._albums[akey]['Play Count'] += play_count

        if 'Genre' in track:
            # Split up the Genres
            genre_parts = track['Genre'].split('/')
            self._albums[akey]['Genre'] |= set(genre_parts)

        if 'Artist' in track:
            self._albums[akey]['Artist'].add(track['Artist'])

        if 'Album Rating' in track:
            self._albums[akey]['Album Rating'] = track['Album Rating']
            self._albums[akey]['Album Rating Computed'] = True

    def _write_artists(self):
        """
            Write artists data to another JSON file
        """

        file_artist = io.open('es-artist-data.json', 'wb')
        for artist in self._artists:
            persistent_id = self._artists[artist]['Persistent ID']
            self._artists[artist]['Rating'] = round(self._artists[artist]['Rating'])
            json_track_index = {
                "index": {"_index": "itunessongs", "_type": "artist", "_id": persistent_id}
            }
            file_artist.write(bytes(json.dumps(json_track_index, indent=None, cls=SetEncoder), 'UTF-8'))
            file_artist.write(bytes("\n", 'UTF-8'))
            file_artist.write(bytes(json.dumps(self._artists[artist], indent=None, cls=SetEncoder), 'UTF-8'))
            file_artist.write(bytes("\n", 'UTF-8'))
        file_artist.close()

    def _write_albums(self):
        """
            Write albums data to another JSON file
        """

        file_albums = io.open('es-albums-data.json', 'wb')
        for album in self._albums:
            persistent_id = self._albums[album]['Persistent ID']
            self._albums[album]['Rating'] = round(self._albums[album]['Rating'])
            json_track_index = {
                "index": {"_index": "itunessongs", "_type": "album", "_id": persistent_id}
            }
            file_albums.write(bytes(json.dumps(json_track_index, indent=None, cls=SetEncoder), 'UTF-8'))
            file_albums.write(bytes("\n", 'UTF-8'))
            file_albums.write(bytes(json.dumps(self._albums[album], indent=None, cls=SetEncoder), 'UTF-8'))
            file_albums.write(bytes("\n", 'UTF-8'))
        file_albums.close()

    @classmethod
    def calc_rating(cls, added_value, current_rating, count):
        """
            Calculate average rating from a current rating, a rating value to add and the number of elements
        """
        return (current_rating * count + added_value) / (count + 1)

    @classmethod
    def calc_id(cls, key):
        """
            Calculate a MD5 sum from a key as ID
        """
        md5 = hashlib.md5()
        md5.update(key.encode('UTF-8'))
        return md5.hexdigest()

#### main block ####

# Default input & output files
DEFAULT_LIBRARY_FILE_NAME = 'iTunesLibrary.xml'
DEFAULT_OUTPUT_FILE_NAME = '/es-music-data.json'
DEFAULT_LIBRARY_FILE = os.path.expanduser(DEFAULT_LIBRARY_FILE_NAME)
DEFAULT_OUTPUT_FILE = os.path.dirname(os.path.realpath(__file__)) + DEFAULT_OUTPUT_FILE_NAME

# Get options
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', default=DEFAULT_LIBRARY_FILE,
                    help='iTunes Library XML file path (default: ./' + DEFAULT_LIBRARY_FILE_NAME + ')')
parser.add_argument('-o', '--output', default=DEFAULT_OUTPUT_FILE,
                    help='Output to file (default: .' + DEFAULT_OUTPUT_FILE_NAME + ')')
parser.add_argument('-c', '--console', action='store_true',
                    help='Output to console instead of file')
# parser.add_argument('-v', '--verbose', action='store_true',
                    # help='Verbose output')

if __name__ == '__main__':
    args = parser.parse_args()

    itunes_parser = ITunesParser(args.file)
    output = itunes_parser.to_json()

    if args.console:
        print(output)
    else:
        with io.open(args.output, 'wb') as outfile:
            if sys.version_info.major == 2:
                outfile.write(bytes(output))
            elif sys.version_info.major == 3:
                outfile.write(bytes(output, 'UTF-8'))
        print('JSON data written to: ' + args.output)