Music Recommendation Engine Project code in Python

Music Recommendation Engine in Python

Visit pythonforbiginners.com to discover python tutorials

About the project: Building a Music Recommendation Engine is a great way to demonstrate fundamental machine learning concepts like Content-Based Filtering and Cosine Similarity.

Since we can not rely on external databases, We've created a complete, single-file Python script that uses a simulated music catalog. The engine works by analyzing the metadata (Genre, Artist, Mood, Keywords) of a song, vectorizing that data, and then finding other songs in the catalog that are most mathematically similar to it.

You will need the standard data science libraries:


  pip install pandas numpy scikit-learn

Project Level: Advance

Below is the complete project code:

This code performs the essential steps of a Content-Based Recommendation Engine:

Feature Combination: Merges the genre, mood, and keywords into a single descriptive string for each song.
Vectorization (TfidfVectorizer): Converts those text descriptions into numerical vectors.
Similarity Matrix (cosine_similarity): Calculates how similar every song's vector is to every other song's vector.
Recommendation: Finds the highest similarity scores for a chosen song and returns the corresponding titles.

Here is the complete code snippet:


import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# Suppress sklearn future warnings for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# --- 1. Synthetic Music Catalog Data ---
# In a real-world scenario, this data would be loaded from a database or CSV.
# We're simulating a rich catalog with various metadata fields.
MUSIC_CATALOG = {
    'Title': [
        "Galactic Drift", "Ocean View", "Urban Pulse", "Coffee Shop Jive", 
        "Desert Wind Song", "Rainy Day Blues", "Power Surge", "Midnight Stroll",
        "Ancient Echoes", "Future Bass Drop"
    ],
    'Artist': [
        "Astro Funk", "Ella Groove", "The City Vibe", "Jazzy Keys", 
        "Cactus Jack", "Sorrowful Seven", "Metal Head", "Lofi Loop",
        "World Fusion", "DJ Cypher"
    ],
    'Genre': [
        "Funk, Electronic", "Jazz, Bossa Nova", "Hip Hop, Electronic", "Jazz", 
        "Folk, Acoustic", "Blues, Jazz", "Rock, Metal", "Lofi, Chill",
        "World, Instrumental", "EDM, Dubstep"
    ],
    'Mood': [
        "Energetic, Danceable", "Relaxing, Calm", "Driving, Confident", "Cozy, Mellow", 
        "Wanderlust, Reflective", "Melancholy, Pensive", "Aggressive, Powerful", "Peaceful, Focused",
        "Spiritual, Epic", "Excited, Hypnotic"
    ],
    'Keywords': [
        "synthesizer bass groove retro", "trumpet saxophone smooth rhythm", "heavy beat street vibe synth", "piano soft warm vinyl", 
        "guitar harmonica simple roadtrip", "slow minor key smoky bar", "fast distorted heavy drums", "lo-fi study concentration beats",
        "percussion flute global chant", "sub bass heavy drop dance"
    ]
}

def create_recommendation_engine():
    """
    Initializes the recommendation engine by processing the catalog data.
    """
    df = pd.DataFrame(MUSIC_CATALOG)
    
    # --- 2. Feature Engineering ---
    # Combine relevant text features into a single string for vectorization.
    # We prioritize Genre and Keywords.
    def combine_features(row):
        return f"{row['Genre']} {row['Mood']} {row['Keywords']} {row['Artist']}"
    
    df['combined_features'] = df.apply(combine_features, axis=1)
    
    print("Catalog loaded. Total songs:", len(df))
    print("Combined features created successfully.")
    
    # --- 3. Vectorization (TF-IDF) ---
    # TF-IDF (Term Frequency-Inverse Document Frequency) measures how important 
    # a word is to a document (song) relative to the corpus (catalog).
    tfidf = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the data
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])
    
    print(f"TF-IDF Matrix created. Shape: {tfidf_matrix.shape}")
    
    # --- 4. Similarity Calculation ---
    # Compute the cosine similarity matrix. Cosine similarity measures the 
    # cosine of the angle between two feature vectors, indicating how similar 
    # the documents (songs) are.
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    print("Cosine Similarity Matrix computed.")
    
    # Create a mapping of song titles to their index in the DataFrame
    indices = pd.Series(df.index, index=df['Title']).drop_duplicates()
    
    return df, indices, cosine_sim

def recommend_songs(song_title, df, indices, cosine_sim, num_recommendations=5):
    """
    Generates recommendations based on a given song title.

    :param song_title: The title of the song the user likes.
    :return: A list of recommended song titles.
    """
    print(f"\n--- Generating Recommendations for: '{song_title}' ---")
    
    if song_title not in indices:
        print(f"Error: Song '{song_title}' not found in the catalog.")
        print("Please check the spelling or choose from the catalog.")
        return []
        
    # Get the index of the song that matches the title
    idx = indices[song_title]
    
    # Get the similarity scores for all songs relative to this song
    # Enumerate creates pairs of (index, similarity_score)
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the songs based on the similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top 'num_recommendations' + 1 songs (excluding the input song itself)
    # [1:num_recommendations+1] skips the first element (the input song itself, which is 100% similar)
    sim_scores = sim_scores[1:num_recommendations + 1]
    
    # Get the song indices
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top recommended titles
    recommendations = df['Title'].iloc[song_indices]
    
    # Print detailed output
    print(f"Base Genre: {df.iloc[idx]['Genre']}")
    print(f"Base Mood: {df.iloc[idx]['Mood']}")
    print("\nTop Recommendations:")
    
    recommended_list = []
    for i, title in enumerate(recommendations, 1):
        rec_idx = df[df['Title'] == title].index[0]
        score = sim_scores[i-1][1]
        recommended_list.append({
            'rank': i,
            'title': title,
            'artist': df.iloc[rec_idx]['Artist'],
            'genre': df.iloc[rec_idx]['Genre'],
            'similarity_score': f"{score:.4f}"
        })
        print(f"{i}. {title} by {df.iloc[rec_idx]['Artist']} (Genre: {df.iloc[rec_idx]['Genre']}, Score: {score:.4f})")

    return recommended_list


if __name__ == "__main__":
    # 1. Initialize the Engine
    df_catalog, indices_map, similarity_matrix = create_recommendation_engine()
    
    # 2. Test Case 1: Electronic/Dance music
    # Target: "Galactic Drift" (Funk, Electronic, Energetic)
    recommendations_1 = recommend_songs("Galactic Drift", df_catalog, indices_map, similarity_matrix, num_recommendations=3)

    # ---
    
    # 3. Test Case 2: Chill/Study music
    # Target: "Midnight Stroll" (Lofi, Chill)
    recommendations_2 = recommend_songs("Midnight Stroll", df_catalog, indices_map, similarity_matrix, num_recommendations=3)

    # ---
    
    # 4. Test Case 3: Blues/Jazz music
    # Target: "Rainy Day Blues" (Blues, Jazz)
    recommendations_3 = recommend_songs("Rainy Day Blues", df_catalog, indices_map, similarity_matrix, num_recommendations=3)