Speech Emotion Recognition Project code in Python

Speech Emotion Recognition in Python

Visit pythonforbiginners.com to discover python tutorials

About the project: Speech Emotion Recognition (SER) is a classic machine learning problem that involves digital signal processing to extract meaningful audio features, followed by deep learning or traditional ML to classify those features.

Since training a complex neural network requires a large dataset and significant time, I will provide a single, complete Python file that sets up the standard workflow: Feature Extraction (MFCCs, Chroma, MEL) + Model Training (Multi-Layer Perceptron)

You will need to install a few specialized libraries for audio processing:


  pip install librosa soundfile numpy scikit-learn

Project Level: Advance

Below is the complete project file, which includes simulated data for immediate testing. You will need to replace the placeholder paths with actual paths to your dataset (like the public RAVDESS or TESS datasets) to achieve real-world accuracy.

How to Use This Project

Save the Code: Save the code above as emotion_recognizer.py.
Get a Dataset: The key to SER is the data. You need a publicly available dataset like RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song) or TESS (Toronto Emotional Speech Set).
Configure Path: Create a folder named audio_dataset in the same directory as your script and place the .wav files inside, or update the DATASET_PATH variable to point to your dataset's root directory.
Run: Execute the script: python emotion_recognizer.py

The script will handle the feature extraction, train the small neural network, and give you an accuracy score on the test set.


import librosa
import soundfile
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# --- Configuration ---
# You need to replace this with the path to your audio dataset.
# The code assumes your files are named or structured such that the emotion 
# can be inferred from the filename or path (e.g., '03-01-05-02-02-02-12.wav' -> 'anger').
DATASET_PATH = "./audio_dataset/" 
SAMPLE_RATE = 22050

# List of emotions supported by the model
EMOTIONS = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad', 
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}
# The specific emotion codes depend on the dataset (e.g., RAVDESS uses these codes)

def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    """
    Extracts features (MFCC, Chroma, MEL) from an audio file.

    :param file_name: Path to the audio file.
    :return: A numpy array of concatenated features.
    """
    try:
        with soundfile.SoundFile(file_name) as sound_file:
            X = sound_file.read(dtype="float32")
            # Calculate the number of samples needed to fill the target sample rate
            target_samples = int(sound_file.samplerate * 2.5) # Process fixed 2.5s duration
            
            # Pad or truncate the audio data to a fixed length for consistency
            if len(X) < target_samples:
                X = np.pad(X, (0, target_samples - len(X)), mode='constant')
            elif len(X) > target_samples:
                X = X[:target_samples]
                
            sample_rate = sound_file.samplerate
            result = np.array([])
            
            # Mel-Frequency Cepstral Coefficients (MFCC) - Captures vocal tract shape
            if mfcc:
                mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
                result = np.hstack((result, mfccs))
            
            # Chroma Feature - Captures pitch and tone
            if chroma:
                stft = np.abs(librosa.stft(X))
                chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
                result = np.hstack((result, chroma))
            
            # Mel Spectrogram (MEL) - Captures energy distribution
            if mel:
                mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
                result = np.hstack((result, mel))
                
            return result
            
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")
        return None

def load_data(test_size=0.25):
    """
    Loads data and extracts features from all audio files in the dataset path.
    
    NOTE: This is set up to parse filenames from datasets like RAVDESS. 
    You may need to adjust the filename parsing logic for your specific dataset.
    """
    X, y = [], []
    
    # Use glob to find all .wav files recursively
    print(f"Searching for audio files in: {DATASET_PATH}...")
    file_paths = glob.glob(os.path.join(DATASET_PATH, '**/*.wav'), recursive=True)

    if not file_paths:
        print("--- WARNING: NO AUDIO FILES FOUND ---")
        print(f"Please create a directory named '{DATASET_PATH}' and place WAV files inside.")
        print("Using synthetic data for demonstration, but accuracy will be zero.")
        return create_synthetic_data(test_size)


    for file in file_paths:
        # Get the base filename (e.g., '03-01-05-02-02-02-12.wav')
        basename = os.path.basename(file)
        
        # Parse the emotion from the filename (e.g., '05' for 'angry' in RAVDESS)
        try:
            emotion_code = basename.split('-')[2]
            emotion = EMOTIONS[emotion_code]
        except (IndexError, KeyError):
            # Skip files that don't match the expected naming convention
            print(f"Skipping file: {basename} (cannot determine emotion from name)")
            continue
        
        # Extract features and append to lists
        feature = extract_feature(file)
        if feature is not None:
            X.append(feature)
            y.append(emotion)
            print(f"  Processed {file} -> Emotion: {emotion}")
            
    if not X:
        print("No valid data processed. Using synthetic data as fallback.")
        return create_synthetic_data(test_size)
    
    print(f"\nSuccessfully loaded {len(X)} samples across {len(set(y))} emotions.")
    return train_test_split(np.array(X), y, test_size=test_size, random_state=42)

def create_synthetic_data(test_size):
    """Fallback function to create synthetic data if no audio files are found."""
    np.random.seed(42)
    # 40 features (40 MFCCs) is a common starting point
    X_synth = np.random.rand(100, 40) 
    y_synth = np.random.choice(list(EMOTIONS.values()), 100)
    print("Synthetic data generated.")
    return train_test_split(X_synth, y_synth, test_size=test_size, random_state=42)

def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    """
    Initializes, trains, and evaluates the MLPClassifier model.
    """
    print("\n--- Training Model (MLPClassifier) ---")
    
    # Initialize the Multi-Layer Perceptron Classifier
    # hidden_layer_sizes: Defines the number and size of the hidden layers (e.g., two layers of 200 neurons each)
    # max_iter: Maximum number of iterations for the optimizer
    # alpha: L2 penalty (regularization term) parameter
    # solver: The optimizer used for weight optimization
    model = MLPClassifier(
        hidden_layer_sizes=(200, 200),
        max_iter=500,
        alpha=0.01,
        solver='adam',
        verbose=True,
        random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    print("\n--- Model Training Complete ---")

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nModel Accuracy on Test Set: {accuracy * 100:.2f}%")
    
    return model

def predict_emotion(model, audio_path):
    """
    Processes a single new audio file and predicts its emotion.
    """
    print(f"\n--- Predicting Emotion for: {audio_path} ---")
    
    # 1. Extract features from the new audio file
    feature = extract_feature(audio_path, mfcc=True, chroma=True, mel=True)
    
    if feature is None:
        print("Prediction failed due to feature extraction error.")
        return None
        
    # 2. Reshape the single sample for prediction (needs to be 2D array)
    feature = feature.reshape(1, -1)
    
    # 3. Predict the emotion
    prediction = model.predict(feature)
    
    print(f"Predicted Emotion: {prediction[0].upper()}")
    return prediction[0]


if __name__ == '__main__':
    # 1. Load Data & Split
    X_train, X_test, y_train, y_test = load_data()
    
    # Ensure all feature sets have the same size before training
    if len(X_train) == 0 or X_train[0].size != X_test[0].size:
        print("\n[CRITICAL ERROR] Feature arrays are empty or mismatched in size. Cannot proceed with training.")
        # If running with synthetic data, this check ensures the fallback worked:
        if len(X_train) > 0 and X_train[0].size == 40:
            print("Using synthetic data (size 40) for demonstration.")
        else:
            exit()
    
    # 2. Train and Evaluate
    trained_model = train_and_evaluate_model(X_train, X_test, y_train, y_test)
    
    # 3. Sample Prediction (Use the first item in the test set as a demonstration)
    if X_test.size > 0:
        print("\n--- Demonstration: Predicting a Sample from the Test Set ---")
        
        # Create a single feature set for prediction
        sample_feature = X_test[0].reshape(1, -1)
        
        # Predict
        sample_pred = trained_model.predict(sample_feature)
        sample_true = y_test[0]
        
        print(f"True Emotion: {sample_true.upper()}")
        print(f"Predicted Emotion: {sample_pred[0].upper()}")
        print("------------------------------------------------------------")
        
    # Note: For real-world use, you would run the predict_emotion function 
    # with a path to a brand new audio file here.

← Back to Projects