import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.covariance import LedoitWolf

seed = 42
np.set_printoptions(precision=2, suppress=True)
np.random.seed(seed)

m = 10_000
n = 3
Z = np.random.randn(m, n)

Z = np.random.randn(m, n)

L = np.array([
    [1, 0, 0], 
    [2, 1, 0], 
    [-3, 1, 1]
])
sigma = L @ L.T
np.fill_diagonal(L, 1)
mu = np.random.randn(n, 1)

X = mu + L @ Z.T
print("Theoretical covariance matrix:")
print(sigma)
print("Sample covariance matrix:")
print(np.cov(X))

Theoretical covariance matrix:
[[ 1  2 -3]
 [ 2  5 -5]
 [-3 -5 11]]
Sample covariance matrix:
[[ 1.01  2.04 -3.02]
 [ 2.04  5.08 -5.06]
 [-3.02 -5.06 10.99]]

def bootstrap_eigenvalues(
    X: np.ndarray, 
    B: int = 100, 
    na_fractions: np.ndarray = np.linspace(0.0, 0.8, 9), 
    shrinkage: bool = False, 
    imputation: bool = False, 
    imputation_method: str = "mean",
    bootstrap: bool = False
):
    """
    Bootstrap the eigenvalues of the covariance matrix of X for different fractions of missing values.
    Args:
        X: The data matrix of shape (m, n).
        B: The number of bootstrap samples.
        na_fractions: The fractions of missing values to use.
        shrinkage: Whether to use Ledoit Wolf shrinkage.
        imputation: Whether to impute missing values.
        imputation_method: The method to impute missing values.
        bootstrap: Whether to use a bootstrapped eigenvalue estimate.
    Returns:
        eigenvalues_samples: The eigenvalues of the covariance matrix of X for different fractions of missing values of shape (n, B, len(na_fractions)).
    """
    assert imputation_method in ["mean", "median"], "Invalid imputation method"

    eigenvalues_samples = []
    removed_fraction_when_negative, na_fraction_when_negative = [], []
    for na_frac in na_fractions:
        eigenvalues_na_fraction = []
        for _ in range(B):
            # Introduce missing values
            mask_na = np.random.choice([True, False], size=X.shape, p=[1-na_frac, na_frac])
            X_na = np.where(mask_na, X, np.nan)

            # Perform imputation if needed
            df = pd.DataFrame(X_na.T)
            if imputation:
                if imputation_method == "mean":
                    df = df.fillna(df.mean())
                elif imputation_method == "median":
                    df = df.fillna(df.median())
            

            # Calculate eigenvalues
            if shrinkage: # Ledoit-Wolf Shrinkage
                try:
                    cov_ledoit_wolf = LedoitWolf().fit(df.dropna()).covariance_
                    eigenvalues = np.linalg.eigvals(cov_ledoit_wolf)
                except np.linalg.LinAlgError as e:
                    print(f"Error calculating eigenvalues for na fraction {na_frac}: {e}")
                    eigenvalues = np.zeros(X.shape[1])
            elif bootstrap: # Bootstrap the Data with Missing Values
                B_prime = 100
                df = df.dropna()
                bootstrapped_eigenvalues = []
                for _ in range(B_prime):
                    df_subset = df.sample(frac=0.5)
                    cov = df_subset.cov().to_numpy()
                    eigenvalues = np.linalg.eigvals(cov)
                    bootstrapped_eigenvalues.append(eigenvalues)
                eigenvalues = np.array(bootstrapped_eigenvalues).mean(axis=0)
            else: 
                try:
                    cov = df.cov().to_numpy()
                    eigenvalues = np.linalg.eigvals(cov)
                except np.linalg.LinAlgError as e:
                    print(f"Error calculating eigenvalues for na fraction {na_frac}: {e}")
                    eigenvalues = np.zeros(X.shape[1])

            if np.any(eigenvalues < 0):
                na_fraction_when_negative.append(na_frac)
                removed_fraction_when_negative.append(1-df.dropna().size/df.size)

            eigenvalues_na_fraction.append(eigenvalues)

        eigenvalues_samples.append(np.array(eigenvalues_na_fraction))

    eigenvalues_samples = np.array(eigenvalues_samples)
    eigenvalues_samples = np.swapaxes(eigenvalues_samples, 0, 2)
    return eigenvalues_samples, na_fraction_when_negative, removed_fraction_when_negative

# Base example
Z1 = np.random.randn(m, n)

L1 = np.tril(np.random.randn(n, n))
np.fill_diagonal(L1, 1)
mu1 = np.random.randn(n, 1)

# We transpose Z1 because it makes it easier to calculate the covariance matrix using numpy because it treats the rows as features
X1 = mu1 + L1 @ Z1.T

# High correlation example
Z2 = np.random.randn(m, n)

L2 = 10*np.tril(np.random.uniform(0.9, 1.1, size=(n, n)))
np.fill_diagonal(L2, 1)
mu2 = np.random.randn(n, 1)

# Again, we transpose Z2 because it makes it easier to calculate the covariance matrix using numpy because it treats the rows as features
X2 = mu2 + L2 @ Z2.T

print("Correlation matrix example 1:")
print(np.corrcoef(X1))

print("Correlation matrix example 2:")
print(np.corrcoef(X2))

Correlation matrix example 1:
[[ 1.    0.24 -0.93]
 [ 0.24  1.   -0.38]
 [-0.93 -0.38  1.  ]]
Correlation matrix example 2:
[[1.   1.   0.69]
 [1.   1.   0.76]
 [0.69 0.76 1.  ]]

# Setup for plotting
tick_labels = np.linspace(0.0, 0.8, 9)
tick_labels = [f"{tick:.1f}" for tick in tick_labels]

eigenvalues_samples_X1, _, _ = bootstrap_eigenvalues(X1)
true_eigenvalues_X1 = np.linalg.eigvals(np.cov(X1))

eigenvalues_samples_X2, na_fraction_when_negative_X2, removed_fraction_when_negative_X2 = bootstrap_eigenvalues(X2)
true_eigenvalues_X2 = np.linalg.eigvals(np.cov(X2))

# Base example plotting
for i in range(n):
    plt.figure(figsize=(10, 5))
    plt.boxplot(eigenvalues_samples_X1[i], tick_labels=tick_labels)
    plt.axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    plt.title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    plt.xlabel('Fraction of Missing Values')
    plt.ylabel('Eigenvalue')
    plt.legend()
    plt.show()

# High correlation example plotting
for i in range(n):
    plt.figure(figsize=(10, 5))
    plt.boxplot(eigenvalues_samples_X2[i], tick_labels=tick_labels)
    plt.axhline(y=true_eigenvalues_X2[i], color='r', linestyle='--', label='True Eigenvalue')
    plt.title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    plt.xlabel('Fraction of Missing Values')
    plt.ylabel('Eigenvalue')
    plt.legend()
    plt.show()

df = pd.DataFrame({"na_fraction_when_negative": na_fraction_when_negative_X2, "removed_fraction_when_negative": removed_fraction_when_negative_X2})
df.describe()

shrinkage_samples_X1, _, _ = bootstrap_eigenvalues(X1, shrinkage=True)

for i in range(X.shape[0]):
    fig, axs = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
    axs[0].boxplot(eigenvalues_samples_X1[i], tick_labels=tick_labels)
    axs[1].boxplot(shrinkage_samples_X1[i], tick_labels=tick_labels)
    axs[0].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[1].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[0].set_title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    axs[1].set_title(f'Shrinkage Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    axs[0].set_xlabel('Fraction of Missing Values')
    axs[1].set_xlabel('Fraction of Missing Values')
    axs[0].set_ylabel('Eigenvalue')
    axs[1].set_ylabel('Eigenvalue')
    axs[0].legend()
    axs[1].legend()
    plt.show()

bootstrapped_samples_X1, _, _ = bootstrap_eigenvalues(X1, bootstrap=True)

for i in range(X.shape[0]):
    fig, axs = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
    axs[0].boxplot(eigenvalues_samples_X1[i], tick_labels=tick_labels)
    axs[1].boxplot(bootstrapped_samples_X1[i], tick_labels=tick_labels)
    axs[0].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[1].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[0].set_title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    axs[1].set_title(f'Bootstrapped Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    axs[0].set_xlabel('Fraction of Missing Values')
    axs[1].set_xlabel('Fraction of Missing Values')
    axs[0].set_ylabel('Eigenvalue')
    axs[1].set_ylabel('Eigenvalue')
    axs[0].legend()
    axs[1].legend()
    plt.show()

eigenvalues_samples_X1_imputed, _, _ = bootstrap_eigenvalues(X1, imputation=True, imputation_method="mean")

for i in range(X.shape[0]):
    fig, axs = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
    axs[0].boxplot(eigenvalues_samples_X1[i], tick_labels=tick_labels)
    axs[1].boxplot(eigenvalues_samples_X1_imputed[i], tick_labels=tick_labels)
    axs[0].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[1].axhline(y=true_eigenvalues_X1[i], color='r', linestyle='--', label='True Eigenvalue')
    axs[0].set_title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values')
    axs[1].set_title(f'Eigenvalues for Feature {i+1} for Different Fractions of Missing Values with Imputation')
    axs[0].set_xlabel('Fraction of Missing Values')
    axs[1].set_xlabel('Fraction of Missing Values')
    axs[0].set_ylabel('Eigenvalue')
    axs[1].set_ylabel('Eigenvalue')
    axs[0].legend()
    axs[1].legend()
    plt.show()

	na_fraction_when_negative	removed_fraction_when_negative
count	416.000000	416.000000
mean	0.463221	0.762566
std	0.225630	0.235474
min	0.100000	0.261100
25%	0.300000	0.654075
50%	0.500000	0.872800
75%	0.700000	0.970700
max	0.800000	0.993800

Estimating Eigenvalues in Empirical Data¶

Background¶

Positive Semi-Definite Matrices¶

Data Simulation¶

Introducing Missing Values and Calculating Eigenvalues¶

Examples¶

Dealing with Noisy Estimates¶

Do Nothing (But Set Negative Eigenvalues to $0$)¶

Ledoit-Wolf Shrinkage¶

Deleting Columns With Too Many Missing Values¶

Bootstrap the Data with Missing Values¶

Imputation¶

Conclusion¶