import sys, os, math
sys.path.insert(0, os.path.abspath('..'))

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from src.plotting import apply_style
from src.tiny_models import toy_mlp, tiny_mlp, count_params
from src.data import load_mnist_7x7
from solutions._01_krylov import hvp_double_backward
from solutions._02_randomized import entk_matvec

apply_style()
torch.manual_seed(0)
print('environment ready')

def hutchinson_trace(matvec, n, m, probe_type='rademacher', seed=0):
    # YOUR CODE HERE
    raise NotImplementedError

# Sanity check on a 30x30 random symmetric matrix.
torch.manual_seed(0)
n = 30
A = torch.randn(n, n); A = A + A.T
true_trace = torch.diagonal(A).sum().item()

est, _ = hutchinson_trace(lambda v: A @ v, n=n, m=1000, probe_type='rademacher', seed=0)
print(f'true trace  = {true_trace:.3f}')
print(f'Hutchinson  = {est:.3f}')

A_diag = A + 3 * torch.eye(n)  # add diagonal mass to expose Rademacher's edge
A_fro_sq = (A_diag ** 2).sum().item()

ms = [10, 30, 100, 300, 1000, 3000]
n_trials = 50
fig, ax = plt.subplots()
for ptype, color in [('rademacher', 'C0'), ('gaussian', 'C1')]:
    emp_var = []
    for m in ms:
        ts = [hutchinson_trace(lambda v: A_diag @ v, n=n, m=m, probe_type=ptype,
                                seed=s)[0] for s in range(n_trials)]
        emp_var.append(torch.tensor(ts).var().item())
    ax.loglog(ms, emp_var, 'o-', color=color, label=ptype)
# theoretical bound for Gaussian
ax.loglog(ms, [2 * A_fro_sq / m for m in ms], 'k--', alpha=0.6,
          label=r'$2\|A\|_F^2/m$ (Gaussian)')
ax.set_xlabel('m (probes)'); ax.set_ylabel('Var(estimate)')
ax.set_title('Hutchinson variance: Rademacher vs Gaussian')
ax.legend(); plt.show()

# Set up tiny_mlp + small training data.
torch.manual_seed(0)
model = tiny_mlp(seed=0)
X_train, y_train = load_mnist_7x7(n=300, seed=0)
P = count_params(model)
print(f'tiny_mlp: P = {P} params')

def H_matvec(v):
    return hvp_double_backward(model, X_train, y_train, v)

# Hutchinson estimate vs explicit trace.
est, _ = hutchinson_trace(H_matvec, n=P, m=200, probe_type='rademacher', seed=0)
print(f'Hutchinson trace (m=200): {est:.4f}')

# Ground truth via materialized Hessian.
H_full = torch.stack([H_matvec(torch.eye(P)[i]) for i in range(P)])
true_trace = torch.diagonal((H_full + H_full.T) / 2).sum().item()
print(f'true trace:               {true_trace:.4f}')

from solutions._01_krylov import lanczos

def slq_density(matvec, n, m_probes, s_lanczos, grid, sigma, seed=0):
    # YOUR CODE HERE
    raise NotImplementedError

# Sanity check: a diagonal matrix with known spectrum.
torch.manual_seed(0)
n = 40
A_diag = torch.diag(torch.linspace(-2, 4, n))
grid = torch.linspace(-3, 5, 400)
density = slq_density(lambda v: A_diag @ v, n=n, m_probes=30, s_lanczos=30,
                      grid=grid, sigma=0.15, seed=0)

# Compare to the true eigenvalues (Dirac comb) and a Gaussian-smoothed
# kernel-density estimate of them.
true_eigs = torch.linalg.eigvalsh(A_diag)
true_kde = torch.zeros_like(grid)
for e in true_eigs:
    true_kde += torch.exp(-((grid - e)**2)/(2*0.15**2)) / (0.15 * math.sqrt(2*math.pi))

fig, ax = plt.subplots()
ax.plot(grid.numpy(), density.numpy(), label='SLQ estimate')
ax.plot(grid.numpy(), true_kde.numpy(), '--', alpha=0.7, label='true (KDE on eigs)')
ax.set_xlabel(r'$\lambda$'); ax.set_ylabel(r'$\rho(\lambda)$')
ax.set_title('SLQ vs true spectral density (40 eigvals, linspace(-2, 4))')
ax.legend(); plt.show()

# === DOS at init ===
def H_init_matvec(v):
    return hvp_double_backward(model, X_train, y_train, v)

grid = torch.linspace(-2.0, 5.0, 500)
dos_init = slq_density(H_init_matvec, n=P, m_probes=10, s_lanczos=40,
                       grid=grid, sigma=0.05, seed=0)
print(f'integrated DOS at init: {torch.trapezoid(dos_init, grid).item():.1f}  (P = {P})')

# Train a bit.
opt = torch.optim.SGD(model.parameters(), lr=0.1)
for step in range(200):
    idx = torch.randint(0, len(X_train), (64,))
    opt.zero_grad()
    F.cross_entropy(model(X_train[idx]), y_train[idx]).backward()
    opt.step()
print(f'trained, final loss = {F.cross_entropy(model(X_train), y_train).item():.3f}')

def H_trained_matvec(v):
    return hvp_double_backward(model, X_train, y_train, v)

dos_trained = slq_density(H_trained_matvec, n=P, m_probes=10, s_lanczos=40,
                          grid=grid, sigma=0.05, seed=0)

fig, ax = plt.subplots(figsize=(8, 4.5))
ax.semilogy(grid.numpy(), dos_init.clamp(min=1e-3).numpy(), label='at init')
ax.semilogy(grid.numpy(), dos_trained.clamp(min=1e-3).numpy(), label='after training')
ax.set_xlabel(r'$\lambda$'); ax.set_ylabel(r'$\rho(\lambda)$ (log)')
ax.set_title('Money plot: Hessian DOS — bulk near zero + outliers after training')
ax.legend(); plt.show()

torch.manual_seed(0)
A = torch.randn(50, 50); A = A + A.T
E = torch.randn(50, 50); E = E + E.T
E_norm = torch.linalg.matrix_norm(E, ord=2).item()
A_eigs = torch.linalg.eigvalsh(A)

alphas = torch.linspace(0, 1, 21)
max_dlams = []
for alpha in alphas:
    P_eigs = torch.linalg.eigvalsh(A + alpha * E)
    max_dlams.append((P_eigs - A_eigs).abs().max().item())

xs = (alphas * E_norm).tolist()
plt.figure()
plt.plot(xs, max_dlams, 'o-', label=r'observed $\max_k |\Delta\lambda_k|$')
plt.plot([0, max(xs)], [0, max(xs)], 'k--', alpha=0.6, label='Weyl bound $y=x$')
plt.xlabel(r'$\alpha \|E\|_2$'); plt.ylabel(r'$\max_k |\Delta \lambda_k|$')
plt.title("Weyl's inequality: eigenvalues are 1-Lipschitz in the matrix")
plt.legend(); plt.show()

def make_A_with_gap(gap, n=50, seed=0):
    g_rng = torch.Generator().manual_seed(seed)
    Q = torch.linalg.qr(torch.randn(n, n, generator=g_rng))[0]
    eigs = torch.cat([
        torch.tensor([10.0, 10.0 - gap, 10.0 - 2*gap]),
        torch.rand(n - 3, generator=g_rng),
    ])
    return Q @ torch.diag(eigs) @ Q.T

from solutions._03_estimation import principal_angle

torch.manual_seed(1)
E = torch.randn(50, 50); E = E + E.T
E = 0.05 * E / torch.linalg.matrix_norm(E, ord=2)
E_norm = torch.linalg.matrix_norm(E, ord=2).item()

gaps = [0.1, 0.3, 1.0, 3.0]
angles, ceilings = [], []
for gap in gaps:
    A = make_A_with_gap(gap)
    _, V_A = torch.linalg.eigh(A)
    _, V_pert = torch.linalg.eigh(A + E)
    top3_A = V_A[:, -3:]
    top3_pert = V_pert[:, -3:]
    angle = principal_angle(top3_A, top3_pert)
    angles.append(math.sin(angle))
    ceilings.append(E_norm / gap)

fig, ax = plt.subplots()
ax.loglog(gaps, angles, 'o-', label=r'observed $\sin\Theta_3$')
ax.loglog(gaps, ceilings, 'k--', alpha=0.6, label=r'Davis-Kahan: $\|E\|_2/\mathrm{gap}$')
ax.set_xlabel('spectral gap $g$'); ax.set_ylabel(r'$\sin \Theta_3$')
ax.set_title(r'Davis-Kahan: small gap $\Rightarrow$ eigenvector instability')
ax.legend(); plt.show()

from solutions._02_randomized import randomized_eigh

torch.manual_seed(0)
model_n = tiny_mlp(seed=0)
X_full, _ = load_mnist_7x7(n=200, seed=0)
N = X_full.shape[0]

n_boot = 20
boot_eigvals = []
boot_eigvecs = []
for b in range(n_boot):
    g = torch.Generator().manual_seed(b)
    idx = torch.randint(0, N, (N,), generator=g)  # bootstrap with replacement
    X_boot = X_full[idx]

    def Kv(v, X=X_boot):
        return entk_matvec(model_n, X, v)

    eigvals, eigvecs = randomized_eigh(Kv, n=N, k=5, oversample=10, n_power=1, seed=b)
    boot_eigvals.append(eigvals)
    boot_eigvecs.append(eigvecs[:, :3])  # keep top-3 for angle analysis

eigval_arr = torch.stack(boot_eigvals)  # (n_boot, 5)
print(f'top-5 eigenvalue means across {n_boot} bootstraps:')
print(eigval_arr.mean(dim=0).tolist())
print(f'top-5 eigenvalue stds:')
print(eigval_arr.std(dim=0).tolist())

# Plot one: boxplot of bootstrap eigenvalue distributions.
fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))
axes[0].boxplot([eigval_arr[:, k].tolist() for k in range(5)])
axes[0].set_xticklabels([f'λ_{k+1}' for k in range(5)])
axes[0].set_ylabel('eigenvalue'); axes[0].set_title('Eigenvalues: stable across bootstraps')

# Plot two: pairwise principal angles between bootstrap eigenspaces.
angles = []
for i in range(n_boot):
    for j in range(i+1, n_boot):
        try:
            a = principal_angle(boot_eigvecs[i], boot_eigvecs[j])
            angles.append(math.sin(a))
        except Exception:
            pass

axes[1].hist(angles, bins=20)
axes[1].set_xlabel(r'$\sin \Theta_3$ (pairwise)'); axes[1].set_ylabel('count')
axes[1].set_title('Eigenvectors: rotate freely (small spectral gaps)')
plt.tight_layout(); plt.show()

Tool	Returns	Cost
Hutchinson	scalar trace	m matvecs
SLQ	smoothed DOS curve	m·s matvecs
Bootstrap + rSVD	CI on top-k eigenpairs	(k+p) × bootstraps matvecs

Notebook 3 — Trace, density of states, and eigenvalue perturbation¶

1. Hutchinson's trick¶

Variance¶

Exercise 1.1: Implement Hutchinson (🔴🔴⚪⚪⚪, 8 min)¶

Exercise 1.2: Variance plot (🔴🔴⚪⚪⚪, 8 min)¶

2. The Hessian's trace¶

3. From point estimates to the spectral density¶

Exercise 3.1: Implement SLQ (🔴🔴🔴🔴⚪, 25 min)¶

4. Density of states in practice — the bulk + outliers picture¶

5. Eigenvalue perturbation: how much do you trust this?¶

Weyl's inequality¶

Davis-Kahan $\sin \Theta$¶

Exercise 5.1: Weyl in action (🔴🔴⚪⚪⚪, 8 min)¶

Exercise 5.2: Davis-Kahan and the gap (🔴🔴🔴⚪⚪, 12 min)¶

Exercise 5.3: Bootstrap on the eNTK — the ML punchline (🔴🔴🔴⚪⚪, 10 min)¶

Wrap-up¶