import sys, os
sys.path.insert(0, os.path.abspath('..'))

import torch
import matplotlib.pyplot as plt
import numpy as np

from src import tests
from src.plotting import apply_style, semilog_convergence, eigenvalue_compare
from src.tiny_models import toy_mlp, tiny_mlp, count_params

apply_style()
torch.manual_seed(0)
print('environment ready')

# Exercise 0.1 (🔴⚪⚪⚪⚪, 3 min)
# Compute both norms of a 20x20 random matrix, verify the inequality.
torch.manual_seed(0)
A = torch.randn(20, 20)

op_norm = None  # YOUR CODE HERE: compute ||A||_2
fro_norm = None  # YOUR CODE HERE: compute ||A||_F

assert op_norm is not None and fro_norm is not None, 'fill in op_norm and fro_norm'
assert op_norm <= fro_norm <= (20 ** 0.5) * op_norm
print(f'||A||_2 = {op_norm:.3f}, ||A||_F = {fro_norm:.3f}')

# Exercise 0.2 (🔴🔴⚪⚪⚪, 8 min)
# Build a symmetric PSD A with engineered eigenvalues {1, 1, ..., 1, 1/kappa},
# solve Ax = b in fp32, plot relative residual vs kappa on log-log axes.

def make_conditioned_spd(n: int, kappa: float, seed: int = 0):
    g = torch.Generator().manual_seed(seed)
    Q = torch.linalg.qr(torch.randn(n, n, generator=g))[0]
    eigs = torch.ones(n)
    eigs[-1] = 1.0 / kappa
    return Q @ torch.diag(eigs) @ Q.T

n = 30
kappas = torch.logspace(0, 8, 9)
residuals = []
for kappa in kappas:
    A = make_conditioned_spd(n, kappa.item()).float()
    x_true = torch.randn(n)
    b = A @ x_true
    x_hat = None  # YOUR CODE HERE: solve A x_hat = b in fp32
    assert x_hat is not None, 'solve A x_hat = b using torch.linalg.solve'
    residuals.append(((A @ x_hat - b).norm() / b.norm()).item())

plt.figure()
plt.loglog(kappas.numpy(), residuals, 'o-')
plt.xlabel(r'$\kappa(A)$'); plt.ylabel('relative residual')
plt.title('fp32 solve precision vs condition number')
plt.show()

# Exercise 0.3 (🔴⚪⚪⚪⚪, 3 min)
# Find epsilon_machine empirically for fp32 and fp64.

def find_eps(dtype):
    one = torch.tensor(1.0, dtype=dtype)
    eps = torch.tensor(1.0, dtype=dtype)
    while one + eps / 2 > one:
        eps = eps / 2
    return eps.item()

eps32 = None  # YOUR CODE HERE
eps64 = None  # YOUR CODE HERE
assert eps32 is not None and eps64 is not None
print(f'eps(fp32) ≈ {eps32:.3e};  eps(fp64) ≈ {eps64:.3e}')

# Exercise 0.4 (🔴⚪⚪⚪⚪, 2 min)
# Verify: R(v) is bounded by [lambda_min, lambda_max] for random unit v.

torch.manual_seed(0)
A = torch.randn(15, 15); A = A + A.T
eigs = torch.linalg.eigvalsh(A)
lam_min, lam_max = eigs.min().item(), eigs.max().item()

for _ in range(20):
    v = torch.randn(15); v = v / v.norm()
    R = (v @ A @ v).item()
    assert lam_min - 1e-6 <= R <= lam_max + 1e-6, f'R={R} outside [{lam_min},{lam_max}]'
print(f'all 20 Rayleigh quotients in [{lam_min:.3f}, {lam_max:.3f}] ✓')

# Setup
torch.manual_seed(0)
model = toy_mlp(seed=1)
X = torch.randn(8, 20)
y = torch.randint(0, 4, (8,))
P = count_params(model)
v = torch.randn(P)
print(f'model: {P} params')

import torch.nn.functional as F
from torch.func import functional_call

def flat_params(model):
    return torch.cat([p.detach().reshape(-1) for p in model.parameters()])

def unflatten_into_dict(flat, ref_named):
    out, i = {}, 0
    for n, p in ref_named.items():
        out[n] = flat[i:i+p.numel()].view_as(p)
        i += p.numel()
    return out

def hvp_double_backward(model, X, y, v):
    # YOUR CODE HERE
    raise NotImplementedError

tests.test_hvp(hvp_double_backward)

from torch.func import grad, jvp

def hvp_jvp_of_grad(model, X, y, v):
    # YOUR CODE HERE
    raise NotImplementedError

tests.test_hvp(hvp_jvp_of_grad)

def hvp_finite_difference(model, X, y, v, eps=1e-3):
    # YOUR CODE HERE
    raise NotImplementedError

# Verify all three agree.
v_test = torch.randn(P)
h_dbl = hvp_double_backward(model, X, y, v_test)
h_jvp = hvp_jvp_of_grad(model, X, y, v_test)
h_fd  = hvp_finite_difference(model, X, y, v_test)

print(f'||double - jvp||_inf = {(h_dbl - h_jvp).abs().max():.2e}')
print(f'||double - fd||_inf  = {(h_dbl - h_fd).abs().max():.2e}  (FD is noisier)')
assert torch.allclose(h_dbl, h_jvp, atol=1e-5)
assert torch.allclose(h_dbl, h_fd,  atol=5e-3)
print('all three agree ✓')

import time

def time_hvp(fn, n_calls=20):
    fn(model, X, y, v)  # warmup
    t0 = time.perf_counter()
    for _ in range(n_calls):
        fn(model, X, y, v)
    return (time.perf_counter() - t0) / n_calls

t_dbl = time_hvp(hvp_double_backward)
t_jvp = time_hvp(hvp_jvp_of_grad)
t_fd  = time_hvp(hvp_finite_difference)
print(f'double backward: {t_dbl*1e3:.2f} ms')
print(f'jvp of grad:     {t_jvp*1e3:.2f} ms')
print(f'finite diff:     {t_fd*1e3:.2f} ms (≈ 2× the others)')

def power_iteration(matvec, dim, num_iters=200, tol=1e-10, seed=0):
    # YOUR CODE HERE
    raise NotImplementedError

tests.test_power_iteration(lambda mv, dim, num_iters, seed:
                            power_iteration(mv, dim, num_iters, seed=seed)[:2])

def make_diag_spd(eigs):
    return torch.diag(torch.tensor(eigs, dtype=torch.float32))

specs = {
    'wide gap (λ₂/λ₁ = 0.1)':   [10.0] + [1.0]*49,
    'narrow gap (λ₂/λ₁ = 0.95)': [10.0, 9.5] + [1.0]*48,
    'tied (λ₂/λ₁ = 1.0)':        [10.0, 10.0] + [1.0]*48,
}

fig, ax = plt.subplots()
for name, eigs in specs.items():
    A = make_diag_spd(eigs)
    matvec = lambda v, A=A: A @ v
    _, _, hist = power_iteration(matvec, dim=50, num_iters=120, seed=0)
    # Avoid log(0) on semilogy when exact convergence hits.
    hist_safe = [max(h, 1e-30) for h in hist]
    ax.semilogy(hist_safe, label=name)
ax.set_xlabel('iteration'); ax.set_ylabel(r'$|\lambda^{(k)} - \lambda^{(k-1)}|$')
ax.legend(); ax.set_title('Power iteration: convergence vs spectral gap')
plt.show()

def hessian_matvec_factory(model, X, y):
    def matvec(v):
        return hvp_double_backward(model, X, y, v)
    return matvec

matvec_H = hessian_matvec_factory(model, X, y)
top_eig, top_vec, _ = power_iteration(matvec_H, dim=P, num_iters=200, seed=0)
print(f'power iteration top |λ| = {top_eig:.4f}')

# Ground truth: materialize the Hessian using HVP on each basis vector.
H_full = torch.stack([matvec_H(torch.eye(P)[i]) for i in range(P)])
true_eigs = torch.linalg.eigvalsh((H_full + H_full.T) / 2)
print(f'true top |λ| = {true_eigs.abs().max():.4f}')
assert abs(abs(top_eig) - true_eigs.abs().max().item()) < 1e-3
print('✓ matches')

def deflated_matvec(v):
    out = matvec(v)
    for lam, u in zip(found_vals, found_vecs):
        out = out - lam * (u @ v) * u
    return out

def power_iteration_deflated(matvec, dim, k, num_iters_per=300, seed=0):
    # YOUR CODE HERE
    raise NotImplementedError

eigvals, eigvecs = power_iteration_deflated(matvec_H, dim=P, k=3, num_iters_per=400, seed=0)
# Compare by magnitude — power iteration finds largest-|λ|, and the Hessian
# may be indefinite at this point in training.
top3_by_mag = true_eigs.abs().sort(descending=True).values[:3]
print(f'top-3 by power+deflation (signed):    {[f"{x:+.3f}" for x in eigvals]}')
print(f'top-3 by torch.linalg (|λ|, sorted):  {top3_by_mag.tolist()}')
assert torch.allclose(
    torch.tensor([abs(e) for e in eigvals]).sort(descending=True).values,
    top3_by_mag, atol=1e-2,
)
print('✓ magnitudes match')

k_max = 10
eigvals_k, _ = power_iteration_deflated(matvec_H, dim=P, k=k_max, num_iters_per=300, seed=0)
true_topk = true_eigs.abs().sort(descending=True).values[:k_max].tolist()

rel_err = [abs(abs(e) - t) / t for e, t in zip(eigvals_k, true_topk)]
plt.figure()
plt.semilogy(range(1, k_max+1), rel_err, 'o-')
plt.xlabel('eigenvalue index'); plt.ylabel('relative error')
plt.title('Naive deflation degrades past ~5 eigenvalues')
plt.show()

def lanczos_no_reorth(matvec, dim, k, seed=0):
    # YOUR CODE HERE
    raise NotImplementedError

# Verify on a small dense matrix where we can compute ground truth.
torch.manual_seed(42)
A_dense = torch.randn(60, 60); A_dense = A_dense + A_dense.T
matvec_dense = lambda v: A_dense @ v
true_eigs_dense = torch.linalg.eigvalsh(A_dense)

ritz, Q = lanczos_no_reorth(matvec_dense, dim=60, k=20)
top5 = ritz.sort(descending=True).values[:5]
true5 = true_eigs_dense.sort(descending=True).values[:5]
print(f'top-5 Ritz:  {top5.tolist()}')
print(f'top-5 true:  {true5.tolist()}')
print(f'max |Δ|: {(top5 - true5).abs().max():.2e}')

fig, axes = plt.subplots(1, 4, figsize=(15, 4), sharey=True)
for ax, k in zip(axes, [5, 10, 20, 40]):
    ritz, _ = lanczos_no_reorth(matvec_dense, dim=60, k=k)
    eigenvalue_compare(true_eigs_dense, ritz, ax=ax)
    ax.set_title(f'k = {k}')
plt.suptitle('Lanczos Ritz values converge to extremes first')
plt.tight_layout(); plt.show()

def lanczos_track_orth(matvec, dim, k, reorth='none', seed=0):
    '''As above but also returns orthogonality-error history.'''
    # YOUR CODE HERE: copy your lanczos_no_reorth and add per-step orth tracking.
    raise NotImplementedError

A40 = (torch.randn(40, 40, generator=torch.Generator().manual_seed(7))).float()
A40 = A40 + A40.T

_, _, orth_none = lanczos_track_orth(lambda v: A40 @ v, dim=40, k=40, reorth='none')

plt.figure()
plt.semilogy(orth_none, label='no reorth (fp32)')
plt.xlabel('step'); plt.ylabel(r'$\| Q^\top Q - I \|_\infty$')
plt.title('Orthogonality loss in classical Lanczos')
plt.legend(); plt.show()

_, _, orth_full = lanczos_track_orth(lambda v: A40 @ v, dim=40, k=40, reorth='full')

plt.figure()
plt.semilogy(orth_none, label='no reorth')
plt.semilogy(orth_full, label='full reorth (twice)')
plt.xlabel('step'); plt.ylabel(r'$\| Q^\top Q - I \|_\infty$')
plt.legend(); plt.title('Full reorth keeps orthogonality at fp32 epsilon')
plt.show()

_, _, orth_sel = lanczos_track_orth(lambda v: A40 @ v, dim=40, k=40, reorth='selective')

plt.figure(figsize=(8, 4.5))
plt.semilogy(orth_none, label='none')
plt.semilogy(orth_full, label='full (twice-is-enough)')
plt.semilogy(orth_sel,  label='selective (Paige)')
plt.axhline(1e-7, color='k', linestyle=':', alpha=0.6, label='fp32 eps')
plt.xlabel('Lanczos step'); plt.ylabel(r'$\| Q^\top Q - I \|_\infty$')
plt.title('Money plot: orthogonality loss across reorth strategies')
plt.legend(); plt.show()

from src.data import load_mnist_7x7

# Set up tiny_mlp + a small training run (just enough to see Hessian change).
torch.manual_seed(0)
model_mnist = tiny_mlp(seed=0)
X_train, y_train = load_mnist_7x7(n=500, seed=0)
P_mnist = count_params(model_mnist); print(f'tiny_mlp: {P_mnist} params')

# === Lanczos top-10 at init ===
matvec_init = lambda v: hvp_double_backward(model_mnist, X_train, y_train, v)
# YOUR CODE HERE: call lanczos_track_orth with reorth='selective', k=30,
# unpack the first element (Ritz values) into ritz_init.
ritz_init = None
assert ritz_init is not None and len(ritz_init) >= 10
print(f'top-10 |λ| at init (Lanczos): {sorted(ritz_init.abs().tolist(), reverse=True)[:10]}')

# Ground-truth check: materialize the Hessian (P x P), eigendecompose.
H_full_init = torch.stack([hvp_double_backward(model_mnist, X_train, y_train, torch.eye(P_mnist)[i])
                          for i in range(P_mnist)])
H_full_init = (H_full_init + H_full_init.T) / 2
true_eigs_init = torch.linalg.eigvalsh(H_full_init)
true_top10 = true_eigs_init.abs().sort(descending=True).values[:10]
print(f'top-10 |λ| at init (explicit): {true_top10.tolist()}')

opt = torch.optim.SGD(model_mnist.parameters(), lr=0.1)
for step in range(200):
    idx = torch.randint(0, len(X_train), (64,))
    opt.zero_grad()
    loss = F.cross_entropy(model_mnist(X_train[idx]), y_train[idx])
    loss.backward()
    opt.step()
print(f'final loss: {loss.item():.4f}')

matvec_trained = lambda v: hvp_double_backward(model_mnist, X_train, y_train, v)
ritz_trained = None  # YOUR CODE HERE: same as ritz_init but for the trained model.
assert ritz_trained is not None

top10_init = sorted(ritz_init.abs().tolist(), reverse=True)[:10]
top10_trained = sorted(ritz_trained.abs().tolist(), reverse=True)[:10]

fig, ax = plt.subplots()
ax.plot(range(1, 11), top10_init,    'o-', label='at init')
ax.plot(range(1, 11), top10_trained, 's-', label='after training')
ax.set_xlabel('rank'); ax.set_ylabel(r'$|\lambda_k|$')
ax.set_yscale('log'); ax.legend()
ax.set_title('Hessian top-10 eigenvalues: init vs trained')
plt.show()

Cost	Notebook 1
matvec ≈ O(forward + backward)	HVP, three implementations
power iter top-1	k = O(log(1/ε)/log(λ₁/λ₂)) matvecs
power iter + deflation top-k	k×above; degrades past ~5 due to drift
Lanczos top-k	O(k) matvecs + O(k²) for reorth

Notebook 1 — Krylov methods for the Hessian¶

0. NLA you'll need¶

0.1 Operator vs Frobenius norm¶

0.2 Condition number¶

0.3 Machine epsilon¶

0.4 Rayleigh quotient & symmetric eigenvalues¶

1. The matvec is the unit of cost¶

Exercise 1.1: HVP via double backward (🔴🔴⚪⚪⚪, 10 min)¶

Exercise 1.2: HVP via JVP of grad (🔴🔴🔴⚪⚪, 12 min)¶

Exercise 1.3: HVP via finite differences (🔴⚪⚪⚪⚪, 5 min)¶

Exercise 1.4: Timing (🔴⚪⚪⚪⚪, 5 min)¶

2. Power iteration¶

Exercise 2.1: Implement power iteration (🔴🔴⚪⚪⚪, 8 min)¶

Exercise 2.2: Convergence rate vs spectral gap (🔴🔴⚪⚪⚪, 10 min)¶

Exercise 2.3: Power iteration on the Hessian (🔴🔴⚪⚪⚪, 7 min)¶

3. Deflation: getting top-k from power iteration¶

Exercise 3.1: Deflated power iteration (🔴🔴⚪⚪⚪, 10 min)¶

Exercise 3.2: Watch deflation degrade (🔴🔴⚪⚪⚪, 7 min)¶

4. Lanczos: the three-term recurrence¶

Exercise 4.1: Implement Lanczos without reorthogonalization (🔴🔴🔴⚪⚪, 20 min)¶

Exercise 4.2: Watch Ritz values converge (🔴🔴⚪⚪⚪, 8 min)¶

5. Loss of orthogonality — and how to fix it¶

Exercise 5.1: Watch orthogonality decay (🔴🔴⚪⚪⚪, 8 min)¶

Exercise 5.2: Full reorthogonalization (🔴🔴⚪⚪⚪, 7 min)¶

Exercise 5.3: Selective reorthogonalization (🔴🔴🔴⚪⚪, 12 min)¶

6. Hessian top-k in practice¶

Exercise 6.1: Train a bit, recompute (🔴🔴⚪⚪⚪, 10 min)¶

Wrap-up¶