LearningToOptimize · andrewrosemberg · Sep 4, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/class02/ISYE_8803___Lecture_2___Slides.pdf b/class02/ISYE_8803___Lecture_2___Slides.pdf
diff --git a/class02/Manifest.toml b/class02/Manifest.toml
@@ -0,0 +1,7 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.11.6"
+manifest_format = "2.0"
+project_hash = "da39a3ee5e6b4b0d3255bfef95601890afd80709"
+
+[deps]
diff --git a/class02/Project.toml b/class02/Project.toml
@@ -0,0 +1 @@
+[deps]
diff --git a/class02/SQP.tex b/class02/SQP.tex
@@ -0,0 +1,123 @@
+\section{Sequential Quadratic Programming (SQP)}
+
+% ------------------------------------------------
+\begin{frame}{What is SQP?}
+\textbf{Idea:} Solve a nonlinear, constrained problem by repeatedly solving a \emph{quadratic program (QP)} built from local models.\\[4pt]
+\begin{itemize}
+  \item Linearize constraints; quadratic model of the Lagrangian/objective.
+  \item Each iteration: solve a QP to get a step \(d\), update \(x \leftarrow x + \alpha d\).
+  \item Strength: strong local convergence (often superlinear) with good Hessian info.
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Target Problem (NLP)}
+\[
+\min_{x \in \R^n} \ f(x)
+\quad
+\text{s.t.}\quad
+g(x)=0,\quad h(x)\le 0
+\]
+\begin{itemize}
+  \item \(f:\R^n\!\to\!\R\), \(g:\R^n\!\to\!\R^{m}\) (equalities), \(h:\R^n\!\to\!\R^{p}\) (inequalities).
+  \item KKT recap (at candidate optimum \(x^\star\)): 
+\[
+\exists \ \lambda \in \R^{m},\ \mu \in \R^{p}_{\ge 0}:
+\ \grad f(x^\star) + \nabla g(x^\star)^T\lambda + \nabla h(x^\star)^T \mu = 0,
+\]
+\[
+g(x^\star)=0,\quad h(x^\star)\le 0,\quad \mu \ge 0,\quad \mu \odot h(x^\star) = 0.
+\]
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{From NLP to a QP (Local Model)}
+At iterate \(x_k\) with multipliers \((\lambda_k,\mu_k)\):\\[4pt]
+\textbf{Quadratic model of the Lagrangian}
+\[
+m_k(d) = \ip{\grad f(x_k)}{d} + \tfrac{1}{2} d^T B_k d
+\]
+with \(B_k \approx \nabla^2_{xx}\Lag(x_k,\lambda_k,\mu_k)\).\\[6pt]
+\textbf{Linearized constraints}
+\[
+g(x_k) + \nabla g(x_k)\, d = 0,\qquad
+h(x_k) + \nabla h(x_k)\, d \le 0.
+\]
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{The SQP Subproblem (QP)}
+\[
+\begin{aligned}
+\min_{d \in \R^n}\quad & \grad f(x_k)^T d + \tfrac{1}{2} d^T B_k d \\
+\text{s.t.}\quad & \nabla g(x_k)\, d + g(x_k) = 0, \\
+& \nabla h(x_k)\, d + h(x_k) \le 0.
+\end{aligned}
+\]
+\begin{itemize}
+  \item Solve QP \(\Rightarrow\) step \(d_k\) and updated multipliers \((\lambda_{k+1},\mu_{k+1})\).
+  \item Update \(x_{k+1} = x_k + \alpha_k d_k\) (line search or trust-region).
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Algorithm Sketch (SQP)}
+\begin{enumerate}
+  \item Start with \(x_0\), multipliers \((\lambda_0,\mu_0)\), and \(B_0 \succ 0\).
+  \item Build QP at \(x_k\) with \(B_k\), linearized constraints.
+  \item Solve QP \(\Rightarrow\) get \(d_k\), \((\lambda_{k+1},\mu_{k+1})\).
+  \item Globalize: line search on merit or use filter/TR to choose \(\alpha_k\).
+  \item Update \(x_{k+1} = x_k + \alpha_k d_k\), update \(B_{k+1}\) (e.g., BFGS).
+\end{enumerate}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Toy Example (Local Models)}
+\textbf{Problem:}
+\[
+\min_{x\in\R^2} \ \tfrac{1}{2}\norm{x}^2
+\quad \text{s.t.} \quad g(x)=x_1^2 + x_2 - 1 = 0,\ \ h(x)=x_2 - 0.2 \le 0.
+\]
+At \(x_k\), build QP with
+\[
+\grad f(x_k)=x_k,\quad B_k=I,\quad
+\nabla g(x_k) = \begin{bmatrix} 2x_{k,1} & 1 \end{bmatrix},\ 
+\nabla h(x_k) = \begin{bmatrix} 0 & 1 \end{bmatrix}.
+\]
+Solve for \(d_k\), then \(x_{k+1}=x_k+\alpha_k d_k\).
+\end{frame}
+
+
+% ------------------------------------------------
+\begin{frame}{Globalization: Making SQP Robust}
+SQP is an important method, and there are many issues to be considered to obtain an \textbf{efficient} and \textbf{reliable} implementation:
+\begin{itemize}
+  \item Efficient solution of the linear systems at each Newton Iteration (Matrix block structure can be exploited.
+  \item Quasi-Newton approximations to the Hessian.
+  \item Trust region, line search, etc. to improve robustnes (i.e TR: restrict \(\norm{d}\) to maintain model validity.)
+  \item Treatment of constraints (equality and inequality) during the iterative process.
+  \item Selection of good starting guess for $\lambda$.
+\end{itemize}
+\end{frame}
+
+
+
+
+
+
+% ------------------------------------------------
+\begin{frame}{Final Takeaways on SQP}
+\textbf{When SQP vs.\ Interior-Point?}
+\begin{itemize}
+  \item \textbf{SQP}: strong local convergence; warm-start friendly; natural for NMPC.
+  \item \textbf{IPM}: very robust for large, strictly feasible problems; good for dense inequality sets.
+  \item In practice: both are valuable—choose to match problem structure and runtime needs.
+\end{itemize} 
+\textbf{Takeaways of SQP} 
+\begin{itemize}
+  \item SQP = Newton-like method using a sequence of structured QPs.
+  \item Globalization (merit/filter/TR) makes it reliable from poor starts.
+  \item Excellent fit for control (NMPC/trajectory optimization) due to sparsity and warm starts.
+\end{itemize}
+\end{frame}
diff --git a/class02/class02.md b/class02/class02.md
@@ -6,5 +6,50 @@
 
 ---
 
-Add notes, links, and resources below.
+## Overview
+
+This class covers the fundamental numerical optimization techniques essential for optimal control problems. We explore gradient-based methods, Sequential Quadratic Programming (SQP), and various approaches to handling constraints including Augmented Lagrangian Methods (ALM), interior-point methods, and penalty methods.
+
+## Interactive Materials
+
+The class is structured around 1 slide deck and four interactive Jupyter notebooks:
+
+1. **[Part 1a: Root Finding & Backward Euler](part1_root_finding.html)**
+   - Root-finding algorithms for implicit integration
+   - Fixed-point iteration vs. Newton's method
+   - Application to pendulum dynamics
+
+
+2. **[Part 1b: Minimization via Newton's Method](part1_minimization.html)**
+   - Unconstrained optimization fundamentals
+   - Newton's method implementation
+   - Globalization strategies: Hessian matrix and regularization
+
+3. **[Part 2: Equality Constraints](part2_eq_constraints.html)**
+   - Lagrange multiplier theory
+   - KKT conditions for equality constraints
+   - Quadratic programming implementation
+
+4. **[Part 3: Interior-Point Methods](part3_ipm.html)**
+   - Inequality constraint handling
+   - Barrier methods and log-barrier functions
+   - Comparison with penalty methods
+
+## Additional Resources
+
+- **[Lecture Slides (PDF)](ISYE_8803___Lecture_2___Slides.pdf)** - Complete slide deck
+- **[LaTeX Source](main.tex)** - Source code for lecture slides
+
+## Key Learning Outcomes
+
+- Understand gradient-based optimization methods
+- Implement Newton's method for minimization
+- Apply root-finding techniques for implicit integration
+- Solve equality-constrained optimization problems
+- Compare different constraint handling methods
+- Implement Sequential Quadratic Programming (SQP)
+
+## Next Steps
+
+This class provides the foundation for advanced topics in subsequent classes, including Pontryagin's Maximum Principle, nonlinear trajectory optimization, and stochastic optimal control.
 
diff --git a/class02/eq_constraints.tex b/class02/eq_constraints.tex
@@ -0,0 +1,205 @@
+
+%\section{Part II -- Equality constraints: KKT, Newton vs. Gauss–Newton}
+\section{Constrained Optimization}
+
+% ==== Equality constraints: KKT, Newton vs. Gauss–Newton ====
+
+\begin{frame}{Equality-constrained minimization: geometry and conditions}
+\textbf{Problem}; $\min_{x\in\mathbb{R}^n} f(x)\quad \text{s.t.}\quad C(x)=0, C:\mathbb{R}^n\to\mathbb{R}^m$.
+
+\medskip
+\textbf{Geometric picture.} At an optimum on the manifold $C(x)=0$, the negative gradient must lie in the tangent space:
+
+$$
+\grad f(x^\star)\ \perp\ \mathcal{T}_{x^\star}=\{p:\; J_C(x^\star)p=0\}.
+$$
+
+Equivalently, the gradient is a linear combination of constraint normals:
+
+$$
+\grad f(x^\star)+J_C(x^\star)^{\!T}\lambda^\star=0,\qquad C(x^\star)=0\quad(\lambda^\star\in\mathbb{R}^m).
+$$
+
+\medskip
+\textbf{Lagrangian.}; $L(x,\lambda)=f(x)+\lambda^{\!T}C(x)$.
+\end{frame}
+
+\begin{frame}{A nicer visual explanation/derivation of KKT conditions}
+\begin{center}
+    Quick little whiteboard derivation
+\end{center}
+
+\end{frame}
+
+
+
+\section{Constrained Optimization}
+
+% ==== Slide 1: Picture-first intuition ====
+\begin{frame}[t]{Equality constraints: picture first}
+\setbeamercovered{invisible}
+
+\textbf{Goal.} Minimize $f(x)$ while staying on the surface $C(x)=0$.
+
+\uncover<2->{\textbf{Feasible set as a surface.} Think of $C(x)=0$ as a smooth surface embedded in $\mathbb{R}^n$ (a manifold).}
+
+\uncover<3->{\textbf{Move without breaking the constraint.} Tangent directions are the “along-the-surface” moves that keep $C(x)$ unchanged to first order. Intuitively: tiny steps that slide on the surface.}
+
+\uncover<4->{\textbf{What must be true at the best point.} At $x^\star$, there is no downhill direction that stays on the surface. Equivalently, the usual gradient of $f$ has \emph{no component along the surface}.}
+
+\uncover<5->{\textbf{Normals enter the story.} If the gradient can’t point along the surface, it must point \emph{through} it—i.e., it aligns with a combination of the surface’s normal directions (one normal per constraint).}
+\end{frame}
+
+% ==== Slide 2: From picture to KKT ====
+\begin{frame}[t]{From the picture to KKT (equality case)}
+\setbeamercovered{invisible}
+
+\textbf{KKT conditions at a regular local minimum (equality only):}
+
+\uncover<1->{\textbf{1) Feasibility:} $C(x^\star)=0$. \emph{(We’re on the surface.)}}
+
+\uncover<2->{\textbf{2) Stationarity:} $\nabla f(x^\star) + J_C(x^\star)^{\!T}\lambda^\star = 0$. \emph{(The gradient is a linear combination of the constraint normals.)}}
+
+\uncover<3->{\textbf{Lagrangian viewpoint.} Define $L(x,\lambda)=f(x)+\lambda^{\!T}C(x)$. At a solution, $x^\star$ is a stationary point of $L$ w.r.t.\ $x$ (that’s the stationarity equation), while $C(x^\star)=0$ enforces feasibility.}
+
+\uncover<4->{\textbf{What the multipliers mean.} The vector $\lambda^\star$ tells how strongly each constraint “pushes back” at the optimum; it also measures sensitivity of the optimal value to small changes in the constraints.}
+
+\end{frame}
+
+
+\begin{frame}{KKT system for equalities (first-order necessary conditions)}
+\textbf{KKT (FOC).}
+
+$$
+\grad_x L(x,\lambda)=\grad f(x)+J_C(x)^{\!T}\lambda=0,\qquad \grad_\lambda L(x,\lambda)=C(x)=0.
+$$
+
+\textbf{Solve by Newton on KKT:} linearize both optimality and feasibility:
+
+$$
+\begin{bmatrix}
+\hess f(x) + \sum_{i=1}^m \lambda_i\,\hess C_i(x) & J_C(x)^{\!T}\\[2pt]
+J_C(x) & 0
+\end{bmatrix}
+\begin{bmatrix}\Delta x\\ \Delta\lambda\end{bmatrix}
+=-
+\begin{bmatrix}
+\grad f(x)+J_C(x)^{\!T}\lambda\\ C(x)
+\end{bmatrix}.
+$$
+
+\textit{Notes.} This is a symmetric \emph{saddle-point} system; typical solves use block elimination (Schur complement) or sparse factorizations.
+\end{frame}
+
+
+
+
+
+
+\begin{frame}{Move to Julia Code}
+\begin{center}
+    \textbf{Quick Demo of Julia Notebook: part2\_eq\_constraints.ipynb}
+\end{center}
+\end{frame}
+
+\begin{frame}{Numerical practice: Newton on KKT}
+  \setbeamercovered{invisible}
+
+
+    \textbf{When it works best.}
+    \begin{itemize}
+      \item Near a regular solution with $J_{C}(x^\star)$ full row rank and positive-definite reduced Hessian.
+      \item With a globalization (line search on a merit function) and mild regularization for robustness.
+    \end{itemize}
+
+    % --- Part 2: appears on the 2nd click only --- 
+    \uncover<2->{%
+      \textbf{Common safeguards.}
+      \begin{itemize}
+        \item \emph{Regularize} the $(1,1)$ block to ensure a good search direction (e.g., add $\beta I$).
+        \item \emph{Merit/penalty} line search to balance feasibility vs.\ optimality during updates.
+        \item \emph{Scaling} constraints to improve conditioning of the KKT system.
+      \end{itemize}
+    } 
+\end{frame}
+
+
+\begin{frame}{Gauss--Newton vs. full Newton on KKT}
+
+\uncover<1->{
+\textbf{Full Newton Hessian of the Lagrangian:}\quad
+$\nabla_{xx}^2 L(x,\lambda) = \nabla^2 f(x) + \sum_{i=1}^m \lambda_i\, \nabla^2 C_i(x)$
+}
+
+\vspace{0.6em}
+
+\uncover<2->{
+\textbf{Gauss--Newton approximation:} drop the \emph{constraint-curvature} term
+$\sum_{i=1}^m \lambda_i\, \nabla^2 C_i(x)$:
+\begin{align*}
+H_{\text{GN}}(x) &\approx \nabla^2 f(x).
+\end{align*}
+}
+
+\uncover<3->{
+\textbf{Trade-offs (high level).}
+\begin{itemize}
+  \item \emph{Full Newton:} fewer iterations near the solution, but each step is costlier and can be less robust far from it.
+  \item \emph{Gauss--Newton:} cheaper per step and often more stable; may need more iterations but wins in wall-clock on many problems.
+\end{itemize}
+}
+
+\end{frame}
+
+
+% ==== Inequalities & KKT: complementarity ====
+
+\begin{frame}{Inequality-constrained minimization and KKT}
+\textbf{Problem.} $\quad \quad \min f(x)\quad\text{s.t.}\quad c(x)\ge 0,  \quad \quad c:\mathbb{R}^n\to\mathbb{R}^p$.
+
+\textbf{KKT conditions (first-order).}
+
+$$
+\begin{aligned}
+&\text{Stationarity:} && \grad f(x)-J_c(x)^{\!T}\lambda=0,\\
+&\text{Primal feasibility:} && c(x)\ge 0,\\
+&\text{Dual feasibility:} && \lambda\ge 0,\\
+&\text{Complementarity:} && \lambda^{\!T}c(x)=0\quad(\text{i.e., }\lambda_i c_i(x)=0\ \forall i).
+\end{aligned}
+$$
+
+\textbf{Interpretation.}
+\begin{itemize}
+\item \emph{Active} constraints: $c_i(x)=0 \Rightarrow \lambda_i\ge 0$ can be nonzero (acts like an equality).
+\item \emph{Inactive} constraints: $c_i(x)>0 \Rightarrow \lambda_i=0$ (no influence on optimality).
+\end{itemize}
+\end{frame}
+
+
+
+
+\begin{frame}{Complementarity in plain English (and why Newton is tricky)}
+\footnotesize
+
+\textbf{What $\lambda_i c_i(x)=0$ means.}
+\begin{itemize}
+\item Tight constraint ($c_i=0$) $\Rightarrow$ can press back ($\lambda_i\ge0$).
+\item Loose constraint ($c_i>0$) $\Rightarrow$ no force ($\lambda_i=0$).
+\end{itemize}
+
+\textbf{Why naive Newton fails.}
+\begin{itemize}
+\item Complementarity = nonsmooth + inequalities ($\lambda\ge0$, $c(x)\ge0$).
+\item Equality-style Newton can violate nonnegativity or bounce across boundary.
+\end{itemize}
+
+\textbf{Two main strategies (preview).}
+\begin{itemize}
+\item \emph{Active-set:} guess actives $\Rightarrow$ solve equality-constrained subproblem, update set.
+\item \emph{Barrier/PDIP/ALM:} smooth or relax complementarity, damped Newton, drive relaxation $\to 0$.
+\end{itemize}
+\end{frame}
+
+
+
+
diff --git a/class02/figures/log_barrier.png b/class02/figures/log_barrier.png
diff --git a/class02/figures/quadratic_penalty.png b/class02/figures/quadratic_penalty.png
diff --git a/class02/figures/tri_paper.png b/class02/figures/tri_paper.png