\documentclass[11pt]{article}

\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{epstopdf}
\usepackage{amsmath}

\renewcommand{\baselinestretch}{1.}
\usepackage[margin=1in]{geometry}

\usepackage{mathptmx}

\newcommand{\E}{\mathbb{E}}
\def\Var{{\rm Var}}

\def\T{{\top}}


\begin{document}

\begin{center}
{\huge From Mechanics to Control} \\
Ying Nian Wu, UCLA Statistics Department
\end{center}


\section{Classical mechanics} 

\subsection{Lagrangian} 

Suppose there are $n$ particles whose positions and velocities are $x(t) = (x_i(t), i = 1, ..., n)$ and $v(t) = (v_i(t), i = 1, ..., n)$, with $v_i(t) = dx_i(t)/dt$.  Sometimes, for notional simplicity, we may simply write $x(t)$ as $x$, and $v(t)$ as $v$. 

The Lagrangian is 
\begin{eqnarray} 
   L(x, v, t) = \frac{1}{2} \sum_{i=1}^{n} m_i v_i^2 - V(x), 
\end{eqnarray}
which is the difference between the kinetic energy and the potential energy. It can be more general. 

\subsection{Action} 

The action from 0 to $T$ is 
\begin{eqnarray} 
    S = \int_{0}^{T} L(x, v) dt, 
\end{eqnarray} 
which is a function (or functional) of the path $x(t), t  \in [0, T]$. 

\subsection{Least action} 

Nature controls the path $x(t)$ by minimizing the action, i.e., nature does barely enough for the kinetic energy to overcome the potential energy. 

In general, we only require the path $x(t)$ to be a stationary point of the action $S$, i.e., a minimum, a maximum, or a saddle point. But usually it should be a minimum. 

\subsection{Discretization} 

A valuable method to understand the continuous process is to always discretize the time into intervals of $\Delta t$, very much like making a movie where the time between every two consecutive frames is $\Delta t$.

In discrete time, we write $S$ as
\begin{eqnarray} 
    S = \sum_{0}^{T} L\left(x(t),  v(t)\right) \Delta t, 
\end{eqnarray} 
where 
\begin{eqnarray}
v(t) = \frac{x(t+\Delta t)-x(t)}{\Delta t}.
\end{eqnarray}
Then for a fixed $t$, the part of $S$ that depends on $x(t)$ is 
\begin{eqnarray} 
    L(x(t), v(t)) \Delta t + L(x(t-\Delta t), v(t-\Delta t)) \Delta t, 
\end{eqnarray} 
where 
\begin{eqnarray} 
v(t-\Delta t) = \frac{x(t)-x(t-\Delta t)}{\Delta t}.
\end{eqnarray} 

\subsection{Euler-Lagrange} 

Thus 
\begin{eqnarray} 
    \frac{\partial S}{\partial x(t)}  = L^{(1)}\left(x(t),  v(t)\right)  \Delta t  - L^{(2)}\left(x(t),  v(t)\right) + L^{(2)}(x(t-\Delta t), v(t-\Delta t)) = 0, 
\end{eqnarray} 
which leads to the Euler-Lagrange equation 
\begin{eqnarray} 
\frac{\partial L}{\partial x} = \frac{d}{dt} \frac{\partial L}{\partial v}.
\end{eqnarray} 
The above derivation of the Euler-Lagrange is unconventional, but is the most direct. 

Note: for $f(x, y)$, we write $f^{(1)}(x, y) = \partial f(x, y)/\partial x$, and $f^{(2)}(x, y) = \partial f(x, y)/\partial y$. 

\subsection{Hamiltonian} 

Define the momentum 
\begin{eqnarray}
    p = \frac{\partial L}{\partial v}. 
\end{eqnarray}
Usually $p = mv$. 

Define the Hamiltonian $H(x, p)$ to be 
\begin{eqnarray} 
   H = p v - L. 
\end{eqnarray} 
$H$ is usually the total energy, i.e., the sum of the kinetic energy and the potential energy. 

We will derive the Hamiltonian equation in the next subsection: 
\begin{eqnarray}
&&\frac{\partial p}{\partial t} =   -\frac{\partial H}{\partial x}; \\
&&\frac{\partial x}{\partial t} =   \frac{\partial H}{\partial p}.
\end{eqnarray}

 
\subsection{Lagrange multiplier and Hamiltonian} 

In discretize time. 
\begin{eqnarray} 
    S = \sum_{0}^{T} L(x(t), v(t)) \Delta t, 
\end{eqnarray} 
where the constraint is 
\begin{eqnarray}
   x(t+\Delta t) = x(t) + v(t)\Delta t. 
\end{eqnarray}
To minimize $S$ under the constraint, we can use Lagrange multiplier, to get 
\begin{eqnarray} 
    \tilde{S} = \sum_{0}^{T} \left\{ L(x(t), v(t)) \Delta t + \lambda(t) [x(t+\Delta t)-x(t) - v(t)\Delta t]\right\}. 
\end{eqnarray}
At the minimum, we should have 
\begin{eqnarray} 
     \frac{\partial \tilde{S}}{\partial x(t)}=  0;  \frac{\partial \tilde{S}}{\partial v(t)}=  0;  \frac{\partial \tilde{S}}{\partial \lambda(t)}=  0.
\end{eqnarray}
First, 
\begin{eqnarray} 
     \frac{\partial \tilde{S}}{\partial v(t)}= L^{(2)}(x(t), v(t)) \Delta t - \lambda(t) \Delta t= 0.
\end{eqnarray}
Thus 
\begin{eqnarray}
   \lambda(t) = L^{(2)}(x(t), v(t)). 
\end{eqnarray}
Compare to the above subsection, $\lambda$ is the momentum $p$. 

Define the Hamiltonian 
\begin{eqnarray}
  H(x(t), \lambda(t))  = \lambda(t) v(t) - L(x(t), v(t)). 
\end{eqnarray}
Then 
\begin{eqnarray} 
    \tilde{S} = \sum_{t_1}^{t_2} \lambda(t) (x(t+\Delta t)-x(t)) - H(x(t), \lambda(t))\Delta t. 
\end{eqnarray}
Thus, 
\begin{eqnarray} 
&& \frac{\partial \tilde{S}}{\partial \lambda(t)} = 0 \rightarrow  \frac{\partial H}{\partial \lambda} = \frac{\partial x}{\partial t}  \\
&& \frac{\partial \tilde{S}}{\partial x(t)} = 0 \rightarrow   \frac{\partial H}{\partial x} = -  \frac{\partial \lambda}{\partial t}.
\end{eqnarray} 
Thus we get the Hamiltonian equation with $\lambda = p$. 


\section{Quantum mechanics} 

\subsection{Hamiltonian and rotation of belief vector} 

Let $b(x)$ be the belief vector. In quantum mechanics, $b(x)$ is a complex number for each $x$. An observer makes a measurement, the probability density (or mass) function of observing $x$ is $p(x) = |b(x)|^2$ for each $x$,  i.e., the sum of squares of the real and imaginary parts of $b(x)$. 

For simplicity, let us first assume $x$ can only take a finite number of values (e.g., the spin of the electron). Then $b = (b(x), \forall x)$ forms a finite dimensional vector.  $b$ keeps rotating over time, i.e., $b_t(x)$ keeps changing over time, but the total squared length $|b_t|^2 = \sum_x |b_t(x)|^2$ remains the same. We can normalize $|b_t|^2$ to 1, so that $\sum_x p_t(x) = |b_t(x)|^2 = 1$. Thus $p_t(x)$ is always a legitimate probability distribution. 

The change of $b_t$ is driven by the Hamiltonian $H$, according to the Schrodinger equation: 
\[
    ih \frac{\partial b_t}{\partial t}  = H b_t, 
\]
where  $h$ is the Planck constant. In other words, 
\[
    b_{t+\Delta t} = (I - i H \Delta t /h) b_t \approx \exp(- i H \Delta t /h), 
\]
where we use $e^x = 1 + x + O(x^2)$ for small $x$, and for matrix $M$, $e^M = 1 + M + M^2/2 + M^3/3! + ...$. Because of $i$, $e^{-ix} = \cos x - i \sin x$ is a rotation in the complex domain, and similarly $\exp(- i H \Delta t /h)$ is a rotation matrix, so that the squared length of $b_t$ is preserved. In fact, $b_t =\exp(- i H t/h) b_0$. 

Now  let us assume that $x$ is continuous, e.g., 1-dimensional, such as the position of the electron. Then $b_t = (b_t(x), \forall x)$ is an infinite dimensional vector, in fact, the dimension is not even countable since $x$ is continuous, and $H$ is an infinite dimensional matrix, or more precisely an operator. But we still have the Schrodinger equation
\[
    ih \frac{\partial b_t(x)}{\partial t}  = H b_t(x), 
\]
where $b_t(x)$ is now a wave function, and $H$ is an operator on this function. 

How do we get $H$? The recipe is as follows. Just take the classical Hamiltonian, $H(x, p)$, which is a function of position $x$ and momentum $p$, and change $p$ to the operator $\hat{p} =  ih \partial/\partial x$ (so that $p^2$ is changed to $\hat{p}^2 = - h^2 \partial^2/\partial x^2$). This is inspired by de Broglie's hypothesis that any matter is also a wave whose wavelength is inverse proportional to $p$. 

\subsection{Discreteness of Hamiltonian eigenvalues}

Where does the name ``quantum'' come from? It turns out the operator (matrix) $H$ has a discrete list of eigen values, i.e., $H = Q \Lambda Q^\T$, where $\Lambda = {\rm diag}(\lambda_1, \lambda_2, ...)$ is a diagonal matrix of eigen values, $Q = (q_1, q_2, ...)$ collects the eigen vectors. $\lambda_i$ are the discrete energy levels. Because of the discreteness of $\lambda_i$, we say that the energy is quantized. This assumption was originally made by Planck (as well as Einstein, Bohr) in an ad hoc manner to account for certain experimental results. But in the above formulation, it is a natural consequence. 

Why do we take $\hat{p} = ih \partial /\partial x$? We can think of $\hat{p}$ as an infinite matrix (with uncountable entries). Its eigen vectors are $v_p = \exp(-i p x/h)$, so that $\hat{p} v_p = ih \partial \exp(-ipx/h)/\partial x = p \exp(-ipx/h) = p v_p$. Thus $p$ is the eigen value of $\hat{p}$. 

As to the position $x$, its operator version $\hat{x}$  is actually a diagonal matrix whose diagonal elements are different values of $x$, so that the eigen vectors of $\hat{x}$ are $\delta_x$, i.e., the Dirac one-hot vector for $x$, and the eigen values are $x$. 

Now we have three sets of basis, position basis $(\delta_x, \forall x)$, momentum basis $(v_p, \forall p)$, and energy basis $(q_i, \forall i)$. For a belief vector $b_t$, we can expand it in the three sets of basis, 
\[
     b_t = \int_x b_t(x) \delta_x = \int_p c_t(p) v_p = \sum_i a_{ti} q_i. 
\]
Thus the probability (density) we find the electron at position $x$ is $|b_t(x)|^2$, the probability (density) we find the electron with momentum $p$ is $|c_t(p)|^2$, and the probability we find the electron at energy level $i$ is $|a_{ti}|^2$. 

For the uncertainty principle, it can be prove that for any $b_t$, the variance of the probability distribution $|b_t(x)|^2$ and the variance of the probability distribution $|c_t(p)|^2$ satisfy the constraint that their product must be bigger than a certain constant. 

Now you can see that any quantity, be it position, momentum, or energy, is a matrix (or operator), whose eigen values are the possible values of that quantity, and whose eigen vectors form a basis. The belief vector can be expanded in this basis, and the squares of the coordinates become probabilities. 

\subsection{Lagrangian and Feynman path integral} 

Suppose at time $t = 0$, the system is in $x_0$. Then the probability amplitude that the system is in $x$ at time $T$, i.e., $b_T(x)$, is 
\begin{eqnarray} 
 \int \exp\left(i \frac{S(x(t))}{h}\right) D(x(t)), 
\end{eqnarray}
where $x(t)$ is the path over $[0, T]$ with $x(0) = x_0$ and $x(T) = x$. Intuitively, the above is 
\begin{eqnarray} 
 \sum_{\rm path} \exp\left(i \frac{S({\rm path})}{h}\right), 
\end{eqnarray}
which is the sum over all possible paths, i.e., sum over histories. 

The least action principle can be derived from the above path integral. If $S'({\rm path}) = 0$ at a particular path, then $S$ is locally constant around this path, then the sum of $e^{i S/h}$ around this path is the sum of a constant complex number. If $S'({\rm path}) \neq 0$ at a particular path, then $S$ is locally linear around this path, then the sum of $e^{i S/h}$ around this path is the sum of complex numbers of very different phases if $h$ is small, and they cancel each other. So in the end, only the stationary paths survive. 

If we change $it$ to $t$, then the above path integral becomes the normalizing constant or partition function $Z$ for a statistical mechanical system. This is how MCMC is used for computing the path integral in QM. 

Quantum mechanics is simpler than classical mechanics because it is linear. The mathematical language is simply vector matrix multiplication. For instance, in quantum field theory, the creation and annihilation operators act on vectors that correspond to different numbers of particles, so that the creation and annihilation of particles can be expressed by vector matrix multiplication. 

\section{Statistical mechanics} 

Consider a system $(x, v) = (x_i, v_i, i = 1, ..., n)$ evolves over time. According to the classical mechanics, we get a deterministic video $(x(t), v(t), t \in [0, T])$. Suppose $T$ is large, and we randomly sample a frame $t \sim {\rm Unif}[0, T]$. Then $(x(t), v(t))$ follows a probability distribution $p(x, v)$. Marginally, $x(t) \sim p(x)$. This is the conceptual basis of statistical mechanics. For large system, the statistical properties of $x(t)$ tend to have little fluctuations, i.e., $\Var[\phi(x(t))]$ tends to be small for a summary $\phi$. Thus when we measure $\phi$, it is close to $\E[\phi(x(t))]$. The closeness of $\phi(x(t))$ to its expectation is often characterized by concentration inequality or large deviation result, e.g., $\Pr(|\phi(x(t)) - \E[\phi(x(t))]|>\epsilon) \leq c_1\exp(-n c_2 \epsilon^2)$, where $c_1$ and $c_2$ are constants.  

In the above, the video is deterministic, and the randomness comes from the random sampling of $t$. This is very different from quantum mechanics, where the randomness is intrinsic. 

In Boltzmann's formation, $\E[\phi(x(t))]$ is considered time average, which is what we did. In Gibbs' thinking, $\E[\phi(x(t))]$ is considered ensemble average, i.e., one can imagine an ensemble (or population) of systems evolve in parallel, and we average the ensemble. Regardless, one needs concentration of the probability (i.e., fluctuations diminish) to explain the success of statistical mechanics. 

For Hamiltonian Monte Carlo (or hybrid Monte Carlo), we assume $p(x, v) = p(x) q(v) \propto \exp(-H(x, v))$, where $q$ is Gaussian. Given the current $x$, we randomly sample $v$ from $q(v)$. Then we let the system evolve according to Hamiltonian equation based on $H(x, v)$. This is like randomly pick a short video segment. 

\section{Optimal control} 

The optimal control for deterministic system, in particular, the Pontryagin's principle, follows the treatment of classical mechanics. 

Here we continue to use the notation in mechanics following the Russian school, so that state and action are denoted by $(x, u)$. In reinforcement learning, we use $(s, a)$. 

\subsection{Deterministic dynamics} 

The optimal control problem has a dynamics 
\begin{eqnarray}
\dot{x} = F(x, u), 
\end{eqnarray}
where the Newtonian notation $\dot{x} = dx/dt$, $u(t)$ is the control variable. In discrete time, it means 
\begin{eqnarray}
x(t+\Delta t) - x(t) = F(x(t), u(t)) \Delta t. 
\end{eqnarray}
We want to find $u(t), t \in [0, T]$, to minimize the cost 
\begin{eqnarray}
S = \phi(x(T)) + \int_{0}^{T} l(x, u) dt. 
\end{eqnarray}
where $\phi(x)$ is the cost of the final state and $l(x, u)$ is the loss rate. In discrete time, 
\begin{eqnarray}
S = \phi(x(T)) + \sum_{0}^{T} l(x(t), u(t)) \Delta t. 
\end{eqnarray}
We can plug in $x(t+\Delta t) = x(t) + F(x(t), u(t)) \Delta t$, so that $S$ is a function of $u(t), t \in [0, T]$. We can minimize $S$ by gradient descent over $u(t), t \in [0, T]$, where the gradient can be computed by back-propagation through time. 

\subsection{Lagrange multiplier and Pontryagin} 

Instead of back-propagation through time after plugging in $x(t+\Delta t) = x(t) + F(x(t), u(t)) \Delta t$, we can also treat it as a constraint, and use Lagrange multiplier. 
\begin{eqnarray}
\tilde{S} = \phi(x(T)) + \sum_{0}^{T} \left\{ l(x(t), u(t)) \Delta t + \lambda(t)[x(t+\Delta t) - x(t) - F(x(t), u(t)) \Delta t]\right\}. 
\end{eqnarray}

Define the Hamiltonian 
\begin{eqnarray}
  H(x(t), u(t), \lambda(t)) =  \lambda(t) F(x(t), u(t)) - l(x(t), u(t)), 
\end{eqnarray}   
then 
\begin{eqnarray}
\tilde{S} = \phi(x(T)) + \sum_{0}^{T} \left\{  \lambda(t)[x(t+\Delta t) - x(t)] - H(x(t), u(t), \lambda(t)) \Delta t\right\}. 
\end{eqnarray}
Thus we get the Hamiltonian equation: 
\begin{eqnarray} 
&& \frac{\partial \tilde{S}}{\partial \lambda(t)} = 0 \rightarrow  \frac{\partial H}{\partial \lambda} = \frac{\partial x}{\partial t}  \\
&& \frac{\partial \tilde{S}}{\partial x(t)} = 0 \rightarrow   \frac{\partial H}{\partial x} = -  \frac{\partial \lambda}{\partial t}.
\end{eqnarray} 
The first equation is just the dynamics. We can run it forward to get $x(t)$. The second equation   enables us to update 
\begin{eqnarray}
\lambda(t) = \lambda(t+\Delta t) - \frac{ \partial H}{\partial x} \Delta t,
\end{eqnarray} 
backward, starting from 
\begin{eqnarray}
\lambda(T) = \phi'(x(T)). 
\end{eqnarray}
With $x(t)$ and $\lambda(t)$, we can then minimize $\tilde{S}$ with respect to $u(t)$. This is an unconstrained minimization, so that we only need to maximize each $H(x(t), u(t), \lambda(t))$ with respect to $u(t)$ separately, without the need for back-propagation through time. 

The above is the Pontryagin maximum principle. One can see that the above optimal control method follows the classical mechanics closely. The classical mechanics can be considered as nature controls the trajectory, where the control variable is $v(t)$. 

\subsection{Dynamic programming and Bellman} 

The dynamic programming approach is based on the value function $v(x, t)$, with 
\begin{eqnarray} 
v(x, T) = \phi(x),
\end{eqnarray} 
and 
\begin{eqnarray} 
v(x, t) &=&  \max_{u} [l(x, u) \Delta t + v(x(t+\Delta t), t+\Delta t)\\
&=&  \max_{u} [l(x, u) \Delta t + v(x + F(x, u)\Delta t, t+\Delta t).
\end{eqnarray} 
The above is the Bellman equation for value iteration. Directly applying the above equation to continuous $x$ can be inconvenient because we need to memorize $v(x, t)$ for all $x$, and if $x$ is high dimensional, there is what Bellman called ``curse of dimensionality.''

Bellman is more convenient for finite discrete state space. It underlies reinforcement learning and in particular $Q$-learning. 

\subsection{Hamilton-Jacobi-Bellman} 

Applying Taylor expansion to Bellman, 
\begin{eqnarray} 
v(x, t) &=&    \max_{u} [-l(x, u) \Delta t + v(x + F(x, u)\Delta t, t+\Delta t) \\
&=& \max_{u} [-l(x, u) \Delta t + v(x, t+\Delta t) + v_x(x, t+\Delta t) F(x, u)\Delta t].
\end{eqnarray} 
Thus 
\begin{eqnarray} 
- v_t(x, t) = \max_{u} [-l(x, u)  +  v_x(x, t) F(x, u)],
\end{eqnarray} 
where $v_x$ and $v_t$ are derivatives of $v(x, t)$ with respect to $x$ and $t$ respectively. 

Compared to Pontryagin, $\lambda(t) = v_x(x, t)$, and the right hand side of the above equation is the Hamiltonian $H(x, u, v_x)$, i.e., 
\begin{eqnarray} 
- v_t = H(x, v_x, t). 
\end{eqnarray} 
Again the above equation follows the Hamiton-Jacobi equation of classical mechanics. This equation was used by Schrodinger to justify his wave equation by making analogy to optics, where there are both wave and particle treatments. 

The LQR (linear quadratic regulation) can be derived from HJB. 

\subsection{Stochastic dynamics} 

For stochastic dynamics, we need to use expectation with respect to the random transition. Then we get into the Markov decision process. In continuous time, the dynamics can also be modeled by adding Brownian motion to dynamics, so that 
\begin{eqnarray} 
  x(t+\Delta t) = x(t) + F(x(t), u(t)) \Delta t + \sigma(x(t), u(t)) \sqrt{\Delta t} \epsilon_t, 
\end{eqnarray}
where $\epsilon_t \sim {\rm N}(0, I)$ independently over $t$, and $\sigma(x, u)$ accounts for diffusion. Optimal control in this situation can be made parallel to Kalman filter. 


\end{document}