\[ f(\mathbf{ x}) = \mathbf{ w}^\top \mathbf{ x} \]
\[ \begin{align*} \phi_1(x) & = 1, \\ \phi_2(x) & = x, \\ \phi_3(x) & = x^2. \end{align*} \]
\[ \boldsymbol{ \phi}(x) = \begin{bmatrix} 1\\ x\\ x^2\end{bmatrix}. \]
\[ \boldsymbol{ \Phi}(\mathbf{ x}) = \begin{bmatrix} 1 & x_1 & x_1^2 \\ 1 & x_2 & x_2^2\\ \vdots & \vdots & \vdots \\ 1 & x_n& x_n^2 \end{bmatrix} \]
\[ f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + {\color{yellow}{w_2 x^2}} \]
\[ f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + {\color{yellow}{w_2 x^2}} \]
\[ \phi_j(x_i) = x_i^j \]
\[ \phi_j(x) = x^j \]
\[ f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + {\color{yellow}{w_2 x^2}} + {\color{magenta}{w_3 x^3}} + {\color{red}{w_4 x^4}} \]
\[ f(x) = \color{cyan}{w_1 e^{-2(x+1)^2}} + \color{green}{w_2e^{-2x^2}} + \color{yellow}{w_3 e^{-2(x-1)^2}} \]
\[ \phi_j(x) = xH(v_j x+ v_0) \]
\[ f(x) = \color{cyan}{w_0} + \color{green}{w_1 xH(x+1.0) } + \color{yellow}{w_2 xH(x+0.33) } + \color{magenta}{w_3 xH(x-0.33)} + \color{red}{w_4 xH(x-1.0)} \]
\[ \phi_j(x) = \tanh(v_j x+ v_0) \]
\[ f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 \text{tanh}\left(x+1\right)}} + {\color{yellow}{w_2 \text{tanh}\left(x+0.33\right)}} + {\color{magenta}{w_3 \text{tanh}\left(x-0.33\right)}} + {\color{red}{w_4 \text{tanh}\left(x-1\right)}} \]
\[ f(x) = w_0 + w_1 \sin(x) + w_2 \cos(x) + w_3 \sin(2x) + w_4 \cos(2x) \]
\[ f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 \sin(x)}} + {\color{yellow}{w_2 \cos(x)}} + {\color{magenta}{w_3 \sin(2x)}} + {\color{red}{w_4 \cos(2x)}} \]
Now we are going to consider how these basis functions can be adjusted to fit to a particular data set. We will return to the olympic marathon data from last time. First we will scale the output of the data to be zero mean and variance 1.
|
|
\[ \begin{align} E(\mathbf{ w},\sigma^2) = &\frac{n}{2}\log \sigma^2 + \frac{1}{2\sigma^2}\sum_{i=1}^{n}y_i^{2}-\frac{1}{\sigma^2}\sum_{i=1}^{n}y_i\mathbf{ w}^{\top}\boldsymbol{ \phi}_i\\ &+\frac{1}{2\sigma^2}\sum_{i=1}^{n}\mathbf{ w}^{\top}\boldsymbol{ \phi}_i\boldsymbol{ \phi}_i^{\top}\mathbf{ w}+\text{const}. \end{align} \]
\[\begin{align} E(\mathbf{ w}, \sigma^2) = & \frac{n}{2}\log \sigma^2 + \frac{1}{2\sigma^2}\sum_{i=1}^{n}y_i^{2}-\frac{1}{\sigma^2} \mathbf{ w}^\top\sum_{i=1}^{n}\boldsymbol{ \phi}_i y_i\\ & +\frac{1}{2\sigma^2}\mathbf{ w}^{\top}\left[\sum_{i=1}^{n}\boldsymbol{ \phi}_i\boldsymbol{ \phi}_i^{\top}\right]\mathbf{ w}+\text{const}.\end{align}\]
Differentiate wrt \(\mathbf{ w}\) \[\frac{\text{d} E\left(\mathbf{ w},\sigma^2 \right)}{\text{d}\mathbf{ w}}=-\frac{1}{\sigma^2} \sum_{i=1}^{n}\boldsymbol{ \phi}_iy_i+\frac{1}{\sigma^2} \left[\sum_{i=1}^{n}\boldsymbol{ \phi}_i\boldsymbol{ \phi}_i^{\top}\right]\mathbf{ w}\] Leading to \[\mathbf{ w}^{*}=\left[\sum_{i=1}^{n}\boldsymbol{ \phi}_i\boldsymbol{ \phi}_i^{\top}\right]^{-1}\sum_{i=1}^{n}\boldsymbol{ \phi}_iy_i,\]
\[ \sum_{i=1}^{n}\boldsymbol{ \phi}_i\boldsymbol{ \phi}_i^\top = \boldsymbol{ \Phi}^\top \boldsymbol{ \Phi}\] \[\sum _{i=1}^{n}\boldsymbol{ \phi}_iy_i = \boldsymbol{ \Phi}^\top \mathbf{ y} \]
\[ \mathbf{A}\mathbf{x} = \mathbf{b}. \]
np.linalg.solve
\[ \designMatrix^\top \designMatrix \boldsymbol{\beta} = \designMatrix^\top \mathbf{ y} \] substitute \(\designMatrix = \mathbf{Q}{\mathbf{R}\) \[ (\mathbf{Q}\mathbf{R})^\top (\mathbf{Q}\mathbf{R})\boldsymbol{\beta} = (\mathbf{Q}\mathbf{R})^\top \mathbf{ y} \] \[ \mathbf{R}^\top (\mathbf{Q}^\top \mathbf{Q}) \mathbf{R} \boldsymbol{\beta} = \mathbf{R}^\top \mathbf{Q}^\top \mathbf{ y} \]
\[ \mathbf{R}^\top \mathbf{R} \boldsymbol{\beta} = \mathbf{R}^\top \mathbf{Q}^\top \mathbf{ y} \] \[ \mathbf{R} \boldsymbol{\beta} = \mathbf{Q}^\top \mathbf{ y} \]
TODO example with polynomials.
Section 1.4 of Rogers and Girolami (2011)
Chapter 1, pg 1-6 of Bishop (2006)
Chapter 3, Section 3.1 up to pg 143 of Bishop (2006)
twitter: @lawrennd
podcast: The Talking Machines
newspaper: Guardian Profile Page
blog posts: