Numerical Algorithms, Piyush Srivastava, Tata Institute of Fundamental Research. February 2018

In [1]:
using Distributions
using Plots
using LaTeXStrings


# Steepest descent¶

Our goal is to minimize

\begin{gather} f(x) = \frac{1}{2}x^TAx - b^Tx\\ \text{with } A \succ 0 \end{gather}

Starting with $x_0$, we define, for $k \geq 0$

\begin{gather} r_k = Ax_k - b\\ \alpha_k = \frac{\Vert r_k \Vert^2}{\Vert r_k \Vert_A^2}\\ x_{k+1} = x_k - \alpha_k r_k \end{gather}

where $\Vert x \Vert_A^2 = x^TAx$.

In [2]:
function gradient_descent(A::Array{T,2}, b::Array{T, 1}, numSteps::Int64) where {T <: Number}
x = b
r = A*x - b
errors = zeros(numSteps)
for k = 1:numSteps
Ar = A*r
Î± = -dot(r, r)/dot(r, Ar)
x += Î±*r
r += Î±*Ar
errors[k] = norm(r)
end
# norm(r), x)
(errors, x)
end

Out[2]:
gradient_descent (generic function with 1 method)

Here starting with $x_0$ we define, for $k \geq 0$ \begin{gather} r_k = Ax_k - b,\; d_0 = r_0\ \alpha_k = - \frac{r_0^T d_k}{\Vert d_k \Vert_A^2}\ \beta_k = - \frac{r_k^T d_k}{\Vert d_k \VertA^2}\ x{k + 1} = x_k + \alpha_k dk\ d{k+1} = r_k = \beta d_k \end{gather}

In [3]:
function conjugate_gradient(A::Array{T,2}, b::Array{T, 1}, numSteps::Int64, restart = false) where {T <: Number}
x = b
r0 = A*x - b
r = r0
d = r
errors = zeros(numSteps)
for k = 1:numSteps
# @printf "%d: \t %lf %lf" k d_A norm(d)
Î± = -dot(r0, d)/d_A
x += Î±*d
d = Î²*d + r
# @printf "\t %lf %lf" Î± dot(d, Ad)
errors[k] = norm(r)
if restart && k > 1 && errors[k] >= errors[k-1]
#Restart
r0 = r
d = r
end
end
#(norm(r), x)
(errors, x)
end

Out[3]:
conjugate_gradient (generic function with 2 methods)
In [4]:
function condition(A::Array{T, 2}) where {T<: Number}
(d, d1) = size(A)
@assert (d==d1) "A' should be a square matrix in call to condition'"
extremes = eigfact(A)[:values][[1, d]]
extremes[2]/extremes[1]
end

Out[4]:
condition (generic function with 1 method)

To evaluate the two algorithms, we will consider a random matrix $A$ from a Wishart distribution, and set $b = \mathbf{1}$.

In [5]:
function WishartMatrix(dimension::Int64)
A_dim = dimension
W = Wishart(2*A_dim, eye(A_dim, A_dim))
rand(W)
end

function simulate(A::Array{Float64, 2}, numSteps::T, restart = false) where {T <: Integer}
(dimension, d1) = size(A)
@assert (dimension==d1) "A' should be a square matrix in call to simulate'"
b = ones(dimension)
@printf "Dimension of A: %d\n" dimension
cg = conjugate_gradient(A, b, numSteps, restart);
plot(g[1], yscale =:log10, label = "Steepest",
xlab = L"Number of steps $(k)$",
ylab = L"\Vert r_k \Vert_2",
title= latexstring("A \\sim W($(dimension), I_{$(dimension)})"))
plot!(cg[1], yscale =:log10, label = "Conjugate")
end

Out[5]:
simulate (generic function with 2 methods)
In [6]:
# Generate test Matrices
A100 = WishartMatrix(100);
A500 = WishartMatrix(500);
A1000 = WishartMatrix(1000);

In [7]:
numSteps = 30
restart = false;

In [8]:
simulate(A100, numSteps)

Dimension of A: 100

Out[8]:
In [9]:
simulate(A500, numSteps)

Dimension of A: 500

Out[9]:
In [10]:
simulate(A1000, numSteps)

Dimension of A: 1000

Out[10]:

Question: What happens when the number of steps is increased in the above simulations?

In [11]:
numSteps = 100
restart = false;

In [12]:
simulate(A100, numSteps)

Dimension of A: 100

Out[12]:
In [13]:
simulate(A500, numSteps)

Dimension of A: 500

Out[13]:
In [14]:
simulate(A1000, numSteps)

Dimension of A: 1000

Out[14]:

We saw in class that one possibl reason for the above errors was that the signs of the dot products were becoming unreliable as iterations of conjugate gradient progressed. One fix might be to take the current vector as the starting vector and "restart" the conjugate gradient method, to find a new set of conjugate directions. We now try this method.

In [15]:
numSteps = 100
restart = true;

In [16]:
simulate(A100, numSteps, restart)

Dimension of A: 100

Out[16]:
In [17]:
simulate(A500, numSteps, restart)

Dimension of A: 500

Out[17]:
In [18]:
simulate(A1000, numSteps, restart)

Dimension of A: 1000

Out[18]: