plot_sparsity (generic function with 1 method)
Polynomial Regression
The following example features the LMO for polynomial regression on the $\ell_1$ norm ball. Given input/output pairs $\{x_i,y_i\}_{i=1}^N$ and sparse coefficients $c_j$, where
\[y_i=\sum_{j=1}^m c_j f_j(x_i)\]
and $f_j: \mathbb{R}^n\to\mathbb{R}$, the task is to recover those $c_j$ that are non-zero alongside their corresponding values. Under certain assumptions, this problem can be convexified into
\[\min_{c\in\mathcal{C}}||y-Ac||^2\]
for a convex set $\mathcal{C}$. It can also be found as example 4.1 in the paper. In order to evaluate the polynomial, we generate a total of 1000 data points $\{x_i\}_{i=1}^N$ from the standard multivariate Gaussian, with which we will compute the output variables $\{y_i\}_{i=1}^N$. Before evaluating the polynomial, these points will be contaminated with noise drawn from a standard multivariate Gaussian. We run the away_frank_wolfe
and blended_conditional_gradient
algorithms, and compare them to Projected Gradient Descent using a smoothness estimate. We will evaluate the output solution on test points drawn in a similar manner as the training points.
using FrankWolfe
using LinearAlgebra
import Random
using MultivariatePolynomials
using DynamicPolynomials
using Plots
using LaTeXStrings
const N = 10
DynamicPolynomials.@polyvar X[1:15]
const max_degree = 4
coefficient_magnitude = 10
noise_magnitude = 1
const var_monomials = MultivariatePolynomials.monomials(X, 0:max_degree)
Random.seed!(42)
const all_coeffs = map(var_monomials) do m
d = MultivariatePolynomials.degree(m)
return coefficient_magnitude * rand() .* (rand() .> 0.95 * d / max_degree)
end
const true_poly = dot(all_coeffs, var_monomials)
const training_data = map(1:500) do _
x = 0.1 * randn(N)
y = MultivariatePolynomials.subs(true_poly, Pair(X, x)) + noise_magnitude * randn()
return (x, y.a[1])
end
const extended_training_data = map(training_data) do (x, y)
x_ext = MultivariatePolynomials.coefficient.(MultivariatePolynomials.subs.(var_monomials, X => x))
return (x_ext, y)
end
const test_data = map(1:1000) do _
x = 0.4 * randn(N)
y = MultivariatePolynomials.subs(true_poly, Pair(X, x)) + noise_magnitude * randn()
return (x, y.a[1])
end
const extended_test_data = map(test_data) do (x, y)
x_ext = MultivariatePolynomials.coefficient.(MultivariatePolynomials.subs.(var_monomials, X => x))
return (x_ext, y)
end
function f(coefficients)
return 0.5 / length(extended_training_data) * sum(extended_training_data) do (x, y)
return (dot(coefficients, x) - y)^2
end
end
function f_test(coefficients)
return 0.5 / length(extended_test_data) * sum(extended_test_data) do (x, y)
return (dot(coefficients, x) - y)^2
end
end
function coefficient_errors(coeffs)
return 0.5 * sum(eachindex(all_coeffs)) do idx
return (all_coeffs[idx] - coeffs[idx])^2
end
end
function grad!(storage, coefficients)
storage .= 0
for (x, y) in extended_training_data
p_i = dot(coefficients, x) - y
@. storage += x * p_i
end
storage ./= length(training_data)
return nothing
end
function build_callback(trajectory_arr)
return function callback(state, args...)
return push!(
trajectory_arr,
(FrankWolfe.callback_state(state)..., f_test(state.x), coefficient_errors(state.x)),
)
end
end
gradient = similar(all_coeffs)
max_iter = 10000
random_initialization_vector = rand(length(all_coeffs))
lmo = FrankWolfe.LpNormLMO{1}(0.95 * norm(all_coeffs, 1))
# Estimating smoothness parameter
num_pairs = 1000
L_estimate = -Inf
gradient_aux = similar(gradient)
function projnorm1(x, τ)
n = length(x)
if norm(x, 1) ≤ τ
return x
end
u = abs.(x)
# simplex projection
bget = false
s_indices = sortperm(u, rev=true)
tsum = zero(τ)
@inbounds for i in 1:n-1
tsum += u[s_indices[i]]
tmax = (tsum - τ) / i
if tmax ≥ u[s_indices[i+1]]
bget = true
break
end
end
if !bget
tmax = (tsum + u[s_indices[n]] - τ) / n
end
@inbounds for i in 1:n
u[i] = max(u[i] - tmax, 0)
u[i] *= sign(x[i])
end
return u
end
for i in 1:num_pairs
global L_estimate
x = compute_extreme_point(lmo, randn(size(all_coeffs)))
y = compute_extreme_point(lmo, randn(size(all_coeffs)))
grad!(gradient, x)
grad!(gradient_aux, y)
new_L = norm(gradient - gradient_aux) / norm(x - y)
if new_L > L_estimate
L_estimate = new_L
end
end
We can now perform projected gradient descent:
xgd = FrankWolfe.compute_extreme_point(lmo, random_initialization_vector)
training_gd = Float64[]
test_gd = Float64[]
coeff_error = Float64[]
time_start = time_ns()
gd_times = Float64[]
for iter in 1:max_iter
global xgd
grad!(gradient, xgd)
xgd = projnorm1(xgd - gradient / L_estimate, lmo.right_hand_side)
push!(training_gd, f(xgd))
push!(test_gd, f_test(xgd))
push!(coeff_error, coefficient_errors(xgd))
push!(gd_times, (time_ns() - time_start) * 1e-9)
end
x00 = FrankWolfe.compute_extreme_point(lmo, random_initialization_vector)
x0 = deepcopy(x00)
trajectory_lafw = []
callback = build_callback(trajectory_lafw)
x_lafw, v, primal, dual_gap, _ = FrankWolfe.away_frank_wolfe(
f,
grad!,
lmo,
x0,
max_iteration=max_iter,
line_search=FrankWolfe.Adaptive(L_est=L_estimate),
print_iter=max_iter ÷ 10,
memory_mode=FrankWolfe.InplaceEmphasis(),
verbose=false,
lazy=true,
gradient=gradient,
callback=callback,
)
trajectory_bcg = []
callback = build_callback(trajectory_bcg)
x0 = deepcopy(x00)
x_bcg, v, primal, dual_gap, _, _ = FrankWolfe.blended_conditional_gradient(
f,
grad!,
lmo,
x0,
max_iteration=max_iter,
line_search=FrankWolfe.Adaptive(L_est=L_estimate),
print_iter=max_iter ÷ 10,
memory_mode=FrankWolfe.InplaceEmphasis(),
verbose=false,
weight_purge_threshold=1e-10,
callback=callback,
)
x0 = deepcopy(x00)
trajectory_lafw_ref = []
callback = build_callback(trajectory_lafw_ref)
_, _, primal_ref, _, _ = FrankWolfe.away_frank_wolfe(
f,
grad!,
lmo,
x0,
max_iteration=2 * max_iter,
line_search=FrankWolfe.Adaptive(L_est=L_estimate),
print_iter=max_iter ÷ 10,
memory_mode=FrankWolfe.InplaceEmphasis(),
verbose=false,
lazy=true,
gradient=gradient,
callback=callback,
)
iteration_list = [
[x[1] + 1 for x in trajectory_lafw],
[x[1] + 1 for x in trajectory_bcg],
collect(eachindex(training_gd)),
]
time_list = [[x[5] for x in trajectory_lafw], [x[5] for x in trajectory_bcg], gd_times]
primal_list = [
[x[2] - primal_ref for x in trajectory_lafw],
[x[2] - primal_ref for x in trajectory_bcg],
[x - primal_ref for x in training_gd],
]
test_list = [[x[6] for x in trajectory_lafw], [x[6] for x in trajectory_bcg], test_gd]
label = [L"\textrm{L-AFW}", L"\textrm{BCG}", L"\textrm{GD}"]
coefficient_error_values =
[[x[7] for x in trajectory_lafw], [x[7] for x in trajectory_bcg], coeff_error]
plot_results(
[primal_list, primal_list, test_list, test_list],
[iteration_list, time_list, iteration_list, time_list],
label,
[L"\textrm{Iteration}", L"\textrm{Time}", L"\textrm{Iteration}", L"\textrm{Time}"],
[L"\textrm{Primal Gap}", L"\textrm{Primal Gap}", L"\textrm{Test loss}", L"\textrm{Test loss}"],
xscalelog=[:log, :identity, :log, :identity],
legend_position=[:bottomleft, nothing, nothing, nothing],
)
This page was generated using Literate.jl.