@article{6748,
  abstract     = {Fitting a function by using linear combinations of a large number N of `simple' components is one of the most fruitful ideas in statistical learning. This idea lies at the core of a variety of methods, from two-layer neural networks to kernel regression, to boosting. In general, the resulting risk minimization problem is non-convex and is solved by gradient descent or its variants. Unfortunately, little is known about global convergence properties of these approaches.
Here we consider the problem of learning a concave function f on a compact convex domain Ω⊆ℝd, using linear combinations of `bump-like' components (neurons). The parameters to be fitted are the centers of N bumps, and the resulting empirical risk minimization problem is highly non-convex. We prove that, in the limit in which the number of neurons diverges, the evolution of gradient descent converges to a Wasserstein gradient flow in the space of probability distributions over Ω. Further, when the bump width δ tends to 0, this gradient flow has a limit which is a viscous porous medium equation. Remarkably, the cost function optimized by this gradient flow exhibits a special property known as displacement convexity, which implies exponential convergence rates for N→∞, δ→0. Surprisingly, this asymptotic theory appears to capture well the behavior for moderate values of δ,N. Explaining this phenomenon, and understanding the dependence on δ,N in a quantitative manner remains an outstanding challenge.},
  author       = {Javanmard, Adel and Mondelli, Marco and Montanari, Andrea},
  issn         = {1941-7330},
  journal      = {Annals of Statistics},
  number       = {6},
  pages        = {3619--3642},
  publisher    = {Institute of Mathematical Statistics},
  title        = {{Analysis of a two-layer neural network via displacement convexity}},
  doi          = {10.1214/20-AOS1945},
  volume       = {48},
  year         = {2020},
}