[{"status":"public","year":"2019","date_published":"2019-01-05T00:00:00Z","creator":{"id":"4435EBFC-F248-11E8-B48F-1D18A9856A87","login":"apreinsp"},"page":"70","publication_status":"submitted","author":[{"last_name":"Javanmard","full_name":"Javanmard, Adel","first_name":"Adel"},{"last_name":"Mondelli","full_name":"Mondelli, Marco","id":"27EB676C-8706-11E9-9510-7717E6697425","orcid":"0000-0002-3242-7020","first_name":"Marco"},{"full_name":"Montanari, Andrea","first_name":"Andrea","last_name":"Montanari"}],"main_file_link":[{"open_access":"1","url":"https://arxiv.org/abs/1901.01375"}],"type":"preprint","publisher":"ArXiv","publication":"arXiv:1901.01375","external_id":{"arxiv":["1901.01375"]},"extern":"1","language":[{"iso":"eng"}],"_id":"6748","title":"Analysis of a two-layer neural network via displacement convexity","date_updated":"2020-05-12T14:54:36Z","day":"05","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","abstract":[{"lang":"eng","text":"Fitting a function by using linear combinations of a large number N of `simple' components is one of the most fruitful ideas in statistical learning. This idea lies at the core of a variety of methods, from two-layer neural networks to kernel regression, to boosting. In general, the resulting risk minimization problem is non-convex and is solved by gradient descent or its variants. Unfortunately, little is known about global convergence properties of these approaches.\r\nHere we consider the problem of learning a concave function f on a compact convex domain Ω⊆ℝd, using linear combinations of `bump-like' components (neurons). The parameters to be fitted are the centers of N bumps, and the resulting empirical risk minimization problem is highly non-convex. We prove that, in the limit in which the number of neurons diverges, the evolution of gradient descent converges to a Wasserstein gradient flow in the space of probability distributions over Ω. Further, when the bump width δ tends to 0, this gradient flow has a limit which is a viscous porous medium equation. Remarkably, the cost function optimized by this gradient flow exhibits a special property known as displacement convexity, which implies exponential convergence rates for N→∞, δ→0. Surprisingly, this asymptotic theory appears to capture well the behavior for moderate values of δ,N. Explaining this phenomenon, and understanding the dependence on δ,N in a quantitative manner remains an outstanding challenge."}],"month":"01","oa":1,"citation":{"ista":"Javanmard A, Mondelli M, Montanari A. Analysis of a two-layer neural network via displacement convexity. arXiv:1901.01375.","ieee":"A. Javanmard, M. Mondelli, and A. Montanari, “Analysis of a two-layer neural network via displacement convexity,” *arXiv:1901.01375*. ArXiv.","short":"A. Javanmard, M. Mondelli, A. Montanari, ArXiv:1901.01375 (n.d.).","apa":"Javanmard, A., Mondelli, M., & Montanari, A. (n.d.). Analysis of a two-layer neural network via displacement convexity. *ArXiv:1901.01375*. ArXiv.","ama":"Javanmard A, Mondelli M, Montanari A. Analysis of a two-layer neural network via displacement convexity. *arXiv:190101375*.","mla":"Javanmard, Adel, et al. “Analysis of a Two-Layer Neural Network via Displacement Convexity.” *ArXiv:1901.01375*, ArXiv.","chicago":"Javanmard, Adel, Marco Mondelli, and Andrea Montanari. “Analysis of a Two-Layer Neural Network via Displacement Convexity.” *ArXiv:1901.01375*. ArXiv, n.d."},"oa_version":"Preprint","date_created":"2019-07-31T09:39:42Z","article_processing_charge":"No","_version":5}]