@article{10738, abstract = {We prove an adiabatic theorem for the Landau–Pekar equations. This allows us to derive new results on the accuracy of their use as effective equations for the time evolution generated by the Fröhlich Hamiltonian with large coupling constant α. In particular, we show that the time evolution of Pekar product states with coherent phonon field and the electron being trapped by the phonons is well approximated by the Landau–Pekar equations until times short compared to α2.}, author = {Leopold, Nikolai K and Rademacher, Simone Anna Elvira and Schlein, Benjamin and Seiringer, Robert}, issn = {1948-206X}, journal = {Analysis and PDE}, number = {7}, pages = {2079--2100}, publisher = {Mathematical Sciences Publishers}, title = {{ The Landau–Pekar equations: Adiabatic theorem and accuracy}}, doi = {10.2140/APDE.2021.14.2079}, volume = {14}, year = {2021}, } @phdthesis{10429, abstract = {The scalability of concurrent data structures and distributed algorithms strongly depends on reducing the contention for shared resources and the costs of synchronization and communication. We show how such cost reductions can be attained by relaxing the strict consistency conditions required by sequential implementations. In the first part of the thesis, we consider relaxation in the context of concurrent data structures. Specifically, in data structures such as priority queues, imposing strong semantics renders scalability impossible, since a correct implementation of the remove operation should return only the element with highest priority. Intuitively, attempting to invoke remove operations concurrently creates a race condition. This bottleneck can be circumvented by relaxing semantics of the affected data structure, thus allowing removal of the elements which are no longer required to have the highest priority. We prove that the randomized implementations of relaxed data structures provide provable guarantees on the priority of the removed elements even under concurrency. Additionally, we show that in some cases the relaxed data structures can be used to scale the classical algorithms which are usually implemented with the exact ones. In the second part, we study parallel variants of the stochastic gradient descent (SGD) algorithm, which distribute computation among the multiple processors, thus reducing the running time. Unfortunately, in order for standard parallel SGD to succeed, each processor has to maintain a local copy of the necessary model parameter, which is identical to the local copies of other processors; the overheads from this perfect consistency in terms of communication and synchronization can negate the speedup gained by distributing the computation. We show that the consistency conditions required by SGD can be relaxed, allowing the algorithm to be more flexible in terms of tolerating quantized communication, asynchrony, or even crash faults, while its convergence remains asymptotically the same.}, author = {Nadiradze, Giorgi}, issn = {2663-337X}, pages = {132}, publisher = {Institute of Science and Technology Austria}, title = {{On achieving scalability through relaxation}}, doi = {10.15479/at:ista:10429}, year = {2021}, } @inproceedings{10435, abstract = {Decentralized optimization is emerging as a viable alternative for scalable distributed machine learning, but also introduces new challenges in terms of synchronization costs. To this end, several communication-reduction techniques, such as non-blocking communication, quantization, and local steps, have been explored in the decentralized setting. Due to the complexity of analyzing optimization in such a relaxed setting, this line of work often assumes \emph{global} communication rounds, which require additional synchronization. In this paper, we consider decentralized optimization in the simpler, but harder to analyze, \emph{asynchronous gossip} model, in which communication occurs in discrete, randomly chosen pairings among nodes. Perhaps surprisingly, we show that a variant of SGD called \emph{SwarmSGD} still converges in this setting, even if \emph{non-blocking communication}, \emph{quantization}, and \emph{local steps} are all applied \emph{in conjunction}, and even if the node data distributions and underlying graph topology are both \emph{heterogenous}. Our analysis is based on a new connection with multi-dimensional load-balancing processes. We implement this algorithm and deploy it in a super-computing environment, showing that it can outperform previous decentralized methods in terms of end-to-end training time, and that it can even rival carefully-tuned large-batch SGD for certain tasks.}, author = {Nadiradze, Giorgi and Sabour, Amirmojtaba and Davies, Peter and Li, Shigang and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, location = {Sydney, Australia}, publisher = {Neural Information Processing Systems Foundation}, title = {{Asynchronous decentralized SGD with quantized and local updates}}, year = {2021}, } @inproceedings{10593, abstract = {We study the problem of estimating a rank-$1$ signal in the presence of rotationally invariant noise-a class of perturbations more general than Gaussian noise. Principal Component Analysis (PCA) provides a natural estimator, and sharp results on its performance have been obtained in the high-dimensional regime. Recently, an Approximate Message Passing (AMP) algorithm has been proposed as an alternative estimator with the potential to improve the accuracy of PCA. However, the existing analysis of AMP requires an initialization that is both correlated with the signal and independent of the noise, which is often unrealistic in practice. In this work, we combine the two methods, and propose to initialize AMP with PCA. Our main result is a rigorous asymptotic characterization of the performance of this estimator. Both the AMP algorithm and its analysis differ from those previously derived in the Gaussian setting: at every iteration, our AMP algorithm requires a specific term to account for PCA initialization, while in the Gaussian case, PCA initialization affects only the first iteration of AMP. The proof is based on a two-phase artificial AMP that first approximates the PCA estimator and then mimics the true AMP. Our numerical simulations show an excellent agreement between AMP results and theoretical predictions, and suggest an interesting open direction on achieving Bayes-optimal performance.}, author = {Mondelli, Marco and Venkataramanan, Ramji}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual}, pages = {29616--29629}, publisher = {Neural Information Processing Systems Foundation}, title = {{PCA initialization for approximate message passing in rotationally invariant models}}, volume = {35}, year = {2021}, } @inproceedings{10594, abstract = {The question of how and why the phenomenon of mode connectivity occurs in training deep neural networks has gained remarkable attention in the research community. From a theoretical perspective, two possible explanations have been proposed: (i) the loss function has connected sublevel sets, and (ii) the solutions found by stochastic gradient descent are dropout stable. While these explanations provide insights into the phenomenon, their assumptions are not always satisfied in practice. In particular, the first approach requires the network to have one layer with order of N neurons (N being the number of training samples), while the second one requires the loss to be almost invariant after removing half of the neurons at each layer (up to some rescaling of the remaining ones). In this work, we improve both conditions by exploiting the quality of the features at every intermediate layer together with a milder over-parameterization condition. More specifically, we show that: (i) under generic assumptions on the features of intermediate layers, it suffices that the last two hidden layers have order of N−−√ neurons, and (ii) if subsets of features at each layer are linearly separable, then no over-parameterization is needed to show the connectivity. Our experiments confirm that the proposed condition ensures the connectivity of solutions found by stochastic gradient descent, even in settings where the previous requirements do not hold.}, author = {Nguyen, Quynh and Bréchet, Pierre and Mondelli, Marco}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual}, publisher = {Neural Information Processing Systems Foundation}, title = {{When are solutions connected in deep networks?}}, volume = {35}, year = {2021}, }