@article{10211,
abstract = {We study the problem of recovering an unknown signal 𝑥𝑥 given measurements obtained from a generalized linear model with a Gaussian sensing matrix. Two popular solutions are based on a linear estimator 𝑥𝑥^L and a spectral estimator 𝑥𝑥^s. The former is a data-dependent linear combination of the columns of the measurement matrix, and its analysis is quite simple. The latter is the principal eigenvector of a data-dependent matrix, and a recent line of work has studied its performance. In this paper, we show how to optimally combine 𝑥𝑥^L and 𝑥𝑥^s. At the heart of our analysis is the exact characterization of the empirical joint distribution of (𝑥𝑥,𝑥𝑥^L,𝑥𝑥^s) in the high-dimensional limit. This allows us to compute the Bayes-optimal combination of 𝑥𝑥^L and 𝑥𝑥^s, given the limiting distribution of the signal 𝑥𝑥. When the distribution of the signal is Gaussian, then the Bayes-optimal combination has the form 𝜃𝑥𝑥^L+𝑥𝑥^s and we derive the optimal combination coefficient. In order to establish the limiting distribution of (𝑥𝑥,𝑥𝑥^L,𝑥𝑥^s), we design and analyze an approximate message passing algorithm whose iterates give 𝑥𝑥^L and approach 𝑥𝑥^s. Numerical simulations demonstrate the improvement of the proposed combination with respect to the two methods considered separately.},
author = {Mondelli, Marco and Thrampoulidis, Christos and Venkataramanan, Ramji},
issn = {1615-3375},
journal = {Foundations of Computational Mathematics},
keywords = {Applied Mathematics, Computational Theory and Mathematics, Computational Mathematics, Analysis},
publisher = {Springer},
title = {{Optimal combination of linear and spectral estimators for generalized linear models}},
doi = {10.1007/s10208-021-09531-x},
year = {2021},
}
@inproceedings{10598,
abstract = { We consider the problem of estimating a signal from measurements obtained via a generalized linear model. We focus on estimators based on approximate message passing (AMP), a family of iterative algorithms with many appealing features: the performance of AMP in the high-dimensional limit can be succinctly characterized under suitable model assumptions; AMP can also be tailored to the empirical distribution of the signal entries, and for a wide class of estimation problems, AMP is conjectured to be optimal among all polynomial-time algorithms. However, a major issue of AMP is that in many models (such as phase retrieval), it requires an initialization correlated with the ground-truth signal and independent from the measurement matrix. Assuming that such an initialization is available is typically not realistic. In this paper, we solve this problem by proposing an AMP algorithm initialized with a spectral estimator. With such an initialization, the standard AMP analysis fails since the spectral estimator depends in a complicated way on the design matrix. Our main contribution is a rigorous characterization of the performance of AMP with spectral initialization in the high-dimensional limit. The key technical idea is to define and analyze a two-phase artificial AMP algorithm that first produces the spectral estimator, and then closely approximates the iterates of the true AMP. We also provide numerical results that demonstrate the validity of the proposed approach. },
author = {Mondelli, Marco and Venkataramanan, Ramji},
booktitle = {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics},
editor = {Banerjee, Arindam and Fukumizu, Kenji},
issn = {2640-3498},
location = {Virtual, San Diego, CA, United States},
pages = {397--405},
publisher = {ML Research Press},
title = {{ Approximate message passing with spectral initialization for generalized linear models}},
volume = {130},
year = {2021},
}
@inproceedings{10053,
abstract = {This paper characterizes the latency of the simplified successive-cancellation (SSC) decoding scheme for polar codes under hardware resource constraints. In particular, when the number of processing elements P that can perform SSC decoding operations in parallel is limited, as is the case in practice, the latency of SSC decoding is O(N1−1 μ+NPlog2log2NP), where N is the block length of the code and μ is the scaling exponent of polar codes for the channel. Three direct consequences of this bound are presented. First, in a fully-parallel implementation where P=N2 , the latency of SSC decoding is O(N1−1/μ) , which is sublinear in the block length. This recovers a result from an earlier work. Second, in a fully-serial implementation where P=1 , the latency of SSC decoding scales as O(Nlog2log2N) . The multiplicative constant is also calculated: we show that the latency of SSC decoding when P=1 is given by (2+o(1))Nlog2log2N . Third, in a semi-parallel implementation, the smallest P that gives the same latency as that of the fully-parallel implementation is P=N1/μ . The tightness of our bound on SSC decoding latency and the applicability of the foregoing results is validated through extensive simulations.},
author = {Hashemi, Seyyed Ali and Mondelli, Marco and Fazeli, Arman and Vardy, Alexander and Cioffi, John and Goldsmith, Andrea},
booktitle = {2021 IEEE International Symposium on Information Theory},
isbn = {978-1-5386-8210-4},
issn = {2157-8095},
location = {Melbourne, Australia},
pages = {2369--2374},
publisher = {Institute of Electrical and Electronics Engineers},
title = {{Parallelism versus latency in simplified successive-cancellation decoding of polar codes}},
doi = {10.1109/ISIT45174.2021.9518153},
year = {2021},
}
@article{10364,
abstract = {This paper characterizes the latency of the simplified successive-cancellation (SSC) decoding scheme for polar codes under hardware resource constraints. In particular, when the number of processing elements P that can perform SSC decoding operations in parallel is limited, as is the case in practice, the latency of SSC decoding is O(N1-1/μ + N/P log2 log2 N/P), where N is the block length of the code and μ is the scaling exponent of the channel. Three direct consequences of this bound are presented. First, in a fully-parallel implementation where P = N/2, the latency of SSC decoding is O(N1-1/μ), which is sublinear in the block length. This recovers a result from our earlier work. Second, in a fully-serial implementation where P = 1, the latency of SSC decoding scales as O(N log2 log2 N). The multiplicative constant is also calculated: we show that the latency of SSC decoding when P = 1 is given by (2 + o(1))N log2 log2 N. Third, in a semi-parallel implementation, the smallest P that gives the same latency as that of the fully-parallel implementation is P = N1/μ. The tightness of our bound on SSC decoding latency and the applicability of the foregoing results is validated through extensive simulations.},
author = {Hashemi, Seyyed Ali and Mondelli, Marco and Fazeli, Arman and Vardy, Alexander and Cioffi, John and Goldsmith, Andrea},
issn = {1558-2248},
journal = {IEEE Transactions on Wireless Communications},
publisher = {Institute of Electrical and Electronics Engineers},
title = {{Parallelism versus latency in simplified successive-cancellation decoding of polar codes}},
doi = {10.1109/TWC.2021.3125626},
year = {2021},
}
@inproceedings{10597,
abstract = {We thank Emmanuel Abbe and Min Ye for providing us the implementation of RPA decoding. D. Fathollahi and M. Mondelli are partially supported by the 2019 Lopez-Loreta Prize. N. Farsad is supported by Discovery Grant from the Natural Sciences and Engineering Research Council of Canada (NSERC) and Canada Foundation for Innovation (CFI), John R. Evans Leader Fund. S. A. Hashemi is supported by a Postdoctoral Fellowship from NSERC.},
author = {Fathollahi, Dorsa and Farsad, Nariman and Hashemi, Seyyed Ali and Mondelli, Marco},
booktitle = {2021 IEEE International Symposium on Information Theory},
isbn = {978-1-5386-8210-4},
location = {Virtual, Melbourne, Australia},
pages = {1082--1087},
publisher = {Institute of Electrical and Electronics Engineers},
title = {{Sparse multi-decoder recursive projection aggregation for Reed-Muller codes}},
doi = {10.1109/isit45174.2021.9517887},
year = {2021},
}
@inproceedings{10599,
abstract = {A two-part successive syndrome-check decoding of polar codes is proposed with the first part successively refining the received codeword and the second part checking its syndrome. A new formulation of the successive-cancellation (SC) decoding algorithm is presented that allows for successively refining the received codeword by comparing the log-likelihood ratio value of a frozen bit with its predefined value. The syndrome of the refined received codeword is then checked for possible errors. In case there are no errors, the decoding process is terminated. Otherwise, the decoder continues to refine the received codeword. The proposed method is extended to the case of SC list (SCL) decoding by terminating the decoding process when the syndrome of the best candidate in the list indicates no errors. Simulation results show that the proposed method reduces the time-complexity of SC and SCL decoders and their fast variants, especially at high signal-to-noise ratios.},
author = {Hashemi, Seyyed Ali and Mondelli, Marco and Cioffi, John and Goldsmith, Andrea},
booktitle = {Proceedings of the Asilomar Conference on Signals, Systems, and Computers},
location = {Virtual, Pacific Grove, CA, United States},
publisher = {IEEE Signal Processing Society},
title = {{Successive syndrome-check decoding of polar codes}},
year = {2021},
}
@inproceedings{10595,
abstract = {A recent line of work has analyzed the theoretical properties of deep neural networks via the Neural Tangent Kernel (NTK). In particular, the smallest eigenvalue of the NTK has been related to the memorization capacity, the global convergence of gradient descent algorithms and the generalization of deep nets. However, existing results either provide bounds in the two-layer setting or assume that the spectrum of the NTK matrices is bounded away from 0 for multi-layer networks. In this paper, we provide tight bounds on the smallest eigenvalue of NTK matrices for deep ReLU nets, both in the limiting case of infinite widths and for finite widths. In the finite-width setting, the network architectures we consider are fairly general: we require the existence of a wide layer with roughly order of $N$ neurons, $N$ being the number of data samples; and the scaling of the remaining layer widths is arbitrary (up to logarithmic factors). To obtain our results, we analyze various quantities of independent interest: we give lower bounds on the smallest singular value of hidden feature matrices, and upper bounds on the Lipschitz constant of input-output feature maps.},
author = {Nguyen, Quynh and Mondelli, Marco and Montufar, Guido F},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
editor = {Meila, Marina and Zhang, Tong},
location = {Virtual},
pages = {8119--8129},
publisher = {ML Research Press},
title = {{Tight bounds on the smallest eigenvalue of the neural tangent kernel for deep ReLU networks}},
volume = {139},
year = {2021},
}
@inproceedings{10594,
abstract = {The question of how and why the phenomenon of mode connectivity occurs in training deep neural networks has gained remarkable attention in the research community. From a theoretical perspective, two possible explanations have been proposed: (i) the loss function has connected sublevel sets, and (ii) the solutions found by stochastic gradient descent are dropout stable. While these explanations provide insights into the phenomenon, their assumptions are not always satisfied in practice. In particular, the first approach requires the network to have one layer with order of N neurons (N being the number of training samples), while the second one requires the loss to be almost invariant after removing half of the neurons at each layer (up to some rescaling of the remaining ones). In this work, we improve both conditions by exploiting the quality of the features at every intermediate layer together with a milder over-parameterization condition. More specifically, we show that: (i) under generic assumptions on the features of intermediate layers, it suffices that the last two hidden layers have order of N−−√ neurons, and (ii) if subsets of features at each layer are linearly separable, then no over-parameterization is needed to show the connectivity. Our experiments confirm that the proposed condition ensures the connectivity of solutions found by stochastic gradient descent, even in settings where the previous requirements do not hold.},
author = {Nguyen, Quynh and Bréchet, Pierre and Mondelli, Marco},
booktitle = {35th Conference on Neural Information Processing Systems},
location = {Virtual},
publisher = {NeurIPS},
title = {{When are solutions connected in deep networks?}},
year = {2021},
}
@inproceedings{10593,
abstract = {We study the problem of estimating a rank-$1$ signal in the presence of rotationally invariant noise-a class of perturbations more general than Gaussian noise. Principal Component Analysis (PCA) provides a natural estimator, and sharp results on its performance have been obtained in the high-dimensional regime. Recently, an Approximate Message Passing (AMP) algorithm has been proposed as an alternative estimator with the potential to improve the accuracy of PCA. However, the existing analysis of AMP requires an initialization that is both correlated with the signal and independent of the noise, which is often unrealistic in practice. In this work, we combine the two methods, and propose to initialize AMP with PCA. Our main result is a rigorous asymptotic characterization of the performance of this estimator. Both the AMP algorithm and its analysis differ from those previously derived in the Gaussian setting: at every iteration, our AMP algorithm requires a specific term to account for PCA initialization, while in the Gaussian case, PCA initialization affects only the first iteration of AMP. The proof is based on a two-phase artificial AMP that first approximates the PCA estimator and then mimics the true AMP. Our numerical simulations show an excellent agreement between AMP results and theoretical predictions, and suggest an interesting open direction on achieving Bayes-optimal performance.},
author = {Mondelli, Marco and Venkataramanan, Ramji},
booktitle = {35th Conference on Neural Information Processing Systems},
location = {Virtual},
publisher = {NeurIPS},
title = {{PCA initialization for approximate message passing in rotationally invariant models}},
year = {2021},
}
@article{9047,
abstract = {This work analyzes the latency of the simplified successive cancellation (SSC) decoding scheme for polar codes proposed by Alamdar-Yazdi and Kschischang. It is shown that, unlike conventional successive cancellation decoding, where latency is linear in the block length, the latency of SSC decoding is sublinear. More specifically, the latency of SSC decoding is O(N1−1/μ) , where N is the block length and μ is the scaling exponent of the channel, which captures the speed of convergence of the rate to capacity. Numerical results demonstrate the tightness of the bound and show that most of the latency reduction arises from the parallel decoding of subcodes of rate 0 or 1.},
author = {Mondelli, Marco and Hashemi, Seyyed Ali and Cioffi, John M. and Goldsmith, Andrea},
issn = {15582248},
journal = {IEEE Transactions on Wireless Communications},
number = {1},
pages = {18--27},
publisher = {IEEE},
title = {{Sublinear latency for simplified successive cancellation decoding of polar codes}},
doi = {10.1109/TWC.2020.3022922},
volume = {20},
year = {2021},
}
@article{6748,
abstract = {Fitting a function by using linear combinations of a large number N of `simple' components is one of the most fruitful ideas in statistical learning. This idea lies at the core of a variety of methods, from two-layer neural networks to kernel regression, to boosting. In general, the resulting risk minimization problem is non-convex and is solved by gradient descent or its variants. Unfortunately, little is known about global convergence properties of these approaches.
Here we consider the problem of learning a concave function f on a compact convex domain Ω⊆ℝd, using linear combinations of `bump-like' components (neurons). The parameters to be fitted are the centers of N bumps, and the resulting empirical risk minimization problem is highly non-convex. We prove that, in the limit in which the number of neurons diverges, the evolution of gradient descent converges to a Wasserstein gradient flow in the space of probability distributions over Ω. Further, when the bump width δ tends to 0, this gradient flow has a limit which is a viscous porous medium equation. Remarkably, the cost function optimized by this gradient flow exhibits a special property known as displacement convexity, which implies exponential convergence rates for N→∞, δ→0. Surprisingly, this asymptotic theory appears to capture well the behavior for moderate values of δ,N. Explaining this phenomenon, and understanding the dependence on δ,N in a quantitative manner remains an outstanding challenge.},
author = {Javanmard, Adel and Mondelli, Marco and Montanari, Andrea},
journal = {Annals of Statistics},
number = {6},
pages = {3619--3642},
publisher = {Project Euclid},
title = {{Analysis of a two-layer neural network via displacement convexity}},
doi = {10.1214/20-AOS1945},
volume = {48},
year = {2020},
}
@article{9002,
abstract = { We prove that, for the binary erasure channel (BEC), the polar-coding paradigm gives rise to codes that not only approach the Shannon limit but do so under the best possible scaling of their block length as a function of the gap to capacity. This result exhibits the first known family of binary codes that attain both optimal scaling and quasi-linear complexity of encoding and decoding. Our proof is based on the construction and analysis of binary polar codes with large kernels. When communicating reliably at rates within ε>0 of capacity, the code length n often scales as O(1/εμ), where the constant μ is called the scaling exponent. It is known that the optimal scaling exponent is μ=2, and it is achieved by random linear codes. The scaling exponent of conventional polar codes (based on the 2×2 kernel) on the BEC is μ=3.63. This falls far short of the optimal scaling guaranteed by random codes. Our main contribution is a rigorous proof of the following result: for the BEC, there exist ℓ×ℓ binary kernels, such that polar codes constructed from these kernels achieve scaling exponent μ(ℓ) that tends to the optimal value of 2 as ℓ grows. We furthermore characterize precisely how large ℓ needs to be as a function of the gap between μ(ℓ) and 2. The resulting binary codes maintain the recursive structure of conventional polar codes, and thereby achieve construction complexity O(n) and encoding/decoding complexity O(nlogn).},
author = {Fazeli, Arman and Hassani, Hamed and Mondelli, Marco and Vardy, Alexander},
issn = {1557-9654},
journal = {IEEE Transactions on Information Theory},
number = {9},
pages = {5693--5710},
publisher = {IEEE},
title = {{Binary linear codes with optimal scaling: Polar codes with large kernels}},
doi = {10.1109/TIT.2020.3038806},
volume = {67},
year = {2020},
}
@inproceedings{9198,
abstract = {The optimization of multilayer neural networks typically leads to a solution
with zero training error, yet the landscape can exhibit spurious local minima
and the minima can be disconnected. In this paper, we shed light on this
phenomenon: we show that the combination of stochastic gradient descent (SGD)
and over-parameterization makes the landscape of multilayer neural networks
approximately connected and thus more favorable to optimization. More
specifically, we prove that SGD solutions are connected via a piecewise linear
path, and the increase in loss along this path vanishes as the number of
neurons grows large. This result is a consequence of the fact that the
parameters found by SGD are increasingly dropout stable as the network becomes
wider. We show that, if we remove part of the neurons (and suitably rescale the
remaining ones), the change in loss is independent of the total number of
neurons, and it depends only on how many neurons are left. Our results exhibit
a mild dependence on the input dimension: they are dimension-free for two-layer
networks and depend linearly on the dimension for multilayer networks. We
validate our theoretical findings with numerical experiments for different
architectures and classification tasks.},
author = {Shevchenko, Alexander and Mondelli, Marco},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {8773--8784},
publisher = {Proceedings of Machine Learning Research},
title = {{Landscape connectivity and dropout stability of SGD solutions for over-parameterized neural networks}},
volume = {119},
year = {2020},
}
@inproceedings{9221,
abstract = {Recent works have shown that gradient descent can find a global minimum for over-parameterized neural networks where the widths of all the hidden layers scale polynomially with N (N being the number of training samples). In this paper, we prove that, for deep networks, a single layer of width N following the input layer suffices to ensure a similar guarantee. In particular, all the remaining layers are allowed to have constant widths, and form a pyramidal topology. We show an application of our result to the widely used LeCun’s initialization and obtain an over-parameterization requirement for the single wide layer of order N2.
},
author = {Nguyen, Quynh and Mondelli, Marco},
booktitle = {34th Conference on Neural Information Processing Systems},
location = {Vancouver, Canada},
pages = {11961–11972},
publisher = {Curran Associates},
title = {{Global convergence of deep networks with one wide layer followed by pyramidal topology}},
volume = {33},
year = {2020},
}
@inproceedings{8536,
abstract = {This work analyzes the latency of the simplified successive cancellation (SSC) decoding scheme for polar codes proposed by Alamdar-Yazdi and Kschischang. It is shown that, unlike conventional successive cancellation decoding, where latency is linear in the block length, the latency of SSC decoding is sublinear. More specifically, the latency of SSC decoding is O(N 1−1/µ ), where N is the block length and µ is the scaling exponent of the channel, which captures the speed of convergence of the rate to capacity. Numerical results demonstrate the tightness of the bound and show that most of the latency reduction arises from the parallel decoding of subcodes of rate 0 and 1.},
author = {Mondelli, Marco and Hashemi, Seyyed Ali and Cioffi, John and Goldsmith, Andrea},
booktitle = {IEEE International Symposium on Information Theory - Proceedings},
isbn = {9781728164328},
issn = {21578095},
location = {Los Angeles, CA, United States},
publisher = {IEEE},
title = {{Simplified successive cancellation decoding of polar codes has sublinear latency}},
doi = {10.1109/ISIT44484.2020.9174141},
volume = {2020-June},
year = {2020},
}
@article{6750,
abstract = {Polar codes have gained extensive attention during the past few years and recently they have been selected for the next generation of wireless communications standards (5G). Successive-cancellation-based (SC-based) decoders, such as SC list (SCL) and SC flip (SCF), provide a reasonable error performance for polar codes at the cost of low decoding speed. Fast SC-based decoders, such as Fast-SSC, Fast-SSCL, and Fast-SSCF, identify the special constituent codes in a polar code graph off-line, produce a list of operations, store the list in memory, and feed the list to the decoder to decode the constituent codes in order efficiently, thus increasing the decoding speed. However, the list of operations is dependent on the code rate and as the rate changes, a new list is produced, making fast SC-based decoders not rate-flexible. In this paper, we propose a completely rate-flexible fast SC-based decoder by creating the list of operations directly in hardware, with low implementation complexity. We further propose a hardware architecture implementing the proposed method and show that the area occupation of the rate-flexible fast SC-based decoder in this paper is only 38% of the total area of the memory-based base-line decoder when 5G code rates are supported. },
author = {Hashemi, Seyyed Ali and Condo, Carlo and Mondelli, Marco and Gross, Warren J},
issn = {1053587X},
journal = {IEEE Transactions on Signal Processing},
number = {22},
publisher = {IEEE},
title = {{Rate-flexible fast polar decoders}},
doi = {10.1109/TSP.2019.2944738},
volume = {67},
year = {2019},
}
@article{7007,
abstract = {We consider the primitive relay channel, where the source sends a message to the relay and to the destination, and the relay helps the communication by transmitting an additional message to the destination via a separate channel. Two well-known coding techniques have been introduced for this setting: decode-and-forward and compress-and-forward. In decode-and-forward, the relay completely decodes the message and sends some information to the destination; in compress-and-forward, the relay does not decode, and it sends a compressed version of the received signal to the destination using Wyner–Ziv coding. In this paper, we present a novel coding paradigm that provides an improved achievable rate for the primitive relay channel. The idea is to combine compress-and-forward and decode-and-forward via a chaining construction. We transmit over pairs of blocks: in the first block, we use compress-and-forward; and, in the second block, we use decode-and-forward. More specifically, in the first block, the relay does not decode, it compresses the received signal via Wyner–Ziv, and it sends only part of the compression to the destination. In the second block, the relay completely decodes the message, it sends some information to the destination, and it also sends the remaining part of the compression coming from the first block. By doing so, we are able to strictly outperform both compress-and-forward and decode-and-forward. Note that the proposed coding scheme can be implemented with polar codes. As such, it has the typical attractive properties of polar coding schemes, namely, quasi-linear encoding and decoding complexity, and error probability that decays at super-polynomial speed. As a running example, we take into account the special case of the erasure relay channel, and we provide a comparison between the rates achievable by our proposed scheme and the existing upper and lower bounds.},
author = {Mondelli, Marco and Hassani, S. Hamed and Urbanke, Rüdiger},
issn = {1999-4893},
journal = {Algorithms},
number = {10},
publisher = {MDPI},
title = {{A new coding paradigm for the primitive relay channel}},
doi = {10.3390/a12100218},
volume = {12},
year = {2019},
}