@inproceedings{15011, abstract = {Pruning large language models (LLMs) from the BERT family has emerged as a standard compression benchmark, and several pruning methods have been proposed for this task. The recent “Sparsity May Cry” (SMC) benchmark put into question the validity of all existing methods, exhibiting a more complex setup where many known pruning methods appear to fail. We revisit the question of accurate BERT-pruning during fine-tuning on downstream datasets, and propose a set of general guidelines for successful pruning, even on the challenging SMC benchmark. First, we perform a cost-vs-benefits analysis of pruning model components, such as the embeddings and the classification head; second, we provide a simple-yet-general way of scaling training, sparsification and learning rate schedules relative to the desired target sparsity; finally, we investigate the importance of proper parametrization for Knowledge Distillation in the context of LLMs. Our simple insights lead to state-of-the-art results, both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, showing that even classic gradual magnitude pruning (GMP) can yield competitive results, with the right approach.}, author = {Kurtic, Eldar and Hoefler, Torsten and Alistarh, Dan-Adrian}, booktitle = {Proceedings of Machine Learning Research}, issn = {2640-3498}, location = {Hongkong, China}, pages = {542--553}, publisher = {ML Research Press}, title = {{How to prune your language model: Recovering accuracy on the "Sparsity May Cry" benchmark}}, volume = {234}, year = {2024}, } @inproceedings{13053, abstract = {Deep neural networks (DNNs) often have to be compressed, via pruning and/or quantization, before they can be deployed in practical settings. In this work we propose a new compression-aware minimizer dubbed CrAM that modifies the optimization step in a principled way, in order to produce models whose local loss behavior is stable under compression operations such as pruning. Thus, dense models trained via CrAM should be compressible post-training, in a single step, without significant accuracy loss. Experimental results on standard benchmarks, such as residual networks for ImageNet classification and BERT models for language modelling, show that CrAM produces dense models that can be more accurate than the standard SGD/Adam-based baselines, but which are stable under weight pruning: specifically, we can prune models in one-shot to 70-80% sparsity with almost no accuracy loss, and to 90% with reasonable (∼1%) accuracy loss, which is competitive with gradual compression methods. Additionally, CrAM can produce sparse models which perform well for transfer learning, and it also works for semi-structured 2:4 pruning patterns supported by GPU hardware. The code for reproducing the results is available at this https URL .}, author = {Peste, Elena-Alexandra and Vladu, Adrian and Kurtic, Eldar and Lampert, Christoph and Alistarh, Dan-Adrian}, booktitle = {11th International Conference on Learning Representations }, location = {Kigali, Rwanda }, title = {{CrAM: A Compression-Aware Minimizer}}, year = {2023}, } @inproceedings{14460, abstract = {We provide an efficient implementation of the backpropagation algorithm, specialized to the case where the weights of the neural network being trained are sparse. Our algorithm is general, as it applies to arbitrary (unstructured) sparsity and common layer types (e.g., convolutional or linear). We provide a fast vectorized implementation on commodity CPUs, and show that it can yield speedups in end-to-end runtime experiments, both in transfer learning using already-sparsified networks, and in training sparse networks from scratch. Thus, our results provide the first support for sparse training on commodity hardware.}, author = {Nikdan, Mahdi and Pegolotti, Tommaso and Iofinova, Eugenia B and Kurtic, Eldar and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {26215--26227}, publisher = {ML Research Press}, title = {{SparseProp: Efficient sparse backpropagation for faster training of neural networks at the edge}}, volume = {202}, year = {2023}, } @inproceedings{11463, abstract = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].}, author = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {14873--14886}, publisher = {Curran Associates}, title = {{M-FAC: Efficient matrix-free approximations of second-order information}}, volume = {34}, year = {2021}, }