@inproceedings{15011, abstract = {Pruning large language models (LLMs) from the BERT family has emerged as a standard compression benchmark, and several pruning methods have been proposed for this task. The recent “Sparsity May Cry” (SMC) benchmark put into question the validity of all existing methods, exhibiting a more complex setup where many known pruning methods appear to fail. We revisit the question of accurate BERT-pruning during fine-tuning on downstream datasets, and propose a set of general guidelines for successful pruning, even on the challenging SMC benchmark. First, we perform a cost-vs-benefits analysis of pruning model components, such as the embeddings and the classification head; second, we provide a simple-yet-general way of scaling training, sparsification and learning rate schedules relative to the desired target sparsity; finally, we investigate the importance of proper parametrization for Knowledge Distillation in the context of LLMs. Our simple insights lead to state-of-the-art results, both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, showing that even classic gradual magnitude pruning (GMP) can yield competitive results, with the right approach.}, author = {Kurtic, Eldar and Hoefler, Torsten and Alistarh, Dan-Adrian}, booktitle = {Proceedings of Machine Learning Research}, issn = {2640-3498}, location = {Hongkong, China}, pages = {542--553}, publisher = {ML Research Press}, title = {{How to prune your language model: Recovering accuracy on the "Sparsity May Cry" benchmark}}, volume = {234}, year = {2024}, } @inproceedings{12735, abstract = {Asynchronous programming has gained significant popularity over the last decade: support for this programming pattern is available in many popular languages via libraries and native language implementations, typically in the form of coroutines or the async/await construct. Instead of programming via shared memory, this concept assumes implicit synchronization through message passing. The key data structure enabling such communication is the rendezvous channel. Roughly, a rendezvous channel is a blocking queue of size zero, so both send(e) and receive() operations wait for each other, performing a rendezvous when they meet. To optimize the message passing pattern, channels are usually equipped with a fixed-size buffer, so sends do not suspend and put elements into the buffer until its capacity is exceeded. This primitive is known as a buffered channel. This paper presents a fast and scalable algorithm for both rendezvous and buffered channels. Similarly to modern queues, our solution is based on an infinite array with two positional counters for send(e) and receive() operations, leveraging the unconditional Fetch-And-Add instruction to update them. Yet, the algorithm requires non-trivial modifications of this classic pattern, in order to support the full channel semantics, such as buffering and cancellation of waiting requests. We compare the performance of our solution to that of the Kotlin implementation, as well as against other academic proposals, showing up to 9.8× speedup. To showcase its expressiveness and performance, we also integrated the proposed algorithm into the standard Kotlin Coroutines library, replacing the previous channel implementations.}, author = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman}, booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9798400700156}, location = {Montreal, QC, Canada}, pages = {107--118}, publisher = {Association for Computing Machinery}, title = {{Fast and scalable channels in Kotlin Coroutines}}, doi = {10.1145/3572848.3577481}, year = {2023}, } @misc{12736, abstract = {Although a wide variety of handcrafted concurrent data structures have been proposed, there is considerable interest in universal approaches (Universal Constructions or UCs) for building concurrent data structures. UCs (semi-)automatically convert a sequential data structure into a concurrent one. The simplest approach uses locks [3, 6] that protect a sequential data structure and allow only one process to access it at a time. However, the resulting data structure is blocking. Most work on UCs instead focuses on obtaining non-blocking progress guarantees such as obstruction-freedom, lock-freedom or wait-freedom. Many non-blocking UCs have appeared. Key examples include the seminal wait-free UC [2] by Herlihy, a NUMA-aware UC [10] by Yi et al., and an efficient UC for large objects [1] by Fatourou et al.}, author = {Aksenov, Vitaly and Brown, Trevor A and Fedorov, Alexander and Kokorin, Ilya}, booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9798400700156}, location = {Montreal, QB, Canada}, pages = {438--440}, publisher = {Association for Computing Machinery}, title = {{Unexpected scaling in path copying trees}}, doi = {10.1145/3572848.3577512}, year = {2023}, } @inproceedings{13053, abstract = {Deep neural networks (DNNs) often have to be compressed, via pruning and/or quantization, before they can be deployed in practical settings. In this work we propose a new compression-aware minimizer dubbed CrAM that modifies the optimization step in a principled way, in order to produce models whose local loss behavior is stable under compression operations such as pruning. Thus, dense models trained via CrAM should be compressible post-training, in a single step, without significant accuracy loss. Experimental results on standard benchmarks, such as residual networks for ImageNet classification and BERT models for language modelling, show that CrAM produces dense models that can be more accurate than the standard SGD/Adam-based baselines, but which are stable under weight pruning: specifically, we can prune models in one-shot to 70-80% sparsity with almost no accuracy loss, and to 90% with reasonable (∼1%) accuracy loss, which is competitive with gradual compression methods. Additionally, CrAM can produce sparse models which perform well for transfer learning, and it also works for semi-structured 2:4 pruning patterns supported by GPU hardware. The code for reproducing the results is available at this https URL .}, author = {Peste, Elena-Alexandra and Vladu, Adrian and Kurtic, Eldar and Lampert, Christoph and Alistarh, Dan-Adrian}, booktitle = {11th International Conference on Learning Representations }, location = {Kigali, Rwanda }, title = {{CrAM: A Compression-Aware Minimizer}}, year = {2023}, } @article{13179, abstract = {Writing concurrent code that is both correct and efficient is notoriously difficult. Thus, programmers often prefer to use synchronization abstractions, which render code simpler and easier to reason about. Despite a wealth of work on this topic, there is still a gap between the rich semantics provided by synchronization abstractions in modern programming languages—specifically, fair FIFO ordering of synchronization requests and support for abortable operations—and frameworks for implementing it correctly and efficiently. Supporting such semantics is critical given the rising popularity of constructs for asynchronous programming, such as coroutines, which abort frequently and are cheaper to suspend and resume compared to native threads. This paper introduces a new framework called CancellableQueueSynchronizer (CQS), which enables simple yet efficient implementations of a wide range of fair and abortable synchronization primitives: mutexes, semaphores, barriers, count-down latches, and blocking pools. Our main contribution is algorithmic, as implementing both fairness and abortability efficiently at this level of generality is non-trivial. Importantly, all our algorithms, including the CQS framework and the primitives built on top of it, come with formal proofs in the Iris framework for Coq for many of their properties. These proofs are modular, so it is easy to show correctness for new primitives implemented on top of CQS. From a practical perspective, implementation of CQS for native threads on the JVM improves throughput by up to two orders of magnitude over Java’s AbstractQueuedSynchronizer, the only practical abstraction offering similar semantics. Further, we successfully integrated CQS as a core component of the popular Kotlin Coroutines library, validating the framework’s practical impact and expressiveness in a real-world environment. In sum, CancellableQueueSynchronizer is the first framework to combine expressiveness with formal guarantees and solid practical performance. Our approach should be extensible to other languages and families of synchronization primitives.}, author = {Koval, Nikita and Khalanskiy, Dmitry and Alistarh, Dan-Adrian}, issn = {2475-1421}, journal = {Proceedings of the ACM on Programming Languages}, publisher = {Association for Computing Machinery }, title = {{CQS: A formally-verified framework for fair and abortable synchronization}}, doi = {10.1145/3591230}, volume = {7}, year = {2023}, } @inproceedings{13262, abstract = {Determining the degree of inherent parallelism in classical sequential algorithms and leveraging it for fast parallel execution is a key topic in parallel computing, and detailed analyses are known for a wide range of classical algorithms. In this paper, we perform the first such analysis for the fundamental Union-Find problem, in which we are given a graph as a sequence of edges, and must maintain its connectivity structure under edge additions. We prove that classic sequential algorithms for this problem are well-parallelizable under reasonable assumptions, addressing a conjecture by [Blelloch, 2017]. More precisely, we show via a new potential argument that, under uniform random edge ordering, parallel union-find operations are unlikely to interfere: T concurrent threads processing the graph in parallel will encounter memory contention O(T2 · log |V| · log |E|) times in expectation, where |E| and |V| are the number of edges and nodes in the graph, respectively. We leverage this result to design a new parallel Union-Find algorithm that is both internally deterministic, i.e., its results are guaranteed to match those of a sequential execution, but also work-efficient and scalable, as long as the number of threads T is O(|E|1 over 3 - ε), for an arbitrarily small constant ε > 0, which holds for most large real-world graphs. We present lower bounds which show that our analysis is close to optimal, and experimental results suggesting that the performance cost of internal determinism is limited.}, author = {Fedorov, Alexander and Hashemi, Diba and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 35th ACM Symposium on Parallelism in Algorithms and Architectures}, isbn = {9781450395458}, location = {Orlando, FL, United States}, pages = {261--271}, publisher = {Association for Computing Machinery}, title = {{Provably-efficient and internally-deterministic parallel Union-Find}}, doi = {10.1145/3558481.3591082}, year = {2023}, } @article{12566, abstract = {Approximate agreement is one of the few variants of consensus that can be solved in a wait-free manner in asynchronous systems where processes communicate by reading and writing to shared memory. In this work, we consider a natural generalisation of approximate agreement on arbitrary undirected connected graphs. Each process is given a node of the graph as input and, if non-faulty, must output a node such that – all the outputs are within distance 1 of one another, and – each output value lies on a shortest path between two input values. From prior work, it is known that there is no wait-free algorithm among processes for this problem on any cycle of length , by reduction from 2-set agreement (Castañeda et al., 2018). In this work, we investigate the solvability of this task on general graphs. We give a new, direct proof of the impossibility of approximate agreement on cycles of length , via a generalisation of Sperner's Lemma to convex polygons. We also extend the reduction from 2-set agreement to a larger class of graphs, showing that approximate agreement on these graphs is unsolvable. On the positive side, we present a wait-free algorithm for a different class of graphs, which properly contains the class of chordal graphs.}, author = {Alistarh, Dan-Adrian and Ellen, Faith and Rybicki, Joel}, issn = {0304-3975}, journal = {Theoretical Computer Science}, number = {2}, publisher = {Elsevier}, title = {{Wait-free approximate agreement on graphs}}, doi = {10.1016/j.tcs.2023.113733}, volume = {948}, year = {2023}, } @phdthesis{13074, abstract = {Deep learning has become an integral part of a large number of important applications, and many of the recent breakthroughs have been enabled by the ability to train very large models, capable to capture complex patterns and relationships from the data. At the same time, the massive sizes of modern deep learning models have made their deployment to smaller devices more challenging; this is particularly important, as in many applications the users rely on accurate deep learning predictions, but they only have access to devices with limited memory and compute power. One solution to this problem is to prune neural networks, by setting as many of their parameters as possible to zero, to obtain accurate sparse models with lower memory footprint. Despite the great research progress in obtaining sparse models that preserve accuracy, while satisfying memory and computational constraints, there are still many challenges associated with efficiently training sparse models, as well as understanding their generalization properties. The focus of this thesis is to investigate how the training process of sparse models can be made more efficient, and to understand the differences between sparse and dense models in terms of how well they can generalize to changes in the data distribution. We first study a method for co-training sparse and dense models, at a lower cost compared to regular training. With our method we can obtain very accurate sparse networks, and dense models that can recover the baseline accuracy. Furthermore, we are able to more easily analyze the differences, at prediction level, between the sparse-dense model pairs. Next, we investigate the generalization properties of sparse neural networks in more detail, by studying how well different sparse models trained on a larger task can adapt to smaller, more specialized tasks, in a transfer learning scenario. Our analysis across multiple pruning methods and sparsity levels reveals that sparse models provide features that can transfer similarly to or better than the dense baseline. However, the choice of the pruning method plays an important role, and can influence the results when the features are fixed (linear finetuning), or when they are allowed to adapt to the new task (full finetuning). Using sparse models with fixed masks for finetuning on new tasks has an important practical advantage, as it enables training neural networks on smaller devices. However, one drawback of current pruning methods is that the entire training cycle has to be repeated to obtain the initial sparse model, for every sparsity target; in consequence, the entire training process is costly and also multiple models need to be stored. In the last part of the thesis we propose a method that can train accurate dense models that are compressible in a single step, to multiple sparsity levels, without additional finetuning. Our method results in sparse models that can be competitive with existing pruning methods, and which can also successfully generalize to new tasks.}, author = {Peste, Elena-Alexandra}, issn = {2663-337X}, pages = {147}, publisher = {Institute of Science and Technology Austria}, title = {{Efficiency and generalization of sparse neural networks}}, doi = {10.15479/at:ista:13074}, year = {2023}, } @article{12330, abstract = {The design and implementation of efficient concurrent data structures has seen significant attention. However, most of this work has focused on concurrent data structures providing good worst-case guarantees, although, in real workloads, objects are often accessed at different rates. Efficient distribution-adaptive data structures, such as splay-trees, are known in the sequential case; however, they often are hard to translate efficiently to the concurrent case. We investigate distribution-adaptive concurrent data structures, and propose a new design called the splay-list. At a high level, the splay-list is similar to a standard skip-list, with the key distinction that the height of each element adapts dynamically to its access rate: popular elements “move up,” whereas rarely-accessed elements decrease in height. We show that the splay-list provides order-optimal amortized complexity bounds for a subset of operations, while being amenable to efficient concurrent implementation. Experiments show that the splay-list can leverage distribution-adaptivity for performance, and can outperform the only previously-known distribution-adaptive concurrent design in certain workloads.}, author = {Aksenov, Vitalii and Alistarh, Dan-Adrian and Drozdova, Alexandra and Mohtashami, Amirkeivan}, issn = {1432-0452}, journal = {Distributed Computing}, pages = {395--418}, publisher = {Springer Nature}, title = {{The splay-list: A distribution-adaptive concurrent skip-list}}, doi = {10.1007/s00446-022-00441-x}, volume = {36}, year = {2023}, } @inproceedings{14461, abstract = {Communication-reduction techniques are a popular way to improve scalability in data-parallel training of deep neural networks (DNNs). The recent emergence of large language models such as GPT has created the need for new approaches to exploit data-parallelism. Among these, fully-sharded data parallel (FSDP) training is highly popular, yet it still encounters scalability bottlenecks. One reason is that applying compression techniques to FSDP is challenging: as the vast majority of the communication involves the model’s weights, direct compression alters convergence and leads to accuracy loss. We present QSDP, a variant of FSDP which supports both gradient and weight quantization with theoretical guarantees, is simple to implement and has essentially no overheads. To derive QSDP we prove that a natural modification of SGD achieves convergence even when we only maintain quantized weights, and thus the domain over which we train consists of quantized points and is, therefore, highly non-convex. We validate this approach by training GPT-family models with up to 1.3 billion parameters on a multi-node cluster. Experiments show that QSDP preserves model accuracy, while completely removing the communication bottlenecks of FSDP, providing end-to-end speedups of up to 2.2x.}, author = {Markov, Ilia and Vladu, Adrian and Guo, Qi and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {24020--24044}, publisher = {ML Research Press}, title = {{Quantized distributed training of large models with convergence guarantees}}, volume = {202}, year = {2023}, } @inproceedings{14459, abstract = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.}, author = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {31151--31209}, publisher = {ML Research Press}, title = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}}, volume = {202}, year = {2023}, } @inproceedings{14460, abstract = {We provide an efficient implementation of the backpropagation algorithm, specialized to the case where the weights of the neural network being trained are sparse. Our algorithm is general, as it applies to arbitrary (unstructured) sparsity and common layer types (e.g., convolutional or linear). We provide a fast vectorized implementation on commodity CPUs, and show that it can yield speedups in end-to-end runtime experiments, both in transfer learning using already-sparsified networks, and in training sparse networks from scratch. Thus, our results provide the first support for sparse training on commodity hardware.}, author = {Nikdan, Mahdi and Pegolotti, Tommaso and Iofinova, Eugenia B and Kurtic, Eldar and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {26215--26227}, publisher = {ML Research Press}, title = {{SparseProp: Efficient sparse backpropagation for faster training of neural networks at the edge}}, volume = {202}, year = {2023}, } @inproceedings{14458, abstract = {We show for the first time that large-scale generative pretrained transformer (GPT) family models can be pruned to at least 50% sparsity in one-shot, without any retraining, at minimal loss of accuracy. This is achieved via a new pruning method called SparseGPT, specifically designed to work efficiently and accurately on massive GPT-family models. We can execute SparseGPT on the largest available open-source models, OPT-175B and BLOOM-176B, in under 4.5 hours, and can reach 60% unstructured sparsity with negligible increase in perplexity: remarkably, more than 100 billion weights from these models can be ignored at inference time. SparseGPT generalizes to semi-structured (2:4 and 4:8) patterns, and is compatible with weight quantization approaches. The code is available at: https://github.com/IST-DASLab/sparsegpt.}, author = {Frantar, Elias and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {10323--10337}, publisher = {ML Research Press}, title = {{SparseGPT: Massive language models can be accurately pruned in one-shot}}, volume = {202}, year = {2023}, } @article{14364, abstract = {We introduce extension-based proofs, a class of impossibility proofs that includes valency arguments. They are modelled as an interaction between a prover and a protocol. Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve -set agreement among processes or approximate agreement on a cycle of length 4 among processes in a wait-free manner in asynchronous models where processes communicate using objects that can be constructed from shared registers. However, it was unknown whether proofs based on simpler techniques were possible. We show that these impossibility results cannot be obtained by extension-based proofs in the iterated snapshot model and, hence, extension-based proofs are limited in power.}, author = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi}, issn = {1095-7111}, journal = {SIAM Journal on Computing}, number = {4}, pages = {913--944}, publisher = {Society for Industrial and Applied Mathematics}, title = {{Why extension-based proofs fail}}, doi = {10.1137/20M1375851}, volume = {52}, year = {2023}, } @inproceedings{14771, abstract = {Pruning—that is, setting a significant subset of the parameters of a neural network to zero—is one of the most popular methods of model compression. Yet, several recent works have raised the issue that pruning may induce or exacerbate bias in the output of the compressed model. Despite existing evidence for this phenomenon, the relationship between neural network pruning and induced bias is not well-understood. In this work, we systematically investigate and characterize this phenomenon in Convolutional Neural Networks for computer vision. First, we show that it is in fact possible to obtain highly-sparse models, e.g. with less than 10% remaining weights, which do not decrease in accuracy nor substantially increase in bias when compared to dense models. At the same time, we also find that, at higher sparsities, pruned models exhibit higher uncertainty in their outputs, as well as increased correlations, which we directly link to increased bias. We propose easy-to-use criteria which, based only on the uncompressed model, establish whether bias will increase with pruning, and identify the samples most susceptible to biased predictions post-compression. Our code can be found at https://github.com/IST-DASLab/pruned-vision-model-bias.}, author = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Alistarh, Dan-Adrian}, booktitle = {2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, issn = {2575-7075}, location = {Vancouver, BC, Canada}, pages = {24364--24373}, publisher = {IEEE}, title = {{Bias in pruned vision models: In-depth analysis and countermeasures}}, doi = {10.1109/cvpr52729.2023.02334}, year = {2023}, } @article{14815, abstract = {In the last few years, various communication compression techniques have emerged as an indispensable tool helping to alleviate the communication bottleneck in distributed learning. However, despite the fact biased compressors often show superior performance in practice when compared to the much more studied and understood unbiased compressors, very little is known about them. In this work we study three classes of biased compression operators, two of which are new, and their performance when applied to (stochastic) gradient descent and distributed (stochastic) gradient descent. We show for the first time that biased compressors can lead to linear convergence rates both in the single node and distributed settings. We prove that distributed compressed SGD method, employed with error feedback mechanism, enjoys the ergodic rate O(δLexp[−μKδL]+(C+δD)Kμ), where δ≥1 is a compression parameter which grows when more compression is applied, L and μ are the smoothness and strong convexity constants, C captures stochastic gradient noise (C=0 if full gradients are computed on each node) and D captures the variance of the gradients at the optimum (D=0 for over-parameterized models). Further, via a theoretical study of several synthetic and empirical distributions of communicated gradients, we shed light on why and by how much biased compressors outperform their unbiased variants. Finally, we propose several new biased compressors with promising theoretical guarantees and practical performance.}, author = {Beznosikov, Aleksandr and Horvath, Samuel and Richtarik, Peter and Safaryan, Mher}, issn = {1533-7928}, journal = {Journal of Machine Learning Research}, pages = {1--50}, publisher = {Journal of Machine Learning Research}, title = {{On biased compression for distributed learning}}, volume = {24}, year = {2023}, } @inproceedings{14260, abstract = {This paper presents Lincheck, a new practical and user-friendly framework for testing concurrent algorithms on the Java Virtual Machine (JVM). Lincheck provides a simple and declarative way to write concurrent tests: instead of describing how to perform the test, users specify what to test by declaring all the operations to examine; the framework automatically handles the rest. As a result, tests written with Lincheck are concise and easy to understand. The framework automatically generates a set of concurrent scenarios, examines them using stress-testing or bounded model checking, and verifies that the results of each invocation are correct. Notably, if an error is detected via model checking, Lincheck provides an easy-to-follow trace to reproduce it, significantly simplifying the bug investigation. To the best of our knowledge, Lincheck is the first production-ready tool on the JVM that offers such a simple way of writing concurrent tests, without requiring special skills or expertise. We successfully integrated Lincheck in the development process of several large projects, such as Kotlin Coroutines, and identified new bugs in popular concurrency libraries, such as a race in Java’s standard ConcurrentLinkedDeque and a liveliness bug in Java’s AbstractQueuedSynchronizer framework, which is used in most of the synchronization primitives. We believe that Lincheck can significantly improve the quality and productivity of concurrent algorithms research and development and become the state-of-the-art tool for checking their correctness.}, author = {Koval, Nikita and Fedorov, Alexander and Sokolova, Maria and Tsitelov, Dmitry and Alistarh, Dan-Adrian}, booktitle = {35th International Conference on Computer Aided Verification }, isbn = {9783031377051}, issn = {1611-3349}, location = {Paris, France}, pages = {156--169}, publisher = {Springer Nature}, title = {{Lincheck: A practical framework for testing concurrent data structures on JVM}}, doi = {10.1007/978-3-031-37706-8_8}, volume = {13964}, year = {2023}, } @misc{14995, abstract = {Lincheck is a new practical and user-friendly framework for testing concurrent data structures on the Java Virtual Machine (JVM). It provides a simple and declarative way to write concurrent tests. Instead of describing how to perform the test, users specify what to test by declaring all the operations to examine; the framework automatically handles the rest. As a result, tests written with Lincheck are concise and easy to understand. The artifact presents a collection of Lincheck tests that discover new bugs in popular libraries and implementations from the concurrency literature -- they are listed in Table 1, Section 3. To evaluate the performance of Lincheck analysis, the collection of tests also includes those which check correct data structures and, thus, always succeed. Similarly to Table 2, Section 3, the experiments demonstrate the reasonable time to perform a test. Finally, Lincheck provides user-friendly output with an easy-to-follow trace to reproduce a detected error, significantly simplifying further investigation.}, author = {Koval, Nikita and Fedorov, Alexander and Sokolova, Maria and Tsitelov, Dmitry and Alistarh, Dan-Adrian}, publisher = {Zenodo}, title = {{Lincheck: A practical framework for testing concurrent data structures on JVM}}, doi = {10.5281/ZENODO.7877757}, year = {2023}, } @inproceedings{11184, abstract = {Let G be a graph on n nodes. In the stochastic population protocol model, a collection of n indistinguishable, resource-limited nodes collectively solve tasks via pairwise interactions. In each interaction, two randomly chosen neighbors first read each other’s states, and then update their local states. A rich line of research has established tight upper and lower bounds on the complexity of fundamental tasks, such as majority and leader election, in this model, when G is a clique. Specifically, in the clique, these tasks can be solved fast, i.e., in n polylog n pairwise interactions, with high probability, using at most polylog n states per node. In this work, we consider the more general setting where G is an arbitrary regular graph, and present a technique for simulating protocols designed for fully-connected networks in any connected regular graph. Our main result is a simulation that is efficient on many interesting graph families: roughly, the simulation overhead is polylogarithmic in the number of nodes, and quadratic in the conductance of the graph. As a sample application, we show that, in any regular graph with conductance φ, both leader election and exact majority can be solved in φ^{-2} ⋅ n polylog n pairwise interactions, with high probability, using at most φ^{-2} ⋅ polylog n states per node. This shows that there are fast and space-efficient population protocols for leader election and exact majority on graphs with good expansion properties. We believe our results will prove generally useful, as they allow efficient technology transfer between the well-mixed (clique) case, and the under-explored spatial setting.}, author = {Alistarh, Dan-Adrian and Gelashvili, Rati and Rybicki, Joel}, booktitle = {25th International Conference on Principles of Distributed Systems}, editor = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia}, isbn = {9783959772198}, issn = {1868-8969}, location = {Strasbourg, France}, publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik}, title = {{Fast graphical population protocols}}, doi = {10.4230/LIPIcs.OPODIS.2021.14}, volume = {217}, year = {2022}, } @inproceedings{11183, abstract = {Subgraph detection has recently been one of the most studied problems in the CONGEST model of distributed computing. In this work, we study the distributed complexity of problems closely related to subgraph detection, mainly focusing on induced subgraph detection. The main line of this work presents lower bounds and parameterized algorithms w.r.t structural parameters of the input graph: - On general graphs, we give unconditional lower bounds for induced detection of cycles and patterns of treewidth 2 in CONGEST. Moreover, by adapting reductions from centralized parameterized complexity, we prove lower bounds in CONGEST for detecting patterns with a 4-clique, and for induced path detection conditional on the hardness of triangle detection in the congested clique. - On graphs of bounded degeneracy, we show that induced paths can be detected fast in CONGEST using techniques from parameterized algorithms, while detecting cycles and patterns of treewidth 2 is hard. - On graphs of bounded vertex cover number, we show that induced subgraph detection is easy in CONGEST for any pattern graph. More specifically, we adapt a centralized parameterized algorithm for a more general maximum common induced subgraph detection problem to the distributed setting. In addition to these induced subgraph detection results, we study various related problems in the CONGEST and congested clique models, including for multicolored versions of subgraph-detection-like problems.}, author = {Nikabadi, Amir and Korhonen, Janne}, booktitle = {25th International Conference on Principles of Distributed Systems}, editor = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia}, isbn = {9783959772198}, issn = {1868-8969}, location = {Strasbourg, France}, publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik}, title = {{Beyond distributed subgraph detection: Induced subgraphs, multicolored problems and graph parameters}}, doi = {10.4230/LIPIcs.OPODIS.2021.15}, volume = {217}, year = {2022}, } @article{11420, abstract = {Understanding the properties of neural networks trained via stochastic gradient descent (SGD) is at the heart of the theory of deep learning. In this work, we take a mean-field view, and consider a two-layer ReLU network trained via noisy-SGD for a univariate regularized regression problem. Our main result is that SGD with vanishingly small noise injected in the gradients is biased towards a simple solution: at convergence, the ReLU network implements a piecewise linear map of the inputs, and the number of “knot” points -- i.e., points where the tangent of the ReLU network estimator changes -- between two consecutive training inputs is at most three. In particular, as the number of neurons of the network grows, the SGD dynamics is captured by the solution of a gradient flow and, at convergence, the distribution of the weights approaches the unique minimizer of a related free energy, which has a Gibbs form. Our key technical contribution consists in the analysis of the estimator resulting from this minimizer: we show that its second derivative vanishes everywhere, except at some specific locations which represent the “knot” points. We also provide empirical evidence that knots at locations distinct from the data points might occur, as predicted by our theory.}, author = {Shevchenko, Aleksandr and Kungurtsev, Vyacheslav and Mondelli, Marco}, issn = {1533-7928}, journal = {Journal of Machine Learning Research}, number = {130}, pages = {1--55}, publisher = {Journal of Machine Learning Research}, title = {{Mean-field analysis of piecewise linear solutions for wide ReLU networks}}, volume = {23}, year = {2022}, } @inproceedings{12182, abstract = {Online algorithms make decisions based on past inputs, with the goal of being competitive against an algorithm that sees also future inputs. In this work, we introduce time-local online algorithms; these are online algorithms in which the output at any given time is a function of only T latest inputs. Our main observation is that time-local online algorithms are closely connected to local distributed graph algorithms: distributed algorithms make decisions based on the local information in the spatial dimension, while time-local online algorithms make decisions based on the local information in the temporal dimension. We formalize this connection, and show how we can directly use the tools developed to study distributed approximability of graph optimization problems to prove upper and lower bounds on the competitive ratio achieved with time-local online algorithms. Moreover, we show how to use computational techniques to synthesize optimal time-local algorithms.}, author = {Pacut, Maciej and Parham, Mahmoud and Rybicki, Joel and Schmid, Stefan and Suomela, Jukka and Tereshchenko, Aleksandr}, booktitle = {36th International Symposium on Distributed Computing}, issn = {1868-8969}, location = {Augusta, GA, United States}, publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik}, title = {{Brief announcement: Temporal locality in online algorithms}}, doi = {10.4230/LIPIcs.DISC.2022.52}, volume = {246}, year = {2022}, } @inproceedings{12780, abstract = {The ability to scale out training workloads has been one of the key performance enablers of deep learning. The main scaling approach is data-parallel GPU-based training, which has been boosted by hardware and software support for highly efficient point-to-point communication, and in particular via hardware bandwidth over-provisioning. Overprovisioning comes at a cost: there is an order of magnitude price difference between "cloud-grade" servers with such support, relative to their popular "consumer-grade" counterparts, although single server-grade and consumer-grade GPUs can have similar computational envelopes. In this paper, we show that the costly hardware overprovisioning approach can be supplanted via algorithmic and system design, and propose a framework called CGX, which provides efficient software support for compressed communication in ML applications, for both multi-GPU single-node training, as well as larger-scale multi-node training. CGX is based on two technical advances: At the system level, it relies on a re-developed communication stack for ML frameworks, which provides flexible, highly-efficient support for compressed communication. At the application level, it provides seamless, parameter-free integration with popular frameworks, so that end-users do not have to modify training recipes, nor significant training code. This is complemented by a layer-wise adaptive compression technique which dynamically balances compression gains with accuracy preservation. CGX integrates with popular ML frameworks, providing up to 3X speedups for multi-GPU nodes based on commodity hardware, and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.}, author = {Markov, Ilia and Ramezanikebrya, Hamidreza and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 23rd ACM/IFIP International Middleware Conference}, isbn = {9781450393409}, location = {Quebec, QC, Canada}, pages = {241--254}, publisher = {Association for Computing Machinery}, title = {{CGX: Adaptive system support for communication-efficient deep learning}}, doi = {10.1145/3528535.3565248}, year = {2022}, } @inproceedings{11844, abstract = {In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1) exactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in Θ(n log n) expected steps using Θ(log log n) states, whereas protocols that use O(1) states require Θ(n2) expected steps. In this work, we investigate the complexity of stable leader election on general graphs. We provide the first non-trivial time lower bounds for leader election on general graphs, showing that, when moving beyond cliques, the complexity landscape of leader election becomes very diverse: the time required to elect a leader can range from O(1) to Θ(n3) expected steps. On the upper bound side, we first observe that there exists a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2n) states that is at most a factor log n slower. Finally, we show that the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor n log n slower than the fast polynomial-state protocol. Moreover, among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs.}, author = {Alistarh, Dan-Adrian and Rybicki, Joel and Voitovych, Sasha}, booktitle = {Proceedings of the Annual ACM Symposium on Principles of Distributed Computing}, isbn = {9781450392624}, location = {Salerno, Italy}, pages = {246--256}, publisher = {Association for Computing Machinery}, title = {{Near-optimal leader election in population protocols on graphs}}, doi = {10.1145/3519270.3538435}, year = {2022}, } @inproceedings{11181, abstract = {To maximize the performance of concurrent data structures, researchers have often turned to highly complex fine-grained techniques, resulting in efficient and elegant algorithms, which can however be often difficult to understand and prove correct. While simpler techniques exist, such as transactional memory, they can have limited performance or portability relative to their fine-grained counterparts. Approaches at both ends of this complexity-performance spectrum have been extensively explored, but relatively less is known about the middle ground: approaches that are willing to sacrifice some performance for simplicity, while remaining competitive with state-of-the-art handcrafted designs. In this paper, we explore this middle ground, and present PathCAS, a primitive that combines ideas from multi-word CAS (KCAS) and transactional memory approaches, while carefully avoiding overhead. We show how PathCAS can be used to implement efficient search data structures relatively simply, using an internal binary search tree as an example, then extending this to an AVL tree. Our best implementations outperform many handcrafted search trees: in search-heavy workloads, it rivals the BCCO tree [5], the fastest known concurrent binary tree in terms of search performance [3]. Our results suggest that PathCAS can yield concurrent data structures that are relatively easy to build and prove correct, while offering surprisingly high performance.}, author = {Brown, Trevor A and Sigouin, William and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9781450392044}, location = {Seoul, Republic of Korea}, pages = {385--399}, publisher = {Association for Computing Machinery}, title = {{PathCAS: An efficient middle ground for concurrent search data structures}}, doi = {10.1145/3503221.3508410}, year = {2022}, } @inproceedings{11180, abstract = {Designing and implementing efficient parallel priority schedulers is an active research area. An intriguing proposed design is the Multi-Queue: given n threads and m ≥ n distinct priority queues, task insertions are performed uniformly at random, while, to delete, a thread picks two queues uniformly at random, and removes the observed task of higher priority. This approach scales well, and has probabilistic rank guarantees: roughly, the rank of each task removed, relative to remaining tasks in all other queues, is O (m) in expectation. Yet, the performance of this pattern is below that of well-engineered schedulers, which eschew theoretical guarantees for practical efficiency. We investigate whether it is possible to design and implement a Multi-Queue-based task scheduler that is both highly-efficient and has analytical guarantees. We propose a new variant called the Stealing Multi-Queue (SMQ), a cache-efficient variant of the Multi-Queue, which leverages both queue affinity---each thread has a local queue, from which tasks are usually removed; but, with some probability, threads also attempt to steal higher-priority tasks from the other queues---and task batching, that is, the processing of several tasks in a single insert / remove step. These ideas are well-known for task scheduling without priorities; our theoretical contribution is showing that, despite relaxations, this design can still provide rank guarantees, which in turn implies bounds on total work performed. We provide a general SMQ implementation which can surpass state-of-the-art schedulers such as OBIM and PMOD in terms of performance on popular graph-processing benchmarks. Notably, the performance improvement comes mainly from the superior rank guarantees provided by our scheduler, confirming that analytically-reasoned approaches can still provide performance improvements for priority task scheduling.}, author = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9781450392044}, location = {Seoul, Republic of Korea}, pages = {353--367}, publisher = {Association for Computing Machinery}, title = {{Multi-queues can be state-of-the-art priority schedulers}}, doi = {10.1145/3503221.3508432}, year = {2022}, } @misc{13076, abstract = {The source code for replicating experiments presented in the paper. The implementation of the designed priority schedulers can be found in Galois-2.2.1/include/Galois/WorkList/: StealingMultiQueue.h is the StealingMultiQueue. MQOptimized/ contains MQ Optimized variants. We provide images that contain all the dependencies and datasets. Images can be pulled from npostnikova/mq-based-schedulers repository, or downloaded from Zenodo. See readme for more detail.}, author = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, publisher = {Zenodo}, title = {{Multi-queues can be state-of-the-art priority schedulers}}, doi = {10.5281/ZENODO.5733408}, year = {2022}, } @inproceedings{11707, abstract = {In this work we introduce the graph-theoretic notion of mendability: for each locally checkable graph problem we can define its mending radius, which captures the idea of how far one needs to modify a partial solution in order to “patch a hole.” We explore how mendability is connected to the existence of efficient algorithms, especially in distributed, parallel, and fault-tolerant settings. It is easy to see that O(1)-mendable problems are also solvable in O(log∗n) rounds in the LOCAL model of distributed computing. One of the surprises is that in paths and cycles, a converse also holds in the following sense: if a problem Π can be solved in O(log∗n), there is always a restriction Π′⊆Π that is still efficiently solvable but that is also O(1)-mendable. We also explore the structure of the landscape of mendability. For example, we show that in trees, the mending radius of any locally checkable problem is O(1), Θ(logn), or Θ(n), while in general graphs the structure is much more diverse.}, author = {Balliu, Alkida and Hirvonen, Juho and Melnyk, Darya and Olivetti, Dennis and Rybicki, Joel and Suomela, Jukka}, booktitle = {International Colloquium on Structural Information and Communication Complexity}, editor = {Parter, Merav}, isbn = {9783031099922}, issn = {1611-3349}, location = {Paderborn, Germany}, pages = {1--20}, publisher = {Springer Nature}, title = {{Local mending}}, doi = {10.1007/978-3-031-09993-9_1}, volume = {13298}, year = {2022}, } @inproceedings{12299, abstract = {Transfer learning is a classic paradigm by which models pretrained on large “upstream” datasets are adapted to yield good results on “downstream” specialized datasets. Generally, more accurate models on the “upstream” dataset tend to provide better transfer accuracy “downstream”. In this work, we perform an in-depth investigation of this phenomenon in the context of convolutional neural networks (CNNs) trained on the ImageNet dataset, which have been pruned-that is, compressed by sparsifiying their connections. We consider transfer using unstructured pruned models obtained by applying several state-of-the-art pruning methods, including magnitude-based, second-order, regrowth, lottery-ticket, and regularization approaches, in the context of twelve standard transfer tasks. In a nutshell, our study shows that sparse models can match or even outperform the transfer performance of dense models, even at high sparsities, and, while doing so, can lead to significant inference and even training speedups. At the same time, we observe and analyze significant differences in the behaviour of different pruning methods. The code is available at: https://github.com/IST-DASLab/sparse-imagenet-transfer.}, author = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Kurtz, Mark and Alistarh, Dan-Adrian}, booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, issn = {2575-7075}, location = {New Orleans, LA, United States}, pages = {12256--12266}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{How well do sparse ImageNet models transfer?}}, doi = {10.1109/cvpr52688.2022.01195}, year = {2022}, } @article{10180, abstract = {The growing energy and performance costs of deep learning have driven the community to reduce the size of neural networks by selectively pruning components. Similarly to their biological counterparts, sparse networks generalize just as well, sometimes even better than, the original dense networks. Sparsity promises to reduce the memory footprint of regular networks to fit mobile devices, as well as shorten training time for ever growing networks. In this paper, we survey prior work on sparsity in deep learning and provide an extensive tutorial of sparsification for both inference and training. We describe approaches to remove and add elements of neural networks, different training strategies to achieve model sparsity, and mechanisms to exploit sparsity in practice. Our work distills ideas from more than 300 research papers and provides guidance to practitioners who wish to utilize sparsity today, as well as to researchers whose goal is to push the frontier forward. We include the necessary background on mathematical methods in sparsification, describe phenomena such as early structure adaptation, the intricate relations between sparsity and the training process, and show techniques for achieving acceleration on real hardware. We also define a metric of pruned parameter efficiency that could serve as a baseline for comparison of different sparse networks. We close by speculating on how sparsity can improve future workloads and outline major open problems in the field.}, author = {Hoefler, Torsten and Alistarh, Dan-Adrian and Ben-Nun, Tal and Dryden, Nikoli and Peste, Elena-Alexandra}, issn = {1533-7928}, journal = {Journal of Machine Learning Research}, number = {241}, pages = {1--124}, publisher = {Journal of Machine Learning Research}, title = {{Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks}}, volume = {22}, year = {2021}, } @inproceedings{10218, abstract = {Let G be a graph on n nodes. In the stochastic population protocol model, a collection of n indistinguishable, resource-limited nodes collectively solve tasks via pairwise interactions. In each interaction, two randomly chosen neighbors first read each other’s states, and then update their local states. A rich line of research has established tight upper and lower bounds on the complexity of fundamental tasks, such as majority and leader election, in this model, when G is a clique. Specifically, in the clique, these tasks can be solved fast, i.e., in n polylog n pairwise interactions, with high probability, using at most polylog n states per node. In this work, we consider the more general setting where G is an arbitrary graph, and present a technique for simulating protocols designed for fully-connected networks in any connected regular graph. Our main result is a simulation that is efficient on many interesting graph families: roughly, the simulation overhead is polylogarithmic in the number of nodes, and quadratic in the conductance of the graph. As an example, this implies that, in any regular graph with conductance φ, both leader election and exact majority can be solved in φ^{-2} ⋅ n polylog n pairwise interactions, with high probability, using at most φ^{-2} ⋅ polylog n states per node. This shows that there are fast and space-efficient population protocols for leader election and exact majority on graphs with good expansion properties.}, author = {Alistarh, Dan-Adrian and Gelashvili, Rati and Rybicki, Joel}, booktitle = {35th International Symposium on Distributed Computing}, isbn = {9-783-9597-7210-5}, issn = {1868-8969}, location = {Freiburg, Germany}, publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik}, title = {{Brief announcement: Fast graphical population protocols}}, doi = {10.4230/LIPIcs.DISC.2021.43}, volume = {209}, year = {2021}, } @inproceedings{10217, abstract = {This paper gives tight logarithmic lower bounds on the solo step complexity of leader election in an asynchronous shared-memory model with single-writer multi-reader (SWMR) registers, for both deterministic and randomized obstruction-free algorithms. The approach extends to lower bounds for deterministic and randomized obstruction-free algorithms using multi-writer registers under bounded write concurrency, showing a trade-off between the solo step complexity of a leader election algorithm, and the worst-case number of stalls incurred by a processor in an execution.}, author = {Alistarh, Dan-Adrian and Gelashvili, Rati and Nadiradze, Giorgi}, booktitle = {35th International Symposium on Distributed Computing}, isbn = {9-783-9597-7210-5}, issn = {1868-8969}, location = {Freiburg, Germany}, publisher = {Schloss Dagstuhl - Leibniz Zentrum für Informatik}, title = {{Lower bounds for shared-memory leader election under bounded write contention}}, doi = {10.4230/LIPIcs.DISC.2021.4}, volume = {209}, year = {2021}, } @inproceedings{10216, abstract = {This paper reports a new concurrent graph data structure that supports updates of both edges and vertices and queries: Breadth-first search, Single-source shortest-path, and Betweenness centrality. The operations are provably linearizable and non-blocking.}, author = {Chatterjee, Bapi and Peri, Sathya and Sa, Muktikanta}, booktitle = {35th International Symposium on Distributed Computing}, isbn = {9-783-9597-7210-5}, issn = {1868-8969}, location = {Freiburg, Germany}, publisher = {Schloss Dagstuhl - Leibniz Zentrum für Informatik}, title = {{Brief announcement: Non-blocking dynamic unbounded graphs with worst-case amortized bounds}}, doi = {10.4230/LIPIcs.DISC.2021.52}, volume = {209}, year = {2021}, } @inproceedings{10219, abstract = {We show that any algorithm that solves the sinkless orientation problem in the supported LOCAL model requires Ω(log n) rounds, and this is tight. The supported LOCAL is at least as strong as the usual LOCAL model, and as a corollary this also gives a new, short and elementary proof that shows that the round complexity of the sinkless orientation problem in the deterministic LOCAL model is Ω(log n).}, author = {Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan and Suomela, Jukka}, booktitle = {35th International Symposium on Distributed Computing}, isbn = {9-783-9597-7210-5}, issn = {1868-8969}, location = {Freiburg, Germany}, publisher = {Schloss Dagstuhl - Leibniz Zentrum für Informatik}, title = {{Brief announcement: Sinkless orientation is hard also in the supported LOCAL model}}, doi = {10.4230/LIPIcs.DISC.2021.58}, volume = {209}, year = {2021}, } @inproceedings{10853, abstract = {Dynamic Connectivity is a fundamental algorithmic graph problem, motivated by a wide range of applications to social and communication networks and used as a building block in various other algorithms, such as the bi-connectivity and the dynamic minimal spanning tree problems. In brief, we wish to maintain the connected components of the graph under dynamic edge insertions and deletions. In the sequential case, the problem has been well-studied from both theoretical and practical perspectives. However, much less is known about efficient concurrent solutions to this problem. This is the gap we address in this paper. We start from one of the classic data structures used to solve this problem, the Euler Tour Tree. Our first contribution is a non-blocking single-writer implementation of it. We leverage this data structure to obtain the first truly concurrent generalization of dynamic connectivity, which preserves the time complexity of its sequential counterpart, but is also scalable in practice. To achieve this, we rely on three main techniques. The first is to ensure that connectivity queries, which usually dominate real-world workloads, are non-blocking. The second non-trivial technique expands the above idea by making all queries that do not change the connectivity structure non-blocking. The third ingredient is applying fine-grained locking for updating the connected components, which allows operations on disjoint components to occur in parallel. We evaluate the resulting algorithm on various workloads, executing on both real and synthetic graphs. The results show the efficiency of each of the proposed optimizations; the most efficient variant improves the performance of a coarse-grained based implementation on realistic scenarios up to 6x on average and up to 30x when connectivity queries dominate.}, author = {Fedorov, Alexander and Koval, Nikita and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 33rd ACM Symposium on Parallelism in Algorithms and Architectures}, isbn = {9781450380706}, location = {Virtual, Online}, pages = {208--220}, publisher = {Association for Computing Machinery}, title = {{A scalable concurrent algorithm for dynamic connectivity}}, doi = {10.1145/3409964.3461810}, year = {2021}, } @inproceedings{11436, abstract = {Asynchronous distributed algorithms are a popular way to reduce synchronization costs in large-scale optimization, and in particular for neural network training. However, for nonsmooth and nonconvex objectives, few convergence guarantees exist beyond cases where closed-form proximal operator solutions are available. As training most popular deep neural networks corresponds to optimizing nonsmooth and nonconvex objectives, there is a pressing need for such convergence guarantees. In this paper, we analyze for the first time the convergence of stochastic asynchronous optimization for this general class of objectives. In particular, we focus on stochastic subgradient methods allowing for block variable partitioning, where the shared model is asynchronously updated by concurrent processes. To this end, we use a probabilistic model which captures key features of real asynchronous scheduling between concurrent processes. Under this model, we establish convergence with probability one to an invariant set for stochastic subgradient methods with momentum. From a practical perspective, one issue with the family of algorithms that we consider is that they are not efficiently supported by machine learning frameworks, which mostly focus on distributed data-parallel strategies. To address this, we propose a new implementation strategy for shared-memory based training of deep neural networks for a partitioned but shared model in single- and multi-GPU settings. Based on this implementation, we achieve on average1.2x speed-up in comparison to state-of-the-art training methods for popular image classification tasks, without compromising accuracy.}, author = {Kungurtsev, Vyacheslav and Egan, Malcolm and Chatterjee, Bapi and Alistarh, Dan-Adrian}, booktitle = {35th AAAI Conference on Artificial Intelligence, AAAI 2021}, isbn = {9781713835974}, issn = {2374-3468}, location = {Virtual, Online}, number = {9B}, pages = {8209--8216}, publisher = {AAAI Press}, title = {{Asynchronous optimization methods for efficient training of deep neural networks with guarantees}}, volume = {35}, year = {2021}, } @inproceedings{11452, abstract = {We study efficient distributed algorithms for the fundamental problem of principal component analysis and leading eigenvector computation on the sphere, when the data are randomly distributed among a set of computational nodes. We propose a new quantized variant of Riemannian gradient descent to solve this problem, and prove that the algorithm converges with high probability under a set of necessary spherical-convexity properties. We give bounds on the number of bits transmitted by the algorithm under common initialization schemes, and investigate the dependency on the problem dimension in each case.}, author = {Alimisis, Foivos and Davies, Peter and Vandereycken, Bart and Alistarh, Dan-Adrian}, booktitle = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {2823--2834}, publisher = {Neural Information Processing Systems Foundation}, title = {{Distributed principal component analysis with limited communication}}, volume = {4}, year = {2021}, } @inproceedings{11463, abstract = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].}, author = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {14873--14886}, publisher = {Curran Associates}, title = {{M-FAC: Efficient matrix-free approximations of second-order information}}, volume = {34}, year = {2021}, } @inproceedings{11464, abstract = {We consider a standard distributed optimisation setting where N machines, each holding a d-dimensional function fi, aim to jointly minimise the sum of the functions ∑Ni=1fi(x). This problem arises naturally in large-scale distributed optimisation, where a standard solution is to apply variants of (stochastic) gradient descent. We focus on the communication complexity of this problem: our main result provides the first fully unconditional bounds on total number of bits which need to be sent and received by the N machines to solve this problem under point-to-point communication, within a given error-tolerance. Specifically, we show that Ω(Ndlogd/Nε) total bits need to be communicated between the machines to find an additive ϵ-approximation to the minimum of ∑Ni=1fi(x). The result holds for both deterministic and randomised algorithms, and, importantly, requires no assumptions on the algorithm structure. The lower bound is tight under certain restrictions on parameter values, and is matched within constant factors for quadratic objectives by a new variant of quantised gradient descent, which we describe and analyse. Our results bring over tools from communication complexity to distributed optimisation, which has potential for further applications.}, author = {Alistarh, Dan-Adrian and Korhonen, Janne}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {7254--7266}, publisher = {Curran Associates}, title = {{Towards tight communication lower bounds for distributed optimisation}}, volume = {34}, year = {2021}, } @inproceedings{9543, abstract = {We consider the problem ofdistributed mean estimation (DME), in which n machines are each given a local d-dimensional vector xv∈Rd, and must cooperate to estimate the mean of their inputs μ=1n∑nv=1xv, while minimizing total communication cost. DME is a fundamental construct in distributed machine learning, and there has been considerable work on variants of this problem, especially in the context of distributed variance reduction for stochastic gradients in parallel SGD. Previous work typically assumes an upper bound on the norm of the input vectors, and achieves an error bound in terms of this norm. However, in many real applications, the input vectors are concentrated around the correct output μ, but μ itself has large norm. In such cases, previous output error bounds perform poorly. In this paper, we show that output error bounds need not depend on input norm. We provide a method of quantization which allows distributed mean estimation to be performed with solution quality dependent only on the distance between inputs, not on input norm, and show an analogous result for distributed variance reduction. The technique is based on a new connection with lattice theory. We also provide lower bounds showing that the communication to error trade-off of our algorithms is asymptotically optimal. As the lattices achieving optimal bounds under l2-norm can be computationally impractical, we also present an extension which leverages easy-to-use cubic lattices, and is loose only up to a logarithmic factor ind. We show experimentally that our method yields practical improvements for common applications, relative to prior approaches.}, author = {Davies, Peter and Gurunanthan, Vijaykrishna and Moshrefi, Niusha and Ashkboos, Saleh and Alistarh, Dan-Adrian}, booktitle = {9th International Conference on Learning Representations}, location = {Virtual}, title = {{New bounds for distributed mean estimation and variance reduction}}, year = {2021}, } @inproceedings{9620, abstract = {In this note, we introduce a distributed twist on the classic coupon collector problem: a set of m collectors wish to each obtain a set of n coupons; for this, they can each sample coupons uniformly at random, but can also meet in pairwise interactions, during which they can exchange coupons. By doing so, they hope to reduce the number of coupons that must be sampled by each collector in order to obtain a full set. This extension is natural when considering real-world manifestations of the coupon collector phenomenon, and has been remarked upon and studied empirically (Hayes and Hannigan 2006, Ahmad et al. 2014, Delmarcelle 2019). We provide the first theoretical analysis for such a scenario. We find that “coupon collecting with friends” can indeed significantly reduce the number of coupons each collector must sample, and raises interesting connections to the more traditional variants of the problem. While our analysis is in most cases asymptotically tight, there are several open questions raised, regarding finer-grained analysis of both “coupon collecting with friends,” and of a long-studied variant of the original problem in which a collector requires multiple full sets of coupons.}, author = {Alistarh, Dan-Adrian and Davies, Peter}, booktitle = {Structural Information and Communication Complexity}, isbn = {9783030795269}, issn = {1611-3349}, location = {Wrocław, Poland}, pages = {3--12}, publisher = {Springer Nature}, title = {{Collecting coupons is faster with friends}}, doi = {10.1007/978-3-030-79527-6_1}, volume = {12810}, year = {2021}, } @inproceedings{9823, abstract = {Approximate agreement is one of the few variants of consensus that can be solved in a wait-free manner in asynchronous systems where processes communicate by reading and writing to shared memory. In this work, we consider a natural generalisation of approximate agreement on arbitrary undirected connected graphs. Each process is given a vertex of the graph as input and, if non-faulty, must output a vertex such that all the outputs are within distance 1 of one another, and each output value lies on a shortest path between two input values. From prior work, it is known that there is no wait-free algorithm among 𝑛≥3 processes for this problem on any cycle of length 𝑐≥4 , by reduction from 2-set agreement (Castañeda et al. 2018). In this work, we investigate the solvability and complexity of this task on general graphs. We give a new, direct proof of the impossibility of approximate agreement on cycles of length 𝑐≥4 , via a generalisation of Sperner’s Lemma to convex polygons. We also extend the reduction from 2-set agreement to a larger class of graphs, showing that approximate agreement on these graphs is unsolvable. On the positive side, we present a wait-free algorithm for a class of graphs that properly contains the class of chordal graphs.}, author = {Alistarh, Dan-Adrian and Ellen, Faith and Rybicki, Joel}, booktitle = {Structural Information and Communication Complexity}, isbn = {9783030795269}, issn = {16113349}, location = {Wrocław, Poland}, pages = {87--105}, publisher = {Springer Nature}, title = {{Wait-free approximate agreement on graphs}}, doi = {10.1007/978-3-030-79527-6_6}, volume = {12810}, year = {2021}, } @inproceedings{11458, abstract = {The increasing computational requirements of deep neural networks (DNNs) have led to significant interest in obtaining DNN models that are sparse, yet accurate. Recent work has investigated the even harder case of sparse training, where the DNN weights are, for as much as possible, already sparse to reduce computational costs during training. Existing sparse training methods are often empirical and can have lower accuracy relative to the dense baseline. In this paper, we present a general approach called Alternating Compressed/DeCompressed (AC/DC) training of DNNs, demonstrate convergence for a variant of the algorithm, and show that AC/DC outperforms existing sparse training methods in accuracy at similar computational budgets; at high sparsity levels, AC/DC even outperforms existing methods that rely on accurate pre-trained dense models. An important property of AC/DC is that it allows co-training of dense and sparse models, yielding accurate sparse–dense model pairs at the end of the training process. This is useful in practice, where compressed variants may be desirable for deployment in resource-constrained settings without re-doing the entire training flow, and also provides us with insights into the accuracy gap between dense and compressed models. The code is available at: https://github.com/IST-DASLab/ACDC.}, author = {Peste, Elena-Alexandra and Iofinova, Eugenia B and Vladu, Adrian and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {8557--8570}, publisher = {Curran Associates}, title = {{AC/DC: Alternating Compressed/DeCompressed training of deep neural networks}}, volume = {34}, year = {2021}, } @inproceedings{13147, abstract = {We investigate fast and communication-efficient algorithms for the classic problem of minimizing a sum of strongly convex and smooth functions that are distributed among n different nodes, which can communicate using a limited number of bits. Most previous communication-efficient approaches for this problem are limited to first-order optimization, and therefore have \emph{linear} dependence on the condition number in their communication complexity. We show that this dependence is not inherent: communication-efficient methods can in fact have sublinear dependence on the condition number. For this, we design and analyze the first communication-efficient distributed variants of preconditioned gradient descent for Generalized Linear Models, and for Newton’s method. Our results rely on a new technique for quantizing both the preconditioner and the descent direction at each step of the algorithms, while controlling their convergence rate. We also validate our findings experimentally, showing faster convergence and reduced communication relative to previous methods.}, author = {Alimisis, Foivos and Davies, Peter and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 38th International Conference on Machine Learning}, isbn = {9781713845065}, issn = {2640-3498}, location = {Virtual}, pages = {196--206}, publisher = {ML Research Press}, title = {{Communication-efficient distributed optimization with quantized preconditioners}}, volume = {139}, year = {2021}, } @article{8723, abstract = {Deep learning at scale is dominated by communication time. Distributing samples across nodes usually yields the best performance, but poses scaling challenges due to global information dissemination and load imbalance across uneven sample lengths. State-of-the-art decentralized optimizers mitigate the problem, but require more iterations to achieve the same accuracy as their globally-communicating counterparts. We present Wait-Avoiding Group Model Averaging (WAGMA) SGD, a wait-avoiding stochastic optimizer that reduces global communication via subgroup weight exchange. The key insight is a combination of algorithmic changes to the averaging scheme and the use of a group allreduce operation. We prove the convergence of WAGMA-SGD, and empirically show that it retains convergence rates similar to Allreduce-SGD. For evaluation, we train ResNet-50 on ImageNet; Transformer for machine translation; and deep reinforcement learning for navigation at scale. Compared with state-of-the-art decentralized SGD variants, WAGMA-SGD significantly improves training throughput (e.g., 2.1× on 1,024 GPUs for reinforcement learning), and achieves the fastest time-to-solution (e.g., the highest score using the shortest training time for Transformer).}, author = {Li, Shigang and Tal Ben-Nun, Tal Ben-Nun and Nadiradze, Giorgi and Girolamo, Salvatore Di and Dryden, Nikoli and Alistarh, Dan-Adrian and Hoefler, Torsten}, issn = {10459219}, journal = {IEEE Transactions on Parallel and Distributed Systems}, number = {7}, publisher = {IEEE}, title = {{Breaking (global) barriers in parallel stochastic optimization with wait-avoiding group averaging}}, doi = {10.1109/TPDS.2020.3040606}, volume = {32}, year = {2021}, } @article{9827, abstract = {The Nearest neighbour search (NNS) is a fundamental problem in many application domains dealing with multidimensional data. In a concurrent setting, where dynamic modifications are allowed, a linearizable implementation of the NNS is highly desirable.This paper introduces the LockFree-kD-tree (LFkD-tree ): a lock-free concurrent kD-tree, which implements an abstract data type (ADT) that provides the operations Add, Remove, Contains, and NNS. Our implementation is linearizable. The operations in the LFkD-tree use single-word read and compare-and-swap (Image 1 ) atomic primitives, which are readily supported on available multi-core processors. We experimentally evaluate the LFkD-tree using several benchmarks comprising real-world and synthetic datasets. The experiments show that the presented design is scalable and achieves significant speed-up compared to the implementations of an existing sequential kD-tree and a recently proposed multidimensional indexing structure, PH-tree.}, author = {Chatterjee, Bapi and Walulya, Ivan and Tsigas, Philippas}, issn = {0304-3975}, journal = {Theoretical Computer Science}, keywords = {Concurrent data structure, kD-tree, Nearest neighbor search, Similarity search, Lock-free, Linearizability}, pages = {27--48}, publisher = {Elsevier}, title = {{Concurrent linearizable nearest neighbour search in LockFree-kD-tree}}, doi = {10.1016/j.tcs.2021.06.041}, volume = {886}, year = {2021}, } @inproceedings{9951, abstract = {There has recently been a surge of interest in the computational and complexity properties of the population model, which assumes n anonymous, computationally-bounded nodes, interacting at random, with the goal of jointly computing global predicates. Significant work has gone towards investigating majority or consensus dynamics in this model: that is, assuming that every node is initially in one of two states X or Y, determine which state had higher initial count. In this paper, we consider a natural generalization of majority/consensus, which we call comparison : in its simplest formulation, we are given two baseline states, X and Y, present in any initial configuration in fixed, but possibly small counts. One of these states has higher count than the other: we will assume |X_0| > C |Y_0| for some constant C > 1. The challenge is to design a protocol by which nodes can quickly and reliably decide on which of the baseline states X_0 and Y_0 has higher initial count. We begin by analyzing a simple and general dynamics solving the above comparison problem, which uses O( log n ) states per node, and converges in O(log n) (parallel) time, with high probability, to a state where the whole population votes on opinions X or Y at rates proportional to the initial concentrations of |X_0| vs. |Y_0|. We then describe how this procedure can be bootstrapped to solve comparison, i.e. have every node in the population reach the "correct'' decision, with probability 1 - o(1), at the cost of O (log log n) additional states. Further, we prove that this dynamics is self-stabilizing, in the sense that it converges to the correct decision from arbitrary initial states, and leak-robust, in the sense that it can withstand spurious faulty reactions, which are known to occur in practical implementations of population protocols. Our analysis is based on a new martingale concentration result relating the discrete-time evolution of a population protocol to its expected (steady-state) analysis, which should be a useful tool when analyzing opinion dynamics and epidemic dissemination in the population model.}, author = {Alistarh, Dan-Adrian and Töpfer, Martin and Uznański, Przemysław}, booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing}, isbn = {9781450385480}, location = {Virtual, Italy}, pages = {55--65}, publisher = {Association for Computing Machinery}, title = {{Comparison dynamics in population protocols}}, doi = {10.1145/3465084.3467915}, year = {2021}, } @inproceedings{9935, abstract = {We present a deterministic O(log log log n)-round low-space Massively Parallel Computation (MPC) algorithm for the classical problem of (Δ+1)-coloring on n-vertex graphs. In this model, every machine has sublinear local space of size n^φ for any arbitrary constant φ \in (0,1). Our algorithm works under the relaxed setting where each machine is allowed to perform exponential local computations, while respecting the n^φ space and bandwidth limitations. Our key technical contribution is a novel derandomization of the ingenious (Δ+1)-coloring local algorithm by Chang-Li-Pettie (STOC 2018, SIAM J. Comput. 2020). The Chang-Li-Pettie algorithm runs in T_local =poly(loglog n) rounds, which sets the state-of-the-art randomized round complexity for the problem in the local model. Our derandomization employs a combination of tools, notably pseudorandom generators (PRG) and bounded-independence hash functions. The achieved round complexity of O(logloglog n) rounds matches the bound of log(T_local ), which currently serves an upper bound barrier for all known randomized algorithms for locally-checkable problems in this model. Furthermore, no deterministic sublogarithmic low-space MPC algorithms for the (Δ+1)-coloring problem have been known before.}, author = {Czumaj, Artur and Davies, Peter and Parter, Merav}, booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing}, isbn = {978-1-4503-8548-0}, location = {Virtual, Italy}, pages = {469–479}, publisher = {Association for Computing Machinery}, title = {{Improved deterministic (Δ+1) coloring in low-space MPC}}, doi = {10.1145/3465084.3467937}, year = {2021}, } @inproceedings{9933, abstract = {In this paper, we study the power and limitations of component-stable algorithms in the low-space model of Massively Parallel Computation (MPC). Recently Ghaffari, Kuhn and Uitto (FOCS 2019) introduced the class of component-stable low-space MPC algorithms, which are, informally, defined as algorithms for which the outputs reported by the nodes in different connected components are required to be independent. This very natural notion was introduced to capture most (if not all) of the known efficient MPC algorithms to date, and it was the first general class of MPC algorithms for which one can show non-trivial conditional lower bounds. In this paper we enhance the framework of component-stable algorithms and investigate its effect on the complexity of randomized and deterministic low-space MPC. Our key contributions include: 1) We revise and formalize the lifting approach of Ghaffari, Kuhn and Uitto. This requires a very delicate amendment of the notion of component stability, which allows us to fill in gaps in the earlier arguments. 2) We also extend the framework to obtain conditional lower bounds for deterministic algorithms and fine-grained lower bounds that depend on the maximum degree Δ. 3) We demonstrate a collection of natural graph problems for which non-component-stable algorithms break the conditional lower bound obtained for component-stable algorithms. This implies that, for both deterministic and randomized algorithms, component-stable algorithms are conditionally weaker than the non-component-stable ones. Altogether our results imply that component-stability might limit the computational power of the low-space MPC model, paving the way for improved upper bounds that escape the conditional lower bound setting of Ghaffari, Kuhn, and Uitto.}, author = {Czumaj, Artur and Davies, Peter and Parter, Merav}, booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing}, isbn = {9781450385480}, location = {Virtual, Italy}, pages = {481–491}, publisher = {Association for Computing Machinery}, title = {{Component stability in low-space massively parallel computation}}, doi = {10.1145/3465084.3467903}, year = {2021}, } @inproceedings{10432, abstract = {One key element behind the recent progress of machine learning has been the ability to train machine learning models in large-scale distributed shared-memory and message-passing environments. Most of these models are trained employing variants of stochastic gradient descent (SGD) based optimization, but most methods involve some type of consistency relaxation relative to sequential SGD, to mitigate its large communication or synchronization costs at scale. In this paper, we introduce a general consistency condition covering communication-reduced and asynchronous distributed SGD implementations. Our framework, called elastic consistency, decouples the system-specific aspects of the implementation from the SGD convergence requirements, giving a general way to obtain convergence bounds for a wide variety of distributed SGD methods used in practice. Elastic consistency can be used to re-derive or improve several previous convergence bounds in message-passing and shared-memory settings, but also to analyze new models and distribution schemes. As a direct application, we propose and analyze a new synchronization-avoiding scheduling scheme for distributed SGD, and show that it can be used to efficiently train deep convolutional models for image classification.}, author = {Nadiradze, Giorgi and Markov, Ilia and Chatterjee, Bapi and Kungurtsev, Vyacheslav and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, location = {Virtual}, number = {10}, pages = {9037--9045}, title = {{Elastic consistency: A practical consistency model for distributed stochastic gradient descent}}, volume = {35}, year = {2021}, }