@inproceedings{13262, abstract = {Determining the degree of inherent parallelism in classical sequential algorithms and leveraging it for fast parallel execution is a key topic in parallel computing, and detailed analyses are known for a wide range of classical algorithms. In this paper, we perform the first such analysis for the fundamental Union-Find problem, in which we are given a graph as a sequence of edges, and must maintain its connectivity structure under edge additions. We prove that classic sequential algorithms for this problem are well-parallelizable under reasonable assumptions, addressing a conjecture by [Blelloch, 2017]. More precisely, we show via a new potential argument that, under uniform random edge ordering, parallel union-find operations are unlikely to interfere: T concurrent threads processing the graph in parallel will encounter memory contention O(T2 · log |V| · log |E|) times in expectation, where |E| and |V| are the number of edges and nodes in the graph, respectively. We leverage this result to design a new parallel Union-Find algorithm that is both internally deterministic, i.e., its results are guaranteed to match those of a sequential execution, but also work-efficient and scalable, as long as the number of threads T is O(|E|1 over 3 - ε), for an arbitrarily small constant ε > 0, which holds for most large real-world graphs. We present lower bounds which show that our analysis is close to optimal, and experimental results suggesting that the performance cost of internal determinism is limited.}, author = {Fedorov, Alexander and Hashemi, Diba and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 35th ACM Symposium on Parallelism in Algorithms and Architectures}, isbn = {9781450395458}, location = {Orlando, FL, United States}, pages = {261--271}, publisher = {Association for Computing Machinery}, title = {{Provably-efficient and internally-deterministic parallel Union-Find}}, doi = {10.1145/3558481.3591082}, year = {2023}, } @inproceedings{11180, abstract = {Designing and implementing efficient parallel priority schedulers is an active research area. An intriguing proposed design is the Multi-Queue: given n threads and m ≥ n distinct priority queues, task insertions are performed uniformly at random, while, to delete, a thread picks two queues uniformly at random, and removes the observed task of higher priority. This approach scales well, and has probabilistic rank guarantees: roughly, the rank of each task removed, relative to remaining tasks in all other queues, is O (m) in expectation. Yet, the performance of this pattern is below that of well-engineered schedulers, which eschew theoretical guarantees for practical efficiency. We investigate whether it is possible to design and implement a Multi-Queue-based task scheduler that is both highly-efficient and has analytical guarantees. We propose a new variant called the Stealing Multi-Queue (SMQ), a cache-efficient variant of the Multi-Queue, which leverages both queue affinity---each thread has a local queue, from which tasks are usually removed; but, with some probability, threads also attempt to steal higher-priority tasks from the other queues---and task batching, that is, the processing of several tasks in a single insert / remove step. These ideas are well-known for task scheduling without priorities; our theoretical contribution is showing that, despite relaxations, this design can still provide rank guarantees, which in turn implies bounds on total work performed. We provide a general SMQ implementation which can surpass state-of-the-art schedulers such as OBIM and PMOD in terms of performance on popular graph-processing benchmarks. Notably, the performance improvement comes mainly from the superior rank guarantees provided by our scheduler, confirming that analytically-reasoned approaches can still provide performance improvements for priority task scheduling.}, author = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9781450392044}, location = {Seoul, Republic of Korea}, pages = {353--367}, publisher = {Association for Computing Machinery}, title = {{Multi-queues can be state-of-the-art priority schedulers}}, doi = {10.1145/3503221.3508432}, year = {2022}, } @misc{13076, abstract = {The source code for replicating experiments presented in the paper. The implementation of the designed priority schedulers can be found in Galois-2.2.1/include/Galois/WorkList/: StealingMultiQueue.h is the StealingMultiQueue. MQOptimized/ contains MQ Optimized variants. We provide images that contain all the dependencies and datasets. Images can be pulled from npostnikova/mq-based-schedulers repository, or downloaded from Zenodo. See readme for more detail.}, author = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian}, publisher = {Zenodo}, title = {{Multi-queues can be state-of-the-art priority schedulers}}, doi = {10.5281/ZENODO.5733408}, year = {2022}, } @inproceedings{10217, abstract = {This paper gives tight logarithmic lower bounds on the solo step complexity of leader election in an asynchronous shared-memory model with single-writer multi-reader (SWMR) registers, for both deterministic and randomized obstruction-free algorithms. The approach extends to lower bounds for deterministic and randomized obstruction-free algorithms using multi-writer registers under bounded write concurrency, showing a trade-off between the solo step complexity of a leader election algorithm, and the worst-case number of stalls incurred by a processor in an execution.}, author = {Alistarh, Dan-Adrian and Gelashvili, Rati and Nadiradze, Giorgi}, booktitle = {35th International Symposium on Distributed Computing}, isbn = {9-783-9597-7210-5}, issn = {1868-8969}, location = {Freiburg, Germany}, publisher = {Schloss Dagstuhl - Leibniz Zentrum für Informatik}, title = {{Lower bounds for shared-memory leader election under bounded write contention}}, doi = {10.4230/LIPIcs.DISC.2021.4}, volume = {209}, year = {2021}, } @article{8723, abstract = {Deep learning at scale is dominated by communication time. Distributing samples across nodes usually yields the best performance, but poses scaling challenges due to global information dissemination and load imbalance across uneven sample lengths. State-of-the-art decentralized optimizers mitigate the problem, but require more iterations to achieve the same accuracy as their globally-communicating counterparts. We present Wait-Avoiding Group Model Averaging (WAGMA) SGD, a wait-avoiding stochastic optimizer that reduces global communication via subgroup weight exchange. The key insight is a combination of algorithmic changes to the averaging scheme and the use of a group allreduce operation. We prove the convergence of WAGMA-SGD, and empirically show that it retains convergence rates similar to Allreduce-SGD. For evaluation, we train ResNet-50 on ImageNet; Transformer for machine translation; and deep reinforcement learning for navigation at scale. Compared with state-of-the-art decentralized SGD variants, WAGMA-SGD significantly improves training throughput (e.g., 2.1× on 1,024 GPUs for reinforcement learning), and achieves the fastest time-to-solution (e.g., the highest score using the shortest training time for Transformer).}, author = {Li, Shigang and Tal Ben-Nun, Tal Ben-Nun and Nadiradze, Giorgi and Girolamo, Salvatore Di and Dryden, Nikoli and Alistarh, Dan-Adrian and Hoefler, Torsten}, issn = {10459219}, journal = {IEEE Transactions on Parallel and Distributed Systems}, number = {7}, publisher = {IEEE}, title = {{Breaking (global) barriers in parallel stochastic optimization with wait-avoiding group averaging}}, doi = {10.1109/TPDS.2020.3040606}, volume = {32}, year = {2021}, } @inproceedings{10432, abstract = {One key element behind the recent progress of machine learning has been the ability to train machine learning models in large-scale distributed shared-memory and message-passing environments. Most of these models are trained employing variants of stochastic gradient descent (SGD) based optimization, but most methods involve some type of consistency relaxation relative to sequential SGD, to mitigate its large communication or synchronization costs at scale. In this paper, we introduce a general consistency condition covering communication-reduced and asynchronous distributed SGD implementations. Our framework, called elastic consistency, decouples the system-specific aspects of the implementation from the SGD convergence requirements, giving a general way to obtain convergence bounds for a wide variety of distributed SGD methods used in practice. Elastic consistency can be used to re-derive or improve several previous convergence bounds in message-passing and shared-memory settings, but also to analyze new models and distribution schemes. As a direct application, we propose and analyze a new synchronization-avoiding scheduling scheme for distributed SGD, and show that it can be used to efficiently train deep convolutional models for image classification.}, author = {Nadiradze, Giorgi and Markov, Ilia and Chatterjee, Bapi and Kungurtsev, Vyacheslav and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, location = {Virtual}, number = {10}, pages = {9037--9045}, title = {{Elastic consistency: A practical consistency model for distributed stochastic gradient descent}}, volume = {35}, year = {2021}, } @phdthesis{10429, abstract = {The scalability of concurrent data structures and distributed algorithms strongly depends on reducing the contention for shared resources and the costs of synchronization and communication. We show how such cost reductions can be attained by relaxing the strict consistency conditions required by sequential implementations. In the first part of the thesis, we consider relaxation in the context of concurrent data structures. Specifically, in data structures such as priority queues, imposing strong semantics renders scalability impossible, since a correct implementation of the remove operation should return only the element with highest priority. Intuitively, attempting to invoke remove operations concurrently creates a race condition. This bottleneck can be circumvented by relaxing semantics of the affected data structure, thus allowing removal of the elements which are no longer required to have the highest priority. We prove that the randomized implementations of relaxed data structures provide provable guarantees on the priority of the removed elements even under concurrency. Additionally, we show that in some cases the relaxed data structures can be used to scale the classical algorithms which are usually implemented with the exact ones. In the second part, we study parallel variants of the stochastic gradient descent (SGD) algorithm, which distribute computation among the multiple processors, thus reducing the running time. Unfortunately, in order for standard parallel SGD to succeed, each processor has to maintain a local copy of the necessary model parameter, which is identical to the local copies of other processors; the overheads from this perfect consistency in terms of communication and synchronization can negate the speedup gained by distributing the computation. We show that the consistency conditions required by SGD can be relaxed, allowing the algorithm to be more flexible in terms of tolerating quantized communication, asynchrony, or even crash faults, while its convergence remains asymptotically the same.}, author = {Nadiradze, Giorgi}, issn = {2663-337X}, pages = {132}, publisher = {Institute of Science and Technology Austria}, title = {{On achieving scalability through relaxation}}, doi = {10.15479/at:ista:10429}, year = {2021}, } @inproceedings{10435, abstract = {Decentralized optimization is emerging as a viable alternative for scalable distributed machine learning, but also introduces new challenges in terms of synchronization costs. To this end, several communication-reduction techniques, such as non-blocking communication, quantization, and local steps, have been explored in the decentralized setting. Due to the complexity of analyzing optimization in such a relaxed setting, this line of work often assumes \emph{global} communication rounds, which require additional synchronization. In this paper, we consider decentralized optimization in the simpler, but harder to analyze, \emph{asynchronous gossip} model, in which communication occurs in discrete, randomly chosen pairings among nodes. Perhaps surprisingly, we show that a variant of SGD called \emph{SwarmSGD} still converges in this setting, even if \emph{non-blocking communication}, \emph{quantization}, and \emph{local steps} are all applied \emph{in conjunction}, and even if the node data distributions and underlying graph topology are both \emph{heterogenous}. Our analysis is based on a new connection with multi-dimensional load-balancing processes. We implement this algorithm and deploy it in a super-computing environment, showing that it can outperform previous decentralized methods in terms of end-to-end training time, and that it can even rival carefully-tuned large-batch SGD for certain tasks.}, author = {Nadiradze, Giorgi and Sabour, Amirmojtaba and Davies, Peter and Li, Shigang and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, location = {Sydney, Australia}, publisher = {Neural Information Processing Systems Foundation}, title = {{Asynchronous decentralized SGD with quantized and local updates}}, year = {2021}, } @article{8286, abstract = {We consider the following dynamic load-balancing process: given an underlying graph G with n nodes, in each step t≥ 0, one unit of load is created, and placed at a randomly chosen graph node. In the same step, the chosen node picks a random neighbor, and the two nodes balance their loads by averaging them. We are interested in the expected gap between the minimum and maximum loads at nodes as the process progresses, and its dependence on n and on the graph structure. Variants of the above graphical balanced allocation process have been studied previously by Peres, Talwar, and Wieder [Peres et al., 2015], and by Sauerwald and Sun [Sauerwald and Sun, 2015]. These authors left as open the question of characterizing the gap in the case of cycle graphs in the dynamic case, where weights are created during the algorithm’s execution. For this case, the only known upper bound is of 𝒪(n log n), following from a majorization argument due to [Peres et al., 2015], which analyzes a related graphical allocation process. In this paper, we provide an upper bound of 𝒪 (√n log n) on the expected gap of the above process for cycles of length n. We introduce a new potential analysis technique, which enables us to bound the difference in load between k-hop neighbors on the cycle, for any k ≤ n/2. We complement this with a "gap covering" argument, which bounds the maximum value of the gap by bounding its value across all possible subsets of a certain structure, and recursively bounding the gaps within each subset. We provide analytical and experimental evidence that our upper bound on the gap is tight up to a logarithmic factor. }, author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Sabour, Amirmojtaba}, issn = {1432-0541}, journal = {Algorithmica}, location = {Virtual, Online; Germany}, publisher = {Springer Nature}, title = {{Dynamic averaging load balancing on cycles}}, doi = {10.1007/s00453-021-00905-9}, year = {2021}, } @inproceedings{15077, abstract = {We consider the following dynamic load-balancing process: given an underlying graph G with n nodes, in each step t≥ 0, one unit of load is created, and placed at a randomly chosen graph node. In the same step, the chosen node picks a random neighbor, and the two nodes balance their loads by averaging them. We are interested in the expected gap between the minimum and maximum loads at nodes as the process progresses, and its dependence on n and on the graph structure. Variants of the above graphical balanced allocation process have been studied previously by Peres, Talwar, and Wieder [Peres et al., 2015], and by Sauerwald and Sun [Sauerwald and Sun, 2015]. These authors left as open the question of characterizing the gap in the case of cycle graphs in the dynamic case, where weights are created during the algorithm’s execution. For this case, the only known upper bound is of 𝒪(n log n), following from a majorization argument due to [Peres et al., 2015], which analyzes a related graphical allocation process. In this paper, we provide an upper bound of 𝒪 (√n log n) on the expected gap of the above process for cycles of length n. We introduce a new potential analysis technique, which enables us to bound the difference in load between k-hop neighbors on the cycle, for any k ≤ n/2. We complement this with a "gap covering" argument, which bounds the maximum value of the gap by bounding its value across all possible subsets of a certain structure, and recursively bounding the gaps within each subset. We provide analytical and experimental evidence that our upper bound on the gap is tight up to a logarithmic factor.}, author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Sabour, Amirmojtaba}, booktitle = {47th International Colloquium on Automata, Languages, and Programming}, location = {Saarbrücken, Germany, Virtual}, publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik}, title = {{Dynamic averaging load balancing on cycles}}, doi = {10.4230/LIPIcs.ICALP.2020.7}, volume = {168}, year = {2020}, } @inproceedings{6673, abstract = {Several classic problems in graph processing and computational geometry are solved via incremental algorithms, which split computation into a series of small tasks acting on shared state, which gets updated progressively. While the sequential variant of such algorithms usually specifies a fixed (but sometimes random) order in which the tasks should be performed, a standard approach to parallelizing such algorithms is to relax this constraint to allow for out-of-order parallel execution. This is the case for parallel implementations of Dijkstra's single-source shortest-paths (SSSP) algorithm, and for parallel Delaunay mesh triangulation. While many software frameworks parallelize incremental computation in this way, it is still not well understood whether this relaxed ordering approach can still provide any complexity guarantees. In this paper, we address this problem, and analyze the efficiency guarantees provided by a range of incremental algorithms when parallelized via relaxed schedulers. We show that, for algorithms such as Delaunay mesh triangulation and sorting by insertion, schedulers with a maximum relaxation factor of k in terms of the maximum priority inversion allowed will introduce a maximum amount of wasted work of O(łog n poly(k)), where n is the number of tasks to be executed. For SSSP, we show that the additional work is O(poly(k), dmax / wmin), where dmax is the maximum distance between two nodes, and wmin is the minimum such distance. In practical settings where n >> k, this suggests that the overheads of relaxation will be outweighed by the improved scalability of the relaxed scheduler. On the negative side, we provide lower bounds showing that certain algorithms will inherently incur a non-trivial amount of wasted work due to scheduler relaxation, even for relatively benign relaxed schedulers.}, author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Koval, Nikita}, booktitle = {31st ACM Symposium on Parallelism in Algorithms and Architectures}, isbn = {9781450361842}, location = {Phoenix, AZ, United States}, pages = {145--154}, publisher = {ACM Press}, title = {{Efficiency guarantees for parallel incremental algorithms under relaxed schedulers}}, doi = {10.1145/3323165.3323201}, year = {2019}, } @inproceedings{791, abstract = {Consider the following random process: we are given n queues, into which elements of increasing labels are inserted uniformly at random. To remove an element, we pick two queues at random, and remove the element of lower label (higher priority) among the two. The cost of a removal is the rank of the label removed, among labels still present in any of the queues, that is, the distance from the optimal choice at each step. Variants of this strategy are prevalent in state-of-the-art concurrent priority queue implementations. Nonetheless, it is not known whether such implementations provide any rank guarantees, even in a sequential model. We answer this question, showing that this strategy provides surprisingly strong guarantees: Although the single-choice process, where we always insert and remove from a single randomly chosen queue, has degrading cost, going to infinity as we increase the number of steps, in the two choice process, the expected rank of a removed element is O(n) while the expected worst-case cost is O(n log n). These bounds are tight, and hold irrespective of the number of steps for which we run the process. The argument is based on a new technical connection between "heavily loaded" balls-into-bins processes and priority scheduling. Our analytic results inspire a new concurrent priority queue implementation, which improves upon the state of the art in terms of practical performance.}, author = {Alistarh, Dan-Adrian and Kopinsky, Justin and Li, Jerry and Nadiradze, Giorgi}, booktitle = {Proceedings of the ACM Symposium on Principles of Distributed Computing}, isbn = {978-145034992-5}, location = {Washington, WA, USA}, pages = {283 -- 292}, publisher = {ACM}, title = {{The power of choice in priority scheduling}}, doi = {10.1145/3087801.3087810}, volume = {Part F129314}, year = {2017}, }