@inproceedings{7635,
abstract = {Concurrent programming can be notoriously complex and error-prone. Programming bugs can arise from a variety of sources, such as operation re-reordering, or incomplete understanding of the memory model. A variety of formal and model checking methods have been developed to address this fundamental difficulty. While technically interesting, existing academic methods are still hard to apply to the large codebases typical of industrial deployments, which limits their practical impact.},
author = {Koval, Nikita and Sokolova, Mariia and Fedorov, Alexander and Alistarh, Dan-Adrian and Tsitelov, Dmitry},
booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP},
isbn = {9781450368186},
location = {San Diego, CA, United States},
pages = {423--424},
publisher = {ACM},
title = {{Testing concurrency on the JVM with Lincheck}},
doi = {10.1145/3332466.3374503},
year = {2020},
}
@inproceedings{7605,
abstract = {Union-Find (or Disjoint-Set Union) is one of the fundamental problems in computer science; it has been well-studied from both theoretical and practical perspectives in the sequential case. Recently, there has been mounting interest in analyzing this problem in the concurrent scenario, and several asymptotically-efficient algorithms have been proposed. Yet, to date, there is very little known about the practical performance of concurrent Union-Find. This work addresses this gap. We evaluate and analyze the performance of several concurrent Union-Find algorithms and optimization strategies across a wide range of platforms (Intel, AMD, and ARM) and workloads (social, random, and road networks, as well as integrations into more complex algorithms). We first observe that, due to the limited computational cost, the number of induced cache misses is the critical determining factor for the performance of existing algorithms. We introduce new techniques to reduce this cost by storing node priorities implicitly and by using plain reads and writes in a way that does not affect the correctness of the algorithms. Finally, we show that Union-Find implementations are an interesting application for Transactional Memory (TM): one of the fastest algorithm variants we discovered is a sequential one that uses coarse-grained locking with the lock elision optimization to reduce synchronization cost and increase scalability. },
author = {Alistarh, Dan-Adrian and Fedorov, Alexander and Koval, Nikita},
isbn = {9783959771337},
issn = {18688969},
location = {Neuchatal, Switzerland},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
title = {{In search of the fastest concurrent union-find algorithm}},
doi = {10.4230/LIPIcs.OPODIS.2019.15},
volume = {153},
year = {2020},
}
@inproceedings{7636,
abstract = {Balanced search trees typically use key comparisons to guide their operations, and achieve logarithmic running time. By relying on numerical properties of the keys, interpolation search achieves lower search complexity and better performance. Although interpolation-based data structures were investigated in the past, their non-blocking concurrent variants have received very little attention so far.
In this paper, we propose the first non-blocking implementation of the classic interpolation search tree (IST) data structure. For arbitrary key distributions, the data structure ensures worst-case O(log n + p) amortized time for search, insertion and deletion traversals. When the input key distributions are smooth, lookups run in expected O(log log n + p) time, and insertion and deletion run in expected amortized O(log log n + p) time, where p is a bound on the number of threads. To improve the scalability of concurrent insertion and deletion, we propose a novel parallel rebuilding technique, which should be of independent interest.
We evaluate whether the theoretical improvements translate to practice by implementing the concurrent interpolation search tree, and benchmarking it on uniform and nonuniform key distributions, for dataset sizes in the millions to billions of keys. Relative to the state-of-the-art concurrent data structures, the concurrent interpolation search tree achieves performance improvements of up to 15% under high update rates, and of up to 50% under moderate update rates. Further, ISTs exhibit up to 2X less cache-misses, and consume 1.2 -- 2.6X less memory compared to the next best alternative on typical dataset sizes. We find that the results are surprisingly robust to distributional skew, which suggests that our data structure can be a promising alternative to classic concurrent search structures.},
author = {Brown, Trevor A and Prokopec, Aleksandar and Alistarh, Dan-Adrian},
booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP},
isbn = {9781450368186},
location = {San Diego, CA, United States},
pages = {276--291},
publisher = {ACM},
title = {{Non-blocking interpolation search trees with doubly-logarithmic running time}},
doi = {10.1145/3332466.3374542},
year = {2020},
}
@inproceedings{6673,
abstract = {Several classic problems in graph processing and computational geometry are solved via incremental algorithms, which split computation into a series of small tasks acting on shared state, which gets updated progressively. While the sequential variant of such algorithms usually specifies a fixed (but sometimes random) order in which the tasks should be performed, a standard approach to parallelizing such algorithms is to relax this constraint to allow for out-of-order parallel execution. This is the case for parallel implementations of Dijkstra's single-source shortest-paths (SSSP) algorithm, and for parallel Delaunay mesh triangulation. While many software frameworks parallelize incremental computation in this way, it is still not well understood whether this relaxed ordering approach can still provide any complexity guarantees. In this paper, we address this problem, and analyze the efficiency guarantees provided by a range of incremental algorithms when parallelized via relaxed schedulers. We show that, for algorithms such as Delaunay mesh triangulation and sorting by insertion, schedulers with a maximum relaxation factor of k in terms of the maximum priority inversion allowed will introduce a maximum amount of wasted work of O(łog n poly(k)), where n is the number of tasks to be executed. For SSSP, we show that the additional work is O(poly(k), dmax / wmin), where dmax is the maximum distance between two nodes, and wmin is the minimum such distance. In practical settings where n >> k, this suggests that the overheads of relaxation will be outweighed by the improved scalability of the relaxed scheduler. On the negative side, we provide lower bounds showing that certain algorithms will inherently incur a non-trivial amount of wasted work due to scheduler relaxation, even for relatively benign relaxed schedulers.},
author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Koval, Nikita},
booktitle = {31st ACM Symposium on Parallelism in Algorithms and Architectures},
isbn = {9781450361842},
location = {Phoenix, AZ, United States},
pages = {145--154},
publisher = {ACM Press},
title = {{Efficiency guarantees for parallel incremental algorithms under relaxed schedulers}},
doi = {10.1145/3323165.3323201},
year = {2019},
}
@inproceedings{7122,
abstract = {Data-rich applications in machine-learning and control have motivated an intense research on large-scale optimization. Novel algorithms have been proposed and shown to have optimal convergence rates in terms of iteration counts. However, their practical performance is severely degraded by the cost of exchanging high-dimensional gradient vectors between computing nodes. Several gradient compression heuristics have recently been proposed to reduce communications, but few theoretical results exist that quantify how they impact algorithm convergence. This paper establishes and strengthens the convergence guarantees for gradient descent under a family of gradient compression techniques. For convex optimization problems, we derive admissible step sizes and quantify both the number of iterations and the number of bits that need to be exchanged to reach a target accuracy. Finally, we validate the performance of different gradient compression techniques in simulations. The numerical results highlight the properties of different gradient compression algorithms and confirm that fast convergence with limited information exchange is possible.},
author = {Khirirat, Sarit and Johansson, Mikael and Alistarh, Dan-Adrian},
booktitle = {2018 IEEE Conference on Decision and Control},
isbn = {9781538613955},
issn = {0743-1546},
location = {Miami Beach, FL, United States},
publisher = {IEEE},
title = {{Gradient compression for communication-limited convex optimization}},
doi = {10.1109/cdc.2018.8619625},
year = {2019},
}
@inproceedings{7228,
abstract = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) and actor models, which share data via explicit communication. These models have been known for almost half a century, and have recently had started to gain significant traction among modern programming languages. The common abstraction for communication between several processes is the channel. Although channels are similar to producer-consumer data structures, they have different semantics and support additional operations, such as the select expression. Despite their growing popularity, most known implementations of channels use lock-based data structures and can be rather inefficient.
In this paper, we present the first efficient lock-free algorithm for implementing a communication channel for CSP programming. We provide implementations and experimental results in the Kotlin and Go programming languages. Our new algorithm outperforms existing implementations on many workloads, while providing non-blocking progress guarantee. Our design can serve as an example of how to construct general communication data structures for CSP and actor models. },
author = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
isbn = {9783030293994},
issn = {16113349},
location = {Göttingen, Germany},
pages = {317--333},
publisher = {Springer Nature},
title = {{Scalable FIFO channels for programming via communicating sequential processes}},
doi = {10.1007/978-3-030-29400-7_23},
volume = {11725},
year = {2019},
}
@inproceedings{7437,
abstract = {Most of today's distributed machine learning systems assume reliable networks: whenever two machines exchange information (e.g., gradients or models), the network should guarantee the delivery of the message. At the same time, recent work exhibits the impressive tolerance of machine learning algorithms to errors or noise arising from relaxed communication or synchronization. In this paper, we connect these two trends, and consider the following question: Can we design machine learning systems that are tolerant to network unreliability during training? With this motivation, we focus on a theoretical problem of independent interest-given a standard distributed parameter server architecture, if every communication between the worker and the server has a non-zero probability p of being dropped, does there exist an algorithm that still converges, and at what speed? The technical contribution of this paper is a novel theoretical analysis proving that distributed learning over unreliable network can achieve comparable convergence rate to centralized or distributed learning over reliable networks. Further, we prove that the influence of the packet drop rate diminishes with the growth of the number of parameter servers. We map this theoretical result onto a real-world scenario, training deep neural networks over an unreliable network layer, and conduct network simulation to validate the system improvement by allowing the networks to be unreliable.},
author = {Yu, Chen and Tang, Hanlin and Renggli, Cedric and Kassing, Simon and Singla, Ankit and Alistarh, Dan-Adrian and Zhang, Ce and Liu, Ji},
booktitle = {36th International Conference on Machine Learning, ICML 2019},
isbn = {9781510886988},
location = {Long Beach, CA, United States},
pages = {12481--12512},
publisher = {IMLS},
title = {{Distributed learning over unreliable networks}},
volume = {2019-June},
year = {2019},
}
@misc{6485,
abstract = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) [1] and actor [2] models, which share data via explicit communication. Rendezvous channelis the common abstraction for communication between several processes, where senders and receivers perform a rendezvous handshake as a part of their protocol (senders wait for receivers and vice versa). Additionally to this, channels support the select expression. In this work, we present the first efficient lock-free channel algorithm, and compare it against Go [3] and Kotlin [4] baseline implementations.},
author = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
booktitle = {Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
isbn = {9781450362252},
location = {Washington, NY, United States},
pages = {417--418},
publisher = {ACM Press},
title = {{Lock-free channels for programming via communicating sequential processes}},
doi = {10.1145/3293883.3297000},
year = {2019},
}
@inproceedings{7201,
abstract = {Applying machine learning techniques to the quickly growing data in science and industry requires highly-scalable algorithms. Large datasets are most commonly processed "data parallel" distributed across many nodes. Each node's contribution to the overall gradient is summed using a global allreduce. This allreduce is the single communication and thus scalability bottleneck for most machine learning workloads. We observe that frequently, many gradient values are (close to) zero, leading to sparse of sparsifyable communications. To exploit this insight, we analyze, design, and implement a set of communication-efficient protocols for sparse input data, in conjunction with efficient machine learning algorithms which can leverage these primitives. Our communication protocols generalize standard collective operations, by allowing processes to contribute arbitrary sparse input data vectors. Our generic communication library, SparCML1, extends MPI to support additional features, such as non-blocking (asynchronous) operations and low-precision data representations. As such, SparCML and its techniques will form the basis of future highly-scalable machine learning frameworks.},
author = {Renggli, Cedric and Ashkboos, Saleh and Aghagolzadeh, Mehdi and Alistarh, Dan-Adrian and Hoefler, Torsten},
booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis, SC},
isbn = {9781450362290},
issn = {21674337},
location = {Denver, CO, Unites States},
publisher = {ACM},
title = {{SparCML: High-performance sparse communication for machine learning}},
doi = {10.1145/3295500.3356222},
year = {2019},
}
@inproceedings{6676,
abstract = {It is impossible to deterministically solve wait-free consensus in an asynchronous system. The classic proof uses a valency argument, which constructs an infinite execution by repeatedly extending a finite execution. We introduce extension-based proofs, a class of impossibility proofs that are modelled as an interaction between a prover and a protocol and that include valency arguments.
Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve k-set agreement among n > k ≥ 2 processes in a wait-free manner. However, it was unknown whether proofs based on simpler techniques were possible. We show that this impossibility result cannot be obtained by an extension-based proof and, hence, extension-based proofs are limited in power.},
author = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
booktitle = {Proceedings of the 51st Annual ACM SIGACT Symposium on Theory of Computing},
isbn = {9781450367059},
location = {Phoenix, AZ, United States},
pages = {986--996},
publisher = {ACM Press},
title = {{Why extension-based proofs fail}},
doi = {10.1145/3313276.3316407},
year = {2019},
}
@inproceedings{7542,
abstract = {We present a novel class of convolutional neural networks (CNNs) for set functions,i.e., data indexed with the powerset of a finite set. The convolutions are derivedas linear, shift-equivariant functions for various notions of shifts on set functions.The framework is fundamentally different from graph convolutions based on theLaplacian, as it provides not one but several basic shifts, one for each element inthe ground set. Prototypical experiments with several set function classificationtasks on synthetic datasets and on datasets derived from real-world hypergraphsdemonstrate the potential of our new powerset CNNs.},
author = {Wendler, Chris and Alistarh, Dan-Adrian and Püschel, Markus},
location = {Vancouver, Canada},
pages = {927--938},
publisher = {Neural Information Processing Systems Foundation},
title = {{Powerset convolutional neural networks}},
volume = {32},
year = {2019},
}
@inproceedings{5961,
abstract = {The area of machine learning has made considerable progress over the past decade, enabled by the widespread availability of large datasets, as well as by improved algorithms and models. Given the large computational demands of machine learning workloads, parallelism, implemented either through single-node concurrency or through multi-node distribution, has been a third key ingredient to advances in machine learning.
The goal of this tutorial is to provide the audience with an overview of standard distribution techniques in machine learning, with an eye towards the intriguing trade-offs between synchronization and communication costs of distributed machine learning algorithms, on the one hand, and their convergence, on the other.The tutorial will focus on parallelization strategies for the fundamental stochastic gradient descent (SGD) algorithm, which is a key tool when training machine learning models, from classical instances such as linear regression, to state-of-the-art neural network architectures.
The tutorial will describe the guarantees provided by this algorithm in the sequential case, and then move on to cover both shared-memory and message-passing parallelization strategies, together with the guarantees they provide, and corresponding trade-offs. The presentation will conclude with a broad overview of ongoing research in distributed and concurrent machine learning. The tutorial will assume no prior knowledge beyond familiarity with basic concepts in algebra and analysis.
},
author = {Alistarh, Dan-Adrian},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {487--488},
publisher = {ACM Press},
title = {{A brief tutorial on distributed and concurrent machine learning}},
doi = {10.1145/3212734.3212798},
year = {2018},
}
@inproceedings{5966,
abstract = {The transactional conflict problem arises in transactional systems whenever two or more concurrent transactions clash on a data item. While the standard solution to such conflicts is to immediately abort one of the transactions, some practical systems consider the alternative of delaying conflict resolution for a short interval, which may allow one of the transactions to commit. The challenge in the transactional conflict problem is to choose the optimal length of this delay interval so as to minimize the overall running time penalty for the conflicting transactions. In this paper, we propose a family of optimal online algorithms for the transactional conflict problem. Specifically, we consider variants of this problem which arise in different implementations of transactional systems, namely "requestor wins'' and "requestor aborts'' implementations: in the former, the recipient of a coherence request is aborted, whereas in the latter, it is the requestor which has to abort. Both strategies are implemented by real systems. We show that the requestor aborts case can be reduced to a classic instance of the ski rental problem, while the requestor wins case leads to a new version of this classical problem, for which we derive optimal deterministic and randomized algorithms. Moreover, we prove that, under a simplified adversarial model, our algorithms are constant-competitive with the offline optimum in terms of throughput. We validate our algorithmic results empirically through a hardware simulation of hardware transactional memory (HTM), showing that our algorithms can lead to non-trivial performance improvements for classic concurrent data structures.},
author = {Alistarh, Dan-Adrian and Haider, Syed Kamran and Kübler, Raphael and Nadiradze, Giorgi},
booktitle = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures - SPAA '18},
isbn = {9781450357999},
location = {Vienna, Austria},
pages = {383--392},
publisher = {ACM Press},
title = {{The transactional conflict problem}},
doi = {10.1145/3210377.3210406},
year = {2018},
}
@inproceedings{6558,
abstract = {This paper studies the problem of distributed stochastic optimization in an adversarial setting where, out of m machines which allegedly compute stochastic gradients every iteration, an α-fraction are Byzantine, and may behave adversarially. Our main result is a variant of stochastic gradient descent (SGD) which finds ε-approximate minimizers of convex functions in T=O~(1/ε²m+α²/ε²) iterations. In contrast, traditional mini-batch SGD needs T=O(1/ε²m) iterations, but cannot tolerate Byzantine failures. Further, we provide a lower bound showing that, up to logarithmic factors, our algorithm is information-theoretically optimal both in terms of sample complexity and time complexity.},
author = {Alistarh, Dan-Adrian and Allen-Zhu, Zeyuan and Li, Jerry},
booktitle = {Advances in Neural Information Processing Systems},
editor = {Bengio, S. and Wallach, H. and Larochelle, H. and Grauman, K. and Cesa-Bianchi, N. and Garnett, R.},
location = {Montreal, Canada},
pages = {4613--4623},
publisher = {Neural Information Processing Systems Foundation},
title = {{Byzantine Stochastic Gradient Descent}},
volume = {Volume 2018},
year = {2018},
}
@inproceedings{6589,
abstract = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
author = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
booktitle = {Advances in Neural Information Processing Systems 31},
location = {Montreal, Canada},
pages = {5973--5983},
publisher = {Neural information processing systems},
title = {{The convergence of sparsified gradient methods}},
volume = {Volume 2018},
year = {2018},
}
@article{536,
abstract = {We consider the problem of consensus in the challenging classic model. In this model, the adversary is adaptive; it can choose which processors crash at any point during the course of the algorithm. Further, communication is via asynchronous message passing: there is no known upper bound on the time to send a message from one processor to another, and all messages and coin flips are seen by the adversary. We describe a new randomized consensus protocol with expected message complexity O(n2log2n) when fewer than n / 2 processes may fail by crashing. This is an almost-linear improvement over the best previously known protocol, and within logarithmic factors of a known Ω(n2) message lower bound. The protocol further ensures that no process sends more than O(nlog3n) messages in expectation, which is again within logarithmic factors of optimal. We also present a generalization of the algorithm to an arbitrary number of failures t, which uses expected O(nt+t2log2t) total messages. Our approach is to build a message-efficient, resilient mechanism for aggregating individual processor votes, implementing the message-passing equivalent of a weak shared coin. Roughly, in our protocol, a processor first announces its votes to small groups, then propagates them to increasingly larger groups as it generates more and more votes. To bound the number of messages that an individual process might have to send or receive, the protocol progressively increases the weight of generated votes. The main technical challenge is bounding the impact of votes that are still “in flight” (generated, but not fully propagated) on the final outcome of the shared coin, especially since such votes might have different weights. We achieve this by leveraging the structure of the algorithm, and a technical argument based on martingale concentration bounds. Overall, we show that it is possible to build an efficient message-passing implementation of a shared coin, and in the process (almost-optimally) solve the classic consensus problem in the asynchronous message-passing model.},
author = {Alistarh, Dan-Adrian and Aspnes, James and King, Valerie and Saia, Jared},
issn = {01782770},
journal = {Distributed Computing},
number = {6},
pages = {489--501},
publisher = {Springer},
title = {{Communication-efficient randomized consensus}},
doi = {10.1007/s00446-017-0315-1},
volume = {31},
year = {2018},
}
@inproceedings{5962,
abstract = {Stochastic Gradient Descent (SGD) is a fundamental algorithm in machine learning, representing the optimization backbone for training several classic models, from regression to neural networks. Given the recent practical focus on distributed machine learning, significant work has been dedicated to the convergence properties of this algorithm under the inconsistent and noisy updates arising from execution in a distributed environment. However, surprisingly, the convergence properties of this classic algorithm in the standard shared-memory model are still not well-understood. In this work, we address this gap, and provide new convergence bounds for lock-free concurrent stochastic gradient descent, executing in the classic asynchronous shared memory model, against a strong adaptive adversary. Our results give improved upper and lower bounds on the "price of asynchrony'' when executing the fundamental SGD algorithm in a concurrent setting. They show that this classic optimization tool can converge faster and with a wider range of parameters than previously known under asynchronous iterations. At the same time, we exhibit a fundamental trade-off between the maximum delay in the system and the rate at which SGD can converge, which governs the set of parameters under which this algorithm can still work efficiently.},
author = {Alistarh, Dan-Adrian and De Sa, Christopher and Konstantinov, Nikola H},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {169--178},
publisher = {ACM Press},
title = {{The convergence of stochastic gradient descent in asynchronous shared memory}},
doi = {10.1145/3212734.3212763},
year = {2018},
}
@inproceedings{7812,
abstract = {Deep neural networks (DNNs) continue to make significant advances, solving tasks from image classification to translation or reinforcement learning. One aspect of the field receiving considerable attention is efficiently executing deep models in resource-constrained environments, such as mobile or embedded devices. This paper focuses on this problem, and proposes two new compression methods, which jointly leverage weight quantization and distillation of larger teacher networks into smaller student networks. The first method we propose is called quantized distillation and leverages distillation during the training process, by incorporating distillation loss, expressed with respect to the teacher, into the training of a student network whose weights are quantized to a limited set of levels. The second method, differentiable quantization, optimizes the location of quantization points through stochastic gradient descent, to better fit the behavior of the teacher model. We validate both methods through experiments on convolutional and recurrent architectures. We show that quantized shallow students can reach similar accuracy levels to full-precision teacher models, while providing order of magnitude compression, and inference speedup that is linear in the depth reduction. In sum, our results enable DNNs for resource-constrained environments to leverage architecture and accuracy advances developed on more powerful devices.},
author = {Polino, Antonio and Pascanu, Razvan and Alistarh, Dan-Adrian},
booktitle = {6th International Conference on Learning Representations},
location = {Vancouver, Canada},
title = {{Model compression via distillation and quantization}},
year = {2018},
}
@inproceedings{5963,
abstract = {There has been significant progress in understanding the parallelism inherent to iterative sequential algorithms: for many classic algorithms, the depth of the dependence structure is now well understood, and scheduling techniques have been developed to exploit this shallow dependence structure for efficient parallel implementations. A related, applied research strand has studied methods by which certain iterative task-based algorithms can be efficiently parallelized via relaxed concurrent priority schedulers. These allow for high concurrency when inserting and removing tasks, at the cost of executing superfluous work due to the relaxed semantics of the scheduler. In this work, we take a step towards unifying these two research directions, by showing that there exists a family of relaxed priority schedulers that can efficiently and deterministically execute classic iterative algorithms such as greedy maximal independent set (MIS) and matching. Our primary result shows that, given a randomized scheduler with an expected relaxation factor of k in terms of the maximum allowed priority inversions on a task, and any graph on n vertices, the scheduler is able to execute greedy MIS with only an additive factor of \poly(k) expected additional iterations compared to an exact (but not scalable) scheduler. This counter-intuitive result demonstrates that the overhead of relaxation when computing MIS is not dependent on the input size or structure of the input graph. Experimental results show that this overhead can be clearly offset by the gain in performance due to the highly scalable scheduler. In sum, we present an efficient method to deterministically parallelize iterative sequential algorithms, with provable runtime guarantees in terms of the number of executed tasks to completion.},
author = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Nadiradze, Giorgi},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {377--386},
publisher = {ACM Press},
title = {{Relaxed schedulers can efficiently parallelize iterative algorithms}},
doi = {10.1145/3212734.3212756},
year = {2018},
}
@inproceedings{6031,
abstract = {We introduce Clover, a new library for efficient computation using low-precision data, providing mathematical routines required by fundamental methods in optimization and sparse recovery. Our library faithfully implements variants of stochastic quantization that guarantee convergence at low precision, and supports data formats from 4-bit quantized to 32-bit IEEE-754 on current Intel processors. In particular, we show that 4-bit can be implemented efficiently using Intel AVX despite the lack of native support for this data format. Experimental results with dot product, matrix-vector multiplication (MVM), gradient descent (GD), and iterative hard thresholding (IHT) demonstrate that the attainable speedups are in many cases close to linear with respect to the reduction of precision due to reduced data movement. Finally, for GD and IHT, we show examples of absolute speedup achieved by 4-bit versus 32-bit, by iterating until a given target error is achieved.},
author = {Stojanov, Alen and Smith, Tyler Michael and Alistarh, Dan-Adrian and Puschel, Markus},
booktitle = {2018 IEEE International Workshop on Signal Processing Systems},
location = {Cape Town, South Africa},
publisher = {IEEE},
title = {{Fast quantized arithmetic on x86: Trading compute for data movement}},
doi = {10.1109/SiPS.2018.8598402},
volume = {2018-October},
year = {2018},
}