@inproceedings{6673,
abstract = {Several classic problems in graph processing and computational geometry are solved via incremental algorithms, which split computation into a series of small tasks acting on shared state, which gets updated progressively. While the sequential variant of such algorithms usually specifies a fixed (but sometimes random) order in which the tasks should be performed, a standard approach to parallelizing such algorithms is to relax this constraint to allow for out-of-order parallel execution. This is the case for parallel implementations of Dijkstra's single-source shortest-paths (SSSP) algorithm, and for parallel Delaunay mesh triangulation. While many software frameworks parallelize incremental computation in this way, it is still not well understood whether this relaxed ordering approach can still provide any complexity guarantees. In this paper, we address this problem, and analyze the efficiency guarantees provided by a range of incremental algorithms when parallelized via relaxed schedulers. We show that, for algorithms such as Delaunay mesh triangulation and sorting by insertion, schedulers with a maximum relaxation factor of k in terms of the maximum priority inversion allowed will introduce a maximum amount of wasted work of O(łog n poly(k)), where n is the number of tasks to be executed. For SSSP, we show that the additional work is O(poly(k), dmax / wmin), where dmax is the maximum distance between two nodes, and wmin is the minimum such distance. In practical settings where n >> k, this suggests that the overheads of relaxation will be outweighed by the improved scalability of the relaxed scheduler. On the negative side, we provide lower bounds showing that certain algorithms will inherently incur a non-trivial amount of wasted work due to scheduler relaxation, even for relatively benign relaxed schedulers.},
author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Koval, Nikita},
booktitle = {31st ACM Symposium on Parallelism in Algorithms and Architectures},
isbn = {9781450361842},
location = {Phoenix, AZ, United States},
pages = {145--154},
publisher = {ACM Press},
title = {{Efficiency guarantees for parallel incremental algorithms under relaxed schedulers}},
doi = {10.1145/3323165.3323201},
year = {2019},
}
@inproceedings{7122,
abstract = {Data-rich applications in machine-learning and control have motivated an intense research on large-scale optimization. Novel algorithms have been proposed and shown to have optimal convergence rates in terms of iteration counts. However, their practical performance is severely degraded by the cost of exchanging high-dimensional gradient vectors between computing nodes. Several gradient compression heuristics have recently been proposed to reduce communications, but few theoretical results exist that quantify how they impact algorithm convergence. This paper establishes and strengthens the convergence guarantees for gradient descent under a family of gradient compression techniques. For convex optimization problems, we derive admissible step sizes and quantify both the number of iterations and the number of bits that need to be exchanged to reach a target accuracy. Finally, we validate the performance of different gradient compression techniques in simulations. The numerical results highlight the properties of different gradient compression algorithms and confirm that fast convergence with limited information exchange is possible.},
author = {Khirirat, Sarit and Johansson, Mikael and Alistarh, Dan-Adrian},
booktitle = {2018 IEEE Conference on Decision and Control},
isbn = {9781538613955},
issn = {0743-1546},
location = {Miami Beach, FL, United States},
publisher = {IEEE},
title = {{Gradient compression for communication-limited convex optimization}},
doi = {10.1109/cdc.2018.8619625},
year = {2019},
}
@inproceedings{7228,
abstract = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) and actor models, which share data via explicit communication. These models have been known for almost half a century, and have recently had started to gain significant traction among modern programming languages. The common abstraction for communication between several processes is the channel. Although channels are similar to producer-consumer data structures, they have different semantics and support additional operations, such as the select expression. Despite their growing popularity, most known implementations of channels use lock-based data structures and can be rather inefficient.
In this paper, we present the first efficient lock-free algorithm for implementing a communication channel for CSP programming. We provide implementations and experimental results in the Kotlin and Go programming languages. Our new algorithm outperforms existing implementations on many workloads, while providing non-blocking progress guarantee. Our design can serve as an example of how to construct general communication data structures for CSP and actor models. },
author = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
isbn = {9783030293994},
issn = {16113349},
location = {Göttingen, Germany},
pages = {317--333},
publisher = {Springer Nature},
title = {{Scalable FIFO channels for programming via communicating sequential processes}},
doi = {10.1007/978-3-030-29400-7_23},
volume = {11725},
year = {2019},
}
@misc{6485,
abstract = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) [1] and actor [2] models, which share data via explicit communication. Rendezvous channelis the common abstraction for communication between several processes, where senders and receivers perform a rendezvous handshake as a part of their protocol (senders wait for receivers and vice versa). Additionally to this, channels support the select expression. In this work, we present the first efficient lock-free channel algorithm, and compare it against Go [3] and Kotlin [4] baseline implementations.},
author = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
booktitle = {Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
isbn = {9781450362252},
location = {Washington, NY, United States},
pages = {417--418},
publisher = {ACM Press},
title = {{Lock-free channels for programming via communicating sequential processes}},
doi = {10.1145/3293883.3297000},
year = {2019},
}
@inproceedings{6676,
abstract = {It is impossible to deterministically solve wait-free consensus in an asynchronous system. The classic proof uses a valency argument, which constructs an infinite execution by repeatedly extending a finite execution. We introduce extension-based proofs, a class of impossibility proofs that are modelled as an interaction between a prover and a protocol and that include valency arguments.
Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve k-set agreement among n > k ≥ 2 processes in a wait-free manner. However, it was unknown whether proofs based on simpler techniques were possible. We show that this impossibility result cannot be obtained by an extension-based proof and, hence, extension-based proofs are limited in power.},
author = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
booktitle = {Proceedings of the 51st Annual ACM SIGACT Symposium on Theory of Computing - STOC 2019},
isbn = {9781450367059},
location = {Phoenix, AZ, United States},
pages = {986--996},
publisher = {ACM Press},
title = {{Why extension-based proofs fail}},
doi = {10.1145/3313276.3316407},
year = {2019},
}
@inproceedings{7201,
abstract = {Applying machine learning techniques to the quickly growing data in science and industry requires highly-scalable algorithms. Large datasets are most commonly processed "data parallel" distributed across many nodes. Each node's contribution to the overall gradient is summed using a global allreduce. This allreduce is the single communication and thus scalability bottleneck for most machine learning workloads. We observe that frequently, many gradient values are (close to) zero, leading to sparse of sparsifyable communications. To exploit this insight, we analyze, design, and implement a set of communication-efficient protocols for sparse input data, in conjunction with efficient machine learning algorithms which can leverage these primitives. Our communication protocols generalize standard collective operations, by allowing processes to contribute arbitrary sparse input data vectors. Our generic communication library, SparCML1, extends MPI to support additional features, such as non-blocking (asynchronous) operations and low-precision data representations. As such, SparCML and its techniques will form the basis of future highly-scalable machine learning frameworks.},
author = {Renggli, Cedric and Ashkboos, Saleh and Aghagolzadeh, Mehdi and Alistarh, Dan-Adrian and Hoefler, Torsten},
booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis, SC},
isbn = {9781450362290},
issn = {21674337},
location = {Denver, CO, Unites States},
publisher = {ACM},
title = {{SparCML: High-performance sparse communication for machine learning}},
doi = {10.1145/3295500.3356222},
year = {2019},
}
@inproceedings{5961,
abstract = {The area of machine learning has made considerable progress over the past decade, enabled by the widespread availability of large datasets, as well as by improved algorithms and models. Given the large computational demands of machine learning workloads, parallelism, implemented either through single-node concurrency or through multi-node distribution, has been a third key ingredient to advances in machine learning.
The goal of this tutorial is to provide the audience with an overview of standard distribution techniques in machine learning, with an eye towards the intriguing trade-offs between synchronization and communication costs of distributed machine learning algorithms, on the one hand, and their convergence, on the other.The tutorial will focus on parallelization strategies for the fundamental stochastic gradient descent (SGD) algorithm, which is a key tool when training machine learning models, from classical instances such as linear regression, to state-of-the-art neural network architectures.
The tutorial will describe the guarantees provided by this algorithm in the sequential case, and then move on to cover both shared-memory and message-passing parallelization strategies, together with the guarantees they provide, and corresponding trade-offs. The presentation will conclude with a broad overview of ongoing research in distributed and concurrent machine learning. The tutorial will assume no prior knowledge beyond familiarity with basic concepts in algebra and analysis.
},
author = {Alistarh, Dan-Adrian},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {487--488},
publisher = {ACM Press},
title = {{A brief tutorial on distributed and concurrent machine learning}},
doi = {10.1145/3212734.3212798},
year = {2018},
}
@inproceedings{5966,
abstract = {The transactional conflict problem arises in transactional systems whenever two or more concurrent transactions clash on a data item. While the standard solution to such conflicts is to immediately abort one of the transactions, some practical systems consider the alternative of delaying conflict resolution for a short interval, which may allow one of the transactions to commit. The challenge in the transactional conflict problem is to choose the optimal length of this delay interval so as to minimize the overall running time penalty for the conflicting transactions. In this paper, we propose a family of optimal online algorithms for the transactional conflict problem. Specifically, we consider variants of this problem which arise in different implementations of transactional systems, namely "requestor wins'' and "requestor aborts'' implementations: in the former, the recipient of a coherence request is aborted, whereas in the latter, it is the requestor which has to abort. Both strategies are implemented by real systems. We show that the requestor aborts case can be reduced to a classic instance of the ski rental problem, while the requestor wins case leads to a new version of this classical problem, for which we derive optimal deterministic and randomized algorithms. Moreover, we prove that, under a simplified adversarial model, our algorithms are constant-competitive with the offline optimum in terms of throughput. We validate our algorithmic results empirically through a hardware simulation of hardware transactional memory (HTM), showing that our algorithms can lead to non-trivial performance improvements for classic concurrent data structures.},
author = {Alistarh, Dan-Adrian and Haider, Syed Kamran and Kübler, Raphael and Nadiradze, Giorgi},
booktitle = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures - SPAA '18},
isbn = {9781450357999},
location = {Vienna, Austria},
pages = {383--392},
publisher = {ACM Press},
title = {{The transactional conflict problem}},
doi = {10.1145/3210377.3210406},
year = {2018},
}
@inproceedings{6558,
abstract = {This paper studies the problem of distributed stochastic optimization in an adversarial setting where, out of m machines which allegedly compute stochastic gradients every iteration, an α-fraction are Byzantine, and may behave adversarially. Our main result is a variant of stochastic gradient descent (SGD) which finds ε-approximate minimizers of convex functions in T=O~(1/ε²m+α²/ε²) iterations. In contrast, traditional mini-batch SGD needs T=O(1/ε²m) iterations, but cannot tolerate Byzantine failures. Further, we provide a lower bound showing that, up to logarithmic factors, our algorithm is information-theoretically optimal both in terms of sample complexity and time complexity.},
author = {Alistarh, Dan-Adrian and Allen-Zhu, Zeyuan and Li, Jerry},
booktitle = {Advances in Neural Information Processing Systems},
editor = {Bengio, S. and Wallach, H. and Larochelle, H. and Grauman, K. and Cesa-Bianchi, N. and Garnett, R.},
location = {Montreal, Canada},
pages = {4613--4623},
publisher = {Neural Information Processing Systems Foundation},
title = {{Byzantine Stochastic Gradient Descent}},
volume = {Volume 2018},
year = {2018},
}
@inproceedings{6589,
abstract = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
author = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
booktitle = {Advances in Neural Information Processing Systems 31},
location = {Montreal, Canada},
pages = {5973--5983},
publisher = {Neural information processing systems},
title = {{The convergence of sparsified gradient methods}},
volume = {Volume 2018},
year = {2018},
}
@article{536,
abstract = {We consider the problem of consensus in the challenging classic model. In this model, the adversary is adaptive; it can choose which processors crash at any point during the course of the algorithm. Further, communication is via asynchronous message passing: there is no known upper bound on the time to send a message from one processor to another, and all messages and coin flips are seen by the adversary. We describe a new randomized consensus protocol with expected message complexity O(n2log2n) when fewer than n / 2 processes may fail by crashing. This is an almost-linear improvement over the best previously known protocol, and within logarithmic factors of a known Ω(n2) message lower bound. The protocol further ensures that no process sends more than O(nlog3n) messages in expectation, which is again within logarithmic factors of optimal. We also present a generalization of the algorithm to an arbitrary number of failures t, which uses expected O(nt+t2log2t) total messages. Our approach is to build a message-efficient, resilient mechanism for aggregating individual processor votes, implementing the message-passing equivalent of a weak shared coin. Roughly, in our protocol, a processor first announces its votes to small groups, then propagates them to increasingly larger groups as it generates more and more votes. To bound the number of messages that an individual process might have to send or receive, the protocol progressively increases the weight of generated votes. The main technical challenge is bounding the impact of votes that are still “in flight” (generated, but not fully propagated) on the final outcome of the shared coin, especially since such votes might have different weights. We achieve this by leveraging the structure of the algorithm, and a technical argument based on martingale concentration bounds. Overall, we show that it is possible to build an efficient message-passing implementation of a shared coin, and in the process (almost-optimally) solve the classic consensus problem in the asynchronous message-passing model.},
author = {Alistarh, Dan-Adrian and Aspnes, James and King, Valerie and Saia, Jared},
issn = {01782770},
journal = {Distributed Computing},
number = {6},
pages = {489--501},
publisher = {Springer},
title = {{Communication-efficient randomized consensus}},
doi = {10.1007/s00446-017-0315-1},
volume = {31},
year = {2018},
}
@inproceedings{5962,
abstract = {Stochastic Gradient Descent (SGD) is a fundamental algorithm in machine learning, representing the optimization backbone for training several classic models, from regression to neural networks. Given the recent practical focus on distributed machine learning, significant work has been dedicated to the convergence properties of this algorithm under the inconsistent and noisy updates arising from execution in a distributed environment. However, surprisingly, the convergence properties of this classic algorithm in the standard shared-memory model are still not well-understood. In this work, we address this gap, and provide new convergence bounds for lock-free concurrent stochastic gradient descent, executing in the classic asynchronous shared memory model, against a strong adaptive adversary. Our results give improved upper and lower bounds on the "price of asynchrony'' when executing the fundamental SGD algorithm in a concurrent setting. They show that this classic optimization tool can converge faster and with a wider range of parameters than previously known under asynchronous iterations. At the same time, we exhibit a fundamental trade-off between the maximum delay in the system and the rate at which SGD can converge, which governs the set of parameters under which this algorithm can still work efficiently.},
author = {Alistarh, Dan-Adrian and De Sa, Christopher and Konstantinov, Nikola H},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {169--178},
publisher = {ACM Press},
title = {{The convergence of stochastic gradient descent in asynchronous shared memory}},
doi = {10.1145/3212734.3212763},
year = {2018},
}
@inproceedings{5963,
abstract = {There has been significant progress in understanding the parallelism inherent to iterative sequential algorithms: for many classic algorithms, the depth of the dependence structure is now well understood, and scheduling techniques have been developed to exploit this shallow dependence structure for efficient parallel implementations. A related, applied research strand has studied methods by which certain iterative task-based algorithms can be efficiently parallelized via relaxed concurrent priority schedulers. These allow for high concurrency when inserting and removing tasks, at the cost of executing superfluous work due to the relaxed semantics of the scheduler. In this work, we take a step towards unifying these two research directions, by showing that there exists a family of relaxed priority schedulers that can efficiently and deterministically execute classic iterative algorithms such as greedy maximal independent set (MIS) and matching. Our primary result shows that, given a randomized scheduler with an expected relaxation factor of k in terms of the maximum allowed priority inversions on a task, and any graph on n vertices, the scheduler is able to execute greedy MIS with only an additive factor of \poly(k) expected additional iterations compared to an exact (but not scalable) scheduler. This counter-intuitive result demonstrates that the overhead of relaxation when computing MIS is not dependent on the input size or structure of the input graph. Experimental results show that this overhead can be clearly offset by the gain in performance due to the highly scalable scheduler. In sum, we present an efficient method to deterministically parallelize iterative sequential algorithms, with provable runtime guarantees in terms of the number of executed tasks to completion.},
author = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Nadiradze, Giorgi},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {377--386},
publisher = {ACM Press},
title = {{Relaxed schedulers can efficiently parallelize iterative algorithms}},
doi = {10.1145/3212734.3212756},
year = {2018},
}
@inproceedings{6031,
abstract = {We introduce Clover, a new library for efficient computation using low-precision data, providing mathematical routines required by fundamental methods in optimization and sparse recovery. Our library faithfully implements variants of stochastic quantization that guarantee convergence at low precision, and supports data formats from 4-bit quantized to 32-bit IEEE-754 on current Intel processors. In particular, we show that 4-bit can be implemented efficiently using Intel AVX despite the lack of native support for this data format. Experimental results with dot product, matrix-vector multiplication (MVM), gradient descent (GD), and iterative hard thresholding (IHT) demonstrate that the attainable speedups are in many cases close to linear with respect to the reduction of precision due to reduced data movement. Finally, for GD and IHT, we show examples of absolute speedup achieved by 4-bit versus 32-bit, by iterating until a given target error is achieved.},
author = {Stojanov, Alen and Smith, Tyler Michael and Alistarh, Dan-Adrian and Puschel, Markus},
booktitle = {2018 IEEE International Workshop on Signal Processing Systems},
location = {Cape Town, South Africa},
publisher = {IEEE},
title = {{Fast quantized arithmetic on x86: Trading compute for data movement}},
doi = {10.1109/SiPS.2018.8598402},
volume = {2018-October},
year = {2018},
}
@inproceedings{7116,
abstract = {Training deep learning models has received tremendous research interest recently. In particular, there has been intensive research on reducing the communication cost of training when using multiple computational devices, through reducing the precision of the underlying data representation. Naturally, such methods induce system trade-offs—lowering communication precision could de-crease communication overheads and improve scalability; but, on the other hand, it can also reduce the accuracy of training. In this paper, we study this trade-off space, and ask:Can low-precision communication consistently improve the end-to-end performance of training modern neural networks, with no accuracy loss?From the performance point of view, the answer to this question may appear deceptively easy: compressing communication through low precision should help when the ratio between communication and computation is high. However, this answer is less straightforward when we try to generalize this principle across various neural network architectures (e.g., AlexNet vs. ResNet),number of GPUs (e.g., 2 vs. 8 GPUs), machine configurations(e.g., EC2 instances vs. NVIDIA DGX-1), communication primitives (e.g., MPI vs. NCCL), and even different GPU architectures(e.g., Kepler vs. Pascal). Currently, it is not clear how a realistic realization of all these factors maps to the speed up provided by low-precision communication. In this paper, we conduct an empirical study to answer this question and report the insights.},
author = {Grubic, Demjan and Tam, Leo and Alistarh, Dan-Adrian and Zhang, Ce},
booktitle = {Proceedings of the 21st International Conference on Extending Database Technology},
isbn = {9783893180783},
issn = {2367-2005},
location = {Vienna, Austria},
pages = {145--156},
publisher = {OpenProceedings},
title = {{Synchronous multi-GPU training for deep learning with low-precision communications: An empirical study}},
doi = {10.5441/002/EDBT.2018.14},
year = {2018},
}
@inproceedings{7123,
abstract = {Population protocols are a popular model of distributed computing, in which n agents with limited local state interact randomly, and cooperate to collectively compute global predicates. Inspired by recent developments in DNA programming, an extensive series of papers, across different communities, has examined the computability and complexity characteristics of this model. Majority, or consensus, is a central task in this model, in which agents need to collectively reach a decision as to which one of two states A or B had a higher initial count. Two metrics are important: the time that a protocol requires to stabilize to an output decision, and the state space size that each agent requires to do so. It is known that majority requires Ω(log log n) states per agent to allow for fast (poly-logarithmic time) stabilization, and that O(log2 n) states are sufficient. Thus, there is an exponential gap between the space upper and lower bounds for this problem. This paper addresses this question.
On the negative side, we provide a new lower bound of Ω(log n) states for any protocol which stabilizes in O(n1–c) expected time, for any constant c > 0. This result is conditional on monotonicity and output assumptions, satisfied by all known protocols. Technically, it represents a departure from previous lower bounds, in that it does not rely on the existence of dense configurations. Instead, we introduce a new generalized surgery technique to prove the existence of incorrect executions for any algorithm which would contradict the lower bound. Subsequently, our lower bound also applies to general initial configurations, including ones with a leader. On the positive side, we give a new algorithm for majority which uses O(log n) states, and stabilizes in O(log2 n) expected time. Central to the algorithm is a new leaderless phase clock technique, which allows agents to synchronize in phases of Θ(n log n) consecutive interactions using O(log n) states per agent, exploiting a new connection between population protocols and power-of-two-choices load balancing mechanisms. We also employ our phase clock to build a leader election algorithm with a state space of size O(log n), which stabilizes in O(log2 n) expected time.},
author = {Alistarh, Dan-Adrian and Aspnes, James and Gelashvili, Rati},
booktitle = {Proceedings of the 29th Annual ACM-SIAM Symposium on Discrete Algorithms},
isbn = {9781611975031},
location = {New Orleans, LA, United States},
pages = {2221--2239},
publisher = {ACM},
title = {{Space-optimal majority in population protocols}},
doi = {10.1137/1.9781611975031.144},
year = {2018},
}
@inproceedings{5964,
abstract = {A standard design pattern found in many concurrent data structures, such as hash tables or ordered containers, is an alternation of parallelizable sections that incur no data conflicts and critical sections that must run sequentially and are protected with locks. A lock can be viewed as a queue that arbitrates the order in which the critical sections are executed, and a natural question is whether we can use stochastic analysis to predict the resulting throughput. As a preliminary evidence to the affirmative, we describe a simple model that can be used to predict the throughput of coarse-grained lock-based algorithms. We show that our model works well for CLH lock, and we expect it to work for other popular lock designs such as TTAS, MCS, etc.},
author = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Kuznetsov, Petr},
booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18},
isbn = {9781450357951},
location = {Egham, United Kingdom},
pages = {411--413},
publisher = {ACM Press},
title = {{Brief Announcement: Performance prediction for coarse-grained locking}},
doi = {10.1145/3212734.3212785},
year = {2018},
}
@article{6001,
abstract = {The concurrent memory reclamation problem is that of devising a way for a deallocating thread to verify that no other concurrent threads hold references to a memory block being deallocated. To date, in the absence of automatic garbage collection, there is no satisfactory solution to this problem; existing tracking methods like hazard pointers, reference counters, or epoch-based techniques like RCU are either prohibitively expensive or require significant programming expertise to the extent that implementing them efficiently can be worthy of a publication. None of the existing techniques are automatic or even semi-automated.
In this article, we take a new approach to concurrent memory reclamation. Instead of manually tracking access to memory locations as done in techniques like hazard pointers, or restricting shared accesses to specific epoch boundaries as in RCU, our algorithm, called ThreadScan, leverages operating system signaling to automatically detect which memory locations are being accessed by concurrent threads.
Initial empirical evidence shows that ThreadScan scales surprisingly well and requires negligible programming effort beyond the standard use of Malloc and Free.},
author = {Alistarh, Dan-Adrian and Leiserson, William and Matveev, Alexander and Shavit, Nir},
issn = {2329-4949},
journal = {ACM Transactions on Parallel Computing},
number = {4},
publisher = {Association for Computing Machinery},
title = {{ThreadScan: Automatic and scalable memory reclamation}},
doi = {10.1145/3201897},
volume = {4},
year = {2018},
}
@inproceedings{5965,
abstract = {Relaxed concurrent data structures have become increasingly popular, due to their scalability in graph processing and machine learning applications (\citeNguyen13, gonzalez2012powergraph ). Despite considerable interest, there exist families of natural, high performing randomized relaxed concurrent data structures, such as the popular MultiQueue~\citeMQ pattern for implementing relaxed priority queue data structures, for which no guarantees are known in the concurrent setting~\citeAKLN17. Our main contribution is in showing for the first time that, under a set of analytic assumptions, a family of relaxed concurrent data structures, including variants of MultiQueues, but also a new approximate counting algorithm we call the MultiCounter, provides strong probabilistic guarantees on the degree of relaxation with respect to the sequential specification, in arbitrary concurrent executions. We formalize these guarantees via a new correctness condition called distributional linearizability, tailored to concurrent implementations with randomized relaxations. Our result is based on a new analysis of an asynchronous variant of the classic power-of-two-choices load balancing algorithm, in which placement choices can be based on inconsistent, outdated information (this result may be of independent interest). We validate our results empirically, showing that the MultiCounter algorithm can implement scalable relaxed timestamps.},
author = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Li, Jerry Z. and Nadiradze, Giorgi},
booktitle = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures - SPAA '18},
isbn = {9781450357999},
location = {Vienna, Austria},
pages = {133--142},
publisher = {ACM Press},
title = {{Distributionally linearizable data structures}},
doi = {10.1145/3210377.3210411},
year = {2018},
}
@inproceedings{787,
abstract = {Population protocols are a popular model of distributed computing, in which randomly-interacting agents with little computational power cooperate to jointly perform computational tasks. Inspired by developments in molecular computation, and in particular DNA computing, recent algorithmic work has focused on the complexity of solving simple yet fundamental tasks in the population model, such as leader election (which requires convergence to a single agent in a special "leader" state), and majority (in which agents must converge to a decision as to which of two possible initial states had higher initial count). Known results point towards an inherent trade-off between the time complexity of such algorithms, and the space complexity, i.e. size of the memory available to each agent. In this paper, we explore this trade-off and provide new upper and lower bounds for majority and leader election. First, we prove a unified lower bound, which relates the space available per node with the time complexity achievable by a protocol: for instance, our result implies that any protocol solving either of these tasks for n agents using O(log log n) states must take (n=polylogn) expected time. This is the first result to characterize time complexity for protocols which employ super-constant number of states per node, and proves that fast, poly-logarithmic running times require protocols to have relatively large space costs. On the positive side, we give algorithms showing that fast, poly-logarithmic convergence time can be achieved using O(log2 n) space per node, in the case of both tasks. Overall, our results highlight a time complexity separation between O(log log n) and (log2 n) state space size for both majority and leader election in population protocols, and introduce new techniques, which should be applicable more broadly.},
author = {Alistarh, Dan-Adrian and Aspnes, James and Eisenstat, David and Rivest, Ronald and Gelashvili, Rati},
pages = {2560 -- 2579},
publisher = {SIAM},
title = {{Time-space trade-offs in population protocols}},
doi = {doi.org/10.1137/1.9781611974782.169},
year = {2017},
}