@article{9541,
abstract = {The Massively Parallel Computation (MPC) model is an emerging model that distills core aspects of distributed and parallel computation, developed as a tool to solve combinatorial (typically graph) problems in systems of many machines with limited space. Recent work has focused on the regime in which machines have sublinear (in n, the number of nodes in the input graph) space, with randomized algorithms presented for the fundamental problems of Maximal Matching and Maximal Independent Set. However, there have been no prior corresponding deterministic algorithms. A major challenge underlying the sublinear space setting is that the local space of each machine might be too small to store all edges incident to a single node. This poses a considerable obstacle compared to classical models in which each node is assumed to know and have easy access to its incident edges. To overcome this barrier, we introduce a new graph sparsification technique that deterministically computes a low-degree subgraph, with the additional property that solving the problem on this subgraph provides significant progress towards solving the problem for the original input graph. Using this framework to derandomize the well-known algorithm of Luby [SICOMP’86], we obtain O(log Δ + log log n)-round deterministic MPC algorithms for solving the problems of Maximal Matching and Maximal Independent Set with O(nɛ) space on each machine for any constant ɛ > 0. These algorithms also run in O(log Δ) rounds in the closely related model of CONGESTED CLIQUE, improving upon the state-of-the-art bound of O(log 2Δ) rounds by Censor-Hillel et al. [DISC’17].},
author = {Czumaj, Artur and Davies, Peter and Parter, Merav},
issn = {1549-6333},
journal = {ACM Transactions on Algorithms},
number = {2},
publisher = {ACM},
title = {{Graph sparsification for derandomizing massively parallel computation with low space}},
doi = {10.1145/3451992},
volume = {17},
year = {2021},
}
@article{9571,
abstract = {As the size and complexity of models and datasets grow, so does the need for communication-efficient variants of stochastic gradient descent that can be deployed to perform parallel model training. One popular communication-compression method for data-parallel SGD is QSGD (Alistarh et al., 2017), which quantizes and encodes gradients to reduce communication costs. The baseline variant of QSGD provides strong theoretical guarantees, however, for practical purposes, the authors proposed a heuristic variant which we call QSGDinf, which demonstrated impressive empirical gains for distributed training of large neural networks. In this paper, we build on this work to propose a new gradient quantization scheme, and show that it has both stronger theoretical guarantees than QSGD, and matches and exceeds the empirical performance of the QSGDinf heuristic and of other compression methods.},
author = {Ramezani-Kebrya, Ali and Faghri, Fartash and Markov, Ilya and Aksenov, Vitalii and Alistarh, Dan-Adrian and Roy, Daniel M.},
issn = {15337928},
journal = {Journal of Machine Learning Research},
number = {114},
pages = {1−43},
publisher = {Journal of Machine Learning Research},
title = {{NUQSGD: Provably communication-efficient data-parallel SGD via nonuniform quantization}},
volume = {22},
year = {2021},
}
@inproceedings{9678,
abstract = {We introduce a new graph problem, the token dropping game, and we show how to solve it efficiently in a distributed setting. We use the token dropping game as a tool to design an efficient distributed algorithm for stable orientations and more generally for locally optimal semi-matchings. The prior work by Czygrinow et al. (DISC 2012) finds a stable orientation in O(Δ^5) rounds in graphs of maximum degree Δ, while we improve it to O(Δ^4) and also prove a lower bound of Ω(Δ). For the more general problem of locally optimal semi-matchings, the prior upper bound is O(S^5) and our new algorithm runs in O(C · S^4) rounds, which is an improvement for C = o(S); here C and S are the maximum degrees of customers and servers, respectively.},
author = {Brandt, Sebastian and Keller, Barbara and Rybicki, Joel and Suomela, Jukka and Uitto, Jara},
booktitle = {Annual ACM Symposium on Parallelism in Algorithms and Architectures},
isbn = {9781450380706},
location = { Virtual Event, United States},
pages = {129--139},
title = {{Efficient load-balancing through distributed token dropping}},
doi = {10.1145/3409964.3461785},
year = {2021},
}
@inproceedings{9620,
abstract = {In this note, we introduce a distributed twist on the classic coupon collector problem: a set of m collectors wish to each obtain a set of n coupons; for this, they can each sample coupons uniformly at random, but can also meet in pairwise interactions, during which they can exchange coupons. By doing so, they hope to reduce the number of coupons that must be sampled by each collector in order to obtain a full set. This extension is natural when considering real-world manifestations of the coupon collector phenomenon, and has been remarked upon and studied empirically (Hayes and Hannigan 2006, Ahmad et al. 2014, Delmarcelle 2019).
We provide the first theoretical analysis for such a scenario. We find that “coupon collecting with friends” can indeed significantly reduce the number of coupons each collector must sample, and raises interesting connections to the more traditional variants of the problem. While our analysis is in most cases asymptotically tight, there are several open questions raised, regarding finer-grained analysis of both “coupon collecting with friends,” and of a long-studied variant of the original problem in which a collector requires multiple full sets of coupons.},
author = {Alistarh, Dan-Adrian and Davies, Peter},
booktitle = {Structural Information and Communication Complexity},
isbn = {9783030795269},
issn = {1611-3349},
location = {Wrocław, Poland},
pages = {3--12},
publisher = {Springer Nature},
title = {{Collecting coupons is faster with friends}},
doi = {10.1007/978-3-030-79527-6_1},
volume = {12810},
year = {2021},
}
@inproceedings{9543,
abstract = {We consider the problem ofdistributed mean estimation (DME), in which n machines are each given a local d-dimensional vector xv∈Rd, and must cooperate to estimate the mean of their inputs μ=1n∑nv=1xv, while minimizing total communication cost. DME is a fundamental construct in distributed machine learning, and there has been considerable work on variants of this problem, especially in the context of distributed variance reduction for stochastic gradients in parallel SGD. Previous work typically assumes an upper bound on the norm of the input vectors, and achieves an error bound in terms of this norm. However, in many real applications, the input vectors are concentrated around the correct output μ, but μ itself has large norm. In such cases, previous output error bounds perform poorly. In this paper, we show that output error bounds need not depend on input norm. We provide a method of quantization which allows distributed mean estimation to be performed with solution quality dependent only on the distance between inputs, not on input norm, and show an analogous result for distributed variance reduction. The technique is based on a new connection with lattice theory. We also provide lower bounds showing that the communication to error trade-off of our algorithms is asymptotically optimal. As the lattices achieving optimal bounds under l2-norm can be computationally impractical, we also present an extension which leverages easy-to-use cubic lattices, and is loose only up to a logarithmic factor ind. We show experimentally that our method yields practical improvements for common applications, relative to prior approaches.},
author = {Davies, Peter and Gurunanthan, Vijaykrishna and Moshrefi, Niusha and Ashkboos, Saleh and Alistarh, Dan-Adrian},
booktitle = {9th International Conference on Learning Representations},
location = {Virtual},
title = {{New bounds for distributed mean estimation and variance reduction}},
year = {2021},
}
@article{9827,
abstract = {The Nearest neighbour search (NNS) is a fundamental problem in many application domains dealing with multidimensional data. In a concurrent setting, where dynamic modifications are allowed, a linearizable implementation of the NNS is highly desirable.This paper introduces the LockFree-kD-tree (LFkD-tree ): a lock-free concurrent kD-tree, which implements an abstract data type (ADT) that provides the operations Add, Remove, Contains, and NNS. Our implementation is linearizable. The operations in the LFkD-tree use single-word read and compare-and-swap (Image 1 ) atomic primitives, which are readily supported on available multi-core processors. We experimentally evaluate the LFkD-tree using several benchmarks comprising real-world and synthetic datasets. The experiments show that the presented design is scalable and achieves significant speed-up compared to the implementations of an existing sequential kD-tree and a recently proposed multidimensional indexing structure, PH-tree.},
author = {Chatterjee, Bapi and Walulya, Ivan and Tsigas, Philippas},
issn = {03043975},
journal = {Theoretical Computer Science},
keywords = {Concurrent data structure, kD-tree, Nearest neighbor search, Similarity search, Lock-free, Linearizability},
publisher = {Elsevier},
title = {{Concurrent linearizable nearest neighbour search in LockFree-kD-tree}},
doi = {10.1016/j.tcs.2021.06.041},
year = {2021},
}
@inproceedings{9823,
abstract = {Approximate agreement is one of the few variants of consensus that can be solved in a wait-free manner in asynchronous systems where processes communicate by reading and writing to shared memory. In this work, we consider a natural generalisation of approximate agreement on arbitrary undirected connected graphs. Each process is given a vertex of the graph as input and, if non-faulty, must output a vertex such that
all the outputs are within distance 1 of one another, and
each output value lies on a shortest path between two input values.
From prior work, it is known that there is no wait-free algorithm among 𝑛≥3 processes for this problem on any cycle of length 𝑐≥4 , by reduction from 2-set agreement (Castañeda et al. 2018).
In this work, we investigate the solvability and complexity of this task on general graphs. We give a new, direct proof of the impossibility of approximate agreement on cycles of length 𝑐≥4 , via a generalisation of Sperner’s Lemma to convex polygons. We also extend the reduction from 2-set agreement to a larger class of graphs, showing that approximate agreement on these graphs is unsolvable. On the positive side, we present a wait-free algorithm for a class of graphs that properly contains the class of chordal graphs.},
author = {Alistarh, Dan-Adrian and Ellen, Faith and Rybicki, Joel},
booktitle = {Structural Information and Communication Complexity},
isbn = {9783030795269},
issn = {16113349},
location = {Wrocław, Poland},
pages = {87--105},
publisher = {Springer Nature},
title = {{Wait-free approximate agreement on graphs}},
doi = {10.1007/978-3-030-79527-6_6},
volume = {12810},
year = {2021},
}
@article{8723,
abstract = {Deep learning at scale is dominated by communication time. Distributing samples across nodes usually yields the best performance, but poses scaling challenges due to global information dissemination and load imbalance across uneven sample lengths. State-of-the-art decentralized optimizers mitigate the problem, but require more iterations to achieve the same accuracy as their globally-communicating counterparts. We present Wait-Avoiding Group Model Averaging (WAGMA) SGD, a wait-avoiding stochastic optimizer that reduces global communication via subgroup weight exchange. The key insight is a combination of algorithmic changes to the averaging scheme and the use of a group allreduce operation. We prove the convergence of WAGMA-SGD, and empirically show that it retains convergence rates similar to Allreduce-SGD. For evaluation, we train ResNet-50 on ImageNet; Transformer for machine translation; and deep reinforcement learning for navigation at scale. Compared with state-of-the-art decentralized SGD variants, WAGMA-SGD significantly improves training throughput (e.g., 2.1× on 1,024 GPUs for reinforcement learning), and achieves the fastest time-to-solution (e.g., the highest score using the shortest training time for Transformer).},
author = {Li, Shigang and Tal Ben-Nun, Tal Ben-Nun and Nadiradze, Giorgi and Girolamo, Salvatore Di and Dryden, Nikoli and Alistarh, Dan-Adrian and Hoefler, Torsten},
issn = {10459219},
journal = {IEEE Transactions on Parallel and Distributed Systems},
number = {7},
publisher = {IEEE},
title = {{Breaking (global) barriers in parallel stochastic optimization with wait-avoiding group averaging}},
doi = {10.1109/TPDS.2020.3040606},
volume = {32},
year = {2021},
}
@inproceedings{9933,
abstract = {In this paper, we study the power and limitations of component-stable algorithms in the low-space model of Massively Parallel Computation (MPC). Recently Ghaffari, Kuhn and Uitto (FOCS 2019) introduced the class of component-stable low-space MPC algorithms, which are, informally, defined as algorithms for which the outputs reported by the nodes in different connected components are required to be independent. This very natural notion was introduced to capture most (if not all) of the known efficient MPC algorithms to date, and it was the first general class of MPC algorithms for which one can show non-trivial conditional lower bounds. In this paper we enhance the framework of component-stable algorithms and investigate its effect on the complexity of randomized and deterministic low-space MPC. Our key contributions include: 1) We revise and formalize the lifting approach of Ghaffari, Kuhn and Uitto. This requires a very delicate amendment of the notion of component stability, which allows us to fill in gaps in the earlier arguments. 2) We also extend the framework to obtain conditional lower bounds for deterministic algorithms and fine-grained lower bounds that depend on the maximum degree Δ. 3) We demonstrate a collection of natural graph problems for which non-component-stable algorithms break the conditional lower bound obtained for component-stable algorithms. This implies that, for both deterministic and randomized algorithms, component-stable algorithms are conditionally weaker than the non-component-stable ones.
Altogether our results imply that component-stability might limit the computational power of the low-space MPC model, paving the way for improved upper bounds that escape the conditional lower bound setting of Ghaffari, Kuhn, and Uitto.},
author = {Czumaj, Artur and Davies, Peter and Parter, Merav},
booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
isbn = {9781450385480},
location = {Virtual, Italy},
pages = {481–491},
publisher = {Association for Computing Machinery},
title = {{Component stability in low-space massively parallel computation}},
doi = {10.1145/3465084.3467903},
year = {2021},
}
@inproceedings{9935,
abstract = {We present a deterministic O(log log log n)-round low-space Massively Parallel Computation (MPC) algorithm for the classical problem of (Δ+1)-coloring on n-vertex graphs. In this model, every machine has sublinear local space of size n^φ for any arbitrary constant φ \in (0,1). Our algorithm works under the relaxed setting where each machine is allowed to perform exponential local computations, while respecting the n^φ space and bandwidth limitations.
Our key technical contribution is a novel derandomization of the ingenious (Δ+1)-coloring local algorithm by Chang-Li-Pettie (STOC 2018, SIAM J. Comput. 2020). The Chang-Li-Pettie algorithm runs in T_local =poly(loglog n) rounds, which sets the state-of-the-art randomized round complexity for the problem in the local model. Our derandomization employs a combination of tools, notably pseudorandom generators (PRG) and bounded-independence hash functions.
The achieved round complexity of O(logloglog n) rounds matches the bound of log(T_local ), which currently serves an upper bound barrier for all known randomized algorithms for locally-checkable problems in this model. Furthermore, no deterministic sublogarithmic low-space MPC algorithms for the (Δ+1)-coloring problem have been known before.},
author = {Czumaj, Artur and Davies, Peter and Parter, Merav},
booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
isbn = {978-1-4503-8548-0},
location = {Virtual, Italy},
pages = {469–479},
publisher = {Association for Computing Machinery},
title = {{Improved deterministic (Δ+1) coloring in low-space MPC}},
doi = {10.1145/3465084.3467937},
year = {2021},
}
@inproceedings{9951,
abstract = {There has recently been a surge of interest in the computational and complexity properties of the population model, which assumes n anonymous, computationally-bounded nodes, interacting at random, with the goal of jointly computing global predicates. Significant work has gone towards investigating majority or consensus dynamics in this model: that is, assuming that every node is initially in one of two states X or Y, determine which state had higher initial count.
In this paper, we consider a natural generalization of majority/consensus, which we call comparison : in its simplest formulation, we are given two baseline states, X and Y, present in any initial configuration in fixed, but possibly small counts. One of these states has higher count than the other: we will assume |X_0| > C |Y_0| for some constant C > 1. The challenge is to design a protocol by which nodes can quickly and reliably decide on which of the baseline states X_0 and Y_0 has higher initial count. We begin by analyzing a simple and general dynamics solving the above comparison problem, which uses O( log n ) states per node, and converges in O(log n) (parallel) time, with high probability, to a state where the whole population votes on opinions X or Y at rates proportional to the initial concentrations of |X_0| vs. |Y_0|. We then describe how this procedure can be bootstrapped to solve comparison, i.e. have every node in the population reach the "correct'' decision, with probability 1 - o(1), at the cost of O (log log n) additional states. Further, we prove that this dynamics is self-stabilizing, in the sense that it converges to the correct decision from arbitrary initial states, and leak-robust, in the sense that it can withstand spurious faulty reactions, which are known to occur in practical implementations of population protocols. Our analysis is based on a new martingale concentration result relating the discrete-time evolution of a population protocol to its expected (steady-state) analysis, which should be a useful tool when analyzing opinion dynamics and epidemic dissemination in the population model.},
author = {Alistarh, Dan-Adrian and Töpfer, Martin and Uznański, Przemysław},
booktitle = {Proceedings of the 2021 ACM Symposium on Principles of Distributed Computing},
isbn = {9781450385480},
location = {Virtual, Italy},
pages = {55--65},
publisher = {Association for Computing Machinery},
title = {{Comparison dynamics in population protocols}},
doi = {10.1145/3465084.3467915},
year = {2021},
}
@inproceedings{8191,
abstract = {There has been a significant amount of research on hardware and software support for efficient concurrent data structures; yet, the question of how to build correct, simple, and scalable data structures has not yet been definitively settled. In this paper, we revisit this question from a minimalist perspective, and ask: what is the smallest amount of synchronization required for correct and efficient concurrent search data structures, and how could this minimal synchronization support be provided in hardware?
To address these questions, we introduce memory tagging, a simple hardware mechanism which enables the programmer to "tag" a dynamic set of memory locations, at cache-line granularity, and later validate whether the memory has been concurrently modified, with the possibility of updating one of the underlying locations atomically if validation succeeds. We provide several examples showing that this mechanism can enable fast and arguably simple concurrent data structure designs, such as lists, binary search trees, balanced search trees, range queries, and Software Transactional Memory (STM) implementations. We provide an implementation of memory tags in the Graphite multi-core simulator, showing that the mechanism can be implemented entirely at the level of L1 cache, and that it can enable non-trivial speedups versus existing implementations of the above data structures.},
author = {Alistarh, Dan-Adrian and Brown, Trevor A and Singhal, Nandini},
booktitle = {Annual ACM Symposium on Parallelism in Algorithms and Architectures},
isbn = {9781450369350},
location = {Virtual Event, United States},
number = {7},
pages = {37--49},
publisher = {ACM},
title = {{Memory tagging: Minimalist synchronization for scalable concurrent data structures}},
doi = {10.1145/3350755.3400213},
year = {2020},
}
@article{8268,
abstract = {Modern scientific instruments produce vast amounts of data, which can overwhelm the processing ability of computer systems. Lossy compression of data is an intriguing solution, but comes with its own drawbacks, such as potential signal loss, and the need for careful optimization of the compression ratio. In this work, we focus on a setting where this problem is especially acute: compressive sensing frameworks for interferometry and medical imaging. We ask the following question: can the precision of the data representation be lowered for all inputs, with recovery guarantees and practical performance Our first contribution is a theoretical analysis of the normalized Iterative Hard Thresholding (IHT) algorithm when all input data, meaning both the measurement matrix and the observation vector are quantized aggressively. We present a variant of low precision normalized IHT that, under mild conditions, can still provide recovery guarantees. The second contribution is the application of our quantization framework to radio astronomy and magnetic resonance imaging. We show that lowering the precision of the data can significantly accelerate image recovery. We evaluate our approach on telescope data and samples of brain images using CPU and FPGA implementations achieving up to a 9x speedup with negligible loss of recovery quality.},
author = {Gurel, Nezihe Merve and Kara, Kaan and Stojanov, Alen and Smith, Tyler and Lemmin, Thomas and Alistarh, Dan-Adrian and Puschel, Markus and Zhang, Ce},
issn = {19410476},
journal = {IEEE Transactions on Signal Processing},
pages = {4268--4282},
publisher = {IEEE},
title = {{Compressive sensing using iterative hard thresholding with low precision data representation: Theory and applications}},
doi = {10.1109/TSP.2020.3010355},
volume = {68},
year = {2020},
}
@inproceedings{8286,
abstract = {We consider the following dynamic load-balancing process: given an underlying graph G with n nodes, in each step t≥ 0, one unit of load is created, and placed at a randomly chosen graph node. In the same step, the chosen node picks a random neighbor, and the two nodes balance their loads by averaging them. We are interested in the expected gap between the minimum and maximum loads at nodes as the process progresses, and its dependence on n and on the graph structure. Variants of the above graphical balanced allocation process have been studied previously by Peres, Talwar, and Wieder [Peres et al., 2015], and by Sauerwald and Sun [Sauerwald and Sun, 2015]. These authors left as open the question of characterizing the gap in the case of cycle graphs in the dynamic case, where weights are created during the algorithm’s execution. For this case, the only known upper bound is of 𝒪(n log n), following from a majorization argument due to [Peres et al., 2015], which analyzes a related graphical allocation process. In this paper, we provide an upper bound of 𝒪 (√n log n) on the expected gap of the above process for cycles of length n. We introduce a new potential analysis technique, which enables us to bound the difference in load between k-hop neighbors on the cycle, for any k ≤ n/2. We complement this with a "gap covering" argument, which bounds the maximum value of the gap by bounding its value across all possible subsets of a certain structure, and recursively bounding the gaps within each subset. We provide analytical and experimental evidence that our upper bound on the gap is tight up to a logarithmic factor. },
author = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Sabour, Amirmojtaba},
booktitle = {47th International Colloquium on Automata, Languages, and Programming},
isbn = {9783959771382},
issn = {18688969},
location = {Virtual, Online; Germany},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
title = {{Dynamic averaging load balancing on cycles}},
doi = {10.4230/LIPIcs.ICALP.2020.7},
volume = {168},
year = {2020},
}
@inproceedings{8383,
abstract = {We introduce extension-based proofs, a class of impossibility proofs that includes valency arguments. They are modelled as an interaction between a prover and a protocol. Using proofs based on combinatorial topology, it has been shown that it is impossible to deterministically solve k-set agreement among n > k ≥ 2 processes in a wait-free manner. However, it was unknown whether proofs based on simpler techniques were possible. We explain why this impossibility result cannot be obtained by an extension-based proof and, hence, extension-based proofs are limited in power.},
author = {Alistarh, Dan-Adrian and Aspnes, James and Ellen, Faith and Gelashvili, Rati and Zhu, Leqi},
booktitle = {Proceedings of the 39th Symposium on Principles of Distributed Computing},
isbn = {9781450375825},
location = {Virtual, Italy},
pages = {54--56},
publisher = {ACM},
title = {{Brief Announcement: Why Extension-Based Proofs Fail}},
doi = {10.1145/3382734.3405743},
year = {2020},
}
@inproceedings{8722,
abstract = {Load imbalance pervasively exists in distributed deep learning training systems, either caused by the inherent imbalance in learned tasks or by the system itself. Traditional synchronous Stochastic Gradient Descent (SGD)
achieves good accuracy for a wide variety of tasks, but relies on global synchronization to accumulate the gradients at every training step. In this paper, we propose eager-SGD, which relaxes the global synchronization for
decentralized accumulation. To implement eager-SGD, we propose to use two partial collectives: solo and majority. With solo allreduce, the faster processes contribute their gradients eagerly without waiting for the slower processes, whereas with majority allreduce, at least half of the participants must contribute gradients before continuing, all without using a central parameter server. We theoretically prove the convergence of the algorithms and describe the partial collectives in detail. Experimental results on load-imbalanced environments (CIFAR-10, ImageNet, and UCF101 datasets) show
that eager-SGD achieves 1.27x speedup over the state-of-the-art synchronous SGD, without losing accuracy.},
author = {Li, Shigang and Tal Ben-Nun, Tal Ben-Nun and Girolamo, Salvatore Di and Alistarh, Dan-Adrian and Hoefler, Torsten},
booktitle = {Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
location = {San Diego, CA, United States},
pages = {45--61},
publisher = {Association for Computing Machinery},
title = {{Taming unbalanced training workloads in deep learning with partial collective operations}},
doi = {10.1145/3332466.3374528},
year = {2020},
}
@article{7224,
abstract = {Habitat loss is one of the key drivers of the ongoing decline of biodiversity. However, ecologists still argue about how fragmentation of habitat (independent of habitat loss) affects species richness. The recently proposed habitat amount hypothesis posits that species richness only depends on the total amount of habitat in a local landscape. In contrast, empirical studies report contrasting patterns: some find positive and others negative effects of fragmentation per se on species richness. To explain this apparent disparity, we devise a stochastic, spatially explicit model of competitive species communities in heterogeneous habitats. The model shows that habitat loss and fragmentation have complex effects on species diversity in competitive communities. When the total amount of habitat is large, fragmentation per se tends to increase species diversity, but if the total amount of habitat is small, the situation is reversed: fragmentation per se decreases species diversity.},
author = {Rybicki, Joel and Abrego, Nerea and Ovaskainen, Otso},
issn = {1461-023X},
journal = {Ecology Letters},
number = {3},
pages = {506--517},
publisher = {Wiley},
title = {{Habitat fragmentation and species diversity in competitive communities}},
doi = {10.1111/ele.13450},
volume = {23},
year = {2020},
}
@inproceedings{7272,
abstract = {Many systems rely on optimistic concurrent search trees for multi-core scalability. In principle, optimistic trees have a simple performance story: searches are read-only and so run in parallel, with writes to shared memory occurring only when modifying the data structure. However, this paper shows that in practice, obtaining the full performance benefits of optimistic search trees is not so simple.
We focus on optimistic binary search trees (BSTs) and perform a detailed performance analysis of 10 state-of-the-art BSTs on large scale x86-64 hardware, using both microbenchmarks and an in-memory database system. We find and explain significant unexpected performance differences between BSTs with similar tree structure and search implementations, which we trace to subtle performance-degrading interactions of BSTs with systems software and hardware subsystems. We further derive a prescriptive approach to avoid this performance degradation, as well as algorithmic insights on optimistic BST design. Our work underlines the gap between the theory and practice of multi-core performance, and calls for further research to help bridge this gap.},
author = {Arbel-Raviv, Maya and Brown, Trevor A and Morrison, Adam},
booktitle = {Proceedings of the 2018 USENIX Annual Technical Conference},
isbn = {9781939133021},
location = {Boston, MA, United States},
pages = {295--306},
publisher = {USENIX Association},
title = {{Getting to the root of concurrent binary search tree performance}},
year = {2020},
}
@inproceedings{7605,
abstract = {Union-Find (or Disjoint-Set Union) is one of the fundamental problems in computer science; it has been well-studied from both theoretical and practical perspectives in the sequential case. Recently, there has been mounting interest in analyzing this problem in the concurrent scenario, and several asymptotically-efficient algorithms have been proposed. Yet, to date, there is very little known about the practical performance of concurrent Union-Find. This work addresses this gap. We evaluate and analyze the performance of several concurrent Union-Find algorithms and optimization strategies across a wide range of platforms (Intel, AMD, and ARM) and workloads (social, random, and road networks, as well as integrations into more complex algorithms). We first observe that, due to the limited computational cost, the number of induced cache misses is the critical determining factor for the performance of existing algorithms. We introduce new techniques to reduce this cost by storing node priorities implicitly and by using plain reads and writes in a way that does not affect the correctness of the algorithms. Finally, we show that Union-Find implementations are an interesting application for Transactional Memory (TM): one of the fastest algorithm variants we discovered is a sequential one that uses coarse-grained locking with the lock elision optimization to reduce synchronization cost and increase scalability. },
author = {Alistarh, Dan-Adrian and Fedorov, Alexander and Koval, Nikita},
booktitle = {23rd International Conference on Principles of Distributed Systems},
isbn = {9783959771337},
issn = {18688969},
location = {Neuchatal, Switzerland},
pages = {15:1--15:16},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
title = {{In search of the fastest concurrent union-find algorithm}},
doi = {10.4230/LIPIcs.OPODIS.2019.15},
volume = {153},
year = {2020},
}
@inproceedings{7635,
abstract = {Concurrent programming can be notoriously complex and error-prone. Programming bugs can arise from a variety of sources, such as operation re-reordering, or incomplete understanding of the memory model. A variety of formal and model checking methods have been developed to address this fundamental difficulty. While technically interesting, existing academic methods are still hard to apply to the large codebases typical of industrial deployments, which limits their practical impact.},
author = {Koval, Nikita and Sokolova, Mariia and Fedorov, Alexander and Alistarh, Dan-Adrian and Tsitelov, Dmitry},
booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP},
isbn = {9781450368186},
location = {San Diego, CA, United States},
pages = {423--424},
publisher = {ACM},
title = {{Testing concurrency on the JVM with Lincheck}},
doi = {10.1145/3332466.3374503},
year = {2020},
}