@misc{12736, abstract = {Although a wide variety of handcrafted concurrent data structures have been proposed, there is considerable interest in universal approaches (Universal Constructions or UCs) for building concurrent data structures. UCs (semi-)automatically convert a sequential data structure into a concurrent one. The simplest approach uses locks [3, 6] that protect a sequential data structure and allow only one process to access it at a time. However, the resulting data structure is blocking. Most work on UCs instead focuses on obtaining non-blocking progress guarantees such as obstruction-freedom, lock-freedom or wait-freedom. Many non-blocking UCs have appeared. Key examples include the seminal wait-free UC [2] by Herlihy, a NUMA-aware UC [10] by Yi et al., and an efficient UC for large objects [1] by Fatourou et al.}, author = {Aksenov, Vitaly and Brown, Trevor A and Fedorov, Alexander and Kokorin, Ilya}, booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9798400700156}, location = {Montreal, QB, Canada}, pages = {438--440}, publisher = {Association for Computing Machinery}, title = {{Unexpected scaling in path copying trees}}, doi = {10.1145/3572848.3577512}, year = {2023}, } @inproceedings{11181, abstract = {To maximize the performance of concurrent data structures, researchers have often turned to highly complex fine-grained techniques, resulting in efficient and elegant algorithms, which can however be often difficult to understand and prove correct. While simpler techniques exist, such as transactional memory, they can have limited performance or portability relative to their fine-grained counterparts. Approaches at both ends of this complexity-performance spectrum have been extensively explored, but relatively less is known about the middle ground: approaches that are willing to sacrifice some performance for simplicity, while remaining competitive with state-of-the-art handcrafted designs. In this paper, we explore this middle ground, and present PathCAS, a primitive that combines ideas from multi-word CAS (KCAS) and transactional memory approaches, while carefully avoiding overhead. We show how PathCAS can be used to implement efficient search data structures relatively simply, using an internal binary search tree as an example, then extending this to an AVL tree. Our best implementations outperform many handcrafted search trees: in search-heavy workloads, it rivals the BCCO tree [5], the fastest known concurrent binary tree in terms of search performance [3]. Our results suggest that PathCAS can yield concurrent data structures that are relatively easy to build and prove correct, while offering surprisingly high performance.}, author = {Brown, Trevor A and Sigouin, William and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9781450392044}, location = {Seoul, Republic of Korea}, pages = {385--399}, publisher = {Association for Computing Machinery}, title = {{PathCAS: An efficient middle ground for concurrent search data structures}}, doi = {10.1145/3503221.3508410}, year = {2022}, } @inproceedings{7272, abstract = {Many systems rely on optimistic concurrent search trees for multi-core scalability. In principle, optimistic trees have a simple performance story: searches are read-only and so run in parallel, with writes to shared memory occurring only when modifying the data structure. However, this paper shows that in practice, obtaining the full performance benefits of optimistic search trees is not so simple. We focus on optimistic binary search trees (BSTs) and perform a detailed performance analysis of 10 state-of-the-art BSTs on large scale x86-64 hardware, using both microbenchmarks and an in-memory database system. We find and explain significant unexpected performance differences between BSTs with similar tree structure and search implementations, which we trace to subtle performance-degrading interactions of BSTs with systems software and hardware subsystems. We further derive a prescriptive approach to avoid this performance degradation, as well as algorithmic insights on optimistic BST design. Our work underlines the gap between the theory and practice of multi-core performance, and calls for further research to help bridge this gap.}, author = {Arbel-Raviv, Maya and Brown, Trevor A and Morrison, Adam}, booktitle = {Proceedings of the 2018 USENIX Annual Technical Conference}, isbn = {9781939133021}, location = {Boston, MA, United States}, pages = {295--306}, publisher = {USENIX Association}, title = {{Getting to the root of concurrent binary search tree performance}}, year = {2020}, } @inproceedings{7636, abstract = {Balanced search trees typically use key comparisons to guide their operations, and achieve logarithmic running time. By relying on numerical properties of the keys, interpolation search achieves lower search complexity and better performance. Although interpolation-based data structures were investigated in the past, their non-blocking concurrent variants have received very little attention so far. In this paper, we propose the first non-blocking implementation of the classic interpolation search tree (IST) data structure. For arbitrary key distributions, the data structure ensures worst-case O(log n + p) amortized time for search, insertion and deletion traversals. When the input key distributions are smooth, lookups run in expected O(log log n + p) time, and insertion and deletion run in expected amortized O(log log n + p) time, where p is a bound on the number of threads. To improve the scalability of concurrent insertion and deletion, we propose a novel parallel rebuilding technique, which should be of independent interest. We evaluate whether the theoretical improvements translate to practice by implementing the concurrent interpolation search tree, and benchmarking it on uniform and nonuniform key distributions, for dataset sizes in the millions to billions of keys. Relative to the state-of-the-art concurrent data structures, the concurrent interpolation search tree achieves performance improvements of up to 15% under high update rates, and of up to 50% under moderate update rates. Further, ISTs exhibit up to 2X less cache-misses, and consume 1.2 -- 2.6X less memory compared to the next best alternative on typical dataset sizes. We find that the results are surprisingly robust to distributional skew, which suggests that our data structure can be a promising alternative to classic concurrent search structures.}, author = {Brown, Trevor A and Prokopec, Aleksandar and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, isbn = {9781450368186}, location = {San Diego, CA, United States}, pages = {276--291}, publisher = {Association for Computing Machinery}, title = {{Non-blocking interpolation search trees with doubly-logarithmic running time}}, doi = {10.1145/3332466.3374542}, year = {2020}, } @inproceedings{8191, abstract = {There has been a significant amount of research on hardware and software support for efficient concurrent data structures; yet, the question of how to build correct, simple, and scalable data structures has not yet been definitively settled. In this paper, we revisit this question from a minimalist perspective, and ask: what is the smallest amount of synchronization required for correct and efficient concurrent search data structures, and how could this minimal synchronization support be provided in hardware? To address these questions, we introduce memory tagging, a simple hardware mechanism which enables the programmer to "tag" a dynamic set of memory locations, at cache-line granularity, and later validate whether the memory has been concurrently modified, with the possibility of updating one of the underlying locations atomically if validation succeeds. We provide several examples showing that this mechanism can enable fast and arguably simple concurrent data structure designs, such as lists, binary search trees, balanced search trees, range queries, and Software Transactional Memory (STM) implementations. We provide an implementation of memory tags in the Graphite multi-core simulator, showing that the mechanism can be implemented entirely at the level of L1 cache, and that it can enable non-trivial speedups versus existing implementations of the above data structures.}, author = {Alistarh, Dan-Adrian and Brown, Trevor A and Singhal, Nandini}, booktitle = {Annual ACM Symposium on Parallelism in Algorithms and Architectures}, isbn = {9781450369350}, location = {Virtual Event, United States}, number = {7}, pages = {37--49}, publisher = {Association for Computing Machinery}, title = {{Memory tagging: Minimalist synchronization for scalable concurrent data structures}}, doi = {10.1145/3350755.3400213}, year = {2020}, } @inproceedings{397, abstract = {Concurrent sets with range query operations are highly desirable in applications such as in-memory databases. However, few set implementations offer range queries. Known techniques for augmenting data structures with range queries (or operations that can be used to build range queries) have numerous problems that limit their usefulness. For example, they impose high overhead or rely heavily on garbage collection. In this work, we show how to augment data structures with highly efficient range queries, without relying on garbage collection. We identify a property of epoch-based memory reclamation algorithms that makes them ideal for implementing range queries, and produce three algorithms, which use locks, transactional memory and lock-free techniques, respectively. Our algorithms are applicable to more data structures than previous work, and are shown to be highly efficient on a large scale Intel system. }, author = {Arbel Raviv, Maya and Brown, Trevor A}, isbn = {978-1-4503-4982-6}, location = {Vienna, Austria}, number = {1}, pages = {14 -- 27}, publisher = {ACM}, title = {{Harnessing epoch-based reclamation for efficient range queries}}, doi = {10.1145/3178487.3178489}, volume = {53}, year = {2018}, } @inproceedings{85, abstract = {Concurrent accesses to shared data structures must be synchronized to avoid data races. Coarse-grained synchronization, which locks the entire data structure, is easy to implement but does not scale. Fine-grained synchronization can scale well, but can be hard to reason about. Hand-over-hand locking, in which operations are pipelined as they traverse the data structure, combines fine-grained synchronization with ease of use. However, the traditional implementation suffers from inherent overheads. This paper introduces snapshot-based synchronization (SBS), a novel hand-over-hand locking mechanism. SBS decouples the synchronization state from the data, significantly improving cache utilization. Further, it relies on guarantees provided by pipelining to minimize synchronization that requires cross-thread communication. Snapshot-based synchronization thus scales much better than traditional hand-over-hand locking, while maintaining the same ease of use.}, author = {Gilad, Eran and Brown, Trevor A and Oskin, Mark and Etsion, Yoav}, issn = {03029743}, location = {Turin, Italy}, pages = {465 -- 479}, publisher = {Springer}, title = {{Snapshot based synchronization: A fast replacement for Hand-over-Hand locking}}, doi = {10.1007/978-3-319-96983-1_33}, volume = {11014}, year = {2018}, } @inproceedings{5963, abstract = {There has been significant progress in understanding the parallelism inherent to iterative sequential algorithms: for many classic algorithms, the depth of the dependence structure is now well understood, and scheduling techniques have been developed to exploit this shallow dependence structure for efficient parallel implementations. A related, applied research strand has studied methods by which certain iterative task-based algorithms can be efficiently parallelized via relaxed concurrent priority schedulers. These allow for high concurrency when inserting and removing tasks, at the cost of executing superfluous work due to the relaxed semantics of the scheduler. In this work, we take a step towards unifying these two research directions, by showing that there exists a family of relaxed priority schedulers that can efficiently and deterministically execute classic iterative algorithms such as greedy maximal independent set (MIS) and matching. Our primary result shows that, given a randomized scheduler with an expected relaxation factor of k in terms of the maximum allowed priority inversions on a task, and any graph on n vertices, the scheduler is able to execute greedy MIS with only an additive factor of \poly(k) expected additional iterations compared to an exact (but not scalable) scheduler. This counter-intuitive result demonstrates that the overhead of relaxation when computing MIS is not dependent on the input size or structure of the input graph. Experimental results show that this overhead can be clearly offset by the gain in performance due to the highly scalable scheduler. In sum, we present an efficient method to deterministically parallelize iterative sequential algorithms, with provable runtime guarantees in terms of the number of executed tasks to completion.}, author = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Nadiradze, Giorgi}, booktitle = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing - PODC '18}, isbn = {9781450357951}, location = {Egham, United Kingdom}, pages = {377--386}, publisher = {ACM Press}, title = {{Relaxed schedulers can efficiently parallelize iterative algorithms}}, doi = {10.1145/3212734.3212756}, year = {2018}, } @inproceedings{5965, abstract = {Relaxed concurrent data structures have become increasingly popular, due to their scalability in graph processing and machine learning applications (\citeNguyen13, gonzalez2012powergraph ). Despite considerable interest, there exist families of natural, high performing randomized relaxed concurrent data structures, such as the popular MultiQueue~\citeMQ pattern for implementing relaxed priority queue data structures, for which no guarantees are known in the concurrent setting~\citeAKLN17. Our main contribution is in showing for the first time that, under a set of analytic assumptions, a family of relaxed concurrent data structures, including variants of MultiQueues, but also a new approximate counting algorithm we call the MultiCounter, provides strong probabilistic guarantees on the degree of relaxation with respect to the sequential specification, in arbitrary concurrent executions. We formalize these guarantees via a new correctness condition called distributional linearizability, tailored to concurrent implementations with randomized relaxations. Our result is based on a new analysis of an asynchronous variant of the classic power-of-two-choices load balancing algorithm, in which placement choices can be based on inconsistent, outdated information (this result may be of independent interest). We validate our results empirically, showing that the MultiCounter algorithm can implement scalable relaxed timestamps.}, author = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Li, Jerry Z. and Nadiradze, Giorgi}, booktitle = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures - SPAA '18}, isbn = {9781450357999}, location = {Vienna, Austria}, pages = {133--142}, publisher = {ACM Press}, title = {{Distributionally linearizable data structures}}, doi = {10.1145/3210377.3210411}, year = {2018}, }