@article{2042, abstract = {Background: CRISPR is a microbial immune system likely to be involved in host-parasite coevolution. It functions using target sequences encoded by the bacterial genome, which interfere with invading nucleic acids using a homology-dependent system. The system also requires protospacer associated motifs (PAMs), short motifs close to the target sequence that are required for interference in CRISPR types I and II. Here, we investigate whether PAMs are depleted in phage genomes due to selection pressure to escape recognition.Results: To this end, we analyzed two data sets. Phages infecting all bacterial hosts were analyzed first, followed by a detailed analysis of phages infecting the genus Streptococcus, where PAMs are best understood. We use two different measures of motif underrepresentation that control for codon bias and the frequency of submotifs. We compare phages infecting species with a particular CRISPR type to those infecting species without that type. Since only known PAMs were investigated, the analysis is restricted to CRISPR types I-C and I-E and in Streptococcus to types I-C and II. We found evidence for PAM depletion in Streptococcus phages infecting hosts with CRISPR type I-C, in Vibrio phages infecting hosts with CRISPR type I-E and in Streptococcus thermopilus phages infecting hosts with type II-A, known as CRISPR3.Conclusions: The observed motif depletion in phages with hosts having CRISPR can be attributed to selection rather than to mutational bias, as mutational bias should affect the phages of all hosts. This observation implies that the CRISPR system has been efficient in the groups discussed here.}, author = {Kupczok, Anne and Bollback, Jonathan P}, journal = {BMC Genomics}, number = {1}, publisher = {BioMed Central}, title = {{Motif depletion in bacteriophages infecting hosts with CRISPR systems}}, doi = {10.1186/1471-2164-15-663}, volume = {15}, year = {2014}, } @article{2412, abstract = {Background: The CRISPR/Cas system is known to act as an adaptive and heritable immune system in Eubacteria and Archaea. Immunity is encoded in an array of spacer sequences. Each spacer can provide specific immunity to invasive elements that carry the same or a similar sequence. Even in closely related strains, spacer content is very dynamic and evolves quickly. Standard models of nucleotide evolutioncannot be applied to quantify its rate of change since processes other than single nucleotide changes determine its evolution.Methods We present probabilistic models that are specific for spacer content evolution. They account for the different processes of insertion and deletion. Insertions can be constrained to occur on one end only or are allowed to occur throughout the array. One deletion event can affect one spacer or a whole fragment of adjacent spacers. Parameters of the underlying models are estimated for a pair of arrays by maximum likelihood using explicit ancestor enumeration.Results Simulations show that parameters are well estimated on average under the models presented here. There is a bias in the rate estimation when including fragment deletions. The models also estimate times between pairs of strains. But with increasing time, spacer overlap goes to zero, and thus there is an upper bound on the distance that can be estimated. Spacer content similarities are displayed in a distance based phylogeny using the estimated times.We use the presented models to analyze different Yersinia pestis data sets and find that the results among them are largely congruent. The models also capture the variation in diversity of spacers among the data sets. A comparison of spacer-based phylogenies and Cas gene phylogenies shows that they resolve very different time scales for this data set.Conclusions The simulations and data analyses show that the presented models are useful for quantifying spacer content evolution and for displaying spacer content similarities of closely related strains in a phylogeny. This allows for comparisons of different CRISPR arrays or for comparisons between CRISPR arrays and nucleotide substitution rates.}, author = {Kupczok, Anne and Bollback, Jonathan P}, journal = {BMC Evolutionary Biology}, number = {1}, pages = {54 -- 54}, publisher = {BioMed Central}, title = {{Probabilistic models for CRISPR spacer content evolution }}, doi = {10.1186/1471-2148-13-54}, volume = {13}, year = {2013}, } @article{2410, abstract = {Here, we describe a novel virulent bacteriophage that infects Bacillus weihenstephanensis, isolated from soil in Austria. It is the first phage to be discovered that infects this species. Here, we present the complete genome sequence of this podovirus. }, author = {Fernandes Redondo, Rodrigo A and Kupczok, Anne and Stift, Gertraud and Bollback, Jonathan P}, journal = {Genome Announcements}, number = {3}, publisher = {American Society for Microbiology}, title = {{Complete genome sequence of the novel phage MG-B1 infecting bacillus weihenstephanensis}}, doi = {10.1128/genomeA.00216-13}, volume = {1}, year = {2013}, } @article{2411, abstract = {The kingdom of fungi provides model organisms for biotechnology, cell biology, genetics, and life sciences in general. Only when their phylogenetic relationships are stably resolved, can individual results from fungal research be integrated into a holistic picture of biology. However, and despite recent progress, many deep relationships within the fungi remain unclear. Here, we present the first phylogenomic study of an entire eukaryotic kingdom that uses a consistency criterion to strengthen phylogenetic conclusions. We reason that branches (splits) recovered with independent data and different tree reconstruction methods are likely to reflect true evolutionary relationships. Two complementary phylogenomic data sets based on 99 fungal genomes and 109 fungal expressed sequence tag (EST) sets analyzed with four different tree reconstruction methods shed light from different angles on the fungal tree of life. Eleven additional data sets address specifically the phylogenetic position of Blastocladiomycota, Ustilaginomycotina, and Dothideomycetes, respectively. The combined evidence from the resulting trees supports the deep-level stability of the fungal groups toward a comprehensive natural system of the fungi. In addition, our analysis reveals methodologically interesting aspects. Enrichment for EST encoded data-a common practice in phylogenomic analyses-introduces a strong bias toward slowly evolving and functionally correlated genes. Consequently, the generalization of phylogenomic data sets as collections of randomly selected genes cannot be taken for granted. A thorough characterization of the data to assess possible influences on the tree reconstruction should therefore become a standard in phylogenomic analyses.}, author = {Ebersberger, Ingo and De Matos Simoes, Ricardo and Kupczok, Anne and Gube, Matthias and Kothe, Erika and Voigt, Kerstin and Von Haeseler, Arndt}, journal = {Molecular Biology and Evolution}, number = {5}, pages = {1319 -- 1334}, publisher = {Oxford University Press}, title = {{A consistent phylogenetic backbone for the fungi}}, doi = {10.1093/molbev/msr285}, volume = {29}, year = {2012}, } @article{3370, abstract = {Supertree methods are widely applied and give rise to new conclusions about phylogenies (e.g., Bininda-Emonds et al. 2007). Although several desiderata for supertree methods exist (Wilkinson, Thorley, et al. 2004), only few of them have been studied in greater detail, examples include shape bias (Wilkinson et al. 2005) or pareto properties (Wilkinson et al. 2007). Here I look more closely at two matrix representation methods, matrix representation with compatibility (MRC) and matrix representation with parsimony (MRP). Different null models of random data are studied and the resulting tree shapes are investigated. Thereby I consider unrooted trees and a bias in tree shape is determined by a tree balance measure. The measure for unrooted trees is a modification of a tree balance measure for rooted trees. I observe that depending on the underlying null model of random data, the methods may resolve conflict in favor of more balanced tree shapes. The analyses refer only to trees with the same taxon set, also known as the consensus setting (e.g., Wilkinson et al. 2007), but I will be able to draw conclusions on how to deal with missing data.}, author = {Kupczok, Anne}, journal = {Systematic Biology}, number = {2}, pages = {218 -- 225}, publisher = {Oxford University Press}, title = {{Consequences of different null models on the tree shape bias of supertree methods}}, doi = {10.1093/sysbio/syq086}, volume = {60}, year = {2011}, } @article{3387, abstract = {Background: Supertree methods combine overlapping input trees into a larger supertree. Here, I consider split-based supertree methods that first extract the split information of the input trees and subsequently combine this split information into a phylogeny. Well known split-based supertree methods are matrix representation with parsimony and matrix representation with compatibility. Combining input trees on the same taxon set, as in the consensus setting, is a well-studied task and it is thus desirable to generalize consensus methods to supertree methods. Results: Here, three variants of majority-rule (MR) supertrees that generalize majority-rule consensus trees are investigated. I provide simple formulas for computing the respective score for bifurcating input- and supertrees. These score computations, together with a heuristic tree search minmizing the scores, were implemented in the python program PluMiST (Plus- and Minus SuperTrees) available from http://www.cibiv.at/software/ plumist. The different MR methods were tested by simulation and on real data sets. The search heuristic was successful in combining compatible input trees. When combining incompatible input trees, especially one variant, MR(-) supertrees, performed well. Conclusions: The presented framework allows for an efficient score computation of three majority-rule supertree variants and input trees. I combined the score computation with a heuristic search over the supertree space. The implementation was tested by simulation and on real data sets and showed promising results. Especially the MR(-) variant seems to be a reasonable score for supertree reconstruction. Generalizing these computations to multifurcating trees is an open problem, which may be tackled using this framework.}, author = {Kupczok, Anne}, journal = {BMC Evolutionary Biology}, number = {205}, publisher = {BioMed Central}, title = {{Split based computation of majority rule supertrees}}, doi = {10.1186/1471-2148-11-205}, volume = {11}, year = {2011}, } @article{2409, abstract = {Background: The availability of many gene alignments with overlapping taxon sets raises the question of which strategy is the best to infer species phylogenies from multiple gene information. Methods and programs abound that use the gene alignment in different ways to reconstruct the species tree. In particular, different methods combine the original data at different points along the way from the underlying sequences to the final tree. Accordingly, they are classified into superalignment, supertree and medium-level approaches. Here, we present a simulation study to compare different methods from each of these three approaches. Results: We observe that superalignment methods usually outperform the other approaches over a wide range of parameters including sparse data and gene-specific evolutionary parameters. In the presence of high incongruency among gene trees, however, other combination methods show better performance than the superalignment approach. Surprisingly, some supertree and medium-level methods exhibit, on average, worse results than a single gene phylogeny with complete taxon information. Conclusions: For some methods, using the reconstructed gene tree as an estimation of the species tree is superior to the combination of incomplete information. Superalignment usually performs best since it is less susceptible to stochastic error. Supertree methods can outperform superalignment in the presence of gene-tree conflict.}, author = {Kupczok, Anne and Schmidt, Heiko and Von Haeseler, Arndt}, journal = {Algorithms for Molecular Biology}, number = {1}, publisher = {BioMed Central}, title = {{Accuracy of phylogeny reconstruction methods combining overlapping gene data sets }}, doi = {10.1186/1748-7188-5-37}, volume = {5}, year = {2010}, } @article{3768, author = {Anne Kupczok and von Haeseler,Arndt}, journal = {Bioinformatics}, number = {1}, pages = {147 -- 149}, publisher = {Oxford University Press}, title = {{Comment on '{A} congruence index for testing topological similarity between trees'.}}, doi = {4199}, volume = {25}, year = {2009}, } @article{3769, abstract = {The geometrical representation of the space of phylogenetic trees implies a metric on the space of weighted trees. This metric, the geodesic distance, is the length of the shortest path through that space. We present an exact algorithm to compute this metric. For biologically reasonable trees, the implementation allows fast computations of the geodesic distance, although the running time of the algorithm is worst-case exponential. The algorithm was applied to pairs of 118 gene trees of the metazoa. The results show that a special path in tree space, the cone path, which can be computed in linear time, is a good approximation of the geodesic distance. The program GeoMeTree is a python implementation of the geodesic distance, and it is approximations and is available from www.cibiv.at/software/geometree.}, author = {Anne Kupczok and von Haeseler,Arndt and Klaere,Steffen}, journal = {Journal of Computational Biology}, number = {6}, pages = {577 -- 591}, publisher = {Mary Ann Liebert}, title = {{An Exact Algorithm for the Geodesic Distance between Phylogenetic Trees.}}, doi = {4200}, volume = {15}, year = {2008}, } @article{3767, abstract = {Models of RNA secondary structure folding are widely used to study evolution in theory and simulation. However, systematic studies of the parameters involved are rare. In this paper, we study by simulation how RNA evolution is influenced by three different factors, namely the mutation rate, scaling of the fitness function, and distance measure. We found that for low mutation rates the qualitative evolutionary behavior is robust with respect to the scaling of the fitness function. For efficient mutation rates, which are close to the error threshold, scaling and distance measure have a strong influence on the evolutionary behavior. A global distance measure that takes sequence information additively into account lowers the error threshold. When using a local sequence-structure alignment for the distance, we observed a smoother evolution of the fitness over time. Finally, in addition to the well known error threshold, we identify another threshold of the mutation rate, called divergence threshold, where the qualitative transient behavior changes from a localized to an exploratory search.}, author = {Anne Kupczok and Dittrich,Peter}, journal = {Journal of Theoretical Biology}, number = {3}, pages = {726 -- 35}, publisher = {Elsevier}, title = {{Determinants of simulated RNA evolution.}}, doi = {10.1016/j.jtbi.2005.06.019}, volume = {238}, year = {2006}, }