@inproceedings{12881, author = {Martius, Georg S and Olbrich, Eckehard}, booktitle = {Proceedings of the 13th European Conference on Artificial Life}, isbn = {9780262330275}, location = {York, United Kingdom}, pages = {78}, publisher = {MIT Press}, title = {{Quantifying self-organizing behavior of autonomous robots}}, doi = {10.7551/978-0-262-33027-5-ch018}, year = {2015}, } @phdthesis{1401, abstract = {The human ability to recognize objects in complex scenes has driven research in the computer vision field over couple of decades. This thesis focuses on the object recognition task in images. That is, given the image, we want the computer system to be able to predict the class of the object that appears in the image. A recent successful attempt to bridge semantic understanding of the image perceived by humans and by computers uses attribute-based models. Attributes are semantic properties of the objects shared across different categories, which humans and computers can decide on. To explore the attribute-based models we take a statistical machine learning approach, and address two key learning challenges in view of object recognition task: learning augmented attributes as mid-level discriminative feature representation, and learning with attributes as privileged information. Our main contributions are parametric and non-parametric models and algorithms to solve these frameworks. In the parametric approach, we explore an autoencoder model combined with the large margin nearest neighbor principle for mid-level feature learning, and linear support vector machines for learning with privileged information. In the non-parametric approach, we propose a supervised Indian Buffet Process for automatic augmentation of semantic attributes, and explore the Gaussian Processes classification framework for learning with privileged information. A thorough experimental analysis shows the effectiveness of the proposed models in both parametric and non-parametric views.}, author = {Sharmanska, Viktoriia}, issn = {2663-337X}, pages = {144}, publisher = {Institute of Science and Technology Austria}, title = {{Learning with attributes for object recognition: Parametric and non-parametrics views}}, doi = {10.15479/at:ista:1401}, year = {2015}, } @article{1655, abstract = {Quantifying behaviors of robots which were generated autonomously from task-independent objective functions is an important prerequisite for objective comparisons of algorithms and movements of animals. The temporal sequence of such a behavior can be considered as a time series and hence complexity measures developed for time series are natural candidates for its quantification. The predictive information and the excess entropy are such complexity measures. They measure the amount of information the past contains about the future and thus quantify the nonrandom structure in the temporal sequence. However, when using these measures for systems with continuous states one has to deal with the fact that their values will depend on the resolution with which the systems states are observed. For deterministic systems both measures will diverge with increasing resolution. We therefore propose a new decomposition of the excess entropy in resolution dependent and resolution independent parts and discuss how they depend on the dimensionality of the dynamics, correlations and the noise level. For the practical estimation we propose to use estimates based on the correlation integral instead of the direct estimation of the mutual information based on next neighbor statistics because the latter allows less control of the scale dependencies. Using our algorithm we are able to show how autonomous learning generates behavior of increasing complexity with increasing learning duration.}, author = {Martius, Georg S and Olbrich, Eckehard}, journal = {Entropy}, number = {10}, pages = {7266 -- 7297}, publisher = {MDPI}, title = {{Quantifying emergent behavior of autonomous robots}}, doi = {10.3390/e17107266}, volume = {17}, year = {2015}, } @inbook{1829, abstract = {Hitting and batting tasks, such as tennis forehands, ping-pong strokes, or baseball batting, depend on predictions where the ball can be intercepted and how it can properly be returned to the opponent. These predictions get more accurate over time, hence the behaviors need to be continuously modified. As a result, movement templates with a learned global shape need to be adapted during the execution so that the racket reaches a target position and velocity that will return the ball over to the other side of the net or court. It requires altering learned movements to hit a varying target with the necessary velocity at a specific instant in time. Such a task cannot be incorporated straightforwardly in most movement representations suitable for learning. For example, the standard formulation of the dynamical system based motor primitives (introduced by Ijspeert et al (2002b)) does not satisfy this property despite their flexibility which has allowed learning tasks ranging from locomotion to kendama. In order to fulfill this requirement, we reformulate the Ijspeert framework to incorporate the possibility of specifying a desired hitting point and a desired hitting velocity while maintaining all advantages of the original formulation.We show that the proposed movement template formulation works well in two scenarios, i.e., for hitting a ball on a string with a table tennis racket at a specified velocity and for returning balls launched by a ball gun successfully over the net using forehand movements.}, author = {Muelling, Katharina and Kroemer, Oliver and Lampert, Christoph and Schölkopf, Bernhard}, booktitle = {Learning Motor Skills}, editor = {Kober, Jens and Peters, Jan}, pages = {69 -- 82}, publisher = {Springer}, title = {{Movement templates for learning of hitting and batting}}, doi = {10.1007/978-3-319-03194-1_3}, volume = {97}, year = {2014}, } @inproceedings{2033, abstract = {The learning with privileged information setting has recently attracted a lot of attention within the machine learning community, as it allows the integration of additional knowledge into the training process of a classifier, even when this comes in the form of a data modality that is not available at test time. Here, we show that privileged information can naturally be treated as noise in the latent function of a Gaussian process classifier (GPC). That is, in contrast to the standard GPC setting, the latent function is not just a nuisance but a feature: it becomes a natural measure of confidence about the training data by modulating the slope of the GPC probit likelihood function. Extensive experiments on public datasets show that the proposed GPC method using privileged noise, called GPC+, improves over a standard GPC without privileged knowledge, and also over the current state-of-the-art SVM-based method, SVM+. Moreover, we show that advanced neural networks and deep learning methods can be compressed as privileged information.}, author = {Hernandez Lobato, Daniel and Sharmanska, Viktoriia and Kersting, Kristian and Lampert, Christoph and Quadrianto, Novi}, booktitle = {Advances in Neural Information Processing Systems}, location = {Montreal, Canada}, number = {January}, pages = {837--845}, publisher = {Neural Information Processing Systems}, title = {{Mind the nuisance: Gaussian process classification using privileged noise}}, volume = {1}, year = {2014}, } @inproceedings{2057, abstract = {In the past few years, a lot of attention has been devoted to multimedia indexing by fusing multimodal informations. Two kinds of fusion schemes are generally considered: The early fusion and the late fusion. We focus on late classifier fusion, where one combines the scores of each modality at the decision level. To tackle this problem, we investigate a recent and elegant well-founded quadratic program named MinCq coming from the machine learning PAC-Bayesian theory. MinCq looks for the weighted combination, over a set of real-valued functions seen as voters, leading to the lowest misclassification rate, while maximizing the voters’ diversity. We propose an extension of MinCq tailored to multimedia indexing. Our method is based on an order-preserving pairwise loss adapted to ranking that allows us to improve Mean Averaged Precision measure while taking into account the diversity of the voters that we want to fuse. We provide evidence that this method is naturally adapted to late fusion procedures and confirm the good behavior of our approach on the challenging PASCAL VOC’07 benchmark.}, author = {Morvant, Emilie and Habrard, Amaury and Ayache, Stéphane}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, location = {Joensuu, Finland}, pages = {153 -- 162}, publisher = {Springer}, title = {{Majority vote of diverse classifiers for late fusion}}, doi = {10.1007/978-3-662-44415-3_16}, volume = {8621}, year = {2014}, } @inproceedings{2171, abstract = {We present LS-CRF, a new method for training cyclic Conditional Random Fields (CRFs) from large datasets that is inspired by classical closed-form expressions for the maximum likelihood parameters of a generative graphical model with tree topology. Training a CRF with LS-CRF requires only solving a set of independent regression problems, each of which can be solved efficiently in closed form or by an iterative solver. This makes LS-CRF orders of magnitude faster than classical CRF training based on probabilistic inference, and at the same time more flexible and easier to implement than other approximate techniques, such as pseudolikelihood or piecewise training. We apply LS-CRF to the task of semantic image segmentation, showing that it achieves on par accuracy to other training techniques at higher speed, thereby allowing efficient CRF training from very large training sets. For example, training a linearly parameterized pairwise CRF on 150,000 images requires less than one hour on a modern workstation.}, author = {Kolesnikov, Alexander and Guillaumin, Matthieu and Ferrari, Vittorio and Lampert, Christoph}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, location = {Zurich, Switzerland}, number = {PART 3}, pages = {550 -- 565}, publisher = {Springer}, title = {{Closed-form approximate CRF training for scalable image segmentation}}, doi = {10.1007/978-3-319-10578-9_36}, volume = {8691}, year = {2014}, } @inproceedings{2173, abstract = {In this work we introduce a new approach to co-classification, i.e. the task of jointly classifying multiple, otherwise independent, data samples. The method we present, named CoConut, is based on the idea of adding a regularizer in the label space to encode certain priors on the resulting labelings. A regularizer that encourages labelings that are smooth across the test set, for instance, can be seen as a test-time variant of the cluster assumption, which has been proven useful at training time in semi-supervised learning. A regularizer that introduces a preference for certain class proportions can be regarded as a prior distribution on the class labels. CoConut can build on existing classifiers without making any assumptions on how they were obtained and without the need to re-train them. The use of a regularizer adds a new level of flexibility. It allows the integration of potentially new information at test time, even in other modalities than what the classifiers were trained on. We evaluate our framework on six datasets, reporting a clear performance gain in classification accuracy compared to the standard classification setup that predicts labels for each test sample separately. }, author = {Khamis, Sameh and Lampert, Christoph}, booktitle = {Proceedings of the British Machine Vision Conference 2014}, location = {Nottingham, UK}, publisher = {BMVA Press}, title = {{CoConut: Co-classification with output space regularization}}, year = {2014}, } @inproceedings{2172, abstract = {Fisher Kernels and Deep Learning were two developments with significant impact on large-scale object categorization in the last years. Both approaches were shown to achieve state-of-the-art results on large-scale object categorization datasets, such as ImageNet. Conceptually, however, they are perceived as very different and it is not uncommon for heated debates to spring up when advocates of both paradigms meet at conferences or workshops. In this work, we emphasize the similarities between both architectures rather than their differences and we argue that such a unified view allows us to transfer ideas from one domain to the other. As a concrete example we introduce a method for learning a support vector machine classifier with Fisher kernel at the same time as a task-specific data representation. We reinterpret the setting as a multi-layer feed forward network. Its final layer is the classifier, parameterized by a weight vector, and the two previous layers compute Fisher vectors, parameterized by the coefficients of a Gaussian mixture model. We introduce a gradient descent based learning algorithm that, in contrast to other feature learning techniques, is not just derived from intuition or biological analogy, but has a theoretical justification in the framework of statistical learning theory. Our experiments show that the new training procedure leads to significant improvements in classification accuracy while preserving the modularity and geometric interpretability of a support vector machine setup.}, author = {Sydorov, Vladyslav and Sakurada, Mayu and Lampert, Christoph}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, location = {Columbus, USA}, pages = {1402 -- 1409}, publisher = {IEEE}, title = {{Deep Fisher Kernels – End to end learning of the Fisher Kernel GMM parameters}}, doi = {10.1109/CVPR.2014.182}, year = {2014}, } @article{2180, abstract = {Weighted majority votes allow one to combine the output of several classifiers or voters. MinCq is a recent algorithm for optimizing the weight of each voter based on the minimization of a theoretical bound over the risk of the vote with elegant PAC-Bayesian generalization guarantees. However, while it has demonstrated good performance when combining weak classifiers, MinCq cannot make use of the useful a priori knowledge that one may have when using a mixture of weak and strong voters. In this paper, we propose P-MinCq, an extension of MinCq that can incorporate such knowledge in the form of a constraint over the distribution of the weights, along with general proofs of convergence that stand in the sample compression setting for data-dependent voters. The approach is applied to a vote of k-NN classifiers with a specific modeling of the voters' performance. P-MinCq significantly outperforms the classic k-NN classifier, a symmetric NN and MinCq using the same voters. We show that it is also competitive with LMNN, a popular metric learning algorithm, and that combining both approaches further reduces the error.}, author = {Bellet, Aurélien and Habrard, Amaury and Morvant, Emilie and Sebban, Marc}, journal = {Machine Learning}, number = {1-2}, pages = {129 -- 154}, publisher = {Springer}, title = {{Learning a priori constrained weighted majority votes}}, doi = {10.1007/s10994-014-5462-z}, volume = {97}, year = {2014}, } @inproceedings{2189, abstract = {En apprentissage automatique, nous parlons d'adaptation de domaine lorsque les données de test (cibles) et d'apprentissage (sources) sont générées selon différentes distributions. Nous devons donc développer des algorithmes de classification capables de s'adapter à une nouvelle distribution, pour laquelle aucune information sur les étiquettes n'est disponible. Nous attaquons cette problématique sous l'angle de l'approche PAC-Bayésienne qui se focalise sur l'apprentissage de modèles définis comme des votes de majorité sur un ensemble de fonctions. Dans ce contexte, nous introduisons PV-MinCq une version adaptative de l'algorithme (non adaptatif) MinCq. PV-MinCq suit le principe suivant. Nous transférons les étiquettes sources aux points cibles proches pour ensuite appliquer MinCq sur l'échantillon cible ``auto-étiqueté'' (justifié par une borne théorique). Plus précisément, nous définissons un auto-étiquetage non itératif qui se focalise dans les régions où les distributions marginales source et cible sont les plus similaires. Dans un second temps, nous étudions l'influence de notre auto-étiquetage pour en déduire une procédure de validation des hyperparamètres. Finalement, notre approche montre des résultats empiriques prometteurs.}, author = {Morvant, Emilie}, location = {Saint-Etienne, France}, pages = {49--58}, publisher = {Elsevier}, title = {{Adaptation de domaine de vote de majorité par auto-étiquetage non itératif}}, volume = {1}, year = {2014}, } @inproceedings{2160, abstract = {Transfer learning has received a lot of attention in the machine learning community over the last years, and several effective algorithms have been developed. However, relatively little is known about their theoretical properties, especially in the setting of lifelong learning, where the goal is to transfer information to tasks for which no data have been observed so far. In this work we study lifelong learning from a theoretical perspective. Our main result is a PAC-Bayesian generalization bound that offers a unified view on existing paradigms for transfer learning, such as the transfer of parameters or the transfer of low-dimensional representations. We also use the bound to derive two principled lifelong learning algorithms, and we show that these yield results comparable with existing methods.}, author = {Pentina, Anastasia and Lampert, Christoph}, location = {Beijing, China}, pages = {991 -- 999}, publisher = {ML Research Press}, title = {{A PAC-Bayesian bound for Lifelong Learning}}, volume = {32}, year = {2014}, } @inproceedings{2294, abstract = {In this work we propose a system for automatic classification of Drosophila embryos into developmental stages. While the system is designed to solve an actual problem in biological research, we believe that the principle underly- ing it is interesting not only for biologists, but also for researchers in computer vision. The main idea is to combine two orthogonal sources of information: one is a classifier trained on strongly invariant features, which makes it applicable to images of very different conditions, but also leads to rather noisy predictions. The other is a label propagation step based on a more powerful similarity measure that however is only consistent within specific subsets of the data at a time. In our biological setup, the information sources are the shape and the staining patterns of embryo images. We show experimentally that while neither of the methods can be used by itself to achieve satisfactory results, their combina- tion achieves prediction quality comparable to human performance.}, author = {Kazmar, Tomas and Kvon, Evgeny and Stark, Alexander and Lampert, Christoph}, location = {Sydney, Australia}, publisher = {IEEE}, title = {{Drosophila Embryo Stage Annotation using Label Propagation}}, doi = {10.1109/ICCV.2013.139}, year = {2013}, } @inproceedings{2293, abstract = {Many computer vision problems have an asymmetric distribution of information between training and test time. In this work, we study the case where we are given additional information about the training data, which however will not be available at test time. This situation is called learning using privileged information (LUPI). We introduce two maximum-margin techniques that are able to make use of this additional source of information, and we show that the framework is applicable to several scenarios that have been studied in computer vision before. Experiments with attributes, bounding boxes, image tags and rationales as additional information in object classification show promising results.}, author = {Sharmanska, Viktoriia and Quadrianto, Novi and Lampert, Christoph}, location = {Sydney, Australia}, pages = {825 -- 832}, publisher = {IEEE}, title = {{Learning to rank using privileged information}}, doi = {10.1109/ICCV.2013.107}, year = {2013}, } @article{2516, abstract = {We study the problem of object recognition for categories for which we have no training examples, a task also called zero-data or zero-shot learning. This situation has hardly been studied in computer vision research, even though it occurs frequently: the world contains tens of thousands of different object classes and for only few of them image collections have been formed and suitably annotated. To tackle the problem we introduce attribute-based classification: objects are identified based on a high-level description that is phrased in terms of semantic attributes, such as the object's color or shape. Because the identification of each such property transcends the specific learning task at hand, the attribute classifiers can be pre-learned independently, e.g. from existing image datasets unrelated to the current task. Afterwards, new classes can be detected based on their attribute representation, without the need for a new training phase. In this paper we also introduce a new dataset, Animals with Attributes, of over 30,000 images of 50 animal classes, annotated with 85 semantic attributes. Extensive experiments on this and two more datasets show that attribute-based classification indeed is able to categorize images without access to any training images of the target classes.}, author = {Lampert, Christoph and Nickisch, Hannes and Harmeling, Stefan}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {3}, pages = {453 -- 465}, publisher = {IEEE}, title = {{Attribute-based classification for zero-shot learning of object categories}}, doi = {10.1109/TPAMI.2013.140}, volume = {36}, year = {2013}, } @inproceedings{2520, abstract = {We propose a probabilistic model to infer supervised latent variables in the Hamming space from observed data. Our model allows simultaneous inference of the number of binary latent variables, and their values. The latent variables preserve neighbourhood structure of the data in a sense that objects in the same semantic concept have similar latent values, and objects in different concepts have dissimilar latent values. We formulate the supervised infinite latent variable problem based on an intuitive principle of pulling objects together if they are of the same type, and pushing them apart if they are not. We then combine this principle with a flexible Indian Buffet Process prior on the latent variables. We show that the inferred supervised latent variables can be directly used to perform a nearest neighbour search for the purpose of retrieval. We introduce a new application of dynamically extending hash codes, and show how to effectively couple the structure of the hash codes with continuously growing structure of the neighbourhood preserving infinite latent feature space.}, author = {Quadrianto, Novi and Sharmanska, Viktoriia and Knowles, David and Ghahramani, Zoubin}, booktitle = {Proceedings of the 29th conference uncertainty in Artificial Intelligence}, isbn = {9780974903996}, location = {Bellevue, WA, United States}, pages = {527 -- 536}, publisher = {AUAI Press}, title = {{The supervised IBP: Neighbourhood preserving infinite latent feature models}}, year = {2013}, } @inproceedings{2901, abstract = { We introduce the M-modes problem for graphical models: predicting the M label configurations of highest probability that are at the same time local maxima of the probability landscape. M-modes have multiple possible applications: because they are intrinsically diverse, they provide a principled alternative to non-maximum suppression techniques for structured prediction, they can act as codebook vectors for quantizing the configuration space, or they can form component centers for mixture model approximation. We present two algorithms for solving the M-modes problem. The first algorithm solves the problem in polynomial time when the underlying graphical model is a simple chain. The second algorithm solves the problem for junction chains. In synthetic and real dataset, we demonstrate how M-modes can improve the performance of prediction. We also use the generated modes as a tool to understand the topography of the probability distribution of configurations, for example with relation to the training set size and amount of noise in the data. }, author = {Chen, Chao and Kolmogorov, Vladimir and Yan, Zhu and Metaxas, Dimitris and Lampert, Christoph}, location = {Scottsdale, AZ, United States}, pages = {161 -- 169}, publisher = {JMLR}, title = {{Computing the M most probable modes of a graphical model}}, volume = {31}, year = {2013}, } @inproceedings{2948, abstract = {Many visual datasets are traditionally used to analyze the performance of different learning techniques. The evaluation is usually done within each dataset, therefore it is questionable if such results are a reliable indicator of true generalization ability. We propose here an algorithm to exploit the existing data resources when learning on a new multiclass problem. Our main idea is to identify an image representation that decomposes orthogonally into two subspaces: a part specific to each dataset, and a part generic to, and therefore shared between, all the considered source sets. This allows us to use the generic representation as un-biased reference knowledge for a novel classification task. By casting the method in the multi-view setting, we also make it possible to use different features for different databases. We call the algorithm MUST, Multitask Unaligned Shared knowledge Transfer. Through extensive experiments on five public datasets, we show that MUST consistently improves the cross-datasets generalization performance.}, author = {Tommasi, Tatiana and Quadrianto, Novi and Caputo, Barbara and Lampert, Christoph}, location = {Daejeon, Korea}, pages = {1 -- 15}, publisher = {Springer}, title = {{Beyond dataset bias: Multi-task unaligned shared knowledge transfer}}, doi = {10.1007/978-3-642-37331-2_1}, volume = {7724}, year = {2013}, } @misc{3321, author = {Quadrianto, Novi and Lampert, Christoph}, booktitle = {Encyclopedia of Systems Biology}, editor = {Dubitzky, Werner and Wolkenhauer, Olaf and Cho, Kwang and Yokota, Hiroki}, pages = {1069 -- 1069}, publisher = {Springer}, title = {{Kernel based learning}}, doi = {10.1007/978-1-4419-9863-7_604}, volume = {3}, year = {2013}, } @inproceedings{2825, abstract = {We study the problem of maximum marginal prediction (MMP) in probabilistic graphical models, a task that occurs, for example, as the Bayes optimal decision rule under a Hamming loss. MMP is typically performed as a two-stage procedure: one estimates each variable's marginal probability and then forms a prediction from the states of maximal probability. In this work we propose a simple yet effective technique for accelerating MMP when inference is sampling-based: instead of the above two-stage procedure we directly estimate the posterior probability of each decision variable. This allows us to identify the point of time when we are sufficiently certain about any individual decision. Whenever this is the case, we dynamically prune the variables we are confident about from the underlying factor graph. Consequently, at any time only samples of variables whose decision is still uncertain need to be created. Experiments in two prototypical scenarios, multi-label classification and image inpainting, show that adaptive sampling can drastically accelerate MMP without sacrificing prediction accuracy.}, author = {Lampert, Christoph}, location = {Lake Tahoe, NV, United States}, pages = {82 -- 90}, publisher = {Neural Information Processing Systems}, title = {{Dynamic pruning of factor graphs for maximum marginal prediction}}, volume = {1}, year = {2012}, }