@inproceedings{6590, abstract = {Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from external sources, e.g. via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As additional complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization. }, author = {Konstantinov, Nikola H and Lampert, Christoph}, booktitle = {Proceedings of the 36th International Conference on Machine Learning}, location = {Long Beach, CA, USA}, pages = {3488--3498}, publisher = {ML Research Press}, title = {{Robust learning from untrusted sources}}, volume = {97}, year = {2019}, } @inproceedings{6482, abstract = {Computer vision systems for automatic image categorization have become accurate and reliable enough that they can run continuously for days or even years as components of real-world commercial applications. A major open problem in this context, however, is quality control. Good classification performance can only be expected if systems run under the specific conditions, in particular data distributions, that they were trained for. Surprisingly, none of the currently used deep network architectures have a built-in functionality that could detect if a network operates on data from a distribution it was not trained for, such that potentially a warning to the human users could be triggered. In this work, we describe KS(conf), a procedure for detecting such outside of specifications (out-of-specs) operation, based on statistical testing of the network outputs. We show by extensive experiments using the ImageNet, AwA2 and DAVIS datasets on a variety of ConvNets architectures that KS(conf) reliably detects out-of-specs situations. It furthermore has a number of properties that make it a promising candidate for practical deployment: it is easy to implement, adds almost no overhead to the system, works with all networks, including pretrained ones, and requires no a priori knowledge of how the data distribution could change. }, author = {Sun, Rémy and Lampert, Christoph}, isbn = {9783030129385}, issn = {1611-3349}, location = {Stuttgart, Germany}, pages = {244--259}, publisher = {Springer Nature}, title = {{KS(conf): A light-weight test if a ConvNet operates outside of Its specifications}}, doi = {10.1007/978-3-030-12939-2_18}, volume = {11269}, year = {2019}, } @phdthesis{68, abstract = {The most common assumption made in statistical learning theory is the assumption of the independent and identically distributed (i.i.d.) data. While being very convenient mathematically, it is often very clearly violated in practice. This disparity between the machine learning theory and applications underlies a growing demand in the development of algorithms that learn from dependent data and theory that can provide generalization guarantees similar to the independent situations. This thesis is dedicated to two variants of dependencies that can arise in practice. One is a dependence on the level of samples in a single learning task. Another dependency type arises in the multi-task setting when the tasks are dependent on each other even though the data for them can be i.i.d. In both cases we model the data (samples or tasks) as stochastic processes and introduce new algorithms for both settings that take into account and exploit the resulting dependencies. We prove the theoretical guarantees on the performance of the introduced algorithms under different evaluation criteria and, in addition, we compliment the theoretical study by the empirical one, where we evaluate some of the algorithms on two real world datasets to highlight their practical applicability.}, author = {Zimin, Alexander}, issn = {2663-337X}, pages = {92}, publisher = {Institute of Science and Technology Austria}, title = {{Learning from dependent data}}, doi = {10.15479/AT:ISTA:TH1048}, year = {2018}, } @phdthesis{197, abstract = {Modern computer vision systems heavily rely on statistical machine learning models, which typically require large amounts of labeled data to be learned reliably. Moreover, very recently computer vision research widely adopted techniques for representation learning, which further increase the demand for labeled data. However, for many important practical problems there is relatively small amount of labeled data available, so it is problematic to leverage full potential of the representation learning methods. One way to overcome this obstacle is to invest substantial resources into producing large labelled datasets. Unfortunately, this can be prohibitively expensive in practice. In this thesis we focus on the alternative way of tackling the aforementioned issue. We concentrate on methods, which make use of weakly-labeled or even unlabeled data. Specifically, the first half of the thesis is dedicated to the semantic image segmentation task. We develop a technique, which achieves competitive segmentation performance and only requires annotations in a form of global image-level labels instead of dense segmentation masks. Subsequently, we present a new methodology, which further improves segmentation performance by leveraging tiny additional feedback from a human annotator. By using our methods practitioners can greatly reduce the amount of data annotation effort, which is required to learn modern image segmentation models. In the second half of the thesis we focus on methods for learning from unlabeled visual data. We study a family of autoregressive models for modeling structure of natural images and discuss potential applications of these models. Moreover, we conduct in-depth study of one of these applications, where we develop the state-of-the-art model for the probabilistic image colorization task.}, author = {Kolesnikov, Alexander}, issn = {2663-337X}, pages = {113}, publisher = {Institute of Science and Technology Austria}, title = {{Weakly-Supervised Segmentation and Unsupervised Modeling of Natural Images}}, doi = {10.15479/AT:ISTA:th_1021}, year = {2018}, } @article{563, abstract = {In continuous populations with local migration, nearby pairs of individuals have on average more similar genotypes than geographically well separated pairs. A barrier to gene flow distorts this classical pattern of isolation by distance. Genetic similarity is decreased for sample pairs on different sides of the barrier and increased for pairs on the same side near the barrier. Here, we introduce an inference scheme that utilizes this signal to detect and estimate the strength of a linear barrier to gene flow in two-dimensions. We use a diffusion approximation to model the effects of a barrier on the geographical spread of ancestry backwards in time. This approach allows us to calculate the chance of recent coalescence and probability of identity by descent. We introduce an inference scheme that fits these theoretical results to the geographical covariance structure of bialleleic genetic markers. It can estimate the strength of the barrier as well as several demographic parameters. We investigate the power of our inference scheme to detect barriers by applying it to a wide range of simulated data. We also showcase an example application to a Antirrhinum majus (snapdragon) flower color hybrid zone, where we do not detect any signal of a strong genome wide barrier to gene flow.}, author = {Ringbauer, Harald and Kolesnikov, Alexander and Field, David and Barton, Nicholas H}, journal = {Genetics}, number = {3}, pages = {1231--1245}, publisher = {Genetics Society of America}, title = {{Estimating barriers to gene flow from distorted isolation-by-distance patterns}}, doi = {10.1534/genetics.117.300638}, volume = {208}, year = {2018}, } @article{321, abstract = {The twelve papers in this special section focus on learning systems with shared information for computer vision and multimedia communication analysis. In the real world, a realistic setting for computer vision or multimedia recognition problems is that we have some classes containing lots of training data and many classes containing a small amount of training data. Therefore, how to use frequent classes to help learning rare classes for which it is harder to collect the training data is an open question. Learning with shared information is an emerging topic in machine learning, computer vision and multimedia analysis. There are different levels of components that can be shared during concept modeling and machine learning stages, such as sharing generic object parts, sharing attributes, sharing transformations, sharing regularization parameters and sharing training examples, etc. Regarding the specific methods, multi-task learning, transfer learning and deep learning can be seen as using different strategies to share information. These learning with shared information methods are very effective in solving real-world large-scale problems.}, author = {Darrell, Trevor and Lampert, Christoph and Sebe, Nico and Wu, Ying and Yan, Yan}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {5}, pages = {1029 -- 1031}, publisher = {IEEE}, title = {{Guest editors' introduction to the special section on learning with Shared information for computer vision and multimedia analysis}}, doi = {10.1109/TPAMI.2018.2804998}, volume = {40}, year = {2018}, } @inproceedings{10882, abstract = {We introduce Intelligent Annotation Dialogs for bounding box annotation. We train an agent to automatically choose a sequence of actions for a human annotator to produce a bounding box in a minimal amount of time. Specifically, we consider two actions: box verification [34], where the annotator verifies a box generated by an object detector, and manual box drawing. We explore two kinds of agents, one based on predicting the probability that a box will be positively verified, and the other based on reinforcement learning. We demonstrate that (1) our agents are able to learn efficient annotation strategies in several scenarios, automatically adapting to the image difficulty, the desired quality of the boxes, and the detector strength; (2) in all scenarios the resulting annotation dialogs speed up annotation compared to manual box drawing alone and box verification alone, while also outperforming any fixed combination of verification and drawing in most scenarios; (3) in a realistic scenario where the detector is iteratively re-trained, our agents evolve a series of strategies that reflect the shifting trade-off between verification and drawing as the detector grows stronger.}, author = {Uijlings, Jasper and Konyushkova, Ksenia and Lampert, Christoph and Ferrari, Vittorio}, booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, isbn = {9781538664209}, issn = {2575-7075}, location = {Salt Lake City, UT, United States}, pages = {9175--9184}, publisher = {IEEE}, title = {{Learning intelligent dialogs for bounding box annotation}}, doi = {10.1109/cvpr.2018.00956}, year = {2018}, } @inproceedings{6012, abstract = {We present an approach to identify concise equations from data using a shallow neural network approach. In contrast to ordinary black-box regression, this approach allows understanding functional relations and generalizing them from observed data to unseen parts of the parameter space. We show how to extend the class of learnable equations for a recently proposed equation learning network to include divisions, and we improve the learning and model selection strategy to be useful for challenging real-world data. For systems governed by analytical expressions, our method can in many cases identify the true underlying equation and extrapolate to unseen domains. We demonstrate its effectiveness by experiments on a cart-pendulum system, where only 2 random rollouts are required to learn the forward dynamics and successfully achieve the swing-up task.}, author = {Sahoo, Subham and Lampert, Christoph and Martius, Georg S}, booktitle = {Proceedings of the 35th International Conference on Machine Learning}, location = {Stockholm, Sweden}, pages = {4442--4450}, publisher = {ML Research Press}, title = {{Learning equations for extrapolation and control}}, volume = {80}, year = {2018}, } @inproceedings{6011, abstract = {We establish a data-dependent notion of algorithmic stability for Stochastic Gradient Descent (SGD), and employ it to develop novel generalization bounds. This is in contrast to previous distribution-free algorithmic stability results for SGD which depend on the worst-case constants. By virtue of the data-dependent argument, our bounds provide new insights into learning with SGD on convex and non-convex problems. In the convex case, we show that the bound on the generalization error depends on the risk at the initialization point. In the non-convex case, we prove that the expected curvature of the objective function around the initialization point has crucial influence on the generalization error. In both cases, our results suggest a simple data-driven strategy to stabilize SGD by pre-screening its initialization. As a corollary, our results allow us to show optimistic generalization bounds that exhibit fast convergence rates for SGD subject to a vanishing empirical risk and low noise of stochastic gradient. }, author = {Kuzborskij, Ilja and Lampert, Christoph}, booktitle = {Proceedings of the 35 th International Conference on Machine Learning}, location = {Stockholm, Sweden}, pages = {2815--2824}, publisher = {ML Research Press}, title = {{Data-dependent stability of stochastic gradient descent}}, volume = {80}, year = {2018}, } @inproceedings{6589, abstract = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.}, author = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric}, booktitle = {Advances in Neural Information Processing Systems 31}, location = {Montreal, Canada}, pages = {5973--5983}, publisher = {Neural Information Processing Systems Foundation}, title = {{The convergence of sparsified gradient methods}}, volume = {Volume 2018}, year = {2018}, } @misc{5584, abstract = {This package contains data for the publication "Nonlinear decoding of a complex movie from the mammalian retina" by Deny S. et al, PLOS Comput Biol (2018). The data consists of (i) 91 spike sorted, isolated rat retinal ganglion cells that pass stability and quality criteria, recorded on the multi-electrode array, in response to the presentation of the complex movie with many randomly moving dark discs. The responses are represented as 648000 x 91 binary matrix, where the first index indicates the timebin of duration 12.5 ms, and the second index the neural identity. The matrix entry is 0/1 if the neuron didn't/did spike in the particular time bin. (ii) README file and a graphical illustration of the structure of the experiment, specifying how the 648000 timebins are split into epochs where 1, 2, 4, or 10 discs were displayed, and which stimulus segments are exact repeats or unique ball trajectories. (iii) a 648000 x 400 matrix of luminance traces for each of the 20 x 20 positions ("sites") in the movie frame, with time that is locked to the recorded raster. The luminance traces are produced as described in the manuscript by filtering the raw disc movie with a small gaussian spatial kernel. }, author = {Deny, Stephane and Marre, Olivier and Botella-Soler, Vicente and Martius, Georg S and Tkacik, Gasper}, keywords = {retina, decoding, regression, neural networks, complex stimulus}, publisher = {Institute of Science and Technology Austria}, title = {{Nonlinear decoding of a complex movie from the mammalian retina}}, doi = {10.15479/AT:ISTA:98}, year = {2018}, } @inproceedings{652, abstract = {We present an approach that enables robots to self-organize their sensorimotor behavior from scratch without providing specific information about neither the robot nor its environment. This is achieved by a simple neural control law that increases the consistency between external sensor dynamics and internal neural dynamics of the utterly simple controller. In this way, the embodiment and the agent-environment coupling are the only source of individual development. We show how an anthropomorphic tendon driven arm-shoulder system develops different behaviors depending on that coupling. For instance: Given a bottle half-filled with water, the arm starts to shake it, driven by the physical response of the water. When attaching a brush, the arm can be manipulated into wiping a table, and when connected to a revolvable wheel it finds out how to rotate it. Thus, the robot may be said to discover the affordances of the world. When allowing two (simulated) humanoid robots to interact physically, they engage into a joint behavior development leading to, for instance, spontaneous cooperation. More social effects are observed if the robots can visually perceive each other. Although, as an observer, it is tempting to attribute an apparent intentionality, there is nothing of the kind put in. As a conclusion, we argue that emergent behavior may be much less rooted in explicit intentions, internal motivations, or specific reward systems than is commonly believed.}, author = {Der, Ralf and Martius, Georg S}, isbn = {978-150905069-7}, location = {Cergy-Pontoise, France}, publisher = {IEEE}, title = {{Dynamical self consistency leads to behavioral development and emergent social interactions in robots}}, doi = {10.1109/DEVLRN.2016.7846789}, year = {2017}, } @article{658, abstract = {With the accelerated development of robot technologies, control becomes one of the central themes of research. In traditional approaches, the controller, by its internal functionality, finds appropriate actions on the basis of specific objectives for the task at hand. While very successful in many applications, self-organized control schemes seem to be favored in large complex systems with unknown dynamics or which are difficult to model. Reasons are the expected scalability, robustness, and resilience of self-organizing systems. The paper presents a self-learning neurocontroller based on extrinsic differential plasticity introduced recently, applying it to an anthropomorphic musculoskeletal robot arm with attached objects of unknown physical dynamics. The central finding of the paper is the following effect: by the mere feedback through the internal dynamics of the object, the robot is learning to relate each of the objects with a very specific sensorimotor pattern. Specifically, an attached pendulum pilots the arm into a circular motion, a half-filled bottle produces axis oriented shaking behavior, a wheel is getting rotated, and wiping patterns emerge automatically in a table-plus-brush setting. By these object-specific dynamical patterns, the robot may be said to recognize the object's identity, or in other words, it discovers dynamical affordances of objects. Furthermore, when including hand coordinates obtained from a camera, a dedicated hand-eye coordination self-organizes spontaneously. These phenomena are discussed from a specific dynamical system perspective. Central is the dedicated working regime at the border to instability with its potentially infinite reservoir of (limit cycle) attractors "waiting" to be excited. Besides converging toward one of these attractors, variate behavior is also arising from a self-induced attractor morphing driven by the learning rule. We claim that experimental investigations with this anthropomorphic, self-learning robot not only generate interesting and potentially useful behaviors, but may also help to better understand what subjective human muscle feelings are, how they can be rooted in sensorimotor patterns, and how these concepts may feed back on robotics.}, author = {Der, Ralf and Martius, Georg S}, issn = {16625218}, journal = {Frontiers in Neurorobotics}, number = {MAR}, publisher = {Frontiers Research Foundation}, title = {{Self organized behavior generation for musculoskeletal robots}}, doi = {10.3389/fnbot.2017.00008}, volume = {11}, year = {2017}, } @inproceedings{6841, abstract = {In classical machine learning, regression is treated as a black box process of identifying a suitable function from a hypothesis set without attempting to gain insight into the mechanism connecting inputs and outputs. In the natural sciences, however, finding an interpretable function for a phenomenon is the prime goal as it allows to understand and generalize results. This paper proposes a novel type of function learning network, called equation learner (EQL), that can learn analytical expressions and is able to extrapolate to unseen domains. It is implemented as an end-to-end differentiable feed-forward network and allows for efficient gradient based training. Due to sparsity regularization concise interpretable expressions can be obtained. Often the true underlying source expression is identified.}, author = {Martius, Georg S and Lampert, Christoph}, booktitle = {5th International Conference on Learning Representations, ICLR 2017 - Workshop Track Proceedings}, location = {Toulon, France}, publisher = {International Conference on Learning Representations}, title = {{Extrapolation and learning equations}}, year = {2017}, } @inproceedings{750, abstract = {Modern communication technologies allow first responders to contact thousands of potential volunteers simultaneously for support during a crisis or disaster event. However, such volunteer efforts must be well coordinated and monitored, in order to offer an effective relief to the professionals. In this paper we extend earlier work on optimally assigning volunteers to selected landmark locations. In particular, we emphasize the aspect that obtaining good assignments requires not only advanced computational tools, but also a realistic measure of distance between volunteers and landmarks. Specifically, we propose the use of the Open Street Map (OSM) driving distance instead of he previously used flight distance. We find the OSM driving distance to be better aligned with the interests of volunteers and first responders. Furthermore, we show that relying on the flying distance leads to a substantial underestimation of the number of required volunteers, causing negative side effects in case of an actual crisis situation.}, author = {Pielorz, Jasmin and Prandtstetter, Matthias and Straub, Markus and Lampert, Christoph}, booktitle = {2017 IEEE International Conference on Big Data}, isbn = {978-153862714-3}, location = {Boston, MA, United States}, pages = {3760 -- 3763}, publisher = {IEEE}, title = {{Optimal geospatial volunteer allocation needs realistic distances}}, doi = {10.1109/BigData.2017.8258375}, year = {2017}, } @inproceedings{1000, abstract = {We study probabilistic models of natural images and extend the autoregressive family of PixelCNN models by incorporating latent variables. Subsequently, we describe two new generative image models that exploit different image transformations as latent variables: a quantized grayscale view of the image or a multi-resolution image pyramid. The proposed models tackle two known shortcomings of existing PixelCNN models: 1) their tendency to focus on low-level image details, while largely ignoring high-level image information, such as object shapes, and 2) their computationally costly procedure for image sampling. We experimentally demonstrate benefits of our LatentPixelCNN models, in particular showing that they produce much more realistically looking image samples than previous state-of-the-art probabilistic models. }, author = {Kolesnikov, Alexander and Lampert, Christoph}, booktitle = {34th International Conference on Machine Learning}, isbn = {978-151085514-4}, location = {Sydney, Australia}, pages = {1905 -- 1914}, publisher = {JMLR}, title = {{PixelCNN models with auxiliary variables for natural image modeling}}, volume = {70}, year = {2017}, } @inproceedings{998, abstract = {A major open problem on the road to artificial intelligence is the development of incrementally learning systems that learn about more and more concepts over time from a stream of data. In this work, we introduce a new training strategy, iCaRL, that allows learning in such a class-incremental way: only the training data for a small number of classes has to be present at the same time and new classes can be added progressively. iCaRL learns strong classifiers and a data representation simultaneously. This distinguishes it from earlier works that were fundamentally limited to fixed data representations and therefore incompatible with deep learning architectures. We show by experiments on CIFAR-100 and ImageNet ILSVRC 2012 data that iCaRL can learn many classes incrementally over a long period of time where other strategies quickly fail. }, author = {Rebuffi, Sylvestre Alvise and Kolesnikov, Alexander and Sperl, Georg and Lampert, Christoph}, isbn = {978-153860457-1}, location = {Honolulu, HA, United States}, pages = {5533 -- 5542}, publisher = {IEEE}, title = {{iCaRL: Incremental classifier and representation learning}}, doi = {10.1109/CVPR.2017.587}, volume = {2017}, year = {2017}, } @inproceedings{911, abstract = {We develop a probabilistic technique for colorizing grayscale natural images. In light of the intrinsic uncertainty of this task, the proposed probabilistic framework has numerous desirable properties. In particular, our model is able to produce multiple plausible and vivid colorizations for a given grayscale image and is one of the first colorization models to provide a proper stochastic sampling scheme. Moreover, our training procedure is supported by a rigorous theoretical framework that does not require any ad hoc heuristics and allows for efficient modeling and learning of the joint pixel color distribution.We demonstrate strong quantitative and qualitative experimental results on the CIFAR-10 dataset and the challenging ILSVRC 2012 dataset.}, author = {Royer, Amélie and Kolesnikov, Alexander and Lampert, Christoph}, location = {London, United Kingdom}, pages = {85.1--85.12}, publisher = {BMVA Press}, title = {{Probabilistic image colorization}}, doi = {10.5244/c.31.85}, year = {2017}, } @inproceedings{1108, abstract = {In this work we study the learnability of stochastic processes with respect to the conditional risk, i.e. the existence of a learning algorithm that improves its next-step performance with the amount of observed data. We introduce a notion of pairwise discrepancy between conditional distributions at different times steps and show how certain properties of these discrepancies can be used to construct a successful learning algorithm. Our main results are two theorems that establish criteria for learnability for many classes of stochastic processes, including all special cases studied previously in the literature.}, author = {Zimin, Alexander and Lampert, Christoph}, location = {Fort Lauderdale, FL, United States}, pages = {213 -- 222}, publisher = {ML Research Press}, title = {{Learning theory for conditional risk minimization}}, volume = {54}, year = {2017}, } @inproceedings{999, abstract = {In multi-task learning, a learner is given a collection of prediction tasks and needs to solve all of them. In contrast to previous work, which required that annotated training data must be available for all tasks, we consider a new setting, in which for some tasks, potentially most of them, only unlabeled training data is provided. Consequently, to solve all tasks, information must be transferred between tasks with labels and tasks without labels. Focusing on an instance-based transfer method we analyze two variants of this setting: when the set of labeled tasks is fixed, and when it can be actively selected by the learner. We state and prove a generalization bound that covers both scenarios and derive from it an algorithm for making the choice of labeled tasks (in the active case) and for transferring information between the tasks in a principled way. We also illustrate the effectiveness of the algorithm on synthetic and real data. }, author = {Pentina, Anastasia and Lampert, Christoph}, isbn = {9781510855144}, location = {Sydney, Australia}, pages = {2807 -- 2816}, publisher = {ML Research Press}, title = {{Multi-task learning with labeled and unlabeled tasks}}, volume = {70}, year = {2017}, } @inproceedings{1098, abstract = {Better understanding of the potential benefits of information transfer and representation learning is an important step towards the goal of building intelligent systems that are able to persist in the world and learn over time. In this work, we consider a setting where the learner encounters a stream of tasks but is able to retain only limited information from each encountered task, such as a learned predictor. In contrast to most previous works analyzing this scenario, we do not make any distributional assumptions on the task generating process. Instead, we formulate a complexity measure that captures the diversity of the observed tasks. We provide a lifelong learning algorithm with error guarantees for every observed task (rather than on average). We show sample complexity reductions in comparison to solving every task in isolation in terms of our task complexity measure. Further, our algorithmic framework can naturally be viewed as learning a representation from encountered tasks with a neural network.}, author = {Pentina, Anastasia and Urner, Ruth}, location = {Barcelona, Spain}, pages = {3619--3627}, publisher = {Neural Information Processing Systems}, title = {{Lifelong learning with weighted majority votes}}, volume = {29}, year = {2016}, } @inproceedings{1102, abstract = {Weakly-supervised object localization methods tend to fail for object classes that consistently co-occur with the same background elements, e.g. trains on tracks. We propose a method to overcome these failures by adding a very small amount of model-specific additional annotation. The main idea is to cluster a deep network\'s mid-level representations and assign object or distractor labels to each cluster. Experiments show substantially improved localization results on the challenging ILSVC2014 dataset for bounding box detection and the PASCAL VOC2012 dataset for semantic segmentation.}, author = {Kolesnikov, Alexander and Lampert, Christoph}, booktitle = {Proceedings of the British Machine Vision Conference 2016}, location = {York, United Kingdom}, pages = {92.1--92.12}, publisher = {BMVA Press}, title = {{Improving weakly-supervised object localization by micro-annotation}}, doi = {10.5244/C.30.92}, volume = {2016-September}, year = {2016}, } @inproceedings{1214, abstract = {With the accelerated development of robot technologies, optimal control becomes one of the central themes of research. In traditional approaches, the controller, by its internal functionality, finds appropriate actions on the basis of the history of sensor values, guided by the goals, intentions, objectives, learning schemes, and so forth. While very successful with classical robots, these methods run into severe difficulties when applied to soft robots, a new field of robotics with large interest for human-robot interaction. We claim that a novel controller paradigm opens new perspective for this field. This paper applies a recently developed neuro controller with differential extrinsic synaptic plasticity to a muscle-tendon driven arm-shoulder system from the Myorobotics toolkit. In the experiments, we observe a vast variety of self-organized behavior patterns: when left alone, the arm realizes pseudo-random sequences of different poses. By applying physical forces, the system can be entrained into definite motion patterns like wiping a table. Most interestingly, after attaching an object, the controller gets in a functional resonance with the object's internal dynamics, starting to shake spontaneously bottles half-filled with water or sensitively driving an attached pendulum into a circular mode. When attached to the crank of a wheel the neural system independently develops to rotate it. In this way, the robot discovers affordances of objects its body is interacting with.}, author = {Martius, Georg S and Hostettler, Raphael and Knoll, Alois and Der, Ralf}, location = {Daejeon, Korea}, publisher = {IEEE}, title = {{Compliant control for soft robots: Emergent behavior of a tendon driven anthropomorphic arm}}, doi = {10.1109/IROS.2016.7759138}, volume = {2016-November}, year = {2016}, } @inproceedings{1369, abstract = {We introduce a new loss function for the weakly-supervised training of semantic image segmentation models based on three guiding principles: to seed with weak localization cues, to expand objects based on the information about which classes can occur in an image, and to constrain the segmentations to coincide with object boundaries. We show experimentally that training a deep convolutional neural network using the proposed loss function leads to substantially better segmentations than previous state-of-the-art methods on the challenging PASCAL VOC 2012 dataset. We furthermore give insight into the working mechanism of our method by a detailed experimental study that illustrates how the segmentation quality is affected by each term of the proposed loss function as well as their combinations.}, author = {Kolesnikov, Alexander and Lampert, Christoph}, location = {Amsterdam, The Netherlands}, pages = {695 -- 711}, publisher = {Springer}, title = {{Seed, expand and constrain: Three principles for weakly-supervised image segmentation}}, doi = {10.1007/978-3-319-46493-0_42}, volume = {9908}, year = {2016}, } @inproceedings{1707, abstract = {Volunteer supporters play an important role in modern crisis and disaster management. In the times of mobile Internet devices, help from thousands of volunteers can be requested within a short time span, thus relieving professional helpers from minor chores or geographically spread-out tasks. However, the simultaneous availability of many volunteers also poses new problems. In particular, the volunteer efforts must be well coordinated, or otherwise situations might emerge in which too many idle volunteers at one location become more of a burden than a relief to the professionals. In this work, we study the task of optimally assigning volunteers to selected locations, e.g. in order to perform regular measurements, to report on damage, or to distribute information or resources to the population in a crisis situation. We formulate the assignment tasks as an optimization problem and propose an effective and efficient solution procedure. Experiments on real data of the Team Österreich, consisting of over 36,000 Austrian volunteers, show the effectiveness and efficiency of our approach.}, author = {Pielorz, Jasmin and Lampert, Christoph}, location = {Rennes, France}, publisher = {IEEE}, title = {{Optimal geospatial allocation of volunteers for crisis management}}, doi = {10.1109/ICT-DM.2015.7402041}, year = {2016}, } @inproceedings{8094, abstract = {With the accelerated development of robot technologies, optimal control becomes one of the central themes of research. In traditional approaches, the controller, by its internal functionality, finds appropriate actions on the basis of the history of sensor values, guided by the goals, intentions, objectives, learning schemes, and so forth. The idea is that the controller controls the world---the body plus its environment---as reliably as possible. This paper focuses on new lines of self-organization for developmental robotics. We apply the recently developed differential extrinsic synaptic plasticity to a muscle-tendon driven arm-shoulder system from the Myorobotics toolkit. In the experiments, we observe a vast variety of self-organized behavior patterns: when left alone, the arm realizes pseudo-random sequences of different poses. By applying physical forces, the system can be entrained into definite motion patterns like wiping a table. Most interestingly, after attaching an object, the controller gets in a functional resonance with the object's internal dynamics, starting to shake spontaneously bottles half-filled with water or sensitively driving an attached pendulum into a circular mode. When attached to the crank of a wheel the neural system independently discovers how to rotate it. In this way, the robot discovers affordances of objects its body is interacting with.}, author = {Martius, Georg S and Hostettler, Rafael and Knoll, Alois and Der, Ralf}, booktitle = {Proceedings of the Artificial Life Conference 2016}, isbn = {9780262339360}, location = {Cancun, Mexico}, pages = {142--143}, publisher = {MIT Press}, title = {{Self-organized control of an tendon driven arm by differential extrinsic plasticity}}, doi = {10.7551/978-0-262-33936-0-ch029}, volume = {28}, year = {2016}, } @phdthesis{1126, abstract = {Traditionally machine learning has been focusing on the problem of solving a single task in isolation. While being quite well understood, this approach disregards an important aspect of human learning: when facing a new problem, humans are able to exploit knowledge acquired from previously learned tasks. Intuitively, access to several problems simultaneously or sequentially could also be advantageous for a machine learning system, especially if these tasks are closely related. Indeed, results of many empirical studies have provided justification for this intuition. However, theoretical justifications of this idea are rather limited. The focus of this thesis is to expand the understanding of potential benefits of information transfer between several related learning problems. We provide theoretical analysis for three scenarios of multi-task learning - multiple kernel learning, sequential learning and active task selection. We also provide a PAC-Bayesian perspective on lifelong learning and investigate how the task generation process influences the generalization guarantees in this scenario. In addition, we show how some of the obtained theoretical results can be used to derive principled multi-task and lifelong learning algorithms and illustrate their performance on various synthetic and real-world datasets.}, author = {Pentina, Anastasia}, issn = {2663-337X}, pages = {127}, publisher = {Institute of Science and Technology Austria}, title = {{Theoretical foundations of multi-task lifelong learning}}, doi = {10.15479/AT:ISTA:TH_776}, year = {2016}, } @inproceedings{1425, abstract = {In this work we aim at extending the theoretical foundations of lifelong learning. Previous work analyzing this scenario is based on the assumption that learning tasks are sampled i.i.d. from a task environment or limited to strongly constrained data distributions. Instead, we study two scenarios when lifelong learning is possible, even though the observed tasks do not form an i.i.d. sample: first, when they are sampled from the same environment, but possibly with dependencies, and second, when the task environment is allowed to change over time in a consistent way. In the first case we prove a PAC-Bayesian theorem that can be seen as a direct generalization of the analogous previous result for the i.i.d. case. For the second scenario we propose to learn an inductive bias in form of a transfer procedure. We present a generalization bound and show on a toy example how it can be used to identify a beneficial transfer algorithm.}, author = {Pentina, Anastasia and Lampert, Christoph}, location = {Montreal, Canada}, pages = {1540 -- 1548}, publisher = {Neural Information Processing Systems}, title = {{Lifelong learning with non-i.i.d. tasks}}, volume = {2015}, year = {2015}, } @article{1533, abstract = {This paper addresses the problem of semantic segmentation, where the possible class labels are from a predefined set. We exploit top-down guidance, i.e., the coarse localization of the objects and their class labels provided by object detectors. For each detected bounding box, figure-ground segmentation is performed and the final result is achieved by merging the figure-ground segmentations. The main idea of the proposed approach, which is presented in our preliminary work, is to reformulate the figure-ground segmentation problem as sparse reconstruction pursuing the object mask in a nonparametric manner. The latent segmentation mask should be coherent subject to sparse error caused by intra-category diversity; thus, the object mask is inferred by making use of sparse representations over the training set. To handle local spatial deformations, local patch-level masks are also considered and inferred by sparse representations over the spatially nearby patches. The sparse reconstruction coefficients and the latent mask are alternately optimized by applying the Lasso algorithm and the accelerated proximal gradient method. The proposed formulation results in a convex optimization problem; thus, the global optimal solution is achieved. In this paper, we provide theoretical analysis of the convergence and optimality. We also give an extended numerical analysis of the proposed algorithm and a comprehensive comparison with the related semantic segmentation methods on the challenging PASCAL visual object class object segmentation datasets and the Weizmann horse dataset. The experimental results demonstrate that the proposed algorithm achieves a competitive performance when compared with the state of the arts.}, author = {Xia, Wei and Domokos, Csaba and Xiong, Junjun and Cheong, Loongfah and Yan, Shuicheng}, journal = {IEEE Transactions on Circuits and Systems for Video Technology}, number = {8}, pages = {1295 -- 1308}, publisher = {IEEE}, title = {{Segmentation over detection via optimal sparse reconstructions}}, doi = {10.1109/TCSVT.2014.2379972}, volume = {25}, year = {2015}, } @article{1570, abstract = {Grounding autonomous behavior in the nervous system is a fundamental challenge for neuroscience. In particular, self-organized behavioral development provides more questions than answers. Are there special functional units for curiosity, motivation, and creativity? This paper argues that these features can be grounded in synaptic plasticity itself, without requiring any higher-level constructs. We propose differential extrinsic plasticity (DEP) as a new synaptic rule for self-learning systems and apply it to a number of complex robotic systems as a test case. Without specifying any purpose or goal, seemingly purposeful and adaptive rhythmic behavior is developed, displaying a certain level of sensorimotor intelligence. These surprising results require no systemspecific modifications of the DEP rule. They rather arise from the underlying mechanism of spontaneous symmetry breaking,which is due to the tight brain body environment coupling. The new synaptic rule is biologically plausible and would be an interesting target for neurobiological investigation. We also argue that this neuronal mechanism may have been a catalyst in natural evolution.}, author = {Der, Ralf and Martius, Georg S}, journal = {PNAS}, number = {45}, pages = {E6224 -- E6232}, publisher = {National Academy of Sciences}, title = {{Novel plasticity rule can explain the development of sensorimotor intelligence}}, doi = {10.1073/pnas.1508400112}, volume = {112}, year = {2015}, } @inproceedings{1706, abstract = {We consider a problem of learning kernels for use in SVM classification in the multi-task and lifelong scenarios and provide generalization bounds on the error of a large margin classifier. Our results show that, under mild conditions on the family of kernels used for learning, solving several related tasks simultaneously is beneficial over single task learning. In particular, as the number of observed tasks grows, assuming that in the considered family of kernels there exists one that yields low approximation error on all tasks, the overhead associated with learning such a kernel vanishes and the complexity converges to that of learning when this good kernel is given to the learner.}, author = {Pentina, Anastasia and Ben David, Shai}, location = {Banff, AB, Canada}, pages = {194 -- 208}, publisher = {Springer}, title = {{Multi-task and lifelong learning of kernels}}, doi = {10.1007/978-3-319-24486-0_13}, volume = {9355}, year = {2015}, } @inproceedings{1859, abstract = {Structural support vector machines (SSVMs) are amongst the best performing models for structured computer vision tasks, such as semantic image segmentation or human pose estimation. Training SSVMs, however, is computationally costly, because it requires repeated calls to a structured prediction subroutine (called \emph{max-oracle}), which has to solve an optimization problem itself, e.g. a graph cut. In this work, we introduce a new algorithm for SSVM training that is more efficient than earlier techniques when the max-oracle is computationally expensive, as it is frequently the case in computer vision tasks. The main idea is to (i) combine the recent stochastic Block-Coordinate Frank-Wolfe algorithm with efficient hyperplane caching, and (ii) use an automatic selection rule for deciding whether to call the exact max-oracle or to rely on an approximate one based on the cached hyperplanes. We show experimentally that this strategy leads to faster convergence to the optimum with respect to the number of requires oracle calls, and that this translates into faster convergence with respect to the total runtime when the max-oracle is slow compared to the other steps of the algorithm. }, author = {Shah, Neel and Kolmogorov, Vladimir and Lampert, Christoph}, location = {Boston, MA, USA}, pages = {2737 -- 2745}, publisher = {IEEE}, title = {{A multi-plane block-coordinate Frank-Wolfe algorithm for training structural SVMs with a costly max-oracle}}, doi = {10.1109/CVPR.2015.7298890}, year = {2015}, } @inproceedings{1860, abstract = {Classifiers for object categorization are usually evaluated by their accuracy on a set of i.i.d. test examples. This provides us with an estimate of the expected error when applying the classifiers to a single new image. In real application, however, classifiers are rarely only used for a single image and then discarded. Instead, they are applied sequentially to many images, and these are typically not i.i.d. samples from a fixed data distribution, but they carry dependencies and their class distribution varies over time. In this work, we argue that the phenomenon of correlated data at prediction time is not a nuisance, but a blessing in disguise. We describe a probabilistic method for adapting classifiers at prediction time without having to retrain them. We also introduce a framework for creating realistically distributed image sequences, which offers a way to benchmark classifier adaptation methods, such as the one we propose. Experiments on the ILSVRC2010 and ILSVRC2012 datasets show that adapting object classification systems at prediction time can significantly reduce their error rate, even with no additional human feedback.}, author = {Royer, Amélie and Lampert, Christoph}, location = {Boston, MA, United States}, pages = {1401 -- 1409}, publisher = {IEEE}, title = {{Classifier adaptation at prediction time}}, doi = {10.1109/CVPR.2015.7298746}, year = {2015}, } @inproceedings{1858, abstract = {We study the problem of predicting the future, though only in the probabilistic sense of estimating a future state of a time-varying probability distribution. This is not only an interesting academic problem, but solving this extrapolation problem also has many practical application, e.g. for training classifiers that have to operate under time-varying conditions. Our main contribution is a method for predicting the next step of the time-varying distribution from a given sequence of sample sets from earlier time steps. For this we rely on two recent machine learning techniques: embedding probability distributions into a reproducing kernel Hilbert space, and learning operators by vector-valued regression. We illustrate the working principles and the practical usefulness of our method by experiments on synthetic and real data. We also highlight an exemplary application: training a classifier in a domain adaptation setting without having access to examples from the test time distribution at training time.}, author = {Lampert, Christoph}, location = {Boston, MA, United States}, pages = {942 -- 950}, publisher = {IEEE}, title = {{Predicting the future behavior of a time-varying probability distribution}}, doi = {10.1109/CVPR.2015.7298696}, year = {2015}, } @inproceedings{1857, abstract = {Sharing information between multiple tasks enables algorithms to achieve good generalization performance even from small amounts of training data. However, in a realistic scenario of multi-task learning not all tasks are equally related to each other, hence it could be advantageous to transfer information only between the most related tasks. In this work we propose an approach that processes multiple tasks in a sequence with sharing between subsequent tasks instead of solving all tasks jointly. Subsequently, we address the question of curriculum learning of tasks, i.e. finding the best order of tasks to be learned. Our approach is based on a generalization bound criterion for choosing the task order that optimizes the average expected classification performance over all tasks. Our experimental results show that learning multiple related tasks sequentially can be more effective than learning them jointly, the order in which tasks are being solved affects the overall performance, and that our model is able to automatically discover the favourable order of tasks. }, author = {Pentina, Anastasia and Sharmanska, Viktoriia and Lampert, Christoph}, location = {Boston, MA, United States}, pages = {5492 -- 5500}, publisher = {IEEE}, title = {{Curriculum learning of multiple tasks}}, doi = {10.1109/CVPR.2015.7299188}, year = {2015}, } @inproceedings{12881, author = {Martius, Georg S and Olbrich, Eckehard}, booktitle = {Proceedings of the 13th European Conference on Artificial Life}, isbn = {9780262330275}, location = {York, United Kingdom}, pages = {78}, publisher = {MIT Press}, title = {{Quantifying self-organizing behavior of autonomous robots}}, doi = {10.7551/978-0-262-33027-5-ch018}, year = {2015}, } @phdthesis{1401, abstract = {The human ability to recognize objects in complex scenes has driven research in the computer vision field over couple of decades. This thesis focuses on the object recognition task in images. That is, given the image, we want the computer system to be able to predict the class of the object that appears in the image. A recent successful attempt to bridge semantic understanding of the image perceived by humans and by computers uses attribute-based models. Attributes are semantic properties of the objects shared across different categories, which humans and computers can decide on. To explore the attribute-based models we take a statistical machine learning approach, and address two key learning challenges in view of object recognition task: learning augmented attributes as mid-level discriminative feature representation, and learning with attributes as privileged information. Our main contributions are parametric and non-parametric models and algorithms to solve these frameworks. In the parametric approach, we explore an autoencoder model combined with the large margin nearest neighbor principle for mid-level feature learning, and linear support vector machines for learning with privileged information. In the non-parametric approach, we propose a supervised Indian Buffet Process for automatic augmentation of semantic attributes, and explore the Gaussian Processes classification framework for learning with privileged information. A thorough experimental analysis shows the effectiveness of the proposed models in both parametric and non-parametric views.}, author = {Sharmanska, Viktoriia}, issn = {2663-337X}, pages = {144}, publisher = {Institute of Science and Technology Austria}, title = {{Learning with attributes for object recognition: Parametric and non-parametrics views}}, doi = {10.15479/at:ista:1401}, year = {2015}, } @article{1655, abstract = {Quantifying behaviors of robots which were generated autonomously from task-independent objective functions is an important prerequisite for objective comparisons of algorithms and movements of animals. The temporal sequence of such a behavior can be considered as a time series and hence complexity measures developed for time series are natural candidates for its quantification. The predictive information and the excess entropy are such complexity measures. They measure the amount of information the past contains about the future and thus quantify the nonrandom structure in the temporal sequence. However, when using these measures for systems with continuous states one has to deal with the fact that their values will depend on the resolution with which the systems states are observed. For deterministic systems both measures will diverge with increasing resolution. We therefore propose a new decomposition of the excess entropy in resolution dependent and resolution independent parts and discuss how they depend on the dimensionality of the dynamics, correlations and the noise level. For the practical estimation we propose to use estimates based on the correlation integral instead of the direct estimation of the mutual information based on next neighbor statistics because the latter allows less control of the scale dependencies. Using our algorithm we are able to show how autonomous learning generates behavior of increasing complexity with increasing learning duration.}, author = {Martius, Georg S and Olbrich, Eckehard}, journal = {Entropy}, number = {10}, pages = {7266 -- 7297}, publisher = {MDPI}, title = {{Quantifying emergent behavior of autonomous robots}}, doi = {10.3390/e17107266}, volume = {17}, year = {2015}, } @inbook{1829, abstract = {Hitting and batting tasks, such as tennis forehands, ping-pong strokes, or baseball batting, depend on predictions where the ball can be intercepted and how it can properly be returned to the opponent. These predictions get more accurate over time, hence the behaviors need to be continuously modified. As a result, movement templates with a learned global shape need to be adapted during the execution so that the racket reaches a target position and velocity that will return the ball over to the other side of the net or court. It requires altering learned movements to hit a varying target with the necessary velocity at a specific instant in time. Such a task cannot be incorporated straightforwardly in most movement representations suitable for learning. For example, the standard formulation of the dynamical system based motor primitives (introduced by Ijspeert et al (2002b)) does not satisfy this property despite their flexibility which has allowed learning tasks ranging from locomotion to kendama. In order to fulfill this requirement, we reformulate the Ijspeert framework to incorporate the possibility of specifying a desired hitting point and a desired hitting velocity while maintaining all advantages of the original formulation.We show that the proposed movement template formulation works well in two scenarios, i.e., for hitting a ball on a string with a table tennis racket at a specified velocity and for returning balls launched by a ball gun successfully over the net using forehand movements.}, author = {Muelling, Katharina and Kroemer, Oliver and Lampert, Christoph and Schölkopf, Bernhard}, booktitle = {Learning Motor Skills}, editor = {Kober, Jens and Peters, Jan}, pages = {69 -- 82}, publisher = {Springer}, title = {{Movement templates for learning of hitting and batting}}, doi = {10.1007/978-3-319-03194-1_3}, volume = {97}, year = {2014}, } @inproceedings{2033, abstract = {The learning with privileged information setting has recently attracted a lot of attention within the machine learning community, as it allows the integration of additional knowledge into the training process of a classifier, even when this comes in the form of a data modality that is not available at test time. Here, we show that privileged information can naturally be treated as noise in the latent function of a Gaussian process classifier (GPC). That is, in contrast to the standard GPC setting, the latent function is not just a nuisance but a feature: it becomes a natural measure of confidence about the training data by modulating the slope of the GPC probit likelihood function. Extensive experiments on public datasets show that the proposed GPC method using privileged noise, called GPC+, improves over a standard GPC without privileged knowledge, and also over the current state-of-the-art SVM-based method, SVM+. Moreover, we show that advanced neural networks and deep learning methods can be compressed as privileged information.}, author = {Hernandez Lobato, Daniel and Sharmanska, Viktoriia and Kersting, Kristian and Lampert, Christoph and Quadrianto, Novi}, booktitle = {Advances in Neural Information Processing Systems}, location = {Montreal, Canada}, number = {January}, pages = {837--845}, publisher = {Neural Information Processing Systems}, title = {{Mind the nuisance: Gaussian process classification using privileged noise}}, volume = {1}, year = {2014}, } @inproceedings{2057, abstract = {In the past few years, a lot of attention has been devoted to multimedia indexing by fusing multimodal informations. Two kinds of fusion schemes are generally considered: The early fusion and the late fusion. We focus on late classifier fusion, where one combines the scores of each modality at the decision level. To tackle this problem, we investigate a recent and elegant well-founded quadratic program named MinCq coming from the machine learning PAC-Bayesian theory. MinCq looks for the weighted combination, over a set of real-valued functions seen as voters, leading to the lowest misclassification rate, while maximizing the voters’ diversity. We propose an extension of MinCq tailored to multimedia indexing. Our method is based on an order-preserving pairwise loss adapted to ranking that allows us to improve Mean Averaged Precision measure while taking into account the diversity of the voters that we want to fuse. We provide evidence that this method is naturally adapted to late fusion procedures and confirm the good behavior of our approach on the challenging PASCAL VOC’07 benchmark.}, author = {Morvant, Emilie and Habrard, Amaury and Ayache, Stéphane}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, location = {Joensuu, Finland}, pages = {153 -- 162}, publisher = {Springer}, title = {{Majority vote of diverse classifiers for late fusion}}, doi = {10.1007/978-3-662-44415-3_16}, volume = {8621}, year = {2014}, } @inproceedings{2171, abstract = {We present LS-CRF, a new method for training cyclic Conditional Random Fields (CRFs) from large datasets that is inspired by classical closed-form expressions for the maximum likelihood parameters of a generative graphical model with tree topology. Training a CRF with LS-CRF requires only solving a set of independent regression problems, each of which can be solved efficiently in closed form or by an iterative solver. This makes LS-CRF orders of magnitude faster than classical CRF training based on probabilistic inference, and at the same time more flexible and easier to implement than other approximate techniques, such as pseudolikelihood or piecewise training. We apply LS-CRF to the task of semantic image segmentation, showing that it achieves on par accuracy to other training techniques at higher speed, thereby allowing efficient CRF training from very large training sets. For example, training a linearly parameterized pairwise CRF on 150,000 images requires less than one hour on a modern workstation.}, author = {Kolesnikov, Alexander and Guillaumin, Matthieu and Ferrari, Vittorio and Lampert, Christoph}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, location = {Zurich, Switzerland}, number = {PART 3}, pages = {550 -- 565}, publisher = {Springer}, title = {{Closed-form approximate CRF training for scalable image segmentation}}, doi = {10.1007/978-3-319-10578-9_36}, volume = {8691}, year = {2014}, } @inproceedings{2173, abstract = {In this work we introduce a new approach to co-classification, i.e. the task of jointly classifying multiple, otherwise independent, data samples. The method we present, named CoConut, is based on the idea of adding a regularizer in the label space to encode certain priors on the resulting labelings. A regularizer that encourages labelings that are smooth across the test set, for instance, can be seen as a test-time variant of the cluster assumption, which has been proven useful at training time in semi-supervised learning. A regularizer that introduces a preference for certain class proportions can be regarded as a prior distribution on the class labels. CoConut can build on existing classifiers without making any assumptions on how they were obtained and without the need to re-train them. The use of a regularizer adds a new level of flexibility. It allows the integration of potentially new information at test time, even in other modalities than what the classifiers were trained on. We evaluate our framework on six datasets, reporting a clear performance gain in classification accuracy compared to the standard classification setup that predicts labels for each test sample separately. }, author = {Khamis, Sameh and Lampert, Christoph}, booktitle = {Proceedings of the British Machine Vision Conference 2014}, location = {Nottingham, UK}, publisher = {BMVA Press}, title = {{CoConut: Co-classification with output space regularization}}, year = {2014}, } @inproceedings{2172, abstract = {Fisher Kernels and Deep Learning were two developments with significant impact on large-scale object categorization in the last years. Both approaches were shown to achieve state-of-the-art results on large-scale object categorization datasets, such as ImageNet. Conceptually, however, they are perceived as very different and it is not uncommon for heated debates to spring up when advocates of both paradigms meet at conferences or workshops. In this work, we emphasize the similarities between both architectures rather than their differences and we argue that such a unified view allows us to transfer ideas from one domain to the other. As a concrete example we introduce a method for learning a support vector machine classifier with Fisher kernel at the same time as a task-specific data representation. We reinterpret the setting as a multi-layer feed forward network. Its final layer is the classifier, parameterized by a weight vector, and the two previous layers compute Fisher vectors, parameterized by the coefficients of a Gaussian mixture model. We introduce a gradient descent based learning algorithm that, in contrast to other feature learning techniques, is not just derived from intuition or biological analogy, but has a theoretical justification in the framework of statistical learning theory. Our experiments show that the new training procedure leads to significant improvements in classification accuracy while preserving the modularity and geometric interpretability of a support vector machine setup.}, author = {Sydorov, Vladyslav and Sakurada, Mayu and Lampert, Christoph}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, location = {Columbus, USA}, pages = {1402 -- 1409}, publisher = {IEEE}, title = {{Deep Fisher Kernels – End to end learning of the Fisher Kernel GMM parameters}}, doi = {10.1109/CVPR.2014.182}, year = {2014}, } @article{2180, abstract = {Weighted majority votes allow one to combine the output of several classifiers or voters. MinCq is a recent algorithm for optimizing the weight of each voter based on the minimization of a theoretical bound over the risk of the vote with elegant PAC-Bayesian generalization guarantees. However, while it has demonstrated good performance when combining weak classifiers, MinCq cannot make use of the useful a priori knowledge that one may have when using a mixture of weak and strong voters. In this paper, we propose P-MinCq, an extension of MinCq that can incorporate such knowledge in the form of a constraint over the distribution of the weights, along with general proofs of convergence that stand in the sample compression setting for data-dependent voters. The approach is applied to a vote of k-NN classifiers with a specific modeling of the voters' performance. P-MinCq significantly outperforms the classic k-NN classifier, a symmetric NN and MinCq using the same voters. We show that it is also competitive with LMNN, a popular metric learning algorithm, and that combining both approaches further reduces the error.}, author = {Bellet, Aurélien and Habrard, Amaury and Morvant, Emilie and Sebban, Marc}, journal = {Machine Learning}, number = {1-2}, pages = {129 -- 154}, publisher = {Springer}, title = {{Learning a priori constrained weighted majority votes}}, doi = {10.1007/s10994-014-5462-z}, volume = {97}, year = {2014}, } @inproceedings{2189, abstract = {En apprentissage automatique, nous parlons d'adaptation de domaine lorsque les données de test (cibles) et d'apprentissage (sources) sont générées selon différentes distributions. Nous devons donc développer des algorithmes de classification capables de s'adapter à une nouvelle distribution, pour laquelle aucune information sur les étiquettes n'est disponible. Nous attaquons cette problématique sous l'angle de l'approche PAC-Bayésienne qui se focalise sur l'apprentissage de modèles définis comme des votes de majorité sur un ensemble de fonctions. Dans ce contexte, nous introduisons PV-MinCq une version adaptative de l'algorithme (non adaptatif) MinCq. PV-MinCq suit le principe suivant. Nous transférons les étiquettes sources aux points cibles proches pour ensuite appliquer MinCq sur l'échantillon cible ``auto-étiqueté'' (justifié par une borne théorique). Plus précisément, nous définissons un auto-étiquetage non itératif qui se focalise dans les régions où les distributions marginales source et cible sont les plus similaires. Dans un second temps, nous étudions l'influence de notre auto-étiquetage pour en déduire une procédure de validation des hyperparamètres. Finalement, notre approche montre des résultats empiriques prometteurs.}, author = {Morvant, Emilie}, location = {Saint-Etienne, France}, pages = {49--58}, publisher = {Elsevier}, title = {{Adaptation de domaine de vote de majorité par auto-étiquetage non itératif}}, volume = {1}, year = {2014}, } @inproceedings{2160, abstract = {Transfer learning has received a lot of attention in the machine learning community over the last years, and several effective algorithms have been developed. However, relatively little is known about their theoretical properties, especially in the setting of lifelong learning, where the goal is to transfer information to tasks for which no data have been observed so far. In this work we study lifelong learning from a theoretical perspective. Our main result is a PAC-Bayesian generalization bound that offers a unified view on existing paradigms for transfer learning, such as the transfer of parameters or the transfer of low-dimensional representations. We also use the bound to derive two principled lifelong learning algorithms, and we show that these yield results comparable with existing methods.}, author = {Pentina, Anastasia and Lampert, Christoph}, location = {Beijing, China}, pages = {991 -- 999}, publisher = {ML Research Press}, title = {{A PAC-Bayesian bound for Lifelong Learning}}, volume = {32}, year = {2014}, } @inproceedings{2294, abstract = {In this work we propose a system for automatic classification of Drosophila embryos into developmental stages. While the system is designed to solve an actual problem in biological research, we believe that the principle underly- ing it is interesting not only for biologists, but also for researchers in computer vision. The main idea is to combine two orthogonal sources of information: one is a classifier trained on strongly invariant features, which makes it applicable to images of very different conditions, but also leads to rather noisy predictions. The other is a label propagation step based on a more powerful similarity measure that however is only consistent within specific subsets of the data at a time. In our biological setup, the information sources are the shape and the staining patterns of embryo images. We show experimentally that while neither of the methods can be used by itself to achieve satisfactory results, their combina- tion achieves prediction quality comparable to human performance.}, author = {Kazmar, Tomas and Kvon, Evgeny and Stark, Alexander and Lampert, Christoph}, location = {Sydney, Australia}, publisher = {IEEE}, title = {{Drosophila Embryo Stage Annotation using Label Propagation}}, doi = {10.1109/ICCV.2013.139}, year = {2013}, } @inproceedings{2293, abstract = {Many computer vision problems have an asymmetric distribution of information between training and test time. In this work, we study the case where we are given additional information about the training data, which however will not be available at test time. This situation is called learning using privileged information (LUPI). We introduce two maximum-margin techniques that are able to make use of this additional source of information, and we show that the framework is applicable to several scenarios that have been studied in computer vision before. Experiments with attributes, bounding boxes, image tags and rationales as additional information in object classification show promising results.}, author = {Sharmanska, Viktoriia and Quadrianto, Novi and Lampert, Christoph}, location = {Sydney, Australia}, pages = {825 -- 832}, publisher = {IEEE}, title = {{Learning to rank using privileged information}}, doi = {10.1109/ICCV.2013.107}, year = {2013}, } @article{2516, abstract = {We study the problem of object recognition for categories for which we have no training examples, a task also called zero-data or zero-shot learning. This situation has hardly been studied in computer vision research, even though it occurs frequently: the world contains tens of thousands of different object classes and for only few of them image collections have been formed and suitably annotated. To tackle the problem we introduce attribute-based classification: objects are identified based on a high-level description that is phrased in terms of semantic attributes, such as the object's color or shape. Because the identification of each such property transcends the specific learning task at hand, the attribute classifiers can be pre-learned independently, e.g. from existing image datasets unrelated to the current task. Afterwards, new classes can be detected based on their attribute representation, without the need for a new training phase. In this paper we also introduce a new dataset, Animals with Attributes, of over 30,000 images of 50 animal classes, annotated with 85 semantic attributes. Extensive experiments on this and two more datasets show that attribute-based classification indeed is able to categorize images without access to any training images of the target classes.}, author = {Lampert, Christoph and Nickisch, Hannes and Harmeling, Stefan}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {3}, pages = {453 -- 465}, publisher = {IEEE}, title = {{Attribute-based classification for zero-shot learning of object categories}}, doi = {10.1109/TPAMI.2013.140}, volume = {36}, year = {2013}, }