@inproceedings{6569,
abstract = {Knowledge distillation, i.e. one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three keyfactors that determine the success of distillation: data geometry – geometric properties of the datadistribution, in particular class separation, has an immediate influence on the convergence speed of the risk; optimization bias– gradient descentoptimization finds a very favorable minimum of the distillation objective; and strong monotonicity– the expected risk of the student classifier always decreases when the size of the training set grows.},
author = {Bui Thi Mai, Phuong and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
location = {Long Beach, CA, United States},
pages = {5142--5151},
publisher = {PMLR},
title = {{Towards understanding knowledge distillation}},
volume = {97},
year = {2019},
}
@inproceedings{6590,
abstract = {Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from external sources, e.g. via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As additional complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization. },
author = {Konstantinov, Nikola H and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
location = {Long Beach, CA, USA},
title = {{Robust learning from untrusted sources}},
year = {2019},
}
@inproceedings{6482,
abstract = {Computer vision systems for automatic image categorization have become accurate and reliable enough that they can run continuously for days or even years as components of real-world commercial applications. A major open problem in this context, however, is quality control. Good classification performance can only be expected if systems run under the specific conditions, in particular data distributions, that they were trained for. Surprisingly, none of the currently used deep network architectures have a built-in functionality that could detect if a network operates on data from a distribution it was not trained for, such that potentially a warning to the human users could be triggered. In this work, we describe KS(conf), a procedure for detecting such outside of specifications (out-of-specs) operation, based on statistical testing of the network outputs. We show by extensive experiments using the ImageNet, AwA2 and DAVIS datasets on a variety of ConvNets architectures that KS(conf) reliably detects out-of-specs situations. It furthermore has a number of properties that make it a promising candidate for practical deployment: it is easy to implement, adds almost no overhead to the system, works with all networks, including pretrained ones, and requires no a priori knowledge of how the data distribution could change. },
author = {Sun, Rémy and Lampert, Christoph},
isbn = {9783030129385},
issn = {0302-9743},
location = {Stuttgart, Germany},
pages = {244--259},
publisher = {Springer Nature},
title = {{KS(conf): A light-weight test if a ConvNet operates outside of Its specifications}},
doi = {10.1007/978-3-030-12939-2_18},
volume = {11269},
year = {2019},
}
@article{321,
abstract = {The twelve papers in this special section focus on learning systems with shared information for computer vision and multimedia communication analysis. In the real world, a realistic setting for computer vision or multimedia recognition problems is that we have some classes containing lots of training data and many classes containing a small amount of training data. Therefore, how to use frequent classes to help learning rare classes for which it is harder to collect the training data is an open question. Learning with shared information is an emerging topic in machine learning, computer vision and multimedia analysis. There are different levels of components that can be shared during concept modeling and machine learning stages, such as sharing generic object parts, sharing attributes, sharing transformations, sharing regularization parameters and sharing training examples, etc. Regarding the specific methods, multi-task learning, transfer learning and deep learning can be seen as using different strategies to share information. These learning with shared information methods are very effective in solving real-world large-scale problems.},
author = {Darrell, Trevor and Lampert, Christoph and Sebe, Nico and Wu, Ying and Yan, Yan},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {5},
pages = {1029 -- 1031},
publisher = {IEEE},
title = {{Guest editors' introduction to the special section on learning with Shared information for computer vision and multimedia analysis}},
doi = {10.1109/TPAMI.2018.2804998},
volume = {40},
year = {2018},
}
@inproceedings{6589,
abstract = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
author = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
booktitle = {Advances in Neural Information Processing Systems 31},
location = {Montreal, Canada},
pages = {5973--5983},
publisher = {Neural information processing systems},
title = {{The convergence of sparsified gradient methods}},
volume = {Volume 2018},
year = {2018},
}
@inproceedings{6011,
abstract = {We establish a data-dependent notion of algorithmic stability for Stochastic Gradient Descent (SGD), and employ it to develop novel generalization bounds. This is in contrast to previous distribution-free algorithmic stability results for SGD which depend on the worst-case constants. By virtue of the data-dependent argument, our bounds provide new insights into learning with SGD on convex and non-convex problems. In the convex case, we show that the bound on the generalization error depends on the risk at the initialization point. In the non-convex case, we prove that the expected curvature of the objective function around the initialization point has crucial influence on the generalization error. In both cases, our results suggest a simple data-driven strategy to stabilize SGD by pre-screening its initialization. As a corollary, our results allow us to show optimistic generalization bounds that exhibit fast convergence rates for SGD subject to a vanishing empirical risk and low noise of stochastic gradient. },
author = {Kuzborskij, Ilja and Lampert, Christoph},
booktitle = {Proceedings of the 35 th International Conference on Machine Learning},
location = {Stockholm, Sweden},
pages = {2815--2824},
publisher = {International Machine Learning Society},
title = {{Data-dependent stability of stochastic gradient descent}},
volume = {80},
year = {2018},
}
@article{6554,
abstract = {Due to the importance of zero-shot learning, i.e. classifying images where there is a lack of labeled training data, the number of proposed approaches has recently increased steadily. We argue that it is time to take a step back and to analyze the status quo of the area. The purpose of this paper is three-fold. First, given the fact that there is no agreed upon zero-shot learning benchmark, we first define a new benchmark by unifying both the evaluation protocols and data splits of publicly available datasets used for this task. This is an important contribution as published results are often not comparable and sometimes even flawed due to, e.g. pre-training on zero-shot test classes. Moreover, we propose a new zero-shot learning dataset, the Animals with Attributes 2 (AWA2) dataset which we make publicly available both in terms of image features and the images themselves. Second, we compare and analyze a significant number of the state-of-the-art methods in depth, both in the classic zero-shot setting but also in the more realistic generalized zero-shot setting. Finally, we discuss in detail the limitations of the current status of the area which can be taken as a basis for advancing it.},
author = {Xian, Yongqin and Lampert, Christoph and Schiele, Bernt and Akata, Zeynep},
issn = {0162-8828},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
pages = {1--1},
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
title = {{Zero-shot learning - A comprehensive evaluation of the good, the bad and the ugly}},
doi = {10.1109/tpami.2018.2857768},
year = {2018},
}
@phdthesis{68,
abstract = {The most common assumption made in statistical learning theory is the assumption of the independent and identically distributed (i.i.d.) data. While being very convenient mathematically, it is often very clearly violated in practice. This disparity between the machine learning theory and applications underlies a growing demand in the development of algorithms that learn from dependent data and theory that can provide generalization guarantees similar to the independent situations. This thesis is dedicated to two variants of dependencies that can arise in practice. One is a dependence on the level of samples in a single learning task. Another dependency type arises in the multi-task setting when the tasks are dependent on each other even though the data for them can be i.i.d. In both cases we model the data (samples or tasks) as stochastic processes and introduce new algorithms for both settings that take into account and exploit the resulting dependencies. We prove the theoretical guarantees on the performance of the introduced algorithms under different evaluation criteria and, in addition, we compliment the theoretical study by the empirical one, where we evaluate some of the algorithms on two real world datasets to highlight their practical applicability.},
author = {Zimin, Alexander},
pages = {92},
publisher = {IST Austria},
title = {{Learning from dependent data}},
doi = {10.15479/AT:ISTA:TH1048},
year = {2018},
}
@inproceedings{6012,
abstract = {We present an approach to identify concise equations from data using a shallow neural network approach. In contrast to ordinary black-box regression, this approach allows understanding functional relations and generalizing them from observed data to unseen parts of the parameter space. We show how to extend the class of learnable equations for a recently proposed equation learning network to include divisions, and we improve the learning and model selection strategy to be useful for challenging real-world data. For systems governed by analytical expressions, our method can in many cases identify the true underlying equation and extrapolate to unseen domains. We demonstrate its effectiveness by experiments on a cart-pendulum system, where only 2 random rollouts are required to learn the forward dynamics and successfully achieve the swing-up task.},
author = {Sahoo, Subham and Lampert, Christoph and Martius, Georg S},
booktitle = {Proceedings of the 35th International Conference on Machine Learning},
location = {Stockholm, Sweden},
pages = {4442--4450},
publisher = {International Machine Learning Society},
title = {{Learning equations for extrapolation and control}},
volume = {80},
year = {2018},
}
@article{563,
abstract = {In continuous populations with local migration, nearby pairs of individuals have on average more similar genotypes
than geographically well separated pairs. A barrier to gene flow distorts this classical pattern of isolation by distance. Genetic similarity is decreased for sample pairs on different sides of the barrier and increased for pairs on the same side near the barrier. Here, we introduce an inference scheme that utilizes this signal to detect and estimate the strength of a linear barrier to gene flow in two-dimensions. We use a diffusion approximation to model the effects of a barrier on the geographical spread of ancestry backwards in time. This approach allows us to calculate the chance of recent coalescence and probability of identity by descent. We introduce an inference scheme that fits these theoretical results to the geographical covariance structure of bialleleic genetic markers. It can estimate the strength of the barrier as well as several demographic parameters. We investigate the power of our inference scheme to detect barriers by applying it to a wide range of simulated data. We also showcase an example application to a Antirrhinum majus (snapdragon) flower color hybrid zone, where we do not detect any signal of a strong genome wide barrier to gene flow.},
author = {Ringbauer, Harald and Kolesnikov, Alexander and Field, David and Barton, Nicholas H},
journal = {Genetics},
number = {3},
pages = {1231--1245},
publisher = {Genetics Society of America},
title = {{Estimating barriers to gene flow from distorted isolation-by-distance patterns}},
doi = {10.1534/genetics.117.300638 },
volume = {208},
year = {2018},
}
@misc{5584,
abstract = {This package contains data for the publication "Nonlinear decoding of a complex movie from the mammalian retina" by Deny S. et al, PLOS Comput Biol (2018).
The data consists of
(i) 91 spike sorted, isolated rat retinal ganglion cells that pass stability and quality criteria, recorded on the multi-electrode array, in response to the presentation of the complex movie with many randomly moving dark discs. The responses are represented as 648000 x 91 binary matrix, where the first index indicates the timebin of duration 12.5 ms, and the second index the neural identity. The matrix entry is 0/1 if the neuron didn't/did spike in the particular time bin.
(ii) README file and a graphical illustration of the structure of the experiment, specifying how the 648000 timebins are split into epochs where 1, 2, 4, or 10 discs were displayed, and which stimulus segments are exact repeats or unique ball trajectories.
(iii) a 648000 x 400 matrix of luminance traces for each of the 20 x 20 positions ("sites") in the movie frame, with time that is locked to the recorded raster. The luminance traces are produced as described in the manuscript by filtering the raw disc movie with a small gaussian spatial kernel. },
author = {Deny, Stephane and Marre, Olivier and Botella-Soler, Vicente and Martius, Georg S and Tkacik, Gasper},
keyword = {retina, decoding, regression, neural networks, complex stimulus},
publisher = {IST Austria},
title = {{Nonlinear decoding of a complex movie from the mammalian retina}},
doi = {10.15479/AT:ISTA:98},
year = {2018},
}
@phdthesis{197,
abstract = {Modern computer vision systems heavily rely on statistical machine learning models, which typically require large amounts of labeled data to be learned reliably. Moreover, very recently computer vision research widely adopted techniques for representation learning, which further increase the demand for labeled data. However, for many important practical problems there is relatively small amount of labeled data available, so it is problematic to leverage full potential of the representation learning methods. One way to overcome this obstacle is to invest substantial resources into producing large labelled datasets. Unfortunately, this can be prohibitively expensive in practice. In this thesis we focus on the alternative way of tackling the aforementioned issue. We concentrate on methods, which make use of weakly-labeled or even unlabeled data. Specifically, the first half of the thesis is dedicated to the semantic image segmentation task. We develop a technique, which achieves competitive segmentation performance and only requires annotations in a form of global image-level labels instead of dense segmentation masks. Subsequently, we present a new methodology, which further improves segmentation performance by leveraging tiny additional feedback from a human annotator. By using our methods practitioners can greatly reduce the amount of data annotation effort, which is required to learn modern image segmentation models. In the second half of the thesis we focus on methods for learning from unlabeled visual data. We study a family of autoregressive models for modeling structure of natural images and discuss potential applications of these models. Moreover, we conduct in-depth study of one of these applications, where we develop the state-of-the-art model for the probabilistic image colorization task.},
author = {Kolesnikov, Alexander},
pages = {113},
publisher = {IST Austria},
title = {{Weakly-Supervised Segmentation and Unsupervised Modeling of Natural Images}},
doi = {10.15479/AT:ISTA:th_1021},
year = {2018},
}
@inproceedings{911,
abstract = {We develop a probabilistic technique for colorizing grayscale natural images. In light of the intrinsic uncertainty of this task, the proposed probabilistic framework has numerous desirable properties. In particular, our model is able to produce multiple plausible and vivid colorizations for a given grayscale image and is one of the first colorization models to provide a proper stochastic sampling scheme. Moreover, our training procedure is supported by a rigorous theoretical framework that does not require any ad hoc heuristics and allows for efficient modeling and learning of the joint pixel color distribution.We demonstrate strong quantitative and qualitative experimental results on the CIFAR-10 dataset and the challenging ILSVRC 2012 dataset.},
author = {Royer, Amélie and Kolesnikov, Alexander and Lampert, Christoph},
location = {London, United Kingdom},
publisher = {BMVA Press},
title = {{Probabilistic image colorization}},
year = {2017},
}
@inproceedings{1000,
abstract = {We study probabilistic models of natural images and extend the autoregressive family of PixelCNN models by incorporating latent variables. Subsequently, we describe two new generative image models that exploit different image transformations as latent variables: a quantized grayscale view of the image or a multi-resolution image pyramid. The proposed models tackle two known shortcomings of existing PixelCNN models: 1) their tendency to focus on low-level image details, while largely ignoring high-level image information, such as object shapes, and 2) their computationally costly procedure for image sampling. We experimentally demonstrate benefits of our LatentPixelCNN models, in particular showing that they produce much more realistically looking image samples than previous state-of-the-art probabilistic models. },
author = {Kolesnikov, Alexander and Lampert, Christoph},
isbn = {978-151085514-4},
location = {Sydney, Australia},
pages = {1905 -- 1914},
publisher = {Omnipress},
title = {{PixelCNN models with auxiliary variables for natural image modeling}},
volume = {70},
year = {2017},
}
@inproceedings{652,
abstract = {We present an approach that enables robots to self-organize their sensorimotor behavior from scratch without providing specific information about neither the robot nor its environment. This is achieved by a simple neural control law that increases the consistency between external sensor dynamics and internal neural dynamics of the utterly simple controller. In this way, the embodiment and the agent-environment coupling are the only source of individual development. We show how an anthropomorphic tendon driven arm-shoulder system develops different behaviors depending on that coupling. For instance: Given a bottle half-filled with water, the arm starts to shake it, driven by the physical response of the water. When attaching a brush, the arm can be manipulated into wiping a table, and when connected to a revolvable wheel it finds out how to rotate it. Thus, the robot may be said to discover the affordances of the world. When allowing two (simulated) humanoid robots to interact physically, they engage into a joint behavior development leading to, for instance, spontaneous cooperation. More social effects are observed if the robots can visually perceive each other. Although, as an observer, it is tempting to attribute an apparent intentionality, there is nothing of the kind put in. As a conclusion, we argue that emergent behavior may be much less rooted in explicit intentions, internal motivations, or specific reward systems than is commonly believed.},
author = {Der, Ralf and Martius, Georg S},
isbn = {978-150905069-7},
location = {Cergy-Pontoise, France},
publisher = {IEEE},
title = {{Dynamical self consistency leads to behavioral development and emergent social interactions in robots}},
doi = {10.1109/DEVLRN.2016.7846789},
year = {2017},
}
@inproceedings{998,
abstract = {A major open problem on the road to artificial intelligence is the development of incrementally learning systems that learn about more and more concepts over time from a stream of data. In this work, we introduce a new training strategy, iCaRL, that allows learning in such a class-incremental way: only the training data for a small number of classes has to be present at the same time and new classes can be added progressively. iCaRL learns strong classifiers and a data representation simultaneously. This distinguishes it from earlier works that were fundamentally limited to fixed data representations and therefore incompatible with deep learning architectures. We show by experiments on CIFAR-100 and ImageNet ILSVRC 2012 data that iCaRL can learn many classes incrementally over a long period of time where other strategies quickly fail. },
author = {Rebuffi, Sylvestre Alvise and Kolesnikov, Alexander and Sperl, Georg and Lampert, Christoph},
isbn = {978-153860457-1},
location = {Honolulu, HA, United States},
pages = {5533 -- 5542},
publisher = {IEEE},
title = {{iCaRL: Incremental classifier and representation learning}},
doi = {10.1109/CVPR.2017.587},
volume = {2017},
year = {2017},
}
@inproceedings{6841,
abstract = {In classical machine learning, regression is treated as a black box process of identifying a suitable function from a hypothesis set without attempting to gain insight into the mechanism connecting inputs and outputs. In the natural sciences, however, finding an interpretable function for a phenomenon is the prime goal as it allows to understand and generalize results. This paper proposes a novel type of function learning network, called equation learner (EQL), that can learn analytical expressions and is able to extrapolate to unseen domains. It is implemented as an end-to-end differentiable feed-forward network and allows for efficient gradient based training. Due to sparsity regularization concise interpretable expressions can be obtained. Often the true underlying source expression is identified.},
author = {Martius, Georg S and Lampert, Christoph},
booktitle = {5th International Conference on Learning Representations, ICLR 2017 - Workshop Track Proceedings},
location = {Toulon, France},
publisher = {International Conference on Learning Representations},
title = {{Extrapolation and learning equations}},
year = {2017},
}
@article{658,
abstract = {With the accelerated development of robot technologies, control becomes one of the central themes of research. In traditional approaches, the controller, by its internal functionality, finds appropriate actions on the basis of specific objectives for the task at hand. While very successful in many applications, self-organized control schemes seem to be favored in large complex systems with unknown dynamics or which are difficult to model. Reasons are the expected scalability, robustness, and resilience of self-organizing systems. The paper presents a self-learning neurocontroller based on extrinsic differential plasticity introduced recently, applying it to an anthropomorphic musculoskeletal robot arm with attached objects of unknown physical dynamics. The central finding of the paper is the following effect: by the mere feedback through the internal dynamics of the object, the robot is learning to relate each of the objects with a very specific sensorimotor pattern. Specifically, an attached pendulum pilots the arm into a circular motion, a half-filled bottle produces axis oriented shaking behavior, a wheel is getting rotated, and wiping patterns emerge automatically in a table-plus-brush setting. By these object-specific dynamical patterns, the robot may be said to recognize the object's identity, or in other words, it discovers dynamical affordances of objects. Furthermore, when including hand coordinates obtained from a camera, a dedicated hand-eye coordination self-organizes spontaneously. These phenomena are discussed from a specific dynamical system perspective. Central is the dedicated working regime at the border to instability with its potentially infinite reservoir of (limit cycle) attractors "waiting" to be excited. Besides converging toward one of these attractors, variate behavior is also arising from a self-induced attractor morphing driven by the learning rule. We claim that experimental investigations with this anthropomorphic, self-learning robot not only generate interesting and potentially useful behaviors, but may also help to better understand what subjective human muscle feelings are, how they can be rooted in sensorimotor patterns, and how these concepts may feed back on robotics.},
author = {Der, Ralf and Martius, Georg S},
issn = {16625218},
journal = {Frontiers in Neurorobotics},
number = {MAR},
publisher = {Frontiers Research Foundation},
title = {{Self organized behavior generation for musculoskeletal robots}},
doi = {10.3389/fnbot.2017.00008},
volume = {11},
year = {2017},
}
@inproceedings{999,
abstract = {In multi-task learning, a learner is given a collection of prediction tasks and needs to solve all of them. In contrast to previous work, which required that annotated training data must be available for all tasks, we consider a new setting, in which for some tasks, potentially most of them, only unlabeled training data is provided. Consequently, to solve all tasks, information must be transferred between tasks with labels and tasks without labels. Focusing on an instance-based transfer method we analyze two variants of this setting: when the set of labeled tasks is fixed, and when it can be actively selected by the learner. We state and prove a generalization bound that covers both scenarios and derive from it an algorithm for making the choice of labeled tasks (in the active case) and for transferring information between the tasks in a principled way. We also illustrate the effectiveness of the algorithm on synthetic and real data. },
author = {Pentina, Anastasia and Lampert, Christoph},
issn = {978-151085514-4},
location = {Sydney, Australia},
pages = {2807 -- 2816},
publisher = {Omnipress},
title = {{Multi-task learning with labeled and unlabeled tasks}},
volume = {70},
year = {2017},
}
@inproceedings{1108,
abstract = {In this work we study the learnability of stochastic processes with respect to the conditional risk, i.e. the existence of a learning algorithm that improves its next-step performance with the amount of observed data. We introduce a notion of pairwise discrepancy between conditional distributions at different times steps and show how certain properties of these discrepancies can be used to construct a successful learning algorithm. Our main results are two theorems that establish criteria for learnability for many classes of stochastic processes, including all special cases studied previously in the literature.},
author = {Zimin, Alexander and Lampert, Christoph},
location = {Fort Lauderdale, FL, United States},
pages = {213 -- 222},
publisher = {JMLR, Inc. and Microtome Publishing},
title = {{Learning theory for conditional risk minimization}},
volume = {54},
year = {2017},
}