@inproceedings{6569,
abstract = {Knowledge distillation, i.e. one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three keyfactors that determine the success of distillation: data geometry – geometric properties of the datadistribution, in particular class separation, has an immediate influence on the convergence speed of the risk; optimization bias– gradient descentoptimization finds a very favorable minimum of the distillation objective; and strong monotonicity– the expected risk of the student classifier always decreases when the size of the training set grows.},
author = {Bui Thi Mai, Phuong and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
location = {Long Beach, CA, United States},
pages = {5142--5151},
publisher = {PMLR},
title = {{Towards understanding knowledge distillation}},
volume = {97},
year = {2019},
}
@inproceedings{6590,
abstract = {Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from external sources, e.g. via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As additional complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization. },
author = {Konstantinov, Nikola H and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
location = {Long Beach, CA, USA},
title = {{Robust learning from untrusted sources}},
year = {2019},
}
@inproceedings{6482,
abstract = {Computer vision systems for automatic image categorization have become accurate and reliable enough that they can run continuously for days or even years as components of real-world commercial applications. A major open problem in this context, however, is quality control. Good classification performance can only be expected if systems run under the specific conditions, in particular data distributions, that they were trained for. Surprisingly, none of the currently used deep network architectures have a built-in functionality that could detect if a network operates on data from a distribution it was not trained for, such that potentially a warning to the human users could be triggered. In this work, we describe KS(conf), a procedure for detecting such outside of specifications (out-of-specs) operation, based on statistical testing of the network outputs. We show by extensive experiments using the ImageNet, AwA2 and DAVIS datasets on a variety of ConvNets architectures that KS(conf) reliably detects out-of-specs situations. It furthermore has a number of properties that make it a promising candidate for practical deployment: it is easy to implement, adds almost no overhead to the system, works with all networks, including pretrained ones, and requires no a priori knowledge of how the data distribution could change. },
author = {Sun, Rémy and Lampert, Christoph},
isbn = {9783030129385},
issn = {0302-9743},
location = {Stuttgart, Germany},
pages = {244--259},
publisher = {Springer Nature},
title = {{KS(conf): A light-weight test if a ConvNet operates outside of Its specifications}},
doi = {10.1007/978-3-030-12939-2_18},
volume = {11269},
year = {2019},
}
@article{321,
abstract = {The twelve papers in this special section focus on learning systems with shared information for computer vision and multimedia communication analysis. In the real world, a realistic setting for computer vision or multimedia recognition problems is that we have some classes containing lots of training data and many classes containing a small amount of training data. Therefore, how to use frequent classes to help learning rare classes for which it is harder to collect the training data is an open question. Learning with shared information is an emerging topic in machine learning, computer vision and multimedia analysis. There are different levels of components that can be shared during concept modeling and machine learning stages, such as sharing generic object parts, sharing attributes, sharing transformations, sharing regularization parameters and sharing training examples, etc. Regarding the specific methods, multi-task learning, transfer learning and deep learning can be seen as using different strategies to share information. These learning with shared information methods are very effective in solving real-world large-scale problems.},
author = {Darrell, Trevor and Lampert, Christoph and Sebe, Nico and Wu, Ying and Yan, Yan},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
number = {5},
pages = {1029 -- 1031},
publisher = {IEEE},
title = {{Guest editors' introduction to the special section on learning with Shared information for computer vision and multimedia analysis}},
doi = {10.1109/TPAMI.2018.2804998},
volume = {40},
year = {2018},
}
@inproceedings{6589,
abstract = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
author = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
booktitle = {Advances in Neural Information Processing Systems 31},
location = {Montreal, Canada},
pages = {5973--5983},
publisher = {Neural information processing systems},
title = {{The convergence of sparsified gradient methods}},
volume = {Volume 2018},
year = {2018},
}
@inproceedings{6011,
abstract = {We establish a data-dependent notion of algorithmic stability for Stochastic Gradient Descent (SGD), and employ it to develop novel generalization bounds. This is in contrast to previous distribution-free algorithmic stability results for SGD which depend on the worst-case constants. By virtue of the data-dependent argument, our bounds provide new insights into learning with SGD on convex and non-convex problems. In the convex case, we show that the bound on the generalization error depends on the risk at the initialization point. In the non-convex case, we prove that the expected curvature of the objective function around the initialization point has crucial influence on the generalization error. In both cases, our results suggest a simple data-driven strategy to stabilize SGD by pre-screening its initialization. As a corollary, our results allow us to show optimistic generalization bounds that exhibit fast convergence rates for SGD subject to a vanishing empirical risk and low noise of stochastic gradient. },
author = {Kuzborskij, Ilja and Lampert, Christoph},
booktitle = {Proceedings of the 35 th International Conference on Machine Learning},
location = {Stockholm, Sweden},
pages = {2815--2824},
publisher = {International Machine Learning Society},
title = {{Data-dependent stability of stochastic gradient descent}},
volume = {80},
year = {2018},
}
@article{6554,
abstract = {Due to the importance of zero-shot learning, i.e. classifying images where there is a lack of labeled training data, the number of proposed approaches has recently increased steadily. We argue that it is time to take a step back and to analyze the status quo of the area. The purpose of this paper is three-fold. First, given the fact that there is no agreed upon zero-shot learning benchmark, we first define a new benchmark by unifying both the evaluation protocols and data splits of publicly available datasets used for this task. This is an important contribution as published results are often not comparable and sometimes even flawed due to, e.g. pre-training on zero-shot test classes. Moreover, we propose a new zero-shot learning dataset, the Animals with Attributes 2 (AWA2) dataset which we make publicly available both in terms of image features and the images themselves. Second, we compare and analyze a significant number of the state-of-the-art methods in depth, both in the classic zero-shot setting but also in the more realistic generalized zero-shot setting. Finally, we discuss in detail the limitations of the current status of the area which can be taken as a basis for advancing it.},
author = {Xian, Yongqin and Lampert, Christoph and Schiele, Bernt and Akata, Zeynep},
issn = {0162-8828},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
pages = {1--1},
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
title = {{Zero-shot learning - A comprehensive evaluation of the good, the bad and the ugly}},
doi = {10.1109/tpami.2018.2857768},
year = {2018},
}
@phdthesis{68,
abstract = {The most common assumption made in statistical learning theory is the assumption of the independent and identically distributed (i.i.d.) data. While being very convenient mathematically, it is often very clearly violated in practice. This disparity between the machine learning theory and applications underlies a growing demand in the development of algorithms that learn from dependent data and theory that can provide generalization guarantees similar to the independent situations. This thesis is dedicated to two variants of dependencies that can arise in practice. One is a dependence on the level of samples in a single learning task. Another dependency type arises in the multi-task setting when the tasks are dependent on each other even though the data for them can be i.i.d. In both cases we model the data (samples or tasks) as stochastic processes and introduce new algorithms for both settings that take into account and exploit the resulting dependencies. We prove the theoretical guarantees on the performance of the introduced algorithms under different evaluation criteria and, in addition, we compliment the theoretical study by the empirical one, where we evaluate some of the algorithms on two real world datasets to highlight their practical applicability.},
author = {Zimin, Alexander},
pages = {92},
publisher = {IST Austria},
title = {{Learning from dependent data}},
doi = {10.15479/AT:ISTA:TH1048},
year = {2018},
}
@inproceedings{6012,
abstract = {We present an approach to identify concise equations from data using a shallow neural network approach. In contrast to ordinary black-box regression, this approach allows understanding functional relations and generalizing them from observed data to unseen parts of the parameter space. We show how to extend the class of learnable equations for a recently proposed equation learning network to include divisions, and we improve the learning and model selection strategy to be useful for challenging real-world data. For systems governed by analytical expressions, our method can in many cases identify the true underlying equation and extrapolate to unseen domains. We demonstrate its effectiveness by experiments on a cart-pendulum system, where only 2 random rollouts are required to learn the forward dynamics and successfully achieve the swing-up task.},
author = {Sahoo, Subham and Lampert, Christoph and Martius, Georg S},
booktitle = {Proceedings of the 35th International Conference on Machine Learning},
location = {Stockholm, Sweden},
pages = {4442--4450},
publisher = {International Machine Learning Society},
title = {{Learning equations for extrapolation and control}},
volume = {80},
year = {2018},
}
@article{563,
abstract = {In continuous populations with local migration, nearby pairs of individuals have on average more similar genotypes
than geographically well separated pairs. A barrier to gene flow distorts this classical pattern of isolation by distance. Genetic similarity is decreased for sample pairs on different sides of the barrier and increased for pairs on the same side near the barrier. Here, we introduce an inference scheme that utilizes this signal to detect and estimate the strength of a linear barrier to gene flow in two-dimensions. We use a diffusion approximation to model the effects of a barrier on the geographical spread of ancestry backwards in time. This approach allows us to calculate the chance of recent coalescence and probability of identity by descent. We introduce an inference scheme that fits these theoretical results to the geographical covariance structure of bialleleic genetic markers. It can estimate the strength of the barrier as well as several demographic parameters. We investigate the power of our inference scheme to detect barriers by applying it to a wide range of simulated data. We also showcase an example application to a Antirrhinum majus (snapdragon) flower color hybrid zone, where we do not detect any signal of a strong genome wide barrier to gene flow.},
author = {Ringbauer, Harald and Kolesnikov, Alexander and Field, David and Barton, Nicholas H},
journal = {Genetics},
number = {3},
pages = {1231--1245},
publisher = {Genetics Society of America},
title = {{Estimating barriers to gene flow from distorted isolation-by-distance patterns}},
doi = {10.1534/genetics.117.300638 },
volume = {208},
year = {2018},
}