@inproceedings{7481,
abstract = {We address the following question: How redundant is the parameterisation of ReLU networks? Specifically, we consider transformations of the weight space which leave the function implemented by the network intact. Two such transformations are known for feed-forward architectures: permutation of neurons within a layer, and positive scaling of all incoming weights of a neuron coupled with inverse scaling of its outgoing weights. In this work, we show for architectures with non-increasing widths that permutation and scaling are in fact the only function-preserving weight transformations. For any eligible architecture we give an explicit construction of a neural network such that any other network that implements the same function can be obtained from the original one by the application of permutations and rescaling. The proof relies on a geometric understanding of boundaries between linear regions of ReLU networks, and we hope the developed mathematical tools are of independent interest.},
author = {Bui Thi Mai, Phuong and Lampert, Christoph},
title = {{Functional vs. parametric equivalence of ReLU networks}},
year = {2020},
}
@article{7479,
abstract = {Multi-exit architectures, in which a stack of processing layers is interleaved with early output layers, allow the processing of a test example to stop early and thus save computation time and/or energy. In this work, we propose a new training procedure for multi-exit architectures based on the principle of knowledge distillation. The method encourage searly exits to mimic later, more accurate exits, by matching their output probabilities.
Experiments on CIFAR100 and ImageNet show that distillation-based training significantly improves the accuracy of early exits while maintaining state-of-the-art accuracy for late ones. The method is particularly beneficial when training data is limited and it allows a straightforward extension to semi-supervised learning,i.e. making use of unlabeled data at training time. Moreover, it takes only afew lines to implement and incurs almost no computational overhead at training time, and none at all at test time.},
author = {Bui Thi Mai, Phuong and Lampert, Christoph},
journal = {IEEE International Conference on Computer Vision},
location = {Seoul, Korea},
pages = {1355--1364},
publisher = {Computer Vision Foundation},
title = {{Distillation-based training for multi-exit architectures}},
year = {2019},
}
@inproceedings{6569,
abstract = {Knowledge distillation, i.e. one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three keyfactors that determine the success of distillation: data geometry – geometric properties of the datadistribution, in particular class separation, has an immediate influence on the convergence speed of the risk; optimization bias– gradient descentoptimization finds a very favorable minimum of the distillation objective; and strong monotonicity– the expected risk of the student classifier always decreases when the size of the training set grows.},
author = {Bui Thi Mai, Phuong and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
location = {Long Beach, CA, United States},
pages = {5142--5151},
publisher = {PMLR},
title = {{Towards understanding knowledge distillation}},
volume = {97},
year = {2019},
}