@inproceedings{9210,
abstract = {Modern neural networks can easily fit their training set perfectly. Surprisingly, despite being “overfit” in this way, they tend to generalize well to future data, thereby defying the classic bias–variance trade-off of machine learning theory. Of the many possible explanations, a prevalent one is that training by stochastic gradient descent (SGD) imposes an implicit bias that leads it to learn simple functions, and these simple functions generalize well. However, the specifics of this implicit bias are not well understood.
In this work, we explore the smoothness conjecture which states that SGD is implicitly biased towards learning functions that are smooth. We propose several measures to formalize the intuitive notion of smoothness, and we conduct experiments to determine whether SGD indeed implicitly optimizes for these measures. Our findings rule out the possibility that smoothness measures based on first-order derivatives are being implicitly enforced. They are supportive, though, of the smoothness conjecture for measures based on second-order derivatives.},
author = {Volhejn, Vaclav and Lampert, Christoph},
booktitle = {42nd German Conference on Pattern Recognition},
isbn = {9783030712778},
issn = {1611-3349},
location = {Tübingen, Germany},
pages = {246--259},
publisher = {Springer},
title = {{Does SGD implicitly optimize for smoothness?}},
doi = {10.1007/978-3-030-71278-5_18},
volume = {12544},
year = {2021},
}