@inproceedings{14771, abstract = {Pruning—that is, setting a significant subset of the parameters of a neural network to zero—is one of the most popular methods of model compression. Yet, several recent works have raised the issue that pruning may induce or exacerbate bias in the output of the compressed model. Despite existing evidence for this phenomenon, the relationship between neural network pruning and induced bias is not well-understood. In this work, we systematically investigate and characterize this phenomenon in Convolutional Neural Networks for computer vision. First, we show that it is in fact possible to obtain highly-sparse models, e.g. with less than 10% remaining weights, which do not decrease in accuracy nor substantially increase in bias when compared to dense models. At the same time, we also find that, at higher sparsities, pruned models exhibit higher uncertainty in their outputs, as well as increased correlations, which we directly link to increased bias. We propose easy-to-use criteria which, based only on the uncompressed model, establish whether bias will increase with pruning, and identify the samples most susceptible to biased predictions post-compression. Our code can be found at https://github.com/IST-DASLab/pruned-vision-model-bias.}, author = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Alistarh, Dan-Adrian}, booktitle = {2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, issn = {2575-7075}, location = {Vancouver, BC, Canada}, pages = {24364--24373}, publisher = {IEEE}, title = {{Bias in pruned vision models: In-depth analysis and countermeasures}}, doi = {10.1109/cvpr52729.2023.02334}, year = {2023}, } @inproceedings{14921, abstract = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.}, author = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph}, booktitle = {37th Annual Conference on Neural Information Processing Systems}, location = {New Orleans, LA, United States}, title = {{Deep neural collapse is provably optimal for the deep unconstrained features model}}, year = {2023}, } @unpublished{15039, abstract = {A crucial property for achieving secure, trustworthy and interpretable deep learning systems is their robustness: small changes to a system's inputs should not result in large changes to its outputs. Mathematically, this means one strives for networks with a small Lipschitz constant. Several recent works have focused on how to construct such Lipschitz networks, typically by imposing constraints on the weight matrices. In this work, we study an orthogonal aspect, namely the role of the activation function. We show that commonly used activation functions, such as MaxMin, as well as all piece-wise linear ones with two segments unnecessarily restrict the class of representable functions, even in the simplest one-dimensional setting. We furthermore introduce the new N-activation function that is provably more expressive than currently popular activation functions. We provide code at this https URL.}, author = {Prach, Bernd and Lampert, Christoph}, booktitle = {arXiv}, title = {{1-Lipschitz neural networks are more expressive with N-activations}}, doi = {10.48550/ARXIV.2311.06103}, year = {2023}, } @unpublished{12660, abstract = {We present Cross-Client Label Propagation(XCLP), a new method for transductive federated learning. XCLP estimates a data graph jointly from the data of multiple clients and computes labels for the unlabeled data by propagating label information across the graph. To avoid clients having to share their data with anyone, XCLP employs two cryptographically secure protocols: secure Hamming distance computation and secure summation. We demonstrate two distinct applications of XCLP within federated learning. In the first, we use it in a one-shot way to predict labels for unseen test points. In the second, we use it to repeatedly pseudo-label unlabeled training data in a federated semi-supervised setting. Experiments on both real federated and standard benchmark datasets show that in both applications XCLP achieves higher classification accuracy than alternative approaches.}, author = {Scott, Jonathan A and Yeo, Michelle X and Lampert, Christoph}, booktitle = {arXiv}, title = {{Cross-client Label Propagation for transductive federated learning}}, doi = {10.48550/arXiv.2210.06434}, year = {2022}, } @unpublished{12662, abstract = {Modern machine learning tasks often require considering not just one but multiple objectives. For example, besides the prediction quality, this could be the efficiency, robustness or fairness of the learned models, or any of their combinations. Multi-objective learning offers a natural framework for handling such problems without having to commit to early trade-offs. Surprisingly, statistical learning theory so far offers almost no insight into the generalization properties of multi-objective learning. In this work, we make first steps to fill this gap: we establish foundational generalization bounds for the multi-objective setting as well as generalization and excess bounds for learning with scalarizations. We also provide the first theoretical analysis of the relation between the Pareto-optimal sets of the true objectives and the Pareto-optimal sets of their empirical approximations from training data. In particular, we show a surprising asymmetry: all Pareto-optimal solutions can be approximated by empirically Pareto-optimal ones, but not vice versa.}, author = {Súkeník, Peter and Lampert, Christoph}, booktitle = {arXiv}, title = {{Generalization in Multi-objective machine learning}}, doi = {10.48550/arXiv.2208.13499}, year = {2022}, } @article{12495, abstract = {Fairness-aware learning aims at constructing classifiers that not only make accurate predictions, but also do not discriminate against specific groups. It is a fast-growing area of machine learning with far-reaching societal impact. However, existing fair learning methods are vulnerable to accidental or malicious artifacts in the training data, which can cause them to unknowingly produce unfair classifiers. In this work we address the problem of fair learning from unreliable training data in the robust multisource setting, where the available training data comes from multiple sources, a fraction of which might not be representative of the true data distribution. We introduce FLEA, a filtering-based algorithm that identifies and suppresses those data sources that would have a negative impact on fairness or accuracy if they were used for training. As such, FLEA is not a replacement of prior fairness-aware learning methods but rather an augmentation that makes any of them robust against unreliable training data. We show the effectiveness of our approach by a diverse range of experiments on multiple datasets. Additionally, we prove formally that –given enough data– FLEA protects the learner against corruptions as long as the fraction of affected data sources is less than half. Our source code and documentation are available at https://github.com/ISTAustria-CVML/FLEA.}, author = {Iofinova, Eugenia B and Konstantinov, Nikola H and Lampert, Christoph}, issn = {2835-8856}, journal = {Transactions on Machine Learning Research}, publisher = {ML Research Press}, title = {{FLEA: Provably robust fair multisource learning from unreliable training data}}, year = {2022}, } @inproceedings{11839, abstract = {It is a highly desirable property for deep networks to be robust against small input changes. One popular way to achieve this property is by designing networks with a small Lipschitz constant. In this work, we propose a new technique for constructing such Lipschitz networks that has a number of desirable properties: it can be applied to any linear network layer (fully-connected or convolutional), it provides formal guarantees on the Lipschitz constant, it is easy to implement and efficient to run, and it can be combined with any training objective and optimization method. In fact, our technique is the first one in the literature that achieves all of these properties simultaneously. Our main contribution is a rescaling-based weight matrix parametrization that guarantees each network layer to have a Lipschitz constant of at most 1 and results in the learned weight matrices to be close to orthogonal. Hence we call such layers almost-orthogonal Lipschitz (AOL). Experiments and ablation studies in the context of image classification with certified robust accuracy confirm that AOL layers achieve results that are on par with most existing methods. Yet, they are simpler to implement and more broadly applicable, because they do not require computationally expensive matrix orthogonalization or inversion steps as part of the network architecture. We provide code at https://github.com/berndprach/AOL.}, author = {Prach, Bernd and Lampert, Christoph}, booktitle = {Computer Vision – ECCV 2022}, isbn = {9783031198021}, location = {Tel Aviv, Israel}, pages = {350--365}, publisher = {Springer Nature}, title = {{Almost-orthogonal layers for efficient general-purpose Lipschitz networks}}, doi = {10.1007/978-3-031-19803-8_21}, volume = {13681}, year = {2022}, } @inproceedings{10752, abstract = {The digitalization of almost all aspects of our everyday lives has led to unprecedented amounts of data being freely available on the Internet. In particular social media platforms provide rich sources of user-generated data, though typically in unstructured form, and with high diversity, such as written in many different languages. Automatically identifying meaningful information in such big data resources and extracting it efficiently is one of the ongoing challenges of our time. A common step for this is sentiment analysis, which forms the foundation for tasks such as opinion mining or trend prediction. Unfortunately, publicly available tools for this task are almost exclusively available for English-language texts. Consequently, a large fraction of the Internet users, who do not communicate in English, are ignored in automatized studies, a phenomenon called rare-language discrimination.In this work we propose a technique to overcome this problem by a truly multi-lingual model, which can be trained automatically without linguistic knowledge or even the ability to read the many target languages. The main step is to combine self-annotation, specifically the use of emoticons as a proxy for labels, with multi-lingual sentence representations.To evaluate our method we curated several large datasets from data obtained via the free Twitter streaming API. The results show that our proposed multi-lingual training is able to achieve sentiment predictions at the same quality level for rare languages as for frequent ones, and in particular clearly better than what mono-lingual training achieves on the same data. }, author = {Lampert, Jasmin and Lampert, Christoph}, booktitle = {2021 IEEE International Conference on Big Data}, isbn = {9781665439022}, location = {Orlando, FL, United States}, pages = {5185--5192}, publisher = {IEEE}, title = {{Overcoming rare-language discrimination in multi-lingual sentiment analysis}}, doi = {10.1109/bigdata52589.2021.9672003}, year = {2022}, } @inproceedings{12161, abstract = {We introduce LIMES, a new method for learning with non-stationary streaming data, inspired by the recent success of meta-learning. The main idea is not to attempt to learn a single classifier that would have to work well across all occurring data distributions, nor many separate classifiers, but to exploit a hybrid strategy: we learn a single set of model parameters from which a specific classifier for any specific data distribution is derived via classifier adaptation. Assuming a multiclass classification setting with class-prior shift, the adaptation step can be performed analytically with only the classifier’s bias terms being affected. Another contribution of our work is an extrapolation step that predicts suitable adaptation parameters for future time steps based on the previous data. In combination, we obtain a lightweight procedure for learning from streaming data with varying class distribution that adds no trainable parameters and almost no memory or computational overhead compared to training a single model. Experiments on a set of exemplary tasks using Twitter data show that LIMES achieves higher accuracy than alternative approaches, especially with respect to the relevant real-world metric of lowest within-day accuracy.}, author = {Tomaszewska, Paulina and Lampert, Christoph}, booktitle = {26th International Conference on Pattern Recognition}, issn = {2831-7475}, location = {Montreal, Canada}, pages = {2128--2134}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{Lightweight conditional model extrapolation for streaming data under class-prior shift}}, doi = {10.1109/icpr56361.2022.9956195}, volume = {2022}, year = {2022}, } @inproceedings{12299, abstract = {Transfer learning is a classic paradigm by which models pretrained on large “upstream” datasets are adapted to yield good results on “downstream” specialized datasets. Generally, more accurate models on the “upstream” dataset tend to provide better transfer accuracy “downstream”. In this work, we perform an in-depth investigation of this phenomenon in the context of convolutional neural networks (CNNs) trained on the ImageNet dataset, which have been pruned-that is, compressed by sparsifiying their connections. We consider transfer using unstructured pruned models obtained by applying several state-of-the-art pruning methods, including magnitude-based, second-order, regrowth, lottery-ticket, and regularization approaches, in the context of twelve standard transfer tasks. In a nutshell, our study shows that sparse models can match or even outperform the transfer performance of dense models, even at high sparsities, and, while doing so, can lead to significant inference and even training speedups. At the same time, we observe and analyze significant differences in the behaviour of different pruning methods. The code is available at: https://github.com/IST-DASLab/sparse-imagenet-transfer.}, author = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Kurtz, Mark and Alistarh, Dan-Adrian}, booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, issn = {2575-7075}, location = {New Orleans, LA, United States}, pages = {12256--12266}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{How well do sparse ImageNet models transfer?}}, doi = {10.1109/cvpr52688.2022.01195}, year = {2022}, } @article{10802, abstract = {Addressing fairness concerns about machine learning models is a crucial step towards their long-term adoption in real-world automated systems. While many approaches have been developed for training fair models from data, little is known about the robustness of these methods to data corruption. In this work we consider fairness-aware learning under worst-case data manipulations. We show that an adversary can in some situations force any learner to return an overly biased classifier, regardless of the sample size and with or without degrading accuracy, and that the strength of the excess bias increases for learning problems with underrepresented protected groups in the data. We also prove that our hardness results are tight up to constant factors. To this end, we study two natural learning algorithms that optimize for both accuracy and fairness and show that these algorithms enjoy guarantees that are order-optimal in terms of the corruption ratio and the protected groups frequencies in the large data limit.}, author = {Konstantinov, Nikola H and Lampert, Christoph}, issn = {1533-7928}, journal = {Journal of Machine Learning Research}, keywords = {Fairness, robustness, data poisoning, trustworthy machine learning, PAC learning}, pages = {1--60}, publisher = {ML Research Press}, title = {{Fairness-aware PAC learning from corrupted data}}, volume = {23}, year = {2022}, } @inproceedings{13241, abstract = {Addressing fairness concerns about machine learning models is a crucial step towards their long-term adoption in real-world automated systems. Many approaches for training fair models from data have been developed and an implicit assumption about such algorithms is that they are able to recover a fair model, despite potential historical biases in the data. In this work we show a number of impossibility results that indicate that there is no learning algorithm that can recover a fair model when a proportion of the dataset is subject to arbitrary manipulations. Specifically, we prove that there are situations in which an adversary can force any learner to return a biased classifier, with or without degrading accuracy, and that the strength of this bias increases for learning problems with underrepresented protected groups in the data. Our results emphasize on the importance of studying further data corruption models of various strength and of establishing stricter data collection practices for fairness-aware learning.}, author = {Konstantinov, Nikola H and Lampert, Christoph}, booktitle = {Proceedings of Machine Learning Research}, issn = {2640-3498}, pages = {59--83}, publisher = {ML Research Press}, title = {{On the impossibility of fairness-aware learning from corrupted data}}, volume = {171}, year = {2022}, } @phdthesis{10799, abstract = {Because of the increasing popularity of machine learning methods, it is becoming important to understand the impact of learned components on automated decision-making systems and to guarantee that their consequences are beneficial to society. In other words, it is necessary to ensure that machine learning is sufficiently trustworthy to be used in real-world applications. This thesis studies two properties of machine learning models that are highly desirable for the sake of reliability: robustness and fairness. In the first part of the thesis we study the robustness of learning algorithms to training data corruption. Previous work has shown that machine learning models are vulnerable to a range of training set issues, varying from label noise through systematic biases to worst-case data manipulations. This is an especially relevant problem from a present perspective, since modern machine learning methods are particularly data hungry and therefore practitioners often have to rely on data collected from various external sources, e.g. from the Internet, from app users or via crowdsourcing. Naturally, such sources vary greatly in the quality and reliability of the data they provide. With these considerations in mind, we study the problem of designing machine learning algorithms that are robust to corruptions in data coming from multiple sources. We show that, in contrast to the case of a single dataset with outliers, successful learning within this model is possible both theoretically and practically, even under worst-case data corruptions. The second part of this thesis deals with fairness-aware machine learning. There are multiple areas where machine learning models have shown promising results, but where careful considerations are required, in order to avoid discrimanative decisions taken by such learned components. Ensuring fairness can be particularly challenging, because real-world training datasets are expected to contain various forms of historical bias that may affect the learning process. In this thesis we show that data corruption can indeed render the problem of achieving fairness impossible, by tightly characterizing the theoretical limits of fair learning under worst-case data manipulations. However, assuming access to clean data, we also show how fairness-aware learning can be made practical in contexts beyond binary classification, in particular in the challenging learning to rank setting.}, author = {Konstantinov, Nikola H}, isbn = {978-3-99078-015-2}, issn = {2663-337X}, keywords = {robustness, fairness, machine learning, PAC learning, adversarial learning}, pages = {176}, publisher = {Institute of Science and Technology Austria}, title = {{Robustness and fairness in machine learning}}, doi = {10.15479/at:ista:10799}, year = {2022}, } @inproceedings{9210, abstract = {Modern neural networks can easily fit their training set perfectly. Surprisingly, despite being “overfit” in this way, they tend to generalize well to future data, thereby defying the classic bias–variance trade-off of machine learning theory. Of the many possible explanations, a prevalent one is that training by stochastic gradient descent (SGD) imposes an implicit bias that leads it to learn simple functions, and these simple functions generalize well. However, the specifics of this implicit bias are not well understood. In this work, we explore the smoothness conjecture which states that SGD is implicitly biased towards learning functions that are smooth. We propose several measures to formalize the intuitive notion of smoothness, and we conduct experiments to determine whether SGD indeed implicitly optimizes for these measures. Our findings rule out the possibility that smoothness measures based on first-order derivatives are being implicitly enforced. They are supportive, though, of the smoothness conjecture for measures based on second-order derivatives.}, author = {Volhejn, Vaclav and Lampert, Christoph}, booktitle = {42nd German Conference on Pattern Recognition}, isbn = {9783030712778}, issn = {1611-3349}, location = {Tübingen, Germany}, pages = {246--259}, publisher = {Springer}, title = {{Does SGD implicitly optimize for smoothness?}}, doi = {10.1007/978-3-030-71278-5_18}, volume = {12544}, year = {2021}, } @inproceedings{9416, abstract = {We study the inductive bias of two-layer ReLU networks trained by gradient flow. We identify a class of easy-to-learn (`orthogonally separable') datasets, and characterise the solution that ReLU networks trained on such datasets converge to. Irrespective of network width, the solution turns out to be a combination of two max-margin classifiers: one corresponding to the positive data subset and one corresponding to the negative data subset. The proof is based on the recently introduced concept of extremal sectors, for which we prove a number of properties in the context of orthogonal separability. In particular, we prove stationarity of activation patterns from some time onwards, which enables a reduction of the ReLU network to an ensemble of linear subnetworks.}, author = {Bui Thi Mai, Phuong and Lampert, Christoph}, booktitle = {9th International Conference on Learning Representations}, location = {Virtual}, title = {{The inductive bias of ReLU networks on orthogonally separable data}}, year = {2021}, } @unpublished{10803, abstract = {Given the abundance of applications of ranking in recent years, addressing fairness concerns around automated ranking systems becomes necessary for increasing the trust among end-users. Previous work on fair ranking has mostly focused on application-specific fairness notions, often tailored to online advertising, and it rarely considers learning as part of the process. In this work, we show how to transfer numerous fairness notions from binary classification to a learning to rank setting. Our formalism allows us to design methods for incorporating fairness objectives with provable generalization guarantees. An extensive experimental evaluation shows that our method can improve ranking fairness substantially with no or only little loss of model quality.}, author = {Konstantinov, Nikola H and Lampert, Christoph}, booktitle = {arXiv}, title = {{Fairness through regularization for learning to rank}}, doi = {10.48550/arXiv.2102.05996}, year = {2021}, } @phdthesis{9418, abstract = {Deep learning is best known for its empirical success across a wide range of applications spanning computer vision, natural language processing and speech. Of equal significance, though perhaps less known, are its ramifications for learning theory: deep networks have been observed to perform surprisingly well in the high-capacity regime, aka the overfitting or underspecified regime. Classically, this regime on the far right of the bias-variance curve is associated with poor generalisation; however, recent experiments with deep networks challenge this view. This thesis is devoted to investigating various aspects of underspecification in deep learning. First, we argue that deep learning models are underspecified on two levels: a) any given training dataset can be fit by many different functions, and b) any given function can be expressed by many different parameter configurations. We refer to the second kind of underspecification as parameterisation redundancy and we precisely characterise its extent. Second, we characterise the implicit criteria (the inductive bias) that guide learning in the underspecified regime. Specifically, we consider a nonlinear but tractable classification setting, and show that given the choice, neural networks learn classifiers with a large margin. Third, we consider learning scenarios where the inductive bias is not by itself sufficient to deal with underspecification. We then study different ways of ‘tightening the specification’: i) In the setting of representation learning with variational autoencoders, we propose a hand- crafted regulariser based on mutual information. ii) In the setting of binary classification, we consider soft-label (real-valued) supervision. We derive a generalisation bound for linear networks supervised in this way and verify that soft labels facilitate fast learning. Finally, we explore an application of soft-label supervision to the training of multi-exit models.}, author = {Bui Thi Mai, Phuong}, issn = {2663-337X}, pages = {125}, publisher = {Institute of Science and Technology Austria}, title = {{Underspecification in deep learning}}, doi = {10.15479/AT:ISTA:9418}, year = {2021}, } @inbook{14987, abstract = {The goal of zero-shot learning is to construct a classifier that can identify object classes for which no training examples are available. When training data for some of the object classes is available but not for others, the name generalized zero-shot learning is commonly used. In a wider sense, the phrase zero-shot is also used to describe other machine learning-based approaches that require no training data from the problem of interest, such as zero-shot action recognition or zero-shot machine translation.}, author = {Lampert, Christoph}, booktitle = {Computer Vision}, editor = {Ikeuchi, Katsushi}, isbn = {9783030634155}, pages = {1395--1397}, publisher = {Springer}, title = {{Zero-Shot Learning}}, doi = {10.1007/978-3-030-63416-2_874}, year = {2021}, } @unpublished{8063, abstract = {We present a generative model of images that explicitly reasons over the set of objects they show. Our model learns a structured latent representation that separates objects from each other and from the background; unlike prior works, it explicitly represents the 2D position and depth of each object, as well as an embedding of its segmentation mask and appearance. The model can be trained from images alone in a purely unsupervised fashion without the need for object masks or depth information. Moreover, it always generates complete objects, even though a significant fraction of training images contain occlusions. Finally, we show that our model can infer decompositions of novel images into their constituent objects, including accurate prediction of depth ordering and segmentation of occluded parts.}, author = {Anciukevicius, Titas and Lampert, Christoph and Henderson, Paul M}, booktitle = {arXiv}, title = {{Object-centric image generation with factored depths, locations, and appearances}}, year = {2020}, } @inproceedings{8188, abstract = {A natural approach to generative modeling of videos is to represent them as a composition of moving objects. Recent works model a set of 2D sprites over a slowly-varying background, but without considering the underlying 3D scene that gives rise to them. We instead propose to model a video as the view seen while moving through a scene with multiple 3D objects and a 3D background. Our model is trained from monocular videos without any supervision, yet learns to generate coherent 3D scenes containing several moving objects. We conduct detailed experiments on two datasets, going beyond the visual complexity supported by state-of-the-art generative approaches. We evaluate our method on depth-prediction and 3D object detection---tasks which cannot be addressed by those earlier works---and show it out-performs them even on 2D instance segmentation and tracking.}, author = {Henderson, Paul M and Lampert, Christoph}, booktitle = {34th Conference on Neural Information Processing Systems}, isbn = {9781713829546}, location = {Vancouver, Canada}, pages = {3106–3117}, publisher = {Curran Associates}, title = {{Unsupervised object-centric video generation and decomposition in 3D}}, volume = {33}, year = {2020}, }