@inproceedings{14105, abstract = {Despite their recent success, deep neural networks continue to perform poorly when they encounter distribution shifts at test time. Many recently proposed approaches try to counter this by aligning the model to the new distribution prior to inference. With no labels available this requires unsupervised objectives to adapt the model on the observed test data. In this paper, we propose Test-Time SelfTraining (TeST): a technique that takes as input a model trained on some source data and a novel data distribution at test time, and learns invariant and robust representations using a student-teacher framework. We find that models adapted using TeST significantly improve over baseline testtime adaptation algorithms. TeST achieves competitive performance to modern domain adaptation algorithms [4, 43], while having access to 5-10x less data at time of adaption. We thoroughly evaluate a variety of baselines on two tasks: object detection and image segmentation and find that models adapted with TeST. We find that TeST sets the new stateof-the art for test-time domain adaptation algorithms. }, author = {Sinha, Samarth and Gehler, Peter and Locatello, Francesco and Schiele, Bernt}, booktitle = {2023 IEEE/CVF Winter Conference on Applications of Computer Vision}, isbn = {9781665493475}, issn = {2642-9381}, location = {Waikoloa, HI, United States}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{TeST: Test-time Self-Training under distribution shift}}, doi = {10.1109/wacv56688.2023.00278}, year = {2023}, } @article{14256, abstract = {Context. Space asteroseismology is revolutionizing our knowledge of the internal structure and dynamics of stars. A breakthrough is ongoing with the recent discoveries of signatures of strong magnetic fields in the core of red giant stars. The key signature for such a detection is the asymmetry these fields induce in the frequency splittings of observed dipolar mixed gravito-acoustic modes. Aims. We investigate the ability of the observed asymmetries of the frequency splittings of dipolar mixed modes to constrain the geometrical properties of deep magnetic fields. Methods. We used the powerful analytical Racah-Wigner algebra used in quantum mechanics to characterize the geometrical couplings of dipolar mixed oscillation modes with various realistically plausible topologies of fossil magnetic fields. We also computed the induced perturbation of their frequencies. Results. First, in the case of an oblique magnetic dipole, we provide the exact analytical expression of the asymmetry as a function of the angle between the rotation and magnetic axes. Its value provides a direct measure of this angle. Second, considering a combination of axisymmetric dipolar and quadrupolar fields, we show how the asymmetry is blind to the unraveling of the relative strength and sign of each component. Finally, in the case of a given multipole, we show that a negative asymmetry is a signature of non-axisymmetric topologies. Conclusions. Asymmetries of dipolar mixed modes provide a key bit of information on the geometrical topology of deep fossil magnetic fields, but this is insufficient on its own. Asteroseismic constraints should therefore be combined with spectropolarimetric observations and numerical simulations, which aim to predict the more probable stable large-scale geometries.}, author = {Mathis, S. and Bugnet, Lisa Annabelle}, issn = {1432-0746}, journal = {Astronomy and Astrophysics}, publisher = {EDP Sciences}, title = {{Asymmetries of frequency splittings of dipolar mixed modes: A window on the topology of deep magnetic fields}}, doi = {10.1051/0004-6361/202346832}, volume = {676}, year = {2023}, } @article{14261, abstract = {In this work, a generalized, adapted Numerov implementation capable of determining band structures of periodic quantum systems is outlined. Based on the input potential, the presented approach numerically solves the Schrödinger equation in position space at each momentum space point. Thus, in addition to the band structure, the method inherently provides information about the state functions and probability densities in position space at each momentum space point considered. The generalized, adapted Numerov framework provided reliable estimates for a variety of increasingly complex test suites in one, two, and three dimensions. The accuracy of the proposed methodology was benchmarked against results obtained for the analytically solvable Kronig-Penney model. Furthermore, the presented numerical solver was applied to a model potential representing a 2D optical lattice being a challenging application relevant, for example, in the field of quantum computing.}, author = {Gamper, Jakob and Kluibenschedl, Florian and Weiss, Alexander K.H. and Hofer, Thomas S.}, issn = {1948-7185}, journal = {Journal of Physical Chemistry Letters}, number = {33}, pages = {7395--7403}, publisher = {American Chemical Society}, title = {{Accessing position space wave functions in band structure calculations of periodic systems - a generalized, adapted numerov implementation for one-, two-, and three-dimensional quantum problems}}, doi = {10.1021/acs.jpclett.3c01707}, volume = {14}, year = {2023}, } @inproceedings{14208, abstract = {This paper focuses on over-parameterized deep neural networks (DNNs) with ReLU activation functions and proves that when the data distribution is well-separated, DNNs can achieve Bayes-optimal test error for classification while obtaining (nearly) zero-training error under the lazy training regime. For this purpose, we unify three interrelated concepts of overparameterization, benign overfitting, and the Lipschitz constant of DNNs. Our results indicate that interpolating with smoother functions leads to better generalization. Furthermore, we investigate the special case where interpolating smooth ground-truth functions is performed by DNNs under the Neural Tangent Kernel (NTK) regime for generalization. Our result demonstrates that the generalization error converges to a constant order that only depends on label noise and initialization noise, which theoretically verifies benign overfitting. Our analysis provides a tight lower bound on the normalized margin under non-smooth activation functions, as well as the minimum eigenvalue of NTK under high-dimensional settings, which has its own interest in learning theory.}, author = {Zhu, Zhenyu and Liu, Fanghui and Chrysos, Grigorios G and Locatello, Francesco and Cevher, Volkan}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, location = {Honolulu, Hawaii, United States}, pages = {43105--43128}, publisher = {ML Research Press}, title = {{Benign overfitting in deep neural networks under lazy training}}, volume = {202}, year = {2023}, } @unpublished{14209, abstract = {Diffusion models excel at generating photorealistic images from text-queries. Naturally, many approaches have been proposed to use these generative abilities to augment training datasets for downstream tasks, such as classification. However, diffusion models are themselves trained on large noisily supervised, but nonetheless, annotated datasets. It is an open question whether the generalization capabilities of diffusion models beyond using the additional data of the pre-training process for augmentation lead to improved downstream performance. We perform a systematic evaluation of existing methods to generate images from diffusion models and study new extensions to assess their benefit for data augmentation. While we find that personalizing diffusion models towards the target data outperforms simpler prompting strategies, we also show that using the training data of the diffusion model alone, via a simple nearest neighbor retrieval procedure, leads to even stronger downstream performance. Overall, our study probes the limitations of diffusion models for data augmentation but also highlights its potential in generating new training data to improve performance on simple downstream vision tasks.}, author = {Burg, Max F. and Wenzel, Florian and Zietlow, Dominik and Horn, Max and Makansi, Osama and Locatello, Francesco and Russell, Chris}, booktitle = {arXiv}, title = {{A data augmentation perspective on diffusion models and retrieval}}, doi = {10.48550/arXiv.2304.10253}, year = {2023}, } @inproceedings{14211, abstract = {Causal discovery methods are intrinsically constrained by the set of assumptions needed to ensure structure identifiability. Moreover additional restrictions are often imposed in order to simplify the inference task: this is the case for the Gaussian noise assumption on additive non-linear models, which is common to many causal discovery approaches. In this paper we show the shortcomings of inference under this hypothesis, analyzing the risk of edge inversion under violation of Gaussianity of the noise terms. Then, we propose a novel method for inferring the topological ordering of the variables in the causal graph, from data generated according to an additive non-linear model with a generic noise distribution. This leads to NoGAM (Not only Gaussian Additive noise Models), a causal discovery algorithm with a minimal set of assumptions and state of the art performance, experimentally benchmarked on synthetic data.}, author = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Zhang, Kun and Locatello, Francesco}, booktitle = {2nd Conference on Causal Learning and Reasoning}, location = {Tübingen, Germany}, title = {{Causal discovery with score matching on additive models with arbitrary noise}}, year = {2023}, } @inproceedings{14212, abstract = {This paper demonstrates how to discover the whole causal graph from the second derivative of the log-likelihood in observational non-linear additive Gaussian noise models. Leveraging scalable machine learning approaches to approximate the score function ∇logp(X), we extend the work of Rolland et al. (2022) that only recovers the topological order from the score and requires an expensive pruning step removing spurious edges among those admitted by the ordering. Our analysis leads to DAS (acronym for Discovery At Scale), a practical algorithm that reduces the complexity of the pruning by a factor proportional to the graph size. In practice, DAS achieves competitive accuracy with current state-of-the-art while being over an order of magnitude faster. Overall, our approach enables principled and scalable causal discovery, significantly lowering the compute bar.}, author = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Zhang, Kun and Locatello, Francesco}, booktitle = {2nd Conference on Causal Learning and Reasoning}, location = {Tübingen, Germany}, title = {{Scalable causal discovery with score matching}}, year = {2023}, } @inproceedings{14214, abstract = {Recent years have seen a surge of interest in learning high-level causal representations from low-level image pairs under interventions. Yet, existing efforts are largely limited to simple synthetic settings that are far away from real-world problems. In this paper, we present Causal Triplet, a causal representation learning benchmark featuring not only visually more complex scenes, but also two crucial desiderata commonly overlooked in previous works: (i) an actionable counterfactual setting, where only certain object-level variables allow for counterfactual observations whereas others do not; (ii) an interventional downstream task with an emphasis on out-of-distribution robustness from the independent causal mechanisms principle. Through extensive experiments, we find that models built with the knowledge of disentangled or object-centric representations significantly outperform their distributed counterparts. However, recent causal representation learning methods still struggle to identify such latent structures, indicating substantial challenges and opportunities for future work.}, author = {Liu, Yuejiang and Alahi, Alexandre and Russell, Chris and Horn, Max and Zietlow, Dominik and Schölkopf, Bernhard and Locatello, Francesco}, booktitle = {2nd Conference on Causal Learning and Reasoning}, location = {Tübingen, Germany}, title = {{Causal triplet: An open challenge for intervention-centric causal representation learning}}, year = {2023}, } @inproceedings{14217, abstract = {Neural networks embed the geometric structure of a data manifold lying in a high-dimensional space into latent representations. Ideally, the distribution of the data points in the latent space should depend only on the task, the data, the loss, and other architecture-specific constraints. However, factors such as the random weights initialization, training hyperparameters, or other sources of randomness in the training phase may induce incoherent latent spaces that hinder any form of reuse. Nevertheless, we empirically observe that, under the same data and modeling choices, the angles between the encodings within distinct latent spaces do not change. In this work, we propose the latent similarity between each sample and a fixed set of anchors as an alternative data representation, demonstrating that it can enforce the desired invariances without any additional training. We show how neural architectures can leverage these relative representations to guarantee, in practice, invariance to latent isometries and rescalings, effectively enabling latent space communication: from zero-shot model stitching to latent space comparison between diverse settings. We extensively validate the generalization capability of our approach on different datasets, spanning various modalities (images, text, graphs), tasks (e.g., classification, reconstruction) and architectures (e.g., CNNs, GCNs, transformers).}, author = {Moschella, Luca and Maiorca, Valentino and Fumero, Marco and Norelli, Antonio and Locatello, Francesco and Rodolà, Emanuele}, booktitle = {The 11th International Conference on Learning Representations}, location = {Kigali, Rwanda}, title = {{Relative representations enable zero-shot latent space communication}}, year = {2023}, } @inproceedings{14222, abstract = {Learning generative object models from unlabelled videos is a long standing problem and required for causal scene modeling. We decompose this problem into three easier subtasks, and provide candidate solutions for each of them. Inspired by the Common Fate Principle of Gestalt Psychology, we first extract (noisy) masks of moving objects via unsupervised motion segmentation. Second, generative models are trained on the masks of the background and the moving objects, respectively. Third, background and foreground models are combined in a conditional "dead leaves" scene model to sample novel scene configurations where occlusions and depth layering arise naturally. To evaluate the individual stages, we introduce the Fishbowl dataset positioned between complex real-world scenes and common object-centric benchmarks of simplistic objects. We show that our approach allows learning generative models that generalize beyond the occlusions present in the input videos, and represent scenes in a modular fashion that allows sampling plausible scenes outside the training distribution by permitting, for instance, object numbers or densities not observed in the training set.}, author = {Tangemann, Matthias and Schneider, Steffen and Kügelgen, Julius von and Locatello, Francesco and Gehler, Peter and Brox, Thomas and Kümmerer, Matthias and Bethge, Matthias and Schölkopf, Bernhard}, booktitle = {2nd Conference on Causal Learning and Reasoning}, location = {Tübingen, Germany}, title = {{Unsupervised object learning via common fate}}, year = {2023}, }