[{"day":"01","article_processing_charge":"Yes","has_accepted_license":"1","scopus_import":"1","date_published":"2023-08-01T00:00:00Z","article_type":"original","page":"175-183","publication":"Measurement Science Review","citation":{"mla":"Jakubík, Jozef, et al. “Against the Flow of Time with Multi-Output Models.” Measurement Science Review, vol. 23, no. 4, Sciendo, 2023, pp. 175–83, doi:10.2478/msr-2023-0023.","short":"J. Jakubík, M. Phuong, M. Chvosteková, A. Krakovská, Measurement Science Review 23 (2023) 175–183.","chicago":"Jakubík, Jozef, Mary Phuong, Martina Chvosteková, and Anna Krakovská. “Against the Flow of Time with Multi-Output Models.” Measurement Science Review. Sciendo, 2023. https://doi.org/10.2478/msr-2023-0023.","ama":"Jakubík J, Phuong M, Chvosteková M, Krakovská A. Against the flow of time with multi-output models. Measurement Science Review. 2023;23(4):175-183. doi:10.2478/msr-2023-0023","ista":"Jakubík J, Phuong M, Chvosteková M, Krakovská A. 2023. Against the flow of time with multi-output models. Measurement Science Review. 23(4), 175–183.","apa":"Jakubík, J., Phuong, M., Chvosteková, M., & Krakovská, A. (2023). Against the flow of time with multi-output models. Measurement Science Review. Sciendo. https://doi.org/10.2478/msr-2023-0023","ieee":"J. Jakubík, M. Phuong, M. Chvosteková, and A. Krakovská, “Against the flow of time with multi-output models,” Measurement Science Review, vol. 23, no. 4. Sciendo, pp. 175–183, 2023."},"abstract":[{"text":"Recent work has paid close attention to the first principle of Granger causality, according to which cause precedes effect. In this context, the question may arise whether the detected direction of causality also reverses after the time reversal of unidirectionally coupled data. Recently, it has been shown that for unidirectionally causally connected autoregressive (AR) processes X → Y, after time reversal of data, the opposite causal direction Y → X is indeed detected, although typically as part of the bidirectional X↔ Y link. As we argue here, the answer is different when the measured data are not from AR processes but from linked deterministic systems. When the goal is the usual forward data analysis, cross-mapping-like approaches correctly detect X → Y, while Granger causality-like approaches, which should not be used for deterministic time series, detect causal independence X → Y. The results of backward causal analysis depend on the predictability of the reversed data. Unlike AR processes, observables from deterministic dynamical systems, even complex nonlinear ones, can be predicted well forward, while backward predictions can be difficult (notably when the time reversal of a function leads to one-to-many relations). To address this problem, we propose an approach based on models that provide multiple candidate predictions for the target, combined with a loss function that consideres only the best candidate. The resulting good forward and backward predictability supports the view that unidirectionally causally linked deterministic dynamical systems X → Y can be expected to detect the same link both before and after time reversal.","lang":"eng"}],"issue":"4","type":"journal_article","oa_version":"Published Version","file":[{"access_level":"open_access","file_name":"2023_MeasurementScienceRev_Jakubik.pdf","creator":"dernst","content_type":"application/pdf","file_size":2639783,"file_id":"14476","relation":"main_file","success":1,"checksum":"b069cc10fa6a7c96b2bc9f728165f9e6","date_created":"2023-10-31T12:07:23Z","date_updated":"2023-10-31T12:07:23Z"}],"title":"Against the flow of time with multi-output models","status":"public","ddc":["510"],"intvolume":" 23","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","_id":"14446","month":"08","publication_identifier":{"eissn":["1335-8871"]},"language":[{"iso":"eng"}],"doi":"10.2478/msr-2023-0023","quality_controlled":"1","tmp":{"name":"Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)","legal_code_url":"https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode","short":"CC BY-NC-ND (4.0)","image":"/images/cc_by_nc_nd.png"},"oa":1,"file_date_updated":"2023-10-31T12:07:23Z","date_updated":"2023-10-31T12:12:47Z","date_created":"2023-10-22T22:01:15Z","volume":23,"author":[{"first_name":"Jozef","last_name":"Jakubík","full_name":"Jakubík, Jozef"},{"full_name":"Bui Thi Mai, Phuong","id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","first_name":"Phuong","last_name":"Bui Thi Mai"},{"first_name":"Martina","last_name":"Chvosteková","full_name":"Chvosteková, Martina"},{"full_name":"Krakovská, Anna","last_name":"Krakovská","first_name":"Anna"}],"publication_status":"published","department":[{"_id":"ChLa"}],"publisher":"Sciendo","acknowledgement":"The work was supported by the Scientific Grant Agency of the Ministry of Education of the Slovak Republic and the Slovak Academy of Sciences, projects APVV-21-0216, VEGA2-0096-21 and VEGA 2-0023-22.","year":"2023"},{"file_date_updated":"2021-05-24T11:15:57Z","abstract":[{"text":"We study the inductive bias of two-layer ReLU networks trained by gradient flow. We identify a class of easy-to-learn (`orthogonally separable') datasets, and characterise the solution that ReLU networks trained on such datasets converge to. Irrespective of network width, the solution turns out to be a combination of two max-margin classifiers: one corresponding to the positive data subset and one corresponding to the negative data subset. The proof is based on the recently introduced concept of extremal sectors, for which we prove a number of properties in the context of orthogonal separability. In particular, we prove stationarity of activation patterns from some time onwards, which enables a reduction of the ReLU network to an ensemble of linear subnetworks.","lang":"eng"}],"type":"conference","related_material":{"record":[{"status":"public","relation":"dissertation_contains","id":"9418"}]},"author":[{"full_name":"Bui Thi Mai, Phuong","id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","first_name":"Phuong","last_name":"Bui Thi Mai"},{"orcid":"0000-0001-8622-7887","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","last_name":"Lampert","first_name":"Christoph","full_name":"Lampert, Christoph"}],"file":[{"checksum":"f34ff17017527db5ba6927f817bdd125","date_created":"2021-05-24T11:15:57Z","date_updated":"2021-05-24T11:15:57Z","file_id":"9417","relation":"main_file","creator":"bphuong","file_size":502356,"content_type":"application/pdf","access_level":"open_access","file_name":"iclr2021_conference.pdf"}],"oa_version":"Published Version","date_updated":"2023-09-07T13:29:50Z","date_created":"2021-05-24T11:16:46Z","_id":"9416","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2021","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"publication_status":"published","title":"The inductive bias of ReLU networks on orthogonally separable data","status":"public","ddc":["000"],"article_processing_charge":"No","has_accepted_license":"1","day":"01","month":"05","scopus_import":"1","date_published":"2021-05-01T00:00:00Z","conference":{"name":" ICLR: International Conference on Learning Representations","end_date":"2021-05-07","start_date":"2021-05-03","location":"Virtual"},"language":[{"iso":"eng"}],"main_file_link":[{"url":"https://openreview.net/pdf?id=krz7T0xU9Z_","open_access":"1"}],"citation":{"chicago":"Phuong, Mary, and Christoph Lampert. “The Inductive Bias of ReLU Networks on Orthogonally Separable Data.” In 9th International Conference on Learning Representations, 2021.","short":"M. Phuong, C. Lampert, in:, 9th International Conference on Learning Representations, 2021.","mla":"Phuong, Mary, and Christoph Lampert. “The Inductive Bias of ReLU Networks on Orthogonally Separable Data.” 9th International Conference on Learning Representations, 2021.","apa":"Phuong, M., & Lampert, C. (2021). The inductive bias of ReLU networks on orthogonally separable data. In 9th International Conference on Learning Representations. Virtual.","ieee":"M. Phuong and C. Lampert, “The inductive bias of ReLU networks on orthogonally separable data,” in 9th International Conference on Learning Representations, Virtual, 2021.","ista":"Phuong M, Lampert C. 2021. The inductive bias of ReLU networks on orthogonally separable data. 9th International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","ama":"Phuong M, Lampert C. The inductive bias of ReLU networks on orthogonally separable data. In: 9th International Conference on Learning Representations. ; 2021."},"oa":1,"publication":"9th International Conference on Learning Representations","quality_controlled":"1"},{"file_date_updated":"2021-05-24T11:56:02Z","date_created":"2021-05-24T13:06:23Z","date_updated":"2023-09-08T11:11:12Z","related_material":{"record":[{"id":"7435","relation":"part_of_dissertation","status":"deleted"},{"relation":"part_of_dissertation","status":"public","id":"7481"},{"status":"public","relation":"part_of_dissertation","id":"9416"},{"status":"public","relation":"part_of_dissertation","id":"7479"}]},"author":[{"id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","last_name":"Bui Thi Mai","first_name":"Phuong","full_name":"Bui Thi Mai, Phuong"}],"publisher":"Institute of Science and Technology Austria","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"publication_status":"published","year":"2021","publication_identifier":{"issn":["2663-337X"]},"month":"05","language":[{"iso":"eng"}],"supervisor":[{"first_name":"Christoph","last_name":"Lampert","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0001-8622-7887","full_name":"Lampert, Christoph"}],"acknowledged_ssus":[{"_id":"ScienComp"},{"_id":"CampIT"},{"_id":"E-Lib"}],"degree_awarded":"PhD","doi":"10.15479/AT:ISTA:9418","oa":1,"abstract":[{"text":"Deep learning is best known for its empirical success across a wide range of applications\r\nspanning computer vision, natural language processing and speech. Of equal significance,\r\nthough perhaps less known, are its ramifications for learning theory: deep networks have\r\nbeen observed to perform surprisingly well in the high-capacity regime, aka the overfitting\r\nor underspecified regime. Classically, this regime on the far right of the bias-variance curve\r\nis associated with poor generalisation; however, recent experiments with deep networks\r\nchallenge this view.\r\n\r\nThis thesis is devoted to investigating various aspects of underspecification in deep learning.\r\nFirst, we argue that deep learning models are underspecified on two levels: a) any given\r\ntraining dataset can be fit by many different functions, and b) any given function can be\r\nexpressed by many different parameter configurations. We refer to the second kind of\r\nunderspecification as parameterisation redundancy and we precisely characterise its extent.\r\nSecond, we characterise the implicit criteria (the inductive bias) that guide learning in the\r\nunderspecified regime. Specifically, we consider a nonlinear but tractable classification\r\nsetting, and show that given the choice, neural networks learn classifiers with a large margin.\r\nThird, we consider learning scenarios where the inductive bias is not by itself sufficient to\r\ndeal with underspecification. We then study different ways of ‘tightening the specification’: i)\r\nIn the setting of representation learning with variational autoencoders, we propose a hand-\r\ncrafted regulariser based on mutual information. ii) In the setting of binary classification, we\r\nconsider soft-label (real-valued) supervision. We derive a generalisation bound for linear\r\nnetworks supervised in this way and verify that soft labels facilitate fast learning. Finally, we\r\nexplore an application of soft-label supervision to the training of multi-exit models.","lang":"eng"}],"alternative_title":["ISTA Thesis"],"type":"dissertation","oa_version":"Published Version","file":[{"file_id":"9419","relation":"main_file","success":1,"checksum":"4f0abe64114cfed264f9d36e8d1197e3","date_updated":"2021-05-24T11:22:29Z","date_created":"2021-05-24T11:22:29Z","access_level":"open_access","file_name":"mph-thesis-v519-pdfimages.pdf","creator":"bphuong","file_size":2673905,"content_type":"application/pdf"},{"checksum":"f5699e876bc770a9b0df8345a77720a2","date_created":"2021-05-24T11:56:02Z","date_updated":"2021-05-24T11:56:02Z","file_id":"9420","relation":"source_file","creator":"bphuong","file_size":92995100,"content_type":"application/zip","access_level":"closed","file_name":"thesis.zip"}],"title":"Underspecification in deep learning","status":"public","ddc":["000"],"user_id":"c635000d-4b10-11ee-a964-aac5a93f6ac1","_id":"9418","has_accepted_license":"1","article_processing_charge":"No","day":"30","date_published":"2021-05-30T00:00:00Z","page":"125","citation":{"ista":"Phuong M. 2021. Underspecification in deep learning. Institute of Science and Technology Austria.","ieee":"M. Phuong, “Underspecification in deep learning,” Institute of Science and Technology Austria, 2021.","apa":"Phuong, M. (2021). Underspecification in deep learning. Institute of Science and Technology Austria. https://doi.org/10.15479/AT:ISTA:9418","ama":"Phuong M. Underspecification in deep learning. 2021. doi:10.15479/AT:ISTA:9418","chicago":"Phuong, Mary. “Underspecification in Deep Learning.” Institute of Science and Technology Austria, 2021. https://doi.org/10.15479/AT:ISTA:9418.","mla":"Phuong, Mary. Underspecification in Deep Learning. Institute of Science and Technology Austria, 2021, doi:10.15479/AT:ISTA:9418.","short":"M. Phuong, Underspecification in Deep Learning, Institute of Science and Technology Austria, 2021."}},{"status":"public","publication_status":"published","ddc":["000"],"title":"Functional vs. parametric equivalence of ReLU networks","department":[{"_id":"ChLa"}],"year":"2020","_id":"7481","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2020-02-11T09:07:37Z","date_updated":"2023-09-07T13:29:50Z","oa_version":"Published Version","file":[{"access_level":"open_access","file_name":"main.pdf","content_type":"application/pdf","file_size":405469,"creator":"bphuong","relation":"main_file","file_id":"7482","checksum":"8d372ea5defd8cb8fdc430111ed754a9","date_created":"2020-02-11T09:07:27Z","date_updated":"2020-07-14T12:47:59Z"}],"author":[{"full_name":"Bui Thi Mai, Phuong","last_name":"Bui Thi Mai","first_name":"Phuong","id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87"},{"last_name":"Lampert","first_name":"Christoph","orcid":"0000-0001-8622-7887","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","full_name":"Lampert, Christoph"}],"related_material":{"record":[{"id":"9418","relation":"dissertation_contains","status":"public"}],"link":[{"url":"https://iclr.cc/virtual_2020/poster_Bylx-TNKvH.html","relation":"supplementary_material"}]},"type":"conference","file_date_updated":"2020-07-14T12:47:59Z","abstract":[{"lang":"eng","text":"We address the following question: How redundant is the parameterisation of ReLU networks? Specifically, we consider transformations of the weight space which leave the function implemented by the network intact. Two such transformations are known for feed-forward architectures: permutation of neurons within a layer, and positive scaling of all incoming weights of a neuron coupled with inverse scaling of its outgoing weights. In this work, we show for architectures with non-increasing widths that permutation and scaling are in fact the only function-preserving weight transformations. For any eligible architecture we give an explicit construction of a neural network such that any other network that implements the same function can be obtained from the original one by the application of permutations and rescaling. The proof relies on a geometric understanding of boundaries between linear regions of ReLU networks, and we hope the developed mathematical tools are of independent interest."}],"quality_controlled":"1","publication":"8th International Conference on Learning Representations","citation":{"mla":"Phuong, Mary, and Christoph Lampert. “Functional vs. Parametric Equivalence of ReLU Networks.” 8th International Conference on Learning Representations, 2020.","short":"M. Phuong, C. Lampert, in:, 8th International Conference on Learning Representations, 2020.","chicago":"Phuong, Mary, and Christoph Lampert. “Functional vs. Parametric Equivalence of ReLU Networks.” In 8th International Conference on Learning Representations, 2020.","ama":"Phuong M, Lampert C. Functional vs. parametric equivalence of ReLU networks. In: 8th International Conference on Learning Representations. ; 2020.","ista":"Phuong M, Lampert C. 2020. Functional vs. parametric equivalence of ReLU networks. 8th International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","apa":"Phuong, M., & Lampert, C. (2020). Functional vs. parametric equivalence of ReLU networks. In 8th International Conference on Learning Representations. Online.","ieee":"M. Phuong and C. Lampert, “Functional vs. parametric equivalence of ReLU networks,” in 8th International Conference on Learning Representations, Online, 2020."},"oa":1,"language":[{"iso":"eng"}],"conference":{"location":"Online","start_date":"2020-04-27","end_date":"2020-04-30","name":"ICLR: International Conference on Learning Representations"},"date_published":"2020-04-26T00:00:00Z","day":"26","month":"04","has_accepted_license":"1","article_processing_charge":"No"},{"file":[{"creator":"bphuong","content_type":"application/pdf","file_size":735768,"access_level":"open_access","file_name":"main.pdf","checksum":"7b77fb5c2d27c4c37a7612ba46a66117","date_updated":"2020-07-14T12:47:59Z","date_created":"2020-02-11T09:06:39Z","file_id":"7480","relation":"main_file"}],"oa_version":"Submitted Version","user_id":"c635000d-4b10-11ee-a964-aac5a93f6ac1","_id":"7479","status":"public","title":"Distillation-based training for multi-exit architectures","ddc":["000"],"abstract":[{"text":"Multi-exit architectures, in which a stack of processing layers is interleaved with early output layers, allow the processing of a test example to stop early and thus save computation time and/or energy. In this work, we propose a new training procedure for multi-exit architectures based on the principle of knowledge distillation. The method encourage searly exits to mimic later, more accurate exits, by matching their output probabilities.\r\nExperiments on CIFAR100 and ImageNet show that distillation-based training significantly improves the accuracy of early exits while maintaining state-of-the-art accuracy for late ones. The method is particularly beneficial when training data is limited and it allows a straightforward extension to semi-supervised learning,i.e. making use of unlabeled data at training time. Moreover, it takes only afew lines to implement and incurs almost no computational overhead at training time, and none at all at test time.","lang":"eng"}],"type":"conference","date_published":"2019-10-01T00:00:00Z","citation":{"ama":"Phuong M, Lampert C. Distillation-based training for multi-exit architectures. In: IEEE International Conference on Computer Vision. Vol 2019-October. IEEE; 2019:1355-1364. doi:10.1109/ICCV.2019.00144","apa":"Phuong, M., & Lampert, C. (2019). Distillation-based training for multi-exit architectures. In IEEE International Conference on Computer Vision (Vol. 2019–October, pp. 1355–1364). Seoul, Korea: IEEE. https://doi.org/10.1109/ICCV.2019.00144","ieee":"M. Phuong and C. Lampert, “Distillation-based training for multi-exit architectures,” in IEEE International Conference on Computer Vision, Seoul, Korea, 2019, vol. 2019–October, pp. 1355–1364.","ista":"Phuong M, Lampert C. 2019. Distillation-based training for multi-exit architectures. IEEE International Conference on Computer Vision. ICCV: International Conference on Computer Vision vol. 2019–October, 1355–1364.","short":"M. Phuong, C. Lampert, in:, IEEE International Conference on Computer Vision, IEEE, 2019, pp. 1355–1364.","mla":"Phuong, Mary, and Christoph Lampert. “Distillation-Based Training for Multi-Exit Architectures.” IEEE International Conference on Computer Vision, vol. 2019–October, IEEE, 2019, pp. 1355–64, doi:10.1109/ICCV.2019.00144.","chicago":"Phuong, Mary, and Christoph Lampert. “Distillation-Based Training for Multi-Exit Architectures.” In IEEE International Conference on Computer Vision, 2019–October:1355–64. IEEE, 2019. https://doi.org/10.1109/ICCV.2019.00144."},"publication":"IEEE International Conference on Computer Vision","page":"1355-1364","article_processing_charge":"No","has_accepted_license":"1","day":"01","scopus_import":"1","related_material":{"record":[{"id":"9418","relation":"dissertation_contains","status":"public"}]},"author":[{"id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","first_name":"Phuong","last_name":"Bui Thi Mai","full_name":"Bui Thi Mai, Phuong"},{"full_name":"Lampert, Christoph","last_name":"Lampert","first_name":"Christoph","orcid":"0000-0001-8622-7887","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"volume":"2019-October","date_updated":"2023-09-08T11:11:12Z","date_created":"2020-02-11T09:06:57Z","year":"2019","publisher":"IEEE","department":[{"_id":"ChLa"}],"publication_status":"published","ec_funded":1,"file_date_updated":"2020-07-14T12:47:59Z","doi":"10.1109/ICCV.2019.00144","conference":{"location":"Seoul, Korea","start_date":"2019-10-27","end_date":"2019-11-02","name":"ICCV: International Conference on Computer Vision"},"language":[{"iso":"eng"}],"oa":1,"external_id":{"isi":["000531438101047"]},"project":[{"call_identifier":"FP7","name":"Lifelong Learning of Visual Scene Understanding","_id":"2532554C-B435-11E9-9278-68D0E5697425","grant_number":"308036"}],"quality_controlled":"1","isi":1,"publication_identifier":{"issn":["15505499"],"isbn":["9781728148038"]},"month":"10"},{"file_date_updated":"2020-07-14T12:47:33Z","year":"2019","department":[{"_id":"ChLa"}],"publisher":"ML Research Press","publication_status":"published","author":[{"id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","last_name":"Bui Thi Mai","first_name":"Phuong","full_name":"Bui Thi Mai, Phuong"},{"full_name":"Lampert, Christoph","last_name":"Lampert","first_name":"Christoph","orcid":"0000-0001-8622-7887","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"volume":97,"date_updated":"2023-10-17T12:31:38Z","date_created":"2019-06-20T18:23:03Z","month":"06","oa":1,"quality_controlled":"1","conference":{"name":"ICML: International Conference on Machine Learning","end_date":"2019-06-15","start_date":"2019-06-10","location":"Long Beach, CA, United States"},"language":[{"iso":"eng"}],"type":"conference","abstract":[{"lang":"eng","text":"Knowledge distillation, i.e. one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three keyfactors that determine the success of distillation: data geometry – geometric properties of the datadistribution, in particular class separation, has an immediate influence on the convergence speed of the risk; optimization bias– gradient descentoptimization finds a very favorable minimum of the distillation objective; and strong monotonicity– the expected risk of the student classifier always decreases when the size of the training set grows."}],"_id":"6569","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","intvolume":" 97","title":"Towards understanding knowledge distillation","ddc":["000"],"status":"public","file":[{"relation":"main_file","file_id":"6570","date_created":"2019-06-20T18:22:56Z","date_updated":"2020-07-14T12:47:33Z","checksum":"a66d00e2694d749250f8507f301320ca","file_name":"paper.pdf","access_level":"open_access","content_type":"application/pdf","file_size":686432,"creator":"bphuong"}],"oa_version":"Published Version","scopus_import":"1","has_accepted_license":"1","article_processing_charge":"No","day":"13","citation":{"ista":"Phuong M, Lampert C. 2019. Towards understanding knowledge distillation. Proceedings of the 36th International Conference on Machine Learning. ICML: International Conference on Machine Learning vol. 97, 5142–5151.","apa":"Phuong, M., & Lampert, C. (2019). Towards understanding knowledge distillation. In Proceedings of the 36th International Conference on Machine Learning (Vol. 97, pp. 5142–5151). Long Beach, CA, United States: ML Research Press.","ieee":"M. Phuong and C. Lampert, “Towards understanding knowledge distillation,” in Proceedings of the 36th International Conference on Machine Learning, Long Beach, CA, United States, 2019, vol. 97, pp. 5142–5151.","ama":"Phuong M, Lampert C. Towards understanding knowledge distillation. In: Proceedings of the 36th International Conference on Machine Learning. Vol 97. ML Research Press; 2019:5142-5151.","chicago":"Phuong, Mary, and Christoph Lampert. “Towards Understanding Knowledge Distillation.” In Proceedings of the 36th International Conference on Machine Learning, 97:5142–51. ML Research Press, 2019.","mla":"Phuong, Mary, and Christoph Lampert. “Towards Understanding Knowledge Distillation.” Proceedings of the 36th International Conference on Machine Learning, vol. 97, ML Research Press, 2019, pp. 5142–51.","short":"M. Phuong, C. Lampert, in:, Proceedings of the 36th International Conference on Machine Learning, ML Research Press, 2019, pp. 5142–5151."},"publication":"Proceedings of the 36th International Conference on Machine Learning","page":"5142-5151","date_published":"2019-06-13T00:00:00Z"}]