@inproceedings{henter2016robust, title={Robust {TTS} duration modelling using {DNN}s}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon}, booktitle={Proc. ICASSP}, abstract={Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.}, keywords={speech synthesis, duration modelling, robust statistics}, address={Shanghai, China}, month={Mar.}, publisher={IEEE}, volume={41}, pages={5130--5134}, doi={10.1109/ICASSP.2016.7472655}, year={2016} }