@inproceedings{henter2016robust,
  title={Robust {TTS} duration modelling using {DNN}s},
  author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
  booktitle={Proc. ICASSP},
  abstract={Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
  keywords={speech synthesis, duration modelling, robust statistics},
  address={Shanghai, China},
  month={Mar.},
  publisher={IEEE},
  volume={41},
  pages={5130--5134},
  doi={10.1109/ICASSP.2016.7472655},
  year={2016}
}