@inproceedings{ronanki2016ukspeech,
  title={A template-based approach for intonation generation using {LSTM}s},
  author={Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon},
  booktitle={Proc. UK Speech},
  abstract={The lack of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems predict the fundamental frequency (F0) frame-by-frame using regression models. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, we propose a classification-based approach to automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically-learned set) are predicted by a recurrent neural network (RNN). The use of templates mitigates the over-smoothing problem: with only six templates, we can reconstruct pitch patterns observed in the data well (small RMSE). The long memory of RNNs in principle enables the prediction of pitch-contour structure spanning the entire utterance. To construct a complete text-to-speech system, this novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and remaining acoustic features. The objective results are encouraging, but listening tests with oracle reconstructions suggest that further work (beyond a simple smoothing) is necessary to reduce subjective artefacts in the template-based F0 reconstructions.},
  address={Sheffield, UK},
  month={June},
  volume={5},
  pages={22},
  url={http://ukspeech.dcs.shef.ac.uk/ukspeech16\_abstracts.pdf},
  year={2016}
}